From f0cd34061300dc501ace5261cb527a6deabe1000 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Wed, 6 Sep 2017 16:18:32 -0700
Subject: [PATCH 001/119] metag/numa: remove the unused parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removes the last user of parent_node().

The parent_node() macro in METAG architecture is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1501076076-1974-4-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/metag/include/asm/topology.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h
index e95f874ded1b..707c7f7b6bea 100644
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -4,7 +4,6 @@
 #ifdef CONFIG_NUMA
 
 #define cpu_to_node(cpu)	((void)(cpu), 0)
-#define parent_node(node)	((void)(node), 0)
 
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
 

From b2770da6425406cf3f6d3fddbf9086b1db0106a1 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:35 -0700
Subject: [PATCH 002/119] mm: add vm_insert_mixed_mkwrite()

When servicing mmap() reads from file holes the current DAX code
allocates a page cache page of all zeroes and places the struct page
pointer in the mapping->page_tree radix tree.  This has three major
drawbacks:

1) It consumes memory unnecessarily. For every 4k page that is read via
   a DAX mmap() over a hole, we allocate a new page cache page. This
   means that if you read 1GiB worth of pages, you end up using 1GiB of
   zeroed memory.

2) It is slower than using a common zero page because each page fault
   has more work to do. Instead of just inserting a common zero page we
   have to allocate a page cache page, zero it, and then insert it.

3) The fact that we had to check for both DAX exceptional entries and
   for page cache pages in the radix tree made the DAX code more
   complex.

This series solves these issues by following the lead of the DAX PMD
code and using a common 4k zero page instead.  This reduces memory usage
and decreases latencies for some workloads, and it simplifies the DAX
code, removing over 100 lines in total.

This patch (of 5):

To be able to use the common 4k zero page in DAX we need to have our PTE
fault path look more like our PMD fault path where a PTE entry can be
marked as dirty and writeable as it is first inserted rather than
waiting for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault()
call.

Right now we can rely on having a dax_pfn_mkwrite() call because we can
distinguish between these two cases in do_wp_page():

	case 1: 4k zero page => writable DAX storage
	case 2: read-only DAX storage => writeable DAX storage

This distinction is made by via vm_normal_page().  vm_normal_page()
returns false for the common 4k zero page, though, just as it does for
DAX ptes.  Instead of special casing the DAX + 4k zero page case we will
simplify our DAX PTE page fault sequence so that it matches our DAX PMD
sequence, and get rid of the dax_pfn_mkwrite() helper.  We will instead
use dax_iomap_fault() to handle write-protection faults.

This means that insert_pfn() needs to follow the lead of
insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag.  If 'mkwrite'
is set insert_pfn() will do the work that was previously done by
wp_page_reuse() as part of the dax_pfn_mkwrite() call path.

Link: http://lkml.kernel.org/r/20170724170616.25810-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 50 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1f6c95f3496..eb5e4bc946cc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2294,6 +2294,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..71c0b6f98a62 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1676,7 +1676,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t prot)
+			pfn_t pfn, pgprot_t prot, bool mkwrite)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
@@ -1688,14 +1688,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (!pte_none(*pte)) {
+		if (mkwrite) {
+			/*
+			 * For read faults on private mappings the PFN passed
+			 * in may not match the PFN we have mapped if the
+			 * mapped PFN is a writeable COW page.  In the mkwrite
+			 * case we are creating a writable PTE for a shared
+			 * mapping and we expect the PFNs to match.
+			 */
+			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+				goto out_unlock;
+			entry = *pte;
+			goto out_mkwrite;
+		} else
+			goto out_unlock;
+	}
 
 	/* Ok, finally just insert the thing.. */
 	if (pfn_t_devmap(pfn))
 		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
 	else
 		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+	if (mkwrite) {
+		entry = pte_mkyoung(entry);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	}
+
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -1766,14 +1787,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
 	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
-	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+			false);
 
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn_prot);
 
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn, bool mkwrite)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
 
@@ -1802,10 +1824,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		page = pfn_to_page(pfn_t_to_pfn(pfn));
 		return insert_page(vma, addr, page, pgprot);
 	}
-	return insert_pfn(vma, addr, pfn, pgprot);
+	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn)
+{
+	return __vm_insert_mixed(vma, addr, pfn, false);
+
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn)
+{
+	return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results

From e30331ff05f689f8f2faeb51664299c4d7841f15 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:39 -0700
Subject: [PATCH 003/119] dax: relocate some dax functions

dax_load_hole() will soon need to call dax_insert_mapping_entry(), so it
needs to be moved lower in dax.c so the definition exists.

dax_wake_mapping_entry_waiter() will soon be removed from dax.h and be
made static to dax.c, so we need to move its definition above all its
callers.

Link: http://lkml.kernel.org/r/20170724170616.25810-3-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 138 +++++++++++++++++++++++++++----------------------------
 1 file changed, 69 insertions(+), 69 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ab925dc6647a..b8882b5ce6ed 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -120,6 +120,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
 	return autoremove_wake_function(wait, mode, sync, NULL);
 }
 
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree.  This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+		pgoff_t index, void *entry, bool wake_all)
+{
+	struct exceptional_entry_key key;
+	wait_queue_head_t *wq;
+
+	wq = dax_entry_waitqueue(mapping, index, entry, &key);
+
+	/*
+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
+	 * under mapping->tree_lock, ditto for entry handling in our callers.
+	 * So at this point all tasks that could have seen our entry locked
+	 * must be in the waitqueue and the following check will see them.
+	 */
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+}
+
 /*
  * Check whether the given slot is locked. The function must be called with
  * mapping->tree_lock held
@@ -392,31 +417,6 @@ restart:
 	return entry;
 }
 
-/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree.  This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
- */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-		pgoff_t index, void *entry, bool wake_all)
-{
-	struct exceptional_entry_key key;
-	wait_queue_head_t *wq;
-
-	wq = dax_entry_waitqueue(mapping, index, entry, &key);
-
-	/*
-	 * Checking for locked entry and prepare_to_wait_exclusive() happens
-	 * under mapping->tree_lock, ditto for entry handling in our callers.
-	 * So at this point all tasks that could have seen our entry locked
-	 * must be in the waitqueue and the following check will see them.
-	 */
-	if (waitqueue_active(wq))
-		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-}
-
 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
 {
@@ -468,50 +468,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 	return __dax_invalidate_mapping_entry(mapping, index, false);
 }
 
-/*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, void **entry,
-			 struct vm_fault *vmf)
-{
-	struct inode *inode = mapping->host;
-	struct page *page;
-	int ret;
-
-	/* Hole page already exists? Return it...  */
-	if (!radix_tree_exceptional_entry(*entry)) {
-		page = *entry;
-		goto finish_fault;
-	}
-
-	/* This will replace locked radix tree entry with a hole page */
-	page = find_or_create_page(mapping, vmf->pgoff,
-				   vmf->gfp_mask | __GFP_ZERO);
-	if (!page) {
-		ret = VM_FAULT_OOM;
-		goto out;
-	}
-
-finish_fault:
-	vmf->page = page;
-	ret = finish_fault(vmf);
-	vmf->page = NULL;
-	*entry = page;
-	if (!ret) {
-		/* Grab reference for PTE that is now referencing the page */
-		get_page(page);
-		ret = VM_FAULT_NOPAGE;
-	}
-out:
-	trace_dax_load_hole(inode, vmf, ret);
-	return ret;
-}
-
 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 		sector_t sector, size_t size, struct page *to,
 		unsigned long vaddr)
@@ -941,6 +897,50 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, void **entry,
+			 struct vm_fault *vmf)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	int ret;
+
+	/* Hole page already exists? Return it...  */
+	if (!radix_tree_exceptional_entry(*entry)) {
+		page = *entry;
+		goto finish_fault;
+	}
+
+	/* This will replace locked radix tree entry with a hole page */
+	page = find_or_create_page(mapping, vmf->pgoff,
+				   vmf->gfp_mask | __GFP_ZERO);
+	if (!page) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
+finish_fault:
+	vmf->page = page;
+	ret = finish_fault(vmf);
+	vmf->page = NULL;
+	*entry = page;
+	if (!ret) {
+		/* Grab reference for PTE that is now referencing the page */
+		get_page(page);
+		ret = VM_FAULT_NOPAGE;
+	}
+out:
+	trace_dax_load_hole(inode, vmf, ret);
+	return ret;
+}
+
 static bool dax_range_is_aligned(struct block_device *bdev,
 				 unsigned int offset, unsigned int length)
 {

From 91d25ba8a6b0d810dc844cebeedc53029118ce3e Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:43 -0700
Subject: [PATCH 004/119] dax: use common 4k zero page for dax mmap reads

When servicing mmap() reads from file holes the current DAX code
allocates a page cache page of all zeroes and places the struct page
pointer in the mapping->page_tree radix tree.

This has three major drawbacks:

1) It consumes memory unnecessarily. For every 4k page that is read via
   a DAX mmap() over a hole, we allocate a new page cache page. This
   means that if you read 1GiB worth of pages, you end up using 1GiB of
   zeroed memory. This is easily visible by looking at the overall
   memory consumption of the system or by looking at /proc/[pid]/smaps:

	7f62e72b3000-7f63272b3000 rw-s 00000000 103:00 12   /root/dax/data
	Size:            1048576 kB
	Rss:             1048576 kB
	Pss:             1048576 kB
	Shared_Clean:          0 kB
	Shared_Dirty:          0 kB
	Private_Clean:   1048576 kB
	Private_Dirty:         0 kB
	Referenced:      1048576 kB
	Anonymous:             0 kB
	LazyFree:              0 kB
	AnonHugePages:         0 kB
	ShmemPmdMapped:        0 kB
	Shared_Hugetlb:        0 kB
	Private_Hugetlb:       0 kB
	Swap:                  0 kB
	SwapPss:               0 kB
	KernelPageSize:        4 kB
	MMUPageSize:           4 kB
	Locked:                0 kB

2) It is slower than using a common zero page because each page fault
   has more work to do. Instead of just inserting a common zero page we
   have to allocate a page cache page, zero it, and then insert it. Here
   are the average latencies of dax_load_hole() as measured by ftrace on
   a random test box:

    Old method, using zeroed page cache pages:	3.4 us
    New method, using the common 4k zero page:	0.8 us

   This was the average latency over 1 GiB of sequential reads done by
   this simple fio script:

     [global]
     size=1G
     filename=/root/dax/data
     fallocate=none
     [io]
     rw=read
     ioengine=mmap

3) The fact that we had to check for both DAX exceptional entries and
   for page cache pages in the radix tree made the DAX code more
   complex.

Solve these issues by following the lead of the DAX PMD code and using a
common 4k zero page instead.  As with the PMD code we will now insert a
DAX exceptional entry into the radix tree instead of a struct page
pointer which allows us to remove all the special casing in the DAX
code.

Note that we do still pretty aggressively check for regular pages in the
DAX radix tree, especially where we take action based on the bits set in
the page.  If we ever find a regular page in our radix tree now that
most likely means that someone besides DAX is inserting pages (which has
happened lots of times in the past), and we want to find that out early
and fail loudly.

This solution also removes the extra memory consumption.  Here is that
same /proc/[pid]/smaps after 1GiB of reading from a hole with the new
code:

	7f2054a74000-7f2094a74000 rw-s 00000000 103:00 12   /root/dax/data
	Size:            1048576 kB
	Rss:                   0 kB
	Pss:                   0 kB
	Shared_Clean:          0 kB
	Shared_Dirty:          0 kB
	Private_Clean:         0 kB
	Private_Dirty:         0 kB
	Referenced:            0 kB
	Anonymous:             0 kB
	LazyFree:              0 kB
	AnonHugePages:         0 kB
	ShmemPmdMapped:        0 kB
	Shared_Hugetlb:        0 kB
	Private_Hugetlb:       0 kB
	Swap:                  0 kB
	SwapPss:               0 kB
	KernelPageSize:        4 kB
	MMUPageSize:           4 kB
	Locked:                0 kB

Overall system memory consumption is similarly improved.

Another major change is that we remove dax_pfn_mkwrite() from our fault
flow, and instead rely on the page fault itself to make the PTE dirty
and writeable.  The following description from the patch adding the
vm_insert_mixed_mkwrite() call explains this a little more:

   "To be able to use the common 4k zero page in DAX we need to have our
    PTE fault path look more like our PMD fault path where a PTE entry
    can be marked as dirty and writeable as it is first inserted rather
    than waiting for a follow-up dax_pfn_mkwrite() =>
    finish_mkwrite_fault() call.

    Right now we can rely on having a dax_pfn_mkwrite() call because we
    can distinguish between these two cases in do_wp_page():

            case 1: 4k zero page => writable DAX storage
            case 2: read-only DAX storage => writeable DAX storage

    This distinction is made by via vm_normal_page(). vm_normal_page()
    returns false for the common 4k zero page, though, just as it does
    for DAX ptes. Instead of special casing the DAX + 4k zero page case
    we will simplify our DAX PTE page fault sequence so that it matches
    our DAX PMD sequence, and get rid of the dax_pfn_mkwrite() helper.
    We will instead use dax_iomap_fault() to handle write-protection
    faults.

    This means that insert_pfn() needs to follow the lead of
    insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag. If
    'mkwrite' is set insert_pfn() will do the work that was previously
    done by wp_page_reuse() as part of the dax_pfn_mkwrite() call path"

Link: http://lkml.kernel.org/r/20170724170616.25810-4-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/dax.txt |   5 +-
 fs/dax.c                          | 243 ++++++++++--------------------
 fs/ext2/file.c                    |  25 +--
 fs/ext4/file.c                    |  32 +---
 fs/xfs/xfs_file.c                 |   2 +-
 include/linux/dax.h               |  12 +-
 include/trace/events/fs_dax.h     |   2 -
 7 files changed, 86 insertions(+), 235 deletions(-)

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index a7e6e14aeb08..3be3b266be41 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -63,9 +63,8 @@ Filesystem support consists of
 - implementing an mmap file operation for DAX files which sets the
   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
   include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
-  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
-  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
-  iomap operations.
+  handlers should probably call dax_iomap_fault() passing the appropriate
+  fault size and iomap operations.
 - calling iomap_zero_range() passing appropriate iomap operations instead of
   block_truncate_page() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
diff --git a/fs/dax.c b/fs/dax.c
index b8882b5ce6ed..ab67ae30ccbf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry)
 
 static int dax_is_zero_entry(void *entry)
 {
-	return (unsigned long)entry & RADIX_DAX_HZP;
+	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
 }
 
 static int dax_is_empty_entry(void *entry)
@@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 	for (;;) {
 		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 					  &slot);
-		if (!entry || !radix_tree_exceptional_entry(entry) ||
+		if (!entry ||
+		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
 		    !slot_locked(mapping, slot)) {
 			if (slotp)
 				*slotp = slot;
@@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
 }
 
 static void put_locked_mapping_entry(struct address_space *mapping,
-				     pgoff_t index, void *entry)
+		pgoff_t index)
 {
-	if (!radix_tree_exceptional_entry(entry)) {
-		unlock_page(entry);
-		put_page(entry);
-	} else {
-		dax_unlock_mapping_entry(mapping, index);
-	}
+	dax_unlock_mapping_entry(mapping, index);
 }
 
 /*
@@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 static void put_unlocked_mapping_entry(struct address_space *mapping,
 				       pgoff_t index, void *entry)
 {
-	if (!radix_tree_exceptional_entry(entry))
+	if (!entry)
 		return;
 
 	/* We have to wake up next waiter for the radix tree entry lock */
@@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
 }
 
 /*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
  *
  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
  * either return that locked entry or will return an error.  This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
  *
  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
@@ -301,18 +297,21 @@ restart:
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
 
+	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+		entry = ERR_PTR(-EIO);
+		goto out_unlock;
+	}
+
 	if (entry) {
 		if (size_flag & RADIX_DAX_PMD) {
-			if (!radix_tree_exceptional_entry(entry) ||
-			    dax_is_pte_entry(entry)) {
+			if (dax_is_pte_entry(entry)) {
 				put_unlocked_mapping_entry(mapping, index,
 						entry);
 				entry = ERR_PTR(-EEXIST);
 				goto out_unlock;
 			}
 		} else { /* trying to grab a PTE entry */
-			if (radix_tree_exceptional_entry(entry) &&
-			    dax_is_pmd_entry(entry) &&
+			if (dax_is_pmd_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
 				pmd_downgrade = true;
@@ -346,7 +345,7 @@ restart:
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 		if (err) {
 			if (pmd_downgrade)
-				put_locked_mapping_entry(mapping, index, entry);
+				put_locked_mapping_entry(mapping, index);
 			return ERR_PTR(err);
 		}
 		spin_lock_irq(&mapping->tree_lock);
@@ -396,21 +395,6 @@ restart:
 		spin_unlock_irq(&mapping->tree_lock);
 		return entry;
 	}
-	/* Normal page in radix tree? */
-	if (!radix_tree_exceptional_entry(entry)) {
-		struct page *page = entry;
-
-		get_page(page);
-		spin_unlock_irq(&mapping->tree_lock);
-		lock_page(page);
-		/* Page got truncated? Retry... */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto restart;
-		}
-		return page;
-	}
 	entry = lock_slot(mapping, slot);
  out_unlock:
 	spin_unlock_irq(&mapping->tree_lock);
@@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
-	if (!entry || !radix_tree_exceptional_entry(entry))
+	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
 		goto out;
 	if (!trunc &&
 	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      unsigned long flags)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int error = 0;
-	bool hole_fill = false;
 	void *new_entry;
 	pgoff_t index = vmf->pgoff;
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	/* Replacing hole page with block mapping? */
-	if (!radix_tree_exceptional_entry(entry)) {
-		hole_fill = true;
-		/*
-		 * Unmap the page now before we remove it from page cache below.
-		 * The page is locked so it cannot be faulted in again.
-		 */
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-				    PAGE_SIZE, 0);
-		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
-		if (error)
-			return ERR_PTR(error);
-	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
-		/* replacing huge zero page with PMD block mapping */
-		unmap_mapping_range(mapping,
-			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+		/* we are replacing a zero page with block mapping */
+		if (dax_is_pmd_entry(entry))
+			unmap_mapping_range(mapping,
+					(vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+					PMD_SIZE, 0);
+		else /* pte entry */
+			unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+					PAGE_SIZE, 0);
 	}
 
 	spin_lock_irq(&mapping->tree_lock);
 	new_entry = dax_radix_locked_entry(sector, flags);
 
-	if (hole_fill) {
-		__delete_from_page_cache(entry, NULL);
-		/* Drop pagecache reference */
-		put_page(entry);
-		error = __radix_tree_insert(page_tree, index,
-				dax_radix_order(new_entry), new_entry);
-		if (error) {
-			new_entry = ERR_PTR(error);
-			goto unlock;
-		}
-		mapping->nrexceptional++;
-	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		/*
 		 * Only swap our new entry into the radix tree if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		WARN_ON_ONCE(ret != entry);
 		__radix_tree_replace(page_tree, node, slot,
 				     new_entry, NULL, NULL);
+		entry = new_entry;
 	}
+
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
 	spin_unlock_irq(&mapping->tree_lock);
-	if (hole_fill) {
-		radix_tree_preload_end();
-		/*
-		 * We don't need hole page anymore, it has been replaced with
-		 * locked radix tree entry now.
-		 */
-		if (mapping->a_ops->freepage)
-			mapping->a_ops->freepage(entry);
-		unlock_page(entry);
-		put_page(entry);
-	}
-	return new_entry;
+	return entry;
 }
 
 static inline unsigned long
@@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
 	/* Entry got punched out / reallocated? */
-	if (!entry2 || !radix_tree_exceptional_entry(entry2))
+	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
 		goto put_unlocked;
 	/*
 	 * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
  dax_unlock:
 	dax_read_unlock(id);
-	put_locked_mapping_entry(mapping, index, entry);
+	put_locked_mapping_entry(mapping, index);
 	return ret;
 
  put_unlocked:
@@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
 		struct block_device *bdev, struct dax_device *dax_dev,
-		sector_t sector, size_t size, void **entryp,
+		sector_t sector, size_t size, void *entry,
 		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = vmf->address;
-	void *entry = *entryp;
 	void *ret, *kaddr;
 	pgoff_t pgoff;
 	int id, rc;
@@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping,
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
-	*entryp = ret;
 
 	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	return vm_insert_mixed(vma, vaddr, pfn);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+	else
+		return vm_insert_mixed(vma, vaddr, pfn);
 }
 
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
- */
-int dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct file *file = vmf->vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	void *entry, **slot;
-	pgoff_t index = vmf->pgoff;
-
-	spin_lock_irq(&mapping->tree_lock);
-	entry = get_unlocked_mapping_entry(mapping, index, &slot);
-	if (!entry || !radix_tree_exceptional_entry(entry)) {
-		if (entry)
-			put_unlocked_mapping_entry(mapping, index, entry);
-		spin_unlock_irq(&mapping->tree_lock);
-		trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
-		return VM_FAULT_NOPAGE;
-	}
-	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-	entry = lock_slot(mapping, slot);
-	spin_unlock_irq(&mapping->tree_lock);
-	/*
-	 * If we race with somebody updating the PTE and finish_mkwrite_fault()
-	 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
-	 * the fault in either case.
-	 */
-	finish_mkwrite_fault(vmf);
-	put_locked_mapping_entry(mapping, index, entry);
-	trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
-	return VM_FAULT_NOPAGE;
-}
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
-
 /*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
+ * The user has performed a load from a hole in the file.  Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
  */
-static int dax_load_hole(struct address_space *mapping, void **entry,
+static int dax_load_hole(struct address_space *mapping, void *entry,
 			 struct vm_fault *vmf)
 {
 	struct inode *inode = mapping->host;
-	struct page *page;
-	int ret;
+	unsigned long vaddr = vmf->address;
+	int ret = VM_FAULT_NOPAGE;
+	struct page *zero_page;
+	void *entry2;
 
-	/* Hole page already exists? Return it...  */
-	if (!radix_tree_exceptional_entry(*entry)) {
-		page = *entry;
-		goto finish_fault;
-	}
-
-	/* This will replace locked radix tree entry with a hole page */
-	page = find_or_create_page(mapping, vmf->pgoff,
-				   vmf->gfp_mask | __GFP_ZERO);
-	if (!page) {
+	zero_page = ZERO_PAGE(0);
+	if (unlikely(!zero_page)) {
 		ret = VM_FAULT_OOM;
 		goto out;
 	}
 
-finish_fault:
-	vmf->page = page;
-	ret = finish_fault(vmf);
-	vmf->page = NULL;
-	*entry = page;
-	if (!ret) {
-		/* Grab reference for PTE that is now referencing the page */
-		get_page(page);
-		ret = VM_FAULT_NOPAGE;
+	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+			RADIX_DAX_ZERO_PAGE);
+	if (IS_ERR(entry2)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
 	}
+
+	vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
 out:
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
@@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-				sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+				sector, PAGE_SIZE, entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
@@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-			vmf_ret = dax_load_hole(mapping, &entry, vmf);
+			vmf_ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
 		/*FALLTHRU*/
@@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+	put_locked_mapping_entry(mapping, vmf->pgoff);
  out:
 	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
 	return vmf_ret;
@@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void **entryp)
+		loff_t pos, void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		goto unlock_fallback;
 	dax_read_unlock(id);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
-	*entryp = ret;
 
 	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1231,7 @@ fallback:
 }
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
-		void **entryp)
+		void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 	if (unlikely(!zero_page))
 		goto fallback;
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
-			RADIX_DAX_PMD | RADIX_DAX_HZP);
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
 	if (IS_ERR(ret))
 		goto fallback;
-	*entryp = ret;
 
 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 		goto fallback;
 
 	/*
-	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
-	 * the tree, for instance), it will return -EEXIST and we just fall
-	 * back to 4k entries.
+	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
+	 * is already in the tree, for instance), it will return -EEXIST and
+	 * we just fall back to 4k entries.
 	 */
 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
 	if (IS_ERR(entry))
@@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			break;
-		result = dax_pmd_load_hole(vmf, &iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 				&iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff, entry);
+	put_locked_mapping_entry(mapping, pgoff);
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
 		split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct ext2_inode_info *ei = EXT2_I(inode);
-	loff_t size;
-	int ret;
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-	down_read(&ei->dax_sem);
-
-	/* check that the faulting page hasn't raced with truncate */
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else
-		ret = dax_pfn_mkwrite(vmf);
-
-	up_read(&ei->dax_sem);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	/*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
 	 * will always fail and fail back to regular faults.
 	 */
 	.page_mkwrite	= ext2_dax_fault,
-	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext2_dax_fault,
 };
 
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..f28ac999dfba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -311,41 +311,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct super_block *sb = inode->i_sb;
-	loff_t size;
-	int ret;
-
-	sb_start_pagefault(sb);
-	file_update_time(vmf->vma->vm_file);
-	down_read(&EXT4_I(inode)->i_mmap_sem);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else
-		ret = dax_pfn_mkwrite(vmf);
-	up_read(&EXT4_I(inode)->i_mmap_sem);
-	sb_end_pagefault(sb);
-
-	return ret;
-}
-
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
 	.huge_fault	= ext4_dax_huge_fault,
 	.page_mkwrite	= ext4_dax_fault,
-	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext4_dax_fault,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..62db8ffa83b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1130,7 +1130,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_pfn_mkwrite(vmf);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..b3518559f0da 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -91,18 +91,17 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a huge zero
- * page (HZP) or an empty entry that is just used for locking.  In total four
- * special bits.
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
  *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
- * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
 #define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 
 static inline unsigned long dax_radix_sector(void *entry)
@@ -153,7 +152,6 @@ static inline unsigned int dax_radix_order(void *entry)
 	return 0;
 }
 #endif
-int dax_pfn_mkwrite(struct vm_fault *vmf);
 
 static inline bool dax_mapping(struct address_space *mapping)
 {
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 08bb3ed18dcc..fbc4a06f7310 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 
 TRACE_EVENT(dax_insert_mapping,

From d01ad197ac3b50a99ea668697acefe12e73c5fea Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:47 -0700
Subject: [PATCH 005/119] dax: remove DAX code from page_cache_tree_insert()

Now that we no longer insert struct page pointers in DAX radix trees we
can remove the special casing for DAX in page_cache_tree_insert().

This also allows us to make dax_wake_mapping_entry_waiter() local to
fs/dax.c, removing it from dax.h.

Link: http://lkml.kernel.org/r/20170724170616.25810-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            |  2 +-
 include/linux/dax.h |  2 --
 mm/filemap.c        | 13 ++-----------
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ab67ae30ccbf..8f70e3b59b91 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -127,7 +127,7 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
  * correct waitqueue where tasks might be waiting for that old 'entry' and
  * wake them.
  */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all)
 {
 	struct exceptional_entry_key key;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b3518559f0da..319e0d997446 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -123,8 +123,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 int __dax_zero_page_range(struct block_device *bdev,
diff --git a/mm/filemap.c b/mm/filemap.c
index 65b4b6e7f7bd..dad935769055 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			return -EEXIST;
 
 		mapping->nrexceptional--;
-		if (!dax_mapping(mapping)) {
-			if (shadowp)
-				*shadowp = p;
-		} else {
-			/* DAX can replace empty locked entry with a hole */
-			WARN_ON_ONCE(p !=
-				dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
-			/* Wakeup waiters for exceptional entry lock */
-			dax_wake_mapping_entry_waiter(mapping, page->index, p,
-						      true);
-		}
+		if (shadowp)
+			*shadowp = p;
 	}
 	__radix_tree_replace(&mapping->page_tree, node, slot, page,
 			     workingset_update_node, mapping);

From 527b19d0808e75fbba896beb2435c2b4d6bcd32a Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:51 -0700
Subject: [PATCH 006/119] dax: move all DAX radix tree defs to fs/dax.c

Now that we no longer insert struct page pointers in DAX radix trees the
page cache code no longer needs to know anything about DAX exceptional
entries.  Move all the DAX exceptional entry definitions from dax.h to
fs/dax.c.

Link: http://lkml.kernel.org/r/20170724170616.25810-6-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            | 34 ++++++++++++++++++++++++++++++++++
 include/linux/dax.h | 41 -----------------------------------------
 2 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 8f70e3b59b91..c576f6181dc8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -54,6 +54,40 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+
+static unsigned long dax_radix_sector(void *entry)
+{
+	return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+			((unsigned long)sector << RADIX_DAX_SHIFT) |
+			RADIX_DAX_ENTRY_LOCK);
+}
+
+static unsigned int dax_radix_order(void *entry)
+{
+	if ((unsigned long)entry & RADIX_DAX_PMD)
+		return PMD_SHIFT - PAGE_SHIFT;
+	return 0;
+}
+
 static int dax_is_pmd_entry(void *entry)
 {
 	return (unsigned long)entry & RADIX_DAX_PMD;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 319e0d997446..eb0bff6f1eab 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -89,33 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
 
-/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking.  In total four special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
- * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-
-static inline unsigned long dax_radix_sector(void *entry)
-{
-	return (unsigned long)entry >> RADIX_DAX_SHIFT;
-}
-
-static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
-{
-	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-			((unsigned long)sector << RADIX_DAX_SHIFT) |
-			RADIX_DAX_ENTRY_LOCK);
-}
-
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
@@ -137,20 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-#ifdef CONFIG_FS_DAX_PMD
-static inline unsigned int dax_radix_order(void *entry)
-{
-	if ((unsigned long)entry & RADIX_DAX_PMD)
-		return PMD_SHIFT - PAGE_SHIFT;
-	return 0;
-}
-#else
-static inline unsigned int dax_radix_order(void *entry)
-{
-	return 0;
-}
-#endif
-
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);

From a2e050f5a9a9bd2b632d67bd06d87088e6a02dae Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:54 -0700
Subject: [PATCH 007/119] dax: explain how read(2)/write(2) addresses are
 validated

Add a comment explaining how the user addresses provided to read(2) and
write(2) are validated in the DAX I/O path.

We call dax_copy_from_iter() or copy_to_iter() on these without calling
access_ok() first in the DAX code, and there was a concern that the user
might be able to read/write to arbitrary kernel addresses with this
path.

Link: http://lkml.kernel.org/r/20170816173615.10098-1-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index c576f6181dc8..5b2eac3ef077 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1004,6 +1004,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		if (map_len > end - pos)
 			map_len = end - pos;
 
+		/*
+		 * The userspace address for the memory copy has already been
+		 * validated via access_ok() in either vfs_read() or
+		 * vfs_write(), depending on which operation we are doing.
+		 */
 		if (iov_iter_rw(iter) == WRITE)
 			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
 					map_len, iter);

From 917f34526c4123ccd93e4373719c645c85020a5a Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:58 -0700
Subject: [PATCH 008/119] dax: use PG_PMD_COLOUR instead of open coding

Use ~PG_PMD_COLOUR in dax_entry_waitqueue() instead of open coding an
equivalent page offset mask.

Link: http://lkml.kernel.org/r/20170822222436.18926-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Slusarz, Marcin" <marcin.slusarz@intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5b2eac3ef077..23674cda0b70 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
 
+/* The 'colour' (ie low bits) within a PMD of a page offset.  */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
+
 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
 
 static int __init init_dax_wait_table(void)
@@ -132,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
 	 * the range covered by the PMD map to the same bit lock.
 	 */
 	if (dax_is_pmd_entry(entry))
-		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+		index &= ~PG_PMD_COLOUR;
 
 	key->mapping = mapping;
 	key->entry_start = index;
@@ -1215,12 +1218,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
-
 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		loff_t pos, void *entry)
 {

From 2f52074d35135ecf3fb719f3430d72c17ae07287 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Wed, 6 Sep 2017 16:19:01 -0700
Subject: [PATCH 009/119] dax: initialize variable pfn before using it

dax_pmd_insert_mapping() contains the following code:

        pfn_t pfn;
        if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
            goto fallback;
        /* ... */
    fallback:
      trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);

When the condition in the if statement fails, the function calls
trace_dax_pmd_insert_mapping_fallback() with an uninitialized pfn value.

This issue has been found while building the kernel with clang.  The
compiler reported:

    fs/dax.c:1280:6: error: variable 'pfn' is used uninitialized
    whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
        if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    fs/dax.c:1310:60: note: uninitialized use occurs here
      trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
                                                                     ^~~

Link: http://lkml.kernel.org/r/20170903083000.587-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 23674cda0b70..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1230,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 	void *ret = NULL, *kaddr;
 	long length = 0;
 	pgoff_t pgoff;
-	pfn_t pfn;
+	pfn_t pfn = {};
 	int id;
 
 	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)

From 6124c04c1344497e0cbfb505ddbd3b83090a4d51 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 6 Sep 2017 16:19:05 -0700
Subject: [PATCH 010/119] modpost: simplify sec_name()

There is code duplication between sec_name() and sech_name().  Simplify
sec_name() by re-using sech_name().  Also, move them up to remove the
forward declaration of sec_name().

Link: http://lkml.kernel.org/r/1502248721-22009-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/mod/modpost.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 48397feb08fb..b920d186ad4a 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -261,7 +261,17 @@ static enum export export_no(const char *s)
 	return export_unknown;
 }
 
-static const char *sec_name(struct elf_info *elf, int secindex);
+static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
+{
+	return (void *)elf->hdr +
+		elf->sechdrs[elf->secindex_strings].sh_offset +
+		sechdr->sh_name;
+}
+
+static const char *sec_name(struct elf_info *elf, int secindex)
+{
+	return sech_name(elf, &elf->sechdrs[secindex]);
+}
 
 #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
 
@@ -775,21 +785,6 @@ static const char *sym_name(struct elf_info *elf, Elf_Sym *sym)
 		return "(unknown)";
 }
 
-static const char *sec_name(struct elf_info *elf, int secindex)
-{
-	Elf_Shdr *sechdrs = elf->sechdrs;
-	return (void *)elf->hdr +
-		elf->sechdrs[elf->secindex_strings].sh_offset +
-		sechdrs[secindex].sh_name;
-}
-
-static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
-{
-	return (void *)elf->hdr +
-		elf->sechdrs[elf->secindex_strings].sh_offset +
-		sechdr->sh_name;
-}
-
 /* The pattern is an array of simple patterns.
  * "foo" will match an exact string equal to "foo"
  * "*foo" will match a string that ends with "foo"

From 01ffb56bc1cb27f0d9fd50be096446a8d1ab6354 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:19:08 -0700
Subject: [PATCH 011/119] ocfs2: make ocfs2_set_acl() static

The function is never called outside of fs/ocfs2/acl.c.

Link: http://lkml.kernel.org/r/20170801141252.19675-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/acl.c | 2 +-
 fs/ocfs2/acl.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e50a387959bf..40b5cc97f7b0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -221,7 +221,7 @@ out:
 /*
  * Set the access or default ACL of an inode.
  */
-int ocfs2_set_acl(handle_t *handle,
+static int ocfs2_set_acl(handle_t *handle,
 			 struct inode *inode,
 			 struct buffer_head *di_bh,
 			 int type,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 2783a75b3999..7be0bb756286 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int ocfs2_set_acl(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *di_bh,
-			 int type,
-			 struct posix_acl *acl,
-			 struct ocfs2_alloc_context *meta_ac,
-			 struct ocfs2_alloc_context *data_ac);
 extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,

From 964f14a0d350486d17cfd24b3b7dc4f7c4bdc4d3 Mon Sep 17 00:00:00 2001
From: Jun Piao <piaojun@huawei.com>
Date: Wed, 6 Sep 2017 16:19:11 -0700
Subject: [PATCH 012/119] ocfs2: clean up some dead code

clean up some unused functions and parameters.

Link: http://lkml.kernel.org/r/598A5E21.2080807@huawei.com
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Alex Chen <alex.chen@huawei.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c             | 22 +++++++------------
 fs/ocfs2/alloc.h             |  3 +--
 fs/ocfs2/cluster/heartbeat.c | 42 ++++--------------------------------
 fs/ocfs2/dir.c               |  2 +-
 fs/ocfs2/file.c              |  7 ------
 fs/ocfs2/journal.c           |  1 -
 fs/ocfs2/move_extents.c      |  2 +-
 fs/ocfs2/ocfs2.h             |  4 +---
 fs/ocfs2/refcounttree.c      |  2 +-
 fs/ocfs2/suballoc.c          |  2 +-
 fs/ocfs2/super.c             |  1 -
 fs/ocfs2/xattr.c             |  2 +-
 12 files changed, 19 insertions(+), 71 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fb15a96df0b6..a177eae3aa1a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
 /*
  * How many free extents have we got before we need more meta data?
  */
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
-			   struct ocfs2_extent_tree *et)
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
 {
 	int retval;
 	struct ocfs2_extent_list *el = NULL;
@@ -1933,14 +1932,12 @@ out:
  * the new changes.
  *
  * left_rec: the record on the left.
- * left_child_el: is the child list pointed to by left_rec
  * right_rec: the record to the right of left_rec
  * right_child_el: is the child list pointed to by right_rec
  *
  * By definition, this only works on interior nodes.
  */
 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
-				  struct ocfs2_extent_list *left_child_el,
 				  struct ocfs2_extent_rec *right_rec,
 				  struct ocfs2_extent_list *right_child_el)
 {
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
 	 */
 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
 
-	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+	ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
 				      &root_el->l_recs[i + 1], right_el);
 }
 
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 		el = right_path->p_node[i].el;
 		right_rec = &el->l_recs[0];
 
-		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
-					      right_el);
+		ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
 
 		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
 		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2509,7 +2505,7 @@ out_ret_path:
 
 static int ocfs2_update_edge_lengths(handle_t *handle,
 				     struct ocfs2_extent_tree *et,
-				     int subtree_index, struct ocfs2_path *path)
+				     struct ocfs2_path *path)
 {
 	int i, idx, ret;
 	struct ocfs2_extent_rec *rec;
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
 	if (del_right_subtree) {
 		ocfs2_unlink_subtree(handle, et, left_path, right_path,
 				     subtree_index, dealloc);
-		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
-						left_path);
+		ret = ocfs2_update_edge_lengths(handle, et, left_path);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
 
 		ocfs2_unlink_subtree(handle, et, left_path, path,
 				     subtree_index, dealloc);
-		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
-						left_path);
+		ret = ocfs2_update_edge_lengths(handle, et, left_path);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, et);
+	free_extents = ocfs2_num_free_extents(et);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
 
 	*ac = NULL;
 
-	num_free_extents = ocfs2_num_free_extents(osb, et);
+	num_free_extents = ocfs2_num_free_extents(et);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..27b75cf32cfa 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 			     struct ocfs2_cached_dealloc_ctxt *dealloc,
 			     u64 refcount_loc, bool refcount_tree_locked);
 
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
-			   struct ocfs2_extent_tree *et);
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
 
 /*
  * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ffe003982d95..56ac07cd35f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
 	}
 }
 
-static void o2hb_wait_on_io(struct o2hb_region *reg,
-			    struct o2hb_bio_wait_ctxt *wc)
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
 {
 	o2hb_bio_wait_dec(wc, 1);
 	wait_for_completion(&wc->wc_io_complete);
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 	status = 0;
 
 bail_and_wait:
-	o2hb_wait_on_io(reg, &wc);
+	o2hb_wait_on_io(&wc);
 	if (wc.wc_error && !status)
 		status = wc.wc_error;
 
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	 * before we can go to steady state.  This ensures that
 	 * people we find in our steady state have seen us.
 	 */
-	o2hb_wait_on_io(reg, &write_wc);
+	o2hb_wait_on_io(&write_wc);
 	if (write_wc.wc_error) {
 		/* Do not re-arm the write timeout on I/O error - we
 		 * can't be sure that the new block ever made it to
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
 		o2hb_prepare_block(reg, 0);
 		ret = o2hb_issue_node_write(reg, &write_wc);
 		if (ret == 0)
-			o2hb_wait_on_io(reg, &write_wc);
+			o2hb_wait_on_io(&write_wc);
 		else
 			mlog_errno(ret);
 	}
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
 }
 EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
 
-int o2hb_check_node_heartbeating(u8 node_num)
-{
-	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-
-	o2hb_fill_node_map(testing_map, sizeof(testing_map));
-	if (!test_bit(node_num, testing_map)) {
-		mlog(ML_HEARTBEAT,
-		     "node (%u) does not have heartbeating enabled.\n",
-		     node_num);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
-
 int o2hb_check_node_heartbeating_no_sem(u8 node_num)
 {
 	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
 }
 EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
 
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
-	u8 node_num;
-
-	/* if this node was set then we have networking */
-	node_num = o2nm_this_node();
-	if (node_num == O2NM_MAX_NODES) {
-		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
-		return 0;
-	}
-
-	return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
-
 /*
  * this is just a hack until we get the plumbing which flips file systems
  * read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3ecb9f337b7d..febe6312ceff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
 					      parent_fe_bh);
-		num_free_extents = ocfs2_num_free_extents(osb, &et);
+		num_free_extents = ocfs2_num_free_extents(&et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..b4512fad99ed 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -713,13 +713,6 @@ leave:
 	return status;
 }
 
-int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
-		u32 clusters_to_add, int mark_unwritten)
-{
-	return __ocfs2_extend_allocation(inode, logical_start,
-			clusters_to_add, mark_unwritten);
-}
-
 /*
  * While a write will already be ordering the data, a truncate will not.
  * Thus, we need to explicitly order the zeroed pages.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d5e5fa7f0743..36304434eacf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 	ocfs2_schedule_truncate_log_flush(osb, 0);
 
 	osb->local_alloc_copy = NULL;
-	osb->dirty = 0;
 
 	/* queue to recover orphan slots for all offline slots */
 	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e52a2852d50d..7eb3b0a6347e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	num_free_extents = ocfs2_num_free_extents(osb, et);
+	num_free_extents = ocfs2_num_free_extents(et);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0c39d71c67a1..9a50f222ac97 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -320,7 +320,6 @@ struct ocfs2_super
 	u64 system_dir_blkno;
 	u64 bitmap_blkno;
 	u32 bitmap_cpg;
-	u8 *uuid;
 	char *uuid_str;
 	u32 uuid_hash;
 	u8 *vol_label;
@@ -388,9 +387,8 @@ struct ocfs2_super
 	unsigned int	osb_resv_level;
 	unsigned int	osb_dir_resv_level;
 
-	/* Next three fields are for local node slot recovery during
+	/* Next two fields are for local node slot recovery during
 	 * mount. */
-	int dirty;
 	struct ocfs2_dinode *local_alloc_copy;
 	struct ocfs2_quota_recovery *quota_rec;
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index f8933cb53d68..ab156e35ec00 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
 					int *credits)
 {
 	int ret = 0, meta_add = 0;
-	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+	int num_free_extents = ocfs2_num_free_extents(et);
 
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6ad3533940ba..71f22c8fbffd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
 
 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
 
-	num_free_extents = ocfs2_num_free_extents(osb, et);
+	num_free_extents = ocfs2_num_free_extents(et);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83005f486451..3f936be379a9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 	if (dirty) {
 		/* Recovery will be completed after we've mounted the
 		 * rest of the volume. */
-		osb->dirty = 1;
 		osb->local_alloc_copy = local_alloc;
 		local_alloc = NULL;
 	}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f70c3778d600..5fdf269ba82e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
 		*credits += 1;
 
 	/* count in the xattr tree change. */
-	num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+	num_free_extents = ocfs2_num_free_extents(xt_et);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);

From ea37df54d2b7950d607800ee417a1d59b95068c2 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 6 Sep 2017 16:19:15 -0700
Subject: [PATCH 013/119] slub: tidy up initialization ordering

 - free_kmem_cache_nodes() frees the cache node before nulling out a
   reference to it

 - init_kmem_cache_nodes() publishes the cache node before initializing
   it

Neither of these matter at runtime because the cache nodes cannot be
looked up by any other thread.  But it's neater and more consistent to
reorder these.

Link: http://lkml.kernel.org/r/20170707083408.40410-1-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index e8b4e31162ca..3e90d791dd41 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3358,8 +3358,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 	struct kmem_cache_node *n;
 
 	for_each_kmem_cache_node(s, node, n) {
-		kmem_cache_free(kmem_cache_node, n);
 		s->node[node] = NULL;
+		kmem_cache_free(kmem_cache_node, n);
 	}
 }
 
@@ -3389,8 +3389,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
 			return 0;
 		}
 
-		s->node[node] = n;
 		init_kmem_cache_node(n);
+		s->node[node] = n;
 	}
 	return 1;
 }

From 2482ddec670fb83717d129012bc558777cb159f7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 6 Sep 2017 16:19:18 -0700
Subject: [PATCH 014/119] mm: add SLUB free list pointer obfuscation

This SLUB free list pointer obfuscation code is modified from Brad
Spengler/PaX Team's code in the last public patch of grsecurity/PaX
based on my understanding of the code.  Changes or omissions from the
original code are mine and don't reflect the original grsecurity/PaX
code.

This adds a per-cache random value to SLUB caches that is XORed with
their freelist pointer address and value.  This adds nearly zero
overhead and frustrates the very common heap overflow exploitation
method of overwriting freelist pointers.

A recent example of the attack is written up here:

  http://cyseclabs.com/blog/cve-2016-6187-heap-off-by-one-exploit

and there is a section dedicated to the technique the book "A Guide to
Kernel Exploitation: Attacking the Core".

This is based on patches by Daniel Micay, and refactored to minimize the
use of #ifdef.

With 200-count cycles of "hackbench -g 20 -l 1000" I saw the following
run times:

 before:
 	mean 10.11882499999999999995
	variance .03320378329145728642
	stdev .18221905304181911048

  after:
	mean 10.12654000000000000014
	variance .04700556623115577889
	stdev .21680767106160192064

The difference gets lost in the noise, but if the above is to be taken
literally, using CONFIG_FREELIST_HARDENED is 0.07% slower.

Link: http://lkml.kernel.org/r/20170802180609.GA66807@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Daniel Micay <danielmicay@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tycho Andersen <tycho@docker.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  4 ++++
 init/Kconfig             |  9 +++++++++
 mm/slub.c                | 42 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index cc0faf3a90be..0783b622311e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,6 +115,10 @@ struct kmem_cache {
 #endif
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	unsigned long random;
+#endif
+
 #ifdef CONFIG_NUMA
 	/*
 	 * Defragmentation by allocating from a remote node.
diff --git a/init/Kconfig b/init/Kconfig
index 5f0ef850e808..78cb2461012e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM
 	  security feature reduces the predictability of the kernel slab
 	  allocator against heap overflows.
 
+config SLAB_FREELIST_HARDENED
+	bool "Harden slab freelist metadata"
+	depends on SLUB
+	help
+	  Many kernel heap attacks try to target slab cache metadata and
+	  other infrastructure. This options makes minor performance
+	  sacrifies to harden the kernel slab allocator against common
+	  freelist exploit methods.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/slub.c b/mm/slub.c
index 3e90d791dd41..6c87c2c6af24 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
 #include <linux/memcontrol.h>
+#include <linux/random.h>
 
 #include <trace/events/kmem.h>
 
@@ -238,30 +239,58 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
  * 			Core slab cache functions
  *******************************************************************/
 
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+				 unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+#else
+	return ptr;
+#endif
+}
+
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+					 void *ptr_addr)
+{
+	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+			    (unsigned long)ptr_addr);
+}
+
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
-	return *(void **)(object + s->offset);
+	return freelist_dereference(s, object + s->offset);
 }
 
 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 {
-	prefetch(object + s->offset);
+	if (object)
+		prefetch(freelist_dereference(s, object + s->offset));
 }
 
 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 {
+	unsigned long freepointer_addr;
 	void *p;
 
 	if (!debug_pagealloc_enabled())
 		return get_freepointer(s, object);
 
-	probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-	return p;
+	freepointer_addr = (unsigned long)object + s->offset;
+	probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+	return freelist_ptr(s, p, freepointer_addr);
 }
 
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
-	*(void **)(object + s->offset) = fp;
+	unsigned long freeptr_addr = (unsigned long)object + s->offset;
+
+	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
 }
 
 /* Loop over all objects in a slab */
@@ -3563,6 +3592,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 {
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	s->random = get_random_long();
+#endif
 
 	if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
 		s->reserved = sizeof(struct rcu_head);

From ce6fa91b93630396ca220c33dd38ffc62686d499 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Wed, 6 Sep 2017 16:19:22 -0700
Subject: [PATCH 015/119] mm/slub.c: add a naive detection of double free or
 corruption

Add an assertion similar to "fasttop" check in GNU C Library allocator
as a part of SLAB_FREELIST_HARDENED feature.  An object added to a
singly linked freelist should not point to itself.  That helps to detect
some double free errors (e.g. CVE-2017-2636) without slub_debug and
KASAN.

Link: http://lkml.kernel.org/r/1502468246-1262-1-git-send-email-alex.popov@linux.com
Signed-off-by: Alexander Popov <alex.popov@linux.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Paul E McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tycho Andersen <tycho@docker.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/slub.c b/mm/slub.c
index 6c87c2c6af24..16a60f871f39 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -290,6 +290,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
 	unsigned long freeptr_addr = (unsigned long)object + s->offset;
 
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	BUG_ON(object == fp); /* naive detection of double free or corruption */
+#endif
+
 	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
 }
 

From d460acb5bdffc19b492b70b8f416c24dc03c474e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 6 Sep 2017 16:19:26 -0700
Subject: [PATCH 016/119] mm: track actual nr_scanned during shrink_slab()

Some shrinkers may only be able to free a bunch of objects at a time,
and so free more than the requested nr_to_scan in one pass.

Whilst other shrinkers may find themselves even unable to scan as many
objects as they counted, and so underreport.  Account for the extra
freed/scanned objects against the total number of objects we intend to
scan, otherwise we may end up penalising the slab far more than
intended.  Similarly, we want to add the underperforming scan to the
deferred pass so that we try harder and harder in future passes.

Link: http://lkml.kernel.org/r/20170822135325.9191-1-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Shaohua Li <shli@fb.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shrinker.h | 7 +++++++
 mm/vmscan.c              | 7 ++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 4fcacd915d45..51d189615bda 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -18,6 +18,13 @@ struct shrink_control {
 	 */
 	unsigned long nr_to_scan;
 
+	/*
+	 * How many objects did scan_objects process?
+	 * This defaults to nr_to_scan before every call, but the callee
+	 * should track its actual progress.
+	 */
+	unsigned long nr_scanned;
+
 	/* current node being shrunk (for NUMA aware shrinkers) */
 	int nid;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f957afe900ec..095817820a56 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		unsigned long nr_to_scan = min(batch_size, total_scan);
 
 		shrinkctl->nr_to_scan = nr_to_scan;
+		shrinkctl->nr_scanned = nr_to_scan;
 		ret = shrinker->scan_objects(shrinker, shrinkctl);
 		if (ret == SHRINK_STOP)
 			break;
 		freed += ret;
 
-		count_vm_events(SLABS_SCANNED, nr_to_scan);
-		total_scan -= nr_to_scan;
-		scanned += nr_to_scan;
+		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+		total_scan -= shrinkctl->nr_scanned;
+		scanned += shrinkctl->nr_scanned;
 
 		cond_resched();
 	}

From 912d572d63b8cd19c303f357b627de492bd6bdef Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 6 Sep 2017 16:19:30 -0700
Subject: [PATCH 017/119] drm/i915: wire up shrinkctl->nr_scanned

shrink_slab() allows us to report back the number of objects we
successfully scanned (out of the target shrinkctl->nr_to_scan).  As
report the number of pages owned by each GEM object as a separate item
to the shrinker, we cannot precisely control the number of shrinker
objects we scan on each pass; and indeed may free more than requested.
If we fail to tell the shrinker about the number of objects we process,
it will continue to hold a grudge against us as any objects left
unscanned are added to the next reclaim -- and so we will keep on
"unfairly" shrinking our own slab in comparison to other slabs.

Link: http://lkml.kernel.org/r/20170822135325.9191-2-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Shaohua Li <shli@fb.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  4 ++--
 drivers/gpu/drm/i915/i915_drv.h          |  1 +
 drivers/gpu/drm/i915/i915_gem.c          |  4 ++--
 drivers/gpu/drm/i915/i915_gem_gtt.c      |  2 +-
 drivers/gpu/drm/i915/i915_gem_shrinker.c | 24 ++++++++++++++++++------
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a36216bd2a84..e4d4b6b41e26 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4308,10 +4308,10 @@ i915_drop_caches_set(void *data, u64 val)
 
 	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
-		i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
+		i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_BOUND);
 
 	if (val & DROP_UNBOUND)
-		i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_UNBOUND);
+		i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
 
 	if (val & DROP_SHRINK_ALL)
 		i915_gem_shrink_all(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 60267e375e88..bd74641ab7f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3742,6 +3742,7 @@ i915_gem_object_create_internal(struct drm_i915_private *dev_priv,
 /* i915_gem_shrinker.c */
 unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv,
 			      unsigned long target,
+			      unsigned long *nr_scanned,
 			      unsigned flags);
 #define I915_SHRINK_PURGEABLE 0x1
 #define I915_SHRINK_UNBOUND 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b9e8e0d6e97b..287c6ead95b3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2354,7 +2354,7 @@ rebuild_st:
 				goto err_sg;
 			}
 
-			i915_gem_shrink(dev_priv, 2 * page_count, *s++);
+			i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
 			cond_resched();
 
 			/* We've tried hard to allocate the memory by reaping
@@ -5015,7 +5015,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
 	 * the objects as well, see i915_gem_freeze()
 	 */
 
-	i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND);
+	i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
 	i915_gem_drain_freed_objects(dev_priv);
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d60f38adc4c4..6c6b8e8592aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2062,7 +2062,7 @@ int i915_gem_gtt_prepare_pages(struct drm_i915_gem_object *obj,
 		 */
 		GEM_BUG_ON(obj->mm.pages == pages);
 	} while (i915_gem_shrink(to_i915(obj->base.dev),
-				 obj->base.size >> PAGE_SHIFT,
+				 obj->base.size >> PAGE_SHIFT, NULL,
 				 I915_SHRINK_BOUND |
 				 I915_SHRINK_UNBOUND |
 				 I915_SHRINK_ACTIVE));
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 77fb39808131..74002b2d1b6f 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -136,6 +136,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
  * i915_gem_shrink - Shrink buffer object caches
  * @dev_priv: i915 device
  * @target: amount of memory to make available, in pages
+ * @nr_scanned: optional output for number of pages scanned (incremental)
  * @flags: control flags for selecting cache types
  *
  * This function is the main interface to the shrinker. It will try to release
@@ -158,7 +159,9 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
  */
 unsigned long
 i915_gem_shrink(struct drm_i915_private *dev_priv,
-		unsigned long target, unsigned flags)
+		unsigned long target,
+		unsigned long *nr_scanned,
+		unsigned flags)
 {
 	const struct {
 		struct list_head *list;
@@ -169,6 +172,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 		{ NULL, 0 },
 	}, *phase;
 	unsigned long count = 0;
+	unsigned long scanned = 0;
 	bool unlock;
 
 	if (!shrinker_lock(dev_priv, &unlock))
@@ -249,6 +253,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 					count += obj->base.size >> PAGE_SHIFT;
 				}
 				mutex_unlock(&obj->mm.lock);
+				scanned += obj->base.size >> PAGE_SHIFT;
 			}
 		}
 		list_splice_tail(&still_in_list, phase->list);
@@ -261,6 +266,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 
 	shrinker_unlock(dev_priv, unlock);
 
+	if (nr_scanned)
+		*nr_scanned += scanned;
 	return count;
 }
 
@@ -283,7 +290,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
 	unsigned long freed;
 
 	intel_runtime_pm_get(dev_priv);
-	freed = i915_gem_shrink(dev_priv, -1UL,
+	freed = i915_gem_shrink(dev_priv, -1UL, NULL,
 				I915_SHRINK_BOUND |
 				I915_SHRINK_UNBOUND |
 				I915_SHRINK_ACTIVE);
@@ -329,23 +336,28 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
 	unsigned long freed;
 	bool unlock;
 
+	sc->nr_scanned = 0;
+
 	if (!shrinker_lock(dev_priv, &unlock))
 		return SHRINK_STOP;
 
 	freed = i915_gem_shrink(dev_priv,
 				sc->nr_to_scan,
+				&sc->nr_scanned,
 				I915_SHRINK_BOUND |
 				I915_SHRINK_UNBOUND |
 				I915_SHRINK_PURGEABLE);
 	if (freed < sc->nr_to_scan)
 		freed += i915_gem_shrink(dev_priv,
-					 sc->nr_to_scan - freed,
+					 sc->nr_to_scan - sc->nr_scanned,
+					 &sc->nr_scanned,
 					 I915_SHRINK_BOUND |
 					 I915_SHRINK_UNBOUND);
 	if (freed < sc->nr_to_scan && current_is_kswapd()) {
 		intel_runtime_pm_get(dev_priv);
 		freed += i915_gem_shrink(dev_priv,
-					 sc->nr_to_scan - freed,
+					 sc->nr_to_scan - sc->nr_scanned,
+					 &sc->nr_scanned,
 					 I915_SHRINK_ACTIVE |
 					 I915_SHRINK_BOUND |
 					 I915_SHRINK_UNBOUND);
@@ -354,7 +366,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
 
 	shrinker_unlock(dev_priv, unlock);
 
-	return freed;
+	return sc->nr_scanned ? freed : SHRINK_STOP;
 }
 
 static bool
@@ -453,7 +465,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
 		goto out;
 
 	intel_runtime_pm_get(dev_priv);
-	freed_pages += i915_gem_shrink(dev_priv, -1UL,
+	freed_pages += i915_gem_shrink(dev_priv, -1UL, NULL,
 				       I915_SHRINK_BOUND |
 				       I915_SHRINK_UNBOUND |
 				       I915_SHRINK_ACTIVE |

From c11525830f92dea58353ace0f074ece5d9ef37c8 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 6 Sep 2017 16:19:33 -0700
Subject: [PATCH 018/119] mm/memory_hotplug: just build zonelist for newly
 added node

Commit 9adb62a5df9c ("mm/hotplug: correctly setup fallback zonelists
when creating new pgdat") tries to build the correct zonelist for a
newly added node, while it is not necessary to rebuild it for already
exist nodes.

In build_zonelists(), it will iterate on nodes with memory.  For a newly
added node, it will have memory until node_states_set_node() is called
in online_pages().

This patch avoids rebuilding the zonelists for already existing nodes.

build_zonelists_node() uses managed_zone(zone) checks, so it should not
include empty zones anyway.  So effectively we avoid some pointless work
under stop_machine().

[akpm@linux-foundation.org: tweak comment text]
[akpm@linux-foundation.org: coding-style tweak, per Vlastimil]
Link: http://lkml.kernel.org/r/20170626035822.50155-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9327a940e373..dcc8a1cf55b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5282,14 +5282,18 @@ static int __build_all_zonelists(void *data)
 	memset(node_load, 0, sizeof(node_load));
 #endif
 
+	/*
+	 * This node is hotadded and no memory is yet present.   So just
+	 * building zonelists is fine - no need to touch other nodes.
+	 */
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
-	}
+	} else {
+		for_each_online_node(nid) {
+			pg_data_t *pgdat = NODE_DATA(nid);
 
-	for_each_online_node(nid) {
-		pg_data_t *pgdat = NODE_DATA(nid);
-
-		build_zonelists(pgdat);
+			build_zonelists(pgdat);
+		}
 	}
 
 	/*

From e5e68930263377c6d4f6da0ff06f36b55d83a83f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:19:37 -0700
Subject: [PATCH 019/119] mm, memory_hotplug: display allowed zones in the
 preferred ordering

Prior to commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate
hotadded memory to zones until online") we used to allow to change the
valid zone types of a memory block if it is adjacent to a different zone
type.

This fact was reflected in memoryNN/valid_zones by the ordering of
printed zones.  The first one was default (echo online > memoryNN/state)
and the other one could be onlined explicitly by online_{movable,kernel}.

This behavior was removed by the said patch and as such the ordering was
not all that important.  In most cases a kernel zone would be default
anyway.  The only exception is movable_node handled by "mm,
memory_hotplug: support movable_node for hotpluggable nodes".

Let's reintroduce this behavior again because later patch will remove
the zone overlap restriction and so user will be allowed to online
kernel resp.  movable block regardless of its placement.  Original
behavior will then become significant again because it would be
non-trivial for users to see what is the default zone to online into.

Implementation is really simple.  Pull out zone selection out of
move_pfn_range into zone_for_pfn_range helper and use it in
show_valid_zones to display the zone for default onlining and then both
kernel and movable if they are allowed.  Default online zone is not
duplicated.

Link: http://lkml.kernel.org/r/20170714121233.16861-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: <slaoub@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 33 ++++++++++-----
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c            | 73 +++++++++++++++++++---------------
 3 files changed, 65 insertions(+), 43 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c7c4e0325cdb..26383af9900c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -388,6 +388,22 @@ static ssize_t show_phys_device(struct device *dev,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
+static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
+		unsigned long nr_pages, int online_type,
+		struct zone *default_zone)
+{
+	struct zone *zone;
+
+	if (!allow_online_pfn_range(nid, start_pfn, nr_pages, online_type))
+		return;
+
+	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
+	if (zone != default_zone) {
+		strcat(buf, " ");
+		strcat(buf, zone->name);
+	}
+}
+
 static ssize_t show_valid_zones(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -395,7 +411,7 @@ static ssize_t show_valid_zones(struct device *dev,
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long valid_start_pfn, valid_end_pfn;
-	bool append = false;
+	struct zone *default_zone;
 	int nid;
 
 	/*
@@ -418,16 +434,13 @@ static ssize_t show_valid_zones(struct device *dev,
 	}
 
 	nid = pfn_to_nid(start_pfn);
-	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
-		strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
-		append = true;
-	}
+	default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
+	strcat(buf, default_zone->name);
 
-	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
-		if (append)
-			strcat(buf, " ");
-		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
-	}
+	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
+			default_zone);
+	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
+			default_zone);
 out:
 	strcat(buf, "\n");
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c8a5056a5ae0..5e6e4cc36ff4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
 		int online_type);
-extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
+extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..e342624622a1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 	node_set_state(node, N_MEMORY);
 }
 
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
-{
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
-
-	/*
-	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
-	 * physically before ZONE_MOVABLE. All we need is they do not
-	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
-	 * though so let's stick with it for simplicity for now.
-	 * TODO make sure we do not overlap with ZONE_DEVICE
-	 */
-	if (online_type == MMOP_ONLINE_KERNEL) {
-		if (zone_is_empty(movable_zone))
-			return true;
-		return movable_zone->zone_start_pfn >= pfn + nr_pages;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		return zone_end_pfn(default_zone) <= pfn;
-	}
-
-	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
-	return online_type == MMOP_ONLINE_KEEP;
-}
-
 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
  * If no kernel zone covers this pfn range it will automatically go
  * to the ZONE_NORMAL.
  */
-struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,6 +847,31 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 	return &pgdat->node_zones[ZONE_NORMAL];
 }
 
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
+
+	/*
+	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+	 * physically before ZONE_MOVABLE. All we need is they do not
+	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+	 * though so let's stick with it for simplicity for now.
+	 * TODO make sure we do not overlap with ZONE_DEVICE
+	 */
+	if (online_type == MMOP_ONLINE_KERNEL) {
+		if (zone_is_empty(movable_zone))
+			return true;
+		return movable_zone->zone_start_pfn >= pfn + nr_pages;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		return zone_end_pfn(default_zone) <= pfn;
+	}
+
+	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+	return online_type == MMOP_ONLINE_KEEP;
+}
+
 static inline bool movable_pfn_range(int nid, struct zone *default_zone,
 		unsigned long start_pfn, unsigned long nr_pages)
 {
@@ -885,12 +885,8 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
 	return !zone_intersects(default_zone, start_pfn, nr_pages);
 }
 
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
-		unsigned long start_pfn, unsigned long nr_pages)
+struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
@@ -909,6 +905,19 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 		zone = &pgdat->node_zones[ZONE_MOVABLE];
 	}
 
+	return zone;
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct zone *zone;
+
+	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
 	return zone;
 }

From c6f03e2903c9ecd8fd709a5b3fa8cf0a8ae0b3da Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:19:40 -0700
Subject: [PATCH 020/119] mm, memory_hotplug: remove zone restrictions

Historically we have enforced that any kernel zone (e.g ZONE_NORMAL) has
to precede the Movable zone in the physical memory range.  The purpose
of the movable zone is, however, not bound to any physical memory
restriction.  It merely defines a class of migrateable and reclaimable
memory.

There are users (e.g.  CMA) who might want to reserve specific physical
memory ranges for their own purpose.  Moreover our pfn walkers have to
be prepared for zones overlapping in the physical range already because
we do support interleaving NUMA nodes and therefore zones can interleave
as well.  This means we can allow each memory block to be associated
with a different zone.

Loosen the current onlining semantic and allow explicit onlining type on
any memblock.  That means that online_{kernel,movable} will be allowed
regardless of the physical address of the memblock as long as it is
offline of course.  This might result in moveble zone overlapping with
other kernel zones.  Default onlining then becomes a bit tricky but
still sensible.  echo online > memoryXY/state will online the given
block to

	1) the default zone if the given range is outside of any zone
	2) the enclosing zone if such a zone doesn't interleave with
	   any other zone
        3) the default zone if more zones interleave for this range

where default zone is movable zone only if movable_node is enabled
otherwise it is a kernel zone.

Here is an example of the semantic with (movable_node is not present but
it work in an analogous way). We start with following memblocks, all of
them offline:

  memory34/valid_zones:Normal Movable
  memory35/valid_zones:Normal Movable
  memory36/valid_zones:Normal Movable
  memory37/valid_zones:Normal Movable
  memory38/valid_zones:Normal Movable
  memory39/valid_zones:Normal Movable
  memory40/valid_zones:Normal Movable
  memory41/valid_zones:Normal Movable

Now, we online block 34 in default mode and block 37 as movable

  root@test1:/sys/devices/system/node/node1# echo online > memory34/state
  root@test1:/sys/devices/system/node/node1# echo online_movable > memory37/state
  memory34/valid_zones:Normal
  memory35/valid_zones:Normal Movable
  memory36/valid_zones:Normal Movable
  memory37/valid_zones:Movable
  memory38/valid_zones:Normal Movable
  memory39/valid_zones:Normal Movable
  memory40/valid_zones:Normal Movable
  memory41/valid_zones:Normal Movable

As we can see all other blocks can still be onlined both into Normal and
Movable zones and the Normal is default because the Movable zone spans
only block37 now.

  root@test1:/sys/devices/system/node/node1# echo online_movable > memory41/state
  memory34/valid_zones:Normal
  memory35/valid_zones:Normal Movable
  memory36/valid_zones:Normal Movable
  memory37/valid_zones:Movable
  memory38/valid_zones:Movable Normal
  memory39/valid_zones:Movable Normal
  memory40/valid_zones:Movable Normal
  memory41/valid_zones:Movable

Now the default zone for blocks 37-41 has changed because movable zone
spans that range.

  root@test1:/sys/devices/system/node/node1# echo online_kernel > memory39/state
  memory34/valid_zones:Normal
  memory35/valid_zones:Normal Movable
  memory36/valid_zones:Normal Movable
  memory37/valid_zones:Movable
  memory38/valid_zones:Normal Movable
  memory39/valid_zones:Normal
  memory40/valid_zones:Movable Normal
  memory41/valid_zones:Movable

Note that the block 39 now belongs to the zone Normal and so block38
falls into Normal by default as well.

For completness

  root@test1:/sys/devices/system/node/node1# for i in memory[34]?
  do
	echo online > $i/state 2>/dev/null
  done

  memory34/valid_zones:Normal
  memory35/valid_zones:Normal
  memory36/valid_zones:Normal
  memory37/valid_zones:Movable
  memory38/valid_zones:Normal
  memory39/valid_zones:Normal
  memory40/valid_zones:Movable
  memory41/valid_zones:Movable

Implementation wise the change is quite straightforward.  We can get rid
of allow_online_pfn_range altogether.  online_pages allows only offline
nodes already.  The original default_zone_for_pfn will become
default_kernel_zone_for_pfn.  New default_zone_for_pfn implements the
above semantic.  zone_for_pfn_range is slightly reorganized to implement
kernel and movable online type explicitly and MMOP_ONLINE_KEEP becomes a
catch all default behavior.

Link: http://lkml.kernel.org/r/20170714121233.16861-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: <slaoub@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c |  3 --
 mm/memory_hotplug.c   | 74 ++++++++++++++-----------------------------
 2 files changed, 23 insertions(+), 54 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 26383af9900c..4e3b61cda520 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -394,9 +394,6 @@ static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
 {
 	struct zone *zone;
 
-	if (!allow_online_pfn_range(nid, start_pfn, nr_pages, online_type))
-		return;
-
 	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 	if (zone != default_zone) {
 		strcat(buf, " ");
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e342624622a1..3e69984346da 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -831,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
  * If no kernel zone covers this pfn range it will automatically go
  * to the ZONE_NORMAL.
  */
-static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
@@ -847,65 +847,40 @@ static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 	return &pgdat->node_zones[ZONE_NORMAL];
 }
 
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
+static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
+	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
+			nr_pages);
+	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
+	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
 
 	/*
-	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
-	 * physically before ZONE_MOVABLE. All we need is they do not
-	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
-	 * though so let's stick with it for simplicity for now.
-	 * TODO make sure we do not overlap with ZONE_DEVICE
+	 * We inherit the existing zone in a simple case where zones do not
+	 * overlap in the given range
 	 */
-	if (online_type == MMOP_ONLINE_KERNEL) {
-		if (zone_is_empty(movable_zone))
-			return true;
-		return movable_zone->zone_start_pfn >= pfn + nr_pages;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		return zone_end_pfn(default_zone) <= pfn;
-	}
+	if (in_kernel ^ in_movable)
+		return (in_kernel) ? kernel_zone : movable_zone;
 
-	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
-	return online_type == MMOP_ONLINE_KEEP;
-}
-
-static inline bool movable_pfn_range(int nid, struct zone *default_zone,
-		unsigned long start_pfn, unsigned long nr_pages)
-{
-	if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
-				MMOP_ONLINE_KERNEL))
-		return true;
-
-	if (!movable_node_is_enabled())
-		return false;
-
-	return !zone_intersects(default_zone, start_pfn, nr_pages);
+	/*
+	 * If the range doesn't belong to any zone or two zones overlap in the
+	 * given range then we use movable zone only if movable_node is
+	 * enabled because we always online to a kernel zone by default.
+	 */
+	return movable_node_enabled ? movable_zone : kernel_zone;
 }
 
 struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages)
 {
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
+	if (online_type == MMOP_ONLINE_KERNEL)
+		return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
 
-	if (online_type == MMOP_ONLINE_KEEP) {
-		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-		/*
-		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
-		 * movable zone if that is not possible (e.g. we are within
-		 * or past the existing movable zone). movable_node overrides
-		 * this default and defaults to movable zone
-		 */
-		if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
-			zone = movable_zone;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		zone = &pgdat->node_zones[ZONE_MOVABLE];
-	}
+	if (online_type == MMOP_ONLINE_MOVABLE)
+		return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
 
-	return zone;
+	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
 /*
@@ -934,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	struct memory_notify arg;
 
 	nid = pfn_to_nid(pfn);
-	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
-		return -EINVAL;
-
 	/* associate pfn range with the zone */
 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 

From 4ebbe7f7fc99260afd51759e35dbfdd6010dc697 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:44 -0700
Subject: [PATCH 021/119] zram: clean up duplicated codes in __zram_bvec_write

Patch series "writeback incompressible pages to storage", v1.

zRam is useful for memory saving with compressible pages but sometime,
workload can be changed and system has lots of incompressible pages
which is very harmful for zram.

This patch supports writeback feature of zram so admin can set up a
block device and with it, zram can save the memory via writing out the
incompressile pages once it found it's incompressible pages (1/4 comp
ratio) instead of keeping the page in memory.

[1-3] is just clean up and [4-8] is step by step feature enablement.
[4-8] is logically not bisectable(ie, logical unit separation)
although I tried to compiled out without breaking but I think it would
be better to review.

This patch (of 9):

__zram_bvec_write has some of duplicated logic for zram meta data
handling of same_page|compressed_page.  This patch aims to clean it up
without behavior change.

[xieyisheng1@huawei.com: fix compr_data_size stat]
  Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1496019048-27016-1-git-send-email-minchan@kernel.org
Link: http://lkml.kernel.org/r/1498459987-24562-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Juneho Choi <juno.choi@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 56 ++++++++++++++---------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3b1b6340ba13..f1a94f449c6d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -453,30 +453,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
 	return false;
 }
 
-static bool zram_same_page_write(struct zram *zram, u32 index,
-					struct page *page)
-{
-	unsigned long element;
-	void *mem = kmap_atomic(page);
-
-	if (page_same_filled(mem, &element)) {
-		kunmap_atomic(mem);
-		/* Free memory associated with this sector now. */
-		zram_slot_lock(zram, index);
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_SAME);
-		zram_set_element(zram, index, element);
-		zram_slot_unlock(zram, index);
-
-		atomic64_inc(&zram->stats.same_pages);
-		atomic64_inc(&zram->stats.pages_stored);
-		return true;
-	}
-	kunmap_atomic(mem);
-
-	return false;
-}
-
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -685,14 +661,23 @@ compress_again:
 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
 	int ret;
-	unsigned long handle;
-	unsigned int comp_len;
-	void *src, *dst;
+	unsigned long handle = 0;
+	unsigned int comp_len = 0;
+	void *src, *dst, *mem;
 	struct zcomp_strm *zstrm;
 	struct page *page = bvec->bv_page;
+	unsigned long element = 0;
+	enum zram_pageflags flags = 0;
 
-	if (zram_same_page_write(zram, index, page))
-		return 0;
+	mem = kmap_atomic(page);
+	if (page_same_filled(mem, &element)) {
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now */
+		atomic64_inc(&zram->stats.same_pages);
+		flags = ZRAM_SAME;
+		goto out;
+	}
+	kunmap_atomic(mem);
 
 	zstrm = zcomp_stream_get(zram->comp);
 	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
@@ -712,19 +697,24 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 
 	zcomp_stream_put(zram->comp);
 	zs_unmap_object(zram->mem_pool, handle);
-
+	atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
 	/*
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	zram_set_handle(zram, index, handle);
-	zram_set_obj_size(zram, index, comp_len);
+	if (flags == ZRAM_SAME) {
+		zram_set_flag(zram, index, ZRAM_SAME);
+		zram_set_element(zram, index, element);
+	} else {
+		zram_set_handle(zram, index, handle);
+		zram_set_obj_size(zram, index, comp_len);
+	}
 	zram_slot_unlock(zram, index);
 
 	/* Update stats */
-	atomic64_add(comp_len, &zram->stats.compr_data_size);
 	atomic64_inc(&zram->stats.pages_stored);
 	return 0;
 }

From 97ec7c8bd5d029b2c3e40355c1204197094e9ba1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:47 -0700
Subject: [PATCH 022/119] zram: inline zram_compress

zram_compress does several things, compress, entry alloc and check
limitation.  I did for just readbility but it hurts modulization.:(

So this patch removes zram_compress functions and inline it in
__zram_bvec_write for upcoming patches.

Link: http://lkml.kernel.org/r/1498459987-24562-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 64 ++++++++++++-----------------------
 1 file changed, 22 insertions(+), 42 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f1a94f449c6d..7b16ab0b923a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -589,25 +589,38 @@ out:
 	return ret;
 }
 
-static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
-			struct page *page,
-			unsigned long *out_handle, unsigned int *out_comp_len)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
 	int ret;
-	unsigned int comp_len;
-	void *src;
 	unsigned long alloced_pages;
 	unsigned long handle = 0;
+	unsigned int comp_len = 0;
+	void *src, *dst, *mem;
+	struct zcomp_strm *zstrm;
+	struct page *page = bvec->bv_page;
+	unsigned long element = 0;
+	enum zram_pageflags flags = 0;
+
+	mem = kmap_atomic(page);
+	if (page_same_filled(mem, &element)) {
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now. */
+		flags = ZRAM_SAME;
+		atomic64_inc(&zram->stats.same_pages);
+		goto out;
+	}
+	kunmap_atomic(mem);
 
 compress_again:
+	zstrm = zcomp_stream_get(zram->comp);
 	src = kmap_atomic(page);
-	ret = zcomp_compress(*zstrm, src, &comp_len);
+	ret = zcomp_compress(zstrm, src, &comp_len);
 	kunmap_atomic(src);
 
 	if (unlikely(ret)) {
+		zcomp_stream_put(zram->comp);
 		pr_err("Compression failed! err=%d\n", ret);
-		if (handle)
-			zs_free(zram->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 		return ret;
 	}
 
@@ -639,7 +652,6 @@ compress_again:
 		handle = zs_malloc(zram->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
-		*zstrm = zcomp_stream_get(zram->comp);
 		if (handle)
 			goto compress_again;
 		return -ENOMEM;
@@ -649,43 +661,11 @@ compress_again:
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+		zcomp_stream_put(zram->comp);
 		zs_free(zram->mem_pool, handle);
 		return -ENOMEM;
 	}
 
-	*out_handle = handle;
-	*out_comp_len = comp_len;
-	return 0;
-}
-
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
-{
-	int ret;
-	unsigned long handle = 0;
-	unsigned int comp_len = 0;
-	void *src, *dst, *mem;
-	struct zcomp_strm *zstrm;
-	struct page *page = bvec->bv_page;
-	unsigned long element = 0;
-	enum zram_pageflags flags = 0;
-
-	mem = kmap_atomic(page);
-	if (page_same_filled(mem, &element)) {
-		kunmap_atomic(mem);
-		/* Free memory associated with this sector now */
-		atomic64_inc(&zram->stats.same_pages);
-		flags = ZRAM_SAME;
-		goto out;
-	}
-	kunmap_atomic(mem);
-
-	zstrm = zcomp_stream_get(zram->comp);
-	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
-	if (ret) {
-		zcomp_stream_put(zram->comp);
-		return ret;
-	}
-
 	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
 
 	src = zstrm->buffer;

From 693dc1ce25b8c8fa33f930d47cd8f926eeb90812 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:50 -0700
Subject: [PATCH 023/119] zram: rename zram_decompress_page to __zram_bvec_read

zram_decompress_page naming is not proper because it doesn't decompress
if page was dedup hit or stored with compression.

Use more abstract term and consistent with write path function
__zram_bvec_write.

Link: http://lkml.kernel.org/r/1498459987-24562-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7b16ab0b923a..7afe0d8487c0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -518,7 +518,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(zram, index, 0);
 }
 
-static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
 {
 	int ret;
 	unsigned long handle;
@@ -570,7 +570,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 			return -ENOMEM;
 	}
 
-	ret = zram_decompress_page(zram, page, index);
+	ret = __zram_bvec_read(zram, page, index);
 	if (unlikely(ret))
 		goto out;
 
@@ -718,7 +718,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		if (!page)
 			return -ENOMEM;
 
-		ret = zram_decompress_page(zram, page, index);
+		ret = __zram_bvec_read(zram, page, index);
 		if (ret)
 			goto out;
 

From 013bf95a83ec760a2afc37fabd6bf13a9cdae205 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:54 -0700
Subject: [PATCH 024/119] zram: add interface to specif backing device

For writeback feature, user should set up backing device before the zram
working.

This patch enables the interface via /sys/block/zramX/backing_dev.

Currently, it supports block device only but it could be enhanced for
file as well.

Link: http://lkml.kernel.org/r/1498459987-24562-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 142 ++++++++++++++++++++++++++++++++++
 drivers/block/zram/zram_drv.h |   5 ++
 2 files changed, 147 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7afe0d8487c0..81eb81f17ef9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -270,6 +270,141 @@ static ssize_t mem_used_max_store(struct device *dev,
 	return len;
 }
 
+#ifdef CONFIG_ZRAM_WRITEBACK
+static bool zram_wb_enabled(struct zram *zram)
+{
+	return zram->backing_dev;
+}
+
+static void reset_bdev(struct zram *zram)
+{
+	struct block_device *bdev;
+
+	if (!zram_wb_enabled(zram))
+		return;
+
+	bdev = zram->bdev;
+	if (zram->old_block_size)
+		set_blocksize(bdev, zram->old_block_size);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	/* hope filp_close flush all of IO */
+	filp_close(zram->backing_dev, NULL);
+	zram->backing_dev = NULL;
+	zram->old_block_size = 0;
+	zram->bdev = NULL;
+}
+
+static ssize_t backing_dev_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	struct file *file = zram->backing_dev;
+	char *p;
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	if (!zram_wb_enabled(zram)) {
+		memcpy(buf, "none\n", 5);
+		up_read(&zram->init_lock);
+		return 5;
+	}
+
+	p = file_path(file, buf, PAGE_SIZE - 1);
+	if (IS_ERR(p)) {
+		ret = PTR_ERR(p);
+		goto out;
+	}
+
+	ret = strlen(p);
+	memmove(buf, p, ret);
+	buf[ret++] = '\n';
+out:
+	up_read(&zram->init_lock);
+	return ret;
+}
+
+static ssize_t backing_dev_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	char *file_name;
+	struct file *backing_dev = NULL;
+	struct inode *inode;
+	struct address_space *mapping;
+	unsigned int old_block_size = 0;
+	struct block_device *bdev = NULL;
+	int err;
+	struct zram *zram = dev_to_zram(dev);
+
+	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!file_name)
+		return -ENOMEM;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Can't setup backing device for initialized device\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	strlcpy(file_name, buf, len);
+
+	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+	if (IS_ERR(backing_dev)) {
+		err = PTR_ERR(backing_dev);
+		backing_dev = NULL;
+		goto out;
+	}
+
+	mapping = backing_dev->f_mapping;
+	inode = mapping->host;
+
+	/* Support only block device in this moment */
+	if (!S_ISBLK(inode->i_mode)) {
+		err = -ENOTBLK;
+		goto out;
+	}
+
+	bdev = bdgrab(I_BDEV(inode));
+	err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+	if (err < 0)
+		goto out;
+
+	old_block_size = block_size(bdev);
+	err = set_blocksize(bdev, PAGE_SIZE);
+	if (err)
+		goto out;
+
+	reset_bdev(zram);
+
+	zram->old_block_size = old_block_size;
+	zram->bdev = bdev;
+	zram->backing_dev = backing_dev;
+	up_write(&zram->init_lock);
+
+	pr_info("setup backing device %s\n", file_name);
+	kfree(file_name);
+
+	return len;
+out:
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	if (backing_dev)
+		filp_close(backing_dev, NULL);
+
+	up_write(&zram->init_lock);
+
+	kfree(file_name);
+
+	return err;
+}
+
+#else
+static bool zram_wb_enabled(struct zram *zram) { return false; }
+static inline void reset_bdev(struct zram *zram) {};
+#endif
+
+
 /*
  * We switched to per-cpu streams and this attr is not needed anymore.
  * However, we will keep it around for some time, because:
@@ -953,6 +1088,7 @@ static void zram_reset_device(struct zram *zram)
 	zram_meta_free(zram, disksize);
 	memset(&zram->stats, 0, sizeof(zram->stats));
 	zcomp_destroy(comp);
+	reset_bdev(zram);
 }
 
 static ssize_t disksize_store(struct device *dev,
@@ -1078,6 +1214,9 @@ static DEVICE_ATTR_WO(mem_limit);
 static DEVICE_ATTR_WO(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+#endif
 
 static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_disksize.attr,
@@ -1088,6 +1227,9 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_mem_used_max.attr,
 	&dev_attr_max_comp_streams.attr,
 	&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+	&dev_attr_backing_dev.attr,
+#endif
 	&dev_attr_io_stat.attr,
 	&dev_attr_mm_stat.attr,
 	&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d02e3e..113a41118918 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -115,5 +115,10 @@ struct zram {
 	 * zram is claimed so open request will be failed
 	 */
 	bool claim; /* Protected by bdev->bd_mutex */
+#ifdef CONFIG_ZRAM_WRITEBACK
+	struct file *backing_dev;
+	struct block_device *bdev;
+	unsigned int old_block_size;
+#endif
 };
 #endif

From 1363d4662a0d28dfdb81ef426c88c9a8dbf7c338 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:57 -0700
Subject: [PATCH 025/119] zram: add free space management in backing device

With backing device, zram needs management of free space of backing
device.

This patch adds bitmap logic to manage free space which is very naive.
However, it would be simple enough as considering uncompressible pages's
frequenty in zram.

Link: http://lkml.kernel.org/r/1498459987-24562-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 48 ++++++++++++++++++++++++++++++++++-
 drivers/block/zram/zram_drv.h |  3 +++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 81eb81f17ef9..fcdbbb1e7745 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -292,6 +292,9 @@ static void reset_bdev(struct zram *zram)
 	zram->backing_dev = NULL;
 	zram->old_block_size = 0;
 	zram->bdev = NULL;
+
+	kvfree(zram->bitmap);
+	zram->bitmap = NULL;
 }
 
 static ssize_t backing_dev_show(struct device *dev,
@@ -330,7 +333,8 @@ static ssize_t backing_dev_store(struct device *dev,
 	struct file *backing_dev = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
-	unsigned int old_block_size = 0;
+	unsigned int bitmap_sz, old_block_size = 0;
+	unsigned long nr_pages, *bitmap = NULL;
 	struct block_device *bdev = NULL;
 	int err;
 	struct zram *zram = dev_to_zram(dev);
@@ -369,16 +373,27 @@ static ssize_t backing_dev_store(struct device *dev,
 	if (err < 0)
 		goto out;
 
+	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+	bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
+	if (!bitmap) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	old_block_size = block_size(bdev);
 	err = set_blocksize(bdev, PAGE_SIZE);
 	if (err)
 		goto out;
 
 	reset_bdev(zram);
+	spin_lock_init(&zram->bitmap_lock);
 
 	zram->old_block_size = old_block_size;
 	zram->bdev = bdev;
 	zram->backing_dev = backing_dev;
+	zram->bitmap = bitmap;
+	zram->nr_pages = nr_pages;
 	up_write(&zram->init_lock);
 
 	pr_info("setup backing device %s\n", file_name);
@@ -386,6 +401,9 @@ static ssize_t backing_dev_store(struct device *dev,
 
 	return len;
 out:
+	if (bitmap)
+		kvfree(bitmap);
+
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 
@@ -399,6 +417,34 @@ out:
 	return err;
 }
 
+static unsigned long get_entry_bdev(struct zram *zram)
+{
+	unsigned long entry;
+
+	spin_lock(&zram->bitmap_lock);
+	/* skip 0 bit to confuse zram.handle = 0 */
+	entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+	if (entry == zram->nr_pages) {
+		spin_unlock(&zram->bitmap_lock);
+		return 0;
+	}
+
+	set_bit(entry, zram->bitmap);
+	spin_unlock(&zram->bitmap_lock);
+
+	return entry;
+}
+
+static void put_entry_bdev(struct zram *zram, unsigned long entry)
+{
+	int was_set;
+
+	spin_lock(&zram->bitmap_lock);
+	was_set = test_and_clear_bit(entry, zram->bitmap);
+	spin_unlock(&zram->bitmap_lock);
+	WARN_ON_ONCE(!was_set);
+}
+
 #else
 static bool zram_wb_enabled(struct zram *zram) { return false; }
 static inline void reset_bdev(struct zram *zram) {};
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 113a41118918..707aec0a2681 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -119,6 +119,9 @@ struct zram {
 	struct file *backing_dev;
 	struct block_device *bdev;
 	unsigned int old_block_size;
+	unsigned long *bitmap;
+	unsigned long nr_pages;
+	spinlock_t bitmap_lock;
 #endif
 };
 #endif

From ae85a8075c5b025b9d503554ddc480a346a24536 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:00 -0700
Subject: [PATCH 026/119] zram: identify asynchronous IO's return value

For upcoming asynchronous IO like writeback, zram_rw_page should be
aware of that whether requested IO was completed or submitted
successfully, otherwise error.

For the goal, zram_bvec_rw has three return values.

-errno: returns error number
     0: IO request is done synchronously
     1: IO request is issued successfully.

Link: http://lkml.kernel.org/r/1498459987-24562-7-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index fcdbbb1e7745..8975f75f113d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -772,7 +772,7 @@ out:
 
 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
-	int ret;
+	int ret = 0;
 	unsigned long alloced_pages;
 	unsigned long handle = 0;
 	unsigned int comp_len = 0;
@@ -877,7 +877,7 @@ out:
 
 	/* Update stats */
 	atomic64_inc(&zram->stats.pages_stored);
-	return 0;
+	return ret;
 }
 
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
@@ -959,6 +959,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 	}
 }
 
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 			int offset, bool is_write)
 {
@@ -980,7 +985,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
 
-	if (unlikely(ret)) {
+	if (unlikely(ret < 0)) {
 		if (!is_write)
 			atomic64_inc(&zram->stats.failed_reads);
 		else
@@ -1073,7 +1078,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, bool is_write)
 {
-	int offset, err = -EIO;
+	int offset, ret;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
@@ -1082,7 +1087,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		err = -EINVAL;
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1093,7 +1098,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	err = zram_bvec_rw(zram, &bv, index, offset, is_write);
+	ret = zram_bvec_rw(zram, &bv, index, offset, is_write);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
@@ -1103,9 +1108,20 @@ out:
 	 * bio->bi_end_io does things to handle the error
 	 * (e.g., SetPageError, set_page_dirty and extra works).
 	 */
-	if (err == 0)
+	if (unlikely(ret < 0))
+		return ret;
+
+	switch (ret) {
+	case 0:
 		page_endio(page, is_write, 0);
-	return err;
+		break;
+	case 1:
+		ret = 0;
+		break;
+	default:
+		WARN_ON(1);
+	}
+	return ret;
 }
 
 static void zram_reset_device(struct zram *zram)

From db8ffbd4e7634cc537c8d32e73e7ce0f06248645 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:03 -0700
Subject: [PATCH 027/119] zram: write incompressible pages to backing device

This patch enables write IO to transfer data to backing device.  For
that, it implements write_to_bdev function which creates new bio and
chaining with parent bio to make the parent bio asynchrnous.

For rw_page which don't have parent bio, it submit owned bio and handle
IO completion by zram_page_end_io.

Also, this patch defines new flag ZRAM_WB to mark written page for later
read IO.

[xieyisheng1@huawei.com: fix typo in comment]
  Link: http://lkml.kernel.org/r/1502707447-6944-2-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1498459987-24562-8-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 113 ++++++++++++++++++++++++++++++----
 drivers/block/zram/zram_drv.h |   3 +-
 2 files changed, 103 insertions(+), 13 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8975f75f113d..195b3372241c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -445,9 +445,76 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
 	WARN_ON_ONCE(!was_set);
 }
 
+void zram_page_end_io(struct bio *bio)
+{
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	page_endio(page, op_is_write(bio_op(bio)),
+			blk_status_to_errno(bio->bi_status));
+	bio_put(bio);
+}
+
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+					u32 index, struct bio *parent,
+					unsigned long *pentry)
+{
+	struct bio *bio;
+	unsigned long entry;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+
+	entry = get_entry_bdev(zram);
+	if (!entry) {
+		bio_put(bio);
+		return -ENOSPC;
+	}
+
+	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+	bio->bi_bdev = zram->bdev;
+	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+					bvec->bv_offset)) {
+		bio_put(bio);
+		put_entry_bdev(zram, entry);
+		return -EIO;
+	}
+
+	if (!parent) {
+		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+		bio->bi_end_io = zram_page_end_io;
+	} else {
+		bio->bi_opf = parent->bi_opf;
+		bio_chain(bio, parent);
+	}
+
+	submit_bio(bio);
+	*pentry = entry;
+
+	return 0;
+}
+
+static void zram_wb_clear(struct zram *zram, u32 index)
+{
+	unsigned long entry;
+
+	zram_clear_flag(zram, index, ZRAM_WB);
+	entry = zram_get_element(zram, index);
+	zram_set_element(zram, index, 0);
+	put_entry_bdev(zram, entry);
+}
+
 #else
 static bool zram_wb_enabled(struct zram *zram) { return false; }
 static inline void reset_bdev(struct zram *zram) {};
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+					u32 index, struct bio *parent,
+					unsigned long *pentry)
+
+{
+	return -EIO;
+}
+static void zram_wb_clear(struct zram *zram, u32 index) {}
 #endif
 
 
@@ -672,7 +739,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	unsigned long handle = zram_get_handle(zram, index);
+	unsigned long handle;
+
+	if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+		zram_wb_clear(zram, index);
+		atomic64_dec(&zram->stats.pages_stored);
+		return;
+	}
 
 	/*
 	 * No memory is allocated for same element filled pages.
@@ -686,6 +759,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 		return;
 	}
 
+	handle = zram_get_handle(zram, index);
 	if (!handle)
 		return;
 
@@ -770,7 +844,8 @@ out:
 	return ret;
 }
 
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, struct bio *bio)
 {
 	int ret = 0;
 	unsigned long alloced_pages;
@@ -781,6 +856,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	struct page *page = bvec->bv_page;
 	unsigned long element = 0;
 	enum zram_pageflags flags = 0;
+	bool allow_wb = true;
 
 	mem = kmap_atomic(page);
 	if (page_same_filled(mem, &element)) {
@@ -805,8 +881,20 @@ compress_again:
 		return ret;
 	}
 
-	if (unlikely(comp_len > max_zpage_size))
+	if (unlikely(comp_len > max_zpage_size)) {
+		if (zram_wb_enabled(zram) && allow_wb) {
+			zcomp_stream_put(zram->comp);
+			ret = write_to_bdev(zram, bvec, index, bio, &element);
+			if (!ret) {
+				flags = ZRAM_WB;
+				ret = 1;
+				goto out;
+			}
+			allow_wb = false;
+			goto compress_again;
+		}
 		comp_len = PAGE_SIZE;
+	}
 
 	/*
 	 * handle allocation has 2 paths:
@@ -866,10 +954,11 @@ out:
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	if (flags == ZRAM_SAME) {
-		zram_set_flag(zram, index, ZRAM_SAME);
+
+	if (flags) {
+		zram_set_flag(zram, index, flags);
 		zram_set_element(zram, index, element);
-	} else {
+	}  else {
 		zram_set_handle(zram, index, handle);
 		zram_set_obj_size(zram, index, comp_len);
 	}
@@ -881,7 +970,7 @@ out:
 }
 
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
-				u32 index, int offset)
+				u32 index, int offset, struct bio *bio)
 {
 	int ret;
 	struct page *page = NULL;
@@ -914,7 +1003,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		vec.bv_offset = 0;
 	}
 
-	ret = __zram_bvec_write(zram, &vec, index);
+	ret = __zram_bvec_write(zram, &vec, index, bio);
 out:
 	if (is_partial_io(bvec))
 		__free_page(page);
@@ -965,7 +1054,7 @@ static void zram_bio_discard(struct zram *zram, u32 index,
  * Returns 1 if IO request was successfully submitted.
  */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, bool is_write)
+			int offset, bool is_write, struct bio *bio)
 {
 	unsigned long start_time = jiffies;
 	int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
@@ -980,7 +1069,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);
-		ret = zram_bvec_write(zram, bvec, index, offset);
+		ret = zram_bvec_write(zram, bvec, index, offset, bio);
 	}
 
 	generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
@@ -1024,7 +1113,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 			bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
 							unwritten);
 			if (zram_bvec_rw(zram, &bv, index, offset,
-					op_is_write(bio_op(bio))) < 0)
+					op_is_write(bio_op(bio)), bio) < 0)
 				goto out;
 
 			bv.bv_offset += bv.bv_len;
@@ -1098,7 +1187,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	ret = zram_bvec_rw(zram, &bv, index, offset, is_write);
+	ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 707aec0a2681..31762db861e3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 
 /* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
-	/* Page consists entirely of zeros */
+	/* Page consists the same element */
 	ZRAM_SAME = ZRAM_FLAG_SHIFT,
 	ZRAM_ACCESS,	/* page is now accessed */
+	ZRAM_WB,	/* page is stored on backing_device */
 
 	__NR_ZRAM_PAGEFLAGS,
 };

From 8e654f8fbff52ac483fb69957222853d7e2fc588 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:07 -0700
Subject: [PATCH 028/119] zram: read page from backing device

This patch enables read IO from backing device.  For the feature, it
implements two IO read functions to transfer data from backing storage.

One is asynchronous IO function and other is synchronous one.

A reason I need synchrnous IO is due to partial write which need to
complete read IO before the overwriting partial data.

We can make the partial IO's case asynchronous, too but at the moment, I
don't feel adding more complexity to support such rare use cases so want
to go with simple.

[xieyisheng1@huawei.com: read_from_bdev_async(): return 1 to avoid call page_endio() in zram_rw_page()]
  Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1498459987-24562-9-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 123 ++++++++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 5 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 195b3372241c..4ddc18a0a04f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -454,6 +454,95 @@ void zram_page_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+	bio->bi_bdev = zram->bdev;
+	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+		bio_put(bio);
+		return -EIO;
+	}
+
+	if (!parent) {
+		bio->bi_opf = REQ_OP_READ;
+		bio->bi_end_io = zram_page_end_io;
+	} else {
+		bio->bi_opf = parent->bi_opf;
+		bio_chain(bio, parent);
+	}
+
+	submit_bio(bio);
+	return 1;
+}
+
+struct zram_work {
+	struct work_struct work;
+	struct zram *zram;
+	unsigned long entry;
+	struct bio *bio;
+};
+
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+	struct bio_vec bvec;
+	struct zram_work *zw = container_of(work, struct zram_work, work);
+	struct zram *zram = zw->zram;
+	unsigned long entry = zw->entry;
+	struct bio *bio = zw->bio;
+
+	read_from_bdev_async(zram, &bvec, entry, bio);
+}
+
+/*
+ * Block layer want one ->make_request_fn to be active at a time
+ * so if we use chained IO with parent IO in same context,
+ * it's a deadlock. To avoid, it, it uses worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	struct zram_work work;
+
+	work.zram = zram;
+	work.entry = entry;
+	work.bio = bio;
+
+	INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+	queue_work(system_unbound_wq, &work.work);
+	flush_work(&work.work);
+	destroy_work_on_stack(&work.work);
+
+	return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+#endif
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	if (sync)
+		return read_from_bdev_sync(zram, bvec, entry, parent);
+	else
+		return read_from_bdev_async(zram, bvec, entry, parent);
+}
+
 static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 					u32 index, struct bio *parent,
 					unsigned long *pentry)
@@ -514,6 +603,12 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 {
 	return -EIO;
 }
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	return -EIO;
+}
 static void zram_wb_clear(struct zram *zram, u32 index) {}
 #endif
 
@@ -773,13 +868,31 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(zram, index, 0);
 }
 
-static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+				struct bio *bio, bool partial_io)
 {
 	int ret;
 	unsigned long handle;
 	unsigned int size;
 	void *src, *dst;
 
+	if (zram_wb_enabled(zram)) {
+		zram_slot_lock(zram, index);
+		if (zram_test_flag(zram, index, ZRAM_WB)) {
+			struct bio_vec bvec;
+
+			zram_slot_unlock(zram, index);
+
+			bvec.bv_page = page;
+			bvec.bv_len = PAGE_SIZE;
+			bvec.bv_offset = 0;
+			return read_from_bdev(zram, &bvec,
+					zram_get_element(zram, index),
+					bio, partial_io);
+		}
+		zram_slot_unlock(zram, index);
+	}
+
 	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
@@ -812,7 +925,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
 }
 
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-				u32 index, int offset)
+				u32 index, int offset, struct bio *bio)
 {
 	int ret;
 	struct page *page;
@@ -825,7 +938,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 			return -ENOMEM;
 	}
 
-	ret = __zram_bvec_read(zram, page, index);
+	ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
 	if (unlikely(ret))
 		goto out;
 
@@ -988,7 +1101,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		if (!page)
 			return -ENOMEM;
 
-		ret = __zram_bvec_read(zram, page, index);
+		ret = __zram_bvec_read(zram, page, index, bio, true);
 		if (ret)
 			goto out;
 
@@ -1065,7 +1178,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	if (!is_write) {
 		atomic64_inc(&zram->stats.num_reads);
-		ret = zram_bvec_read(zram, bvec, index, offset);
+		ret = zram_bvec_read(zram, bvec, index, offset, bio);
 		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);

From 5a47074f0279421778f97b1b1e75686696a5f42a Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:10 -0700
Subject: [PATCH 029/119] zram: add config and doc file for writeback feature

This patch adds document and kconfig for using of writeback feature.

Link: http://lkml.kernel.org/r/1498459987-24562-10-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-block-zram |  8 ++++++++
 Documentation/blockdev/zram.txt            | 11 +++++++++++
 drivers/block/zram/Kconfig                 | 12 ++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d882b2c..c1513c756af1 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,11 @@ Description:
 		device's debugging info useful for kernel developers. Its
 		format is not documented intentionally and may change
 		anytime without any notice.
+
+What:		/sys/block/zram<id>/backing_dev
+Date:		June 2017
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The backing_dev file is read-write and set up backing
+		device for zram to write incompressible pages.
+		For using, user should enable CONFIG_ZRAM_WRITEBACK.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a21307..257e65714c6a 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams  RW    the number of possible concurrent compress operations
 comp_algorithm    RW    show and change the compression algorithm
 compact           WO    trigger memory compaction
 debug_stat        RO    this file is used for zram debugging purposes
+backing_dev	  RW	set up backend storage for zram to write out
 
 
 User space is advised to use the following files to read the device statistics.
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
 	resets the disksize to zero. You must set the disksize again
 	before reusing the device.
 
+* Optional Feature
+
+= writeback
+
+With incompressible pages, there is no memory saving with zram.
+Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
+to backing storage rather than keeping it in memory.
+User should set up backing device via /sys/block/zramX/backing_dev
+before disksize setting.
+
 Nitin Gupta
 ngupta@vflare.org
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6dcd3b..7cd4a8ec3c8f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,15 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
+
+config ZRAM_WRITEBACK
+       bool "Write back incompressible page to backing device"
+       depends on ZRAM
+       default n
+       help
+	 With incompressible page, there is no memory saving to keep it
+	 in memory. Instead, write it out to backing device.
+	 For this feature, admin should set up backing device via
+	 /sys/block/zramX/backing_dev.
+
+	 See zram.txt for more infomration.

From c9bff3eebc09be23fbc868f5e6731666d23cbea3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:13 -0700
Subject: [PATCH 030/119] mm, page_alloc: rip out ZONELIST_ORDER_ZONE

Patch series "cleanup zonelists initialization", v1.

This is aimed at cleaning up the zonelists initialization code we have
but the primary motivation was bug report [2] which got resolved but the
usage of stop_machine is just too ugly to live.  Most patches are
straightforward but 3 of them need a special consideration.

Patch 1 removes zone ordered zonelists completely.  I am CCing linux-api
because this is a user visible change.  As I argue in the patch
description I do not think we have a strong usecase for it these days.
I have kept sysctl in place and warn into the log if somebody tries to
configure zone lists ordering.  If somebody has a real usecase for it we
can revert this patch but I do not expect anybody will actually notice
runtime differences.  This patch is not strictly needed for the rest but
it made patch 6 easier to implement.

Patch 7 removes stop_machine from build_all_zonelists without adding any
special synchronization between iterators and updater which I _believe_
is acceptable as explained in the changelog.  I hope I am not missing
anything.

Patch 8 then removes zonelists_mutex which is kind of ugly as well and
not really needed AFAICS but a care should be taken when double checking
my thinking.

This patch (of 9):

Supporting zone ordered zonelists costs us just a lot of code while the
usefulness is arguable if existent at all.  Mel has already made node
ordering default on 64b systems.  32b systems are still using
ZONELIST_ORDER_ZONE because it is considered better to fallback to a
different NUMA node rather than consume precious lowmem zones.

This argument is, however, weaken by the fact that the memory reclaim
has been reworked to be node rather than zone oriented.  This means that
lowmem requests have to skip over all highmem pages on LRUs already and
so zone ordering doesn't save the reclaim time much.  So the only
advantage of the zone ordering is under a light memory pressure when
highmem requests do not ever hit into lowmem zones and the lowmem
pressure doesn't need to reclaim.

Considering that 32b NUMA systems are rather suboptimal already and it
is generally advisable to use 64b kernel on such a HW I believe we
should rather care about the code maintainability and just get rid of
ZONELIST_ORDER_ZONE altogether.  Keep systcl in place and warn if
somebody tries to set zone ordering either from kernel command line or
the sysctl.

[mhocko@suse.com: reading vm.numa_zonelist_order will never terminate]
Link: http://lkml.kernel.org/r/20170721143915.14161-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../admin-guide/kernel-parameters.txt         |   2 +-
 Documentation/sysctl/vm.txt                   |   4 +-
 Documentation/vm/numa                         |   7 +-
 include/linux/mmzone.h                        |   2 +-
 mm/page_alloc.c                               | 179 ++----------------
 5 files changed, 28 insertions(+), 166 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6996b7727b85..86b0e8ec8ad7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2783,7 +2783,7 @@
 			Allowed values are enable and disable
 
 	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
-			one of ['zone', 'node', 'default'] can be specified
+			'node', 'default' can be specified
 			This can be set from sysctl after boot.
 			See Documentation/sysctl/vm.txt for details.
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 48244c42ff52..9baf66a9ef4e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
 
 numa_zonelist_order
 
-This sysctl is only for NUMA.
+This sysctl is only for NUMA and it is deprecated. Anything but
+Node order will fail!
+
 'where the memory is allocated from' is controlled by zonelists.
 (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
  you may be able to read ZONE_DMA as ZONE_DMA32...)
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index a08f71647714..a31b85b9bb88 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
 fall back to the same zone type on a different node, or to a different zone
 type on the same node.  This is an important consideration because some zones,
 such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default zonelist order based on the sizes of the various zone types relative
-to the total memory of the node and the total memory of the system.  The
-default zonelist order may be overridden using the numa_zonelist_order kernel
-boot parameter or sysctl.  [see Documentation/admin-guide/kernel-parameters.rst and
-Documentation/sysctl/vm.txt]
+a default Node ordered zonelist. This means it tries to fallback to other zones
+from the same node before using remote nodes which are ordered by NUMA distance.
 
 By default, Linux will attempt to satisfy memory allocation requests from the
 node to which the CPU that executes the request is assigned.  Specifically,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..bfdc37b77d88 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -896,7 +896,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
-#define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
+#define NUMA_ZONELIST_ORDER_LEN	16
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dcc8a1cf55b6..6b23df1be909 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 	return nr_zones;
 }
 
-
-/*
- *  zonelist_order:
- *  0 = automatic detection of better ordering.
- *  1 = order by ([node] distance, -zonetype)
- *  2 = order by (-zonetype, [node] distance)
- *
- *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- *  the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT  0
-#define ZONELIST_ORDER_NODE     1
-#define ZONELIST_ORDER_ZONE     2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
 #ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN	16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- *	= "[dD]efault	- default, automatic configuration.
- *	= "[nN]ode 	- order by node locality, then by zone within node
- *	= "[zZ]one      - order by zone, then by locality within zone
- */
 
 static int __parse_numa_zonelist_order(char *s)
 {
-	if (*s == 'd' || *s == 'D') {
-		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-	} else if (*s == 'n' || *s == 'N') {
-		user_zonelist_order = ZONELIST_ORDER_NODE;
-	} else if (*s == 'z' || *s == 'Z') {
-		user_zonelist_order = ZONELIST_ORDER_ZONE;
-	} else {
-		pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
+	/*
+	 * We used to support different zonlists modes but they turned
+	 * out to be just not useful. Let's keep the warning in place
+	 * if somebody still use the cmd line parameter so that we do
+	 * not fail it silently
+	 */
+	if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+		pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
 		return -EINVAL;
 	}
 	return 0;
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s)
 
 static __init int setup_numa_zonelist_order(char *s)
 {
-	int ret;
-
 	if (!s)
 		return 0;
 
-	ret = __parse_numa_zonelist_order(s);
-	if (ret == 0)
-		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
-	return ret;
+	return __parse_numa_zonelist_order(s);
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 
+char numa_zonelist_order[] = "Node";
+
 /*
  * sysctl handler for numa_zonelist_order
  */
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
-	char saved_string[NUMA_ZONELIST_ORDER_LEN];
+	char *str;
 	int ret;
-	static DEFINE_MUTEX(zl_order_mutex);
 
-	mutex_lock(&zl_order_mutex);
-	if (write) {
-		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
-			ret = -EINVAL;
-			goto out;
-		}
-		strcpy(saved_string, (char *)table->data);
-	}
-	ret = proc_dostring(table, write, buffer, length, ppos);
-	if (ret)
-		goto out;
-	if (write) {
-		int oldval = user_zonelist_order;
+	if (!write)
+		return proc_dostring(table, write, buffer, length, ppos);
+	str = memdup_user_nul(buffer, 16);
+	if (IS_ERR(str))
+		return PTR_ERR(str);
 
-		ret = __parse_numa_zonelist_order((char *)table->data);
-		if (ret) {
-			/*
-			 * bogus value.  restore saved string
-			 */
-			strncpy((char *)table->data, saved_string,
-				NUMA_ZONELIST_ORDER_LEN);
-			user_zonelist_order = oldval;
-		} else if (oldval != user_zonelist_order) {
-			mem_hotplug_begin();
-			mutex_lock(&zonelists_mutex);
-			build_all_zonelists(NULL, NULL);
-			mutex_unlock(&zonelists_mutex);
-			mem_hotplug_done();
-		}
-	}
-out:
-	mutex_unlock(&zl_order_mutex);
+	ret = __parse_numa_zonelist_order(str);
+	kfree(str);
 	return ret;
 }
 
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
  */
 static int node_order[MAX_NUMNODES];
 
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
-	int pos, j, node;
-	int zone_type;		/* needs to be signed */
-	struct zone *z;
-	struct zonelist *zonelist;
-
-	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-	pos = 0;
-	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
-		for (j = 0; j < nr_nodes; j++) {
-			node = node_order[j];
-			z = &NODE_DATA(node)->node_zones[zone_type];
-			if (managed_zone(z)) {
-				zoneref_set_zone(z,
-					&zonelist->_zonerefs[pos++]);
-				check_highest_zone(zone_type);
-			}
-		}
-	}
-	zonelist->_zonerefs[pos].zone = NULL;
-	zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
-	return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
-	return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
-	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
-		current_zonelist_order = default_zonelist_order();
-	else
-		current_zonelist_order = user_zonelist_order;
-}
-
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int i, node, load;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
-	unsigned int order = current_zonelist_order;
 
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat)
 
 		prev_node = node;
 		load--;
-		if (order == ZONELIST_ORDER_NODE)
-			build_zonelists_in_node_order(pgdat, node);
-		else
-			node_order[i++] = node;	/* remember order */
-	}
-
-	if (order == ZONELIST_ORDER_ZONE) {
-		/* calculate node order -- i.e., DMA last! */
-		build_zonelists_in_zone_order(pgdat, i);
+		build_zonelists_in_node_order(pgdat, node);
 	}
 
 	build_thisnode_zonelists(pgdat);
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void);
 static void setup_min_slab_ratio(void);
 #else	/* CONFIG_NUMA */
 
-static void set_zonelist_order(void)
-{
-	current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void)
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
-	set_zonelist_order();
-
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 	else
 		page_group_by_mobility_disabled = 0;
 
-	pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+	pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
 		nr_online_nodes,
-		zonelist_order_name[current_zonelist_order],
 		page_group_by_mobility_disabled ? "off" : "on",
 		vm_total_pages);
 #ifdef CONFIG_NUMA

From afb6ebb3faeb382f7c8b4478f2a84cee37bb8610 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:17 -0700
Subject: [PATCH 031/119] mm, page_alloc: remove boot pageset initialization
 from memory hotplug

boot_pageset is a boot time hack which gets superseded by normal
pagesets later in the boot process.  It makes zero sense to reinitialize
it again and again during memory hotplug.

Link: http://lkml.kernel.org/r/20170721143915.14161-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 50 ++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b23df1be909..94e64784a8be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5141,7 +5141,7 @@ DEFINE_MUTEX(zonelists_mutex);
 static int __build_all_zonelists(void *data)
 {
 	int nid;
-	int cpu;
+	int __maybe_unused cpu;
 	pg_data_t *self = data;
 
 #ifdef CONFIG_NUMA
@@ -5162,6 +5162,31 @@ static int __build_all_zonelists(void *data)
 		}
 	}
 
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+	for_each_possible_cpu(cpu) {
+		/*
+		 * We now know the "local memory node" for each node--
+		 * i.e., the node of the first zone in the generic zonelist.
+		 * Set up numa_mem percpu variable for on-line cpus.  During
+		 * boot, only the boot cpu should be on-line;  we'll init the
+		 * secondary cpus' numa_mem as they come on-line.  During
+		 * node/memory hotplug, we'll fixup all on-line cpus.
+		 */
+		if (cpu_online(cpu))
+			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+	}
+#endif
+
+	return 0;
+}
+
+static noinline void __init
+build_all_zonelists_init(void)
+{
+	int cpu;
+
+	__build_all_zonelists(NULL);
+
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
@@ -5175,30 +5200,9 @@ static int __build_all_zonelists(void *data)
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
-	for_each_possible_cpu(cpu) {
+	for_each_possible_cpu(cpu)
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 
-#ifdef CONFIG_HAVE_MEMORYLESS_NODES
-		/*
-		 * We now know the "local memory node" for each node--
-		 * i.e., the node of the first zone in the generic zonelist.
-		 * Set up numa_mem percpu variable for on-line cpus.  During
-		 * boot, only the boot cpu should be on-line;  we'll init the
-		 * secondary cpus' numa_mem as they come on-line.  During
-		 * node/memory hotplug, we'll fixup all on-line cpus.
-		 */
-		if (cpu_online(cpu))
-			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
-#endif
-	}
-
-	return 0;
-}
-
-static noinline void __init
-build_all_zonelists_init(void)
-{
-	__build_all_zonelists(NULL);
 	mminit_verify_zonelist();
 	cpuset_init_current_mems_allowed();
 }

From d9c9a0b9729c35c96ef7bee4ded56d9441bebd58 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:20 -0700
Subject: [PATCH 032/119] mm, page_alloc: do not set_cpu_numa_mem on empty
 nodes initialization

__build_all_zonelists reinitializes each online cpu local node for
CONFIG_HAVE_MEMORYLESS_NODES.  This makes sense because previously
memory less nodes could gain some memory during memory hotplug and so
the local node should be changed for CPUs close to such a node.  It
makes less sense to do that unconditionally for a newly creaded NUMA
node which is still offline and without any memory.

Let's also simplify the cpu loop and use for_each_online_cpu instead of
an explicit cpu_online check for all possible cpus.

Link: http://lkml.kernel.org/r/20170721143915.14161-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94e64784a8be..94fb4370e000 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5160,10 +5160,8 @@ static int __build_all_zonelists(void *data)
 
 			build_zonelists(pgdat);
 		}
-	}
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
-	for_each_possible_cpu(cpu) {
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
@@ -5172,10 +5170,10 @@ static int __build_all_zonelists(void *data)
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
-		if (cpu_online(cpu))
+		for_each_online_cpu(cpu)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
-	}
 #endif
+	}
 
 	return 0;
 }

From 72675e131eb418c78980c1e683c0c25a25b61221 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:24 -0700
Subject: [PATCH 033/119] mm, memory_hotplug: drop zone from
 build_all_zonelists

build_all_zonelists gets a zone parameter to initialize zone's pagesets.
There is only a single user which gives a non-NULL zone parameter and
that one doesn't really need the rest of the build_all_zonelists (see
commit 6dcd73d7011b ("memory-hotplug: allocate zone's pcp before
onlining pages")).

Therefore remove setup_zone_pageset from build_all_zonelists and call it
from its only user directly.  This will also remove a pointless zonlists
rebuilding which is always good.

Link: http://lkml.kernel.org/r/20170721143915.14161-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 init/main.c            |  2 +-
 mm/internal.h          |  1 +
 mm/memory_hotplug.c    | 10 +++++-----
 mm/page_alloc.c        | 13 +++----------
 5 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bfdc37b77d88..551f68bec2fa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -771,7 +771,7 @@ static inline bool is_dev_zone(const struct zone *zone)
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
+void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int classzone_idx, unsigned int alloc_flags,
diff --git a/init/main.c b/init/main.c
index 8828fc148670..a21a1a8708a8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void)
 	boot_cpu_state_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
-	build_all_zonelists(NULL, NULL);
+	build_all_zonelists(NULL);
 	page_alloc_init();
 
 	pr_notice("Kernel command line: %s\n", boot_command_line);
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..781c0d54d75a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -525,4 +525,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
 }
 
+void setup_zone_pageset(struct zone *zone);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3e69984346da..c4df7d3c64d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -929,7 +929,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone)) {
 		need_zonelists_rebuild = 1;
-		build_all_zonelists(NULL, zone);
+		setup_zone_pageset(zone);
 	}
 
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -950,7 +950,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (onlined_pages) {
 		node_states_set_node(nid, &arg);
 		if (need_zonelists_rebuild)
-			build_all_zonelists(NULL, NULL);
+			build_all_zonelists(NULL);
 		else
 			zone_pcp_update(zone);
 	}
@@ -1028,7 +1028,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 * to access not-initialized zonelist, build here.
 	 */
 	mutex_lock(&zonelists_mutex);
-	build_all_zonelists(pgdat, NULL);
+	build_all_zonelists(pgdat);
 	mutex_unlock(&zonelists_mutex);
 
 	/*
@@ -1084,7 +1084,7 @@ int try_online_node(int nid)
 
 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
 		mutex_lock(&zonelists_mutex);
-		build_all_zonelists(NULL, NULL);
+		build_all_zonelists(NULL);
 		mutex_unlock(&zonelists_mutex);
 	}
 
@@ -1704,7 +1704,7 @@ repeat:
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
 		mutex_lock(&zonelists_mutex);
-		build_all_zonelists(NULL, NULL);
+		build_all_zonelists(NULL);
 		mutex_unlock(&zonelists_mutex);
 	} else
 		zone_pcp_update(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94fb4370e000..2523d5b3b22f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5129,7 +5129,6 @@ static void build_zonelists(pg_data_t *pgdat)
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
 
 /*
  * Global mutex to protect against size modification of zonelists
@@ -5209,20 +5208,14 @@ build_all_zonelists_init(void)
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  *
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
  * [protected by SYSTEM_BOOTING].
  */
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
 {
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
-		if (zone)
-			setup_zone_pageset(zone);
-#endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
@@ -5496,7 +5489,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 	pageset_set_high_and_batch(zone, pcp);
 }
 
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);

From 34ad1296571f7a004a761e3afc18e79428a726a8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:27 -0700
Subject: [PATCH 034/119] mm, memory_hotplug: remove explicit
 build_all_zonelists from try_online_node

try_online_node calls hotadd_new_pgdat which already calls
build_all_zonelists.  So the additional call is redundant.  Even though
hotadd_new_pgdat will only initialize zonelists of the new node this is
the right thing to do because such a node doesn't have any memory so
other zonelists would ignore all the zones from this node anyway.

Link: http://lkml.kernel.org/r/20170721143915.14161-6-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c4df7d3c64d1..2f0c7ebc7624 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1081,13 +1081,6 @@ int try_online_node(int nid)
 	node_set_online(nid);
 	ret = register_one_node(nid);
 	BUG_ON(ret);
-
-	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
-		mutex_lock(&zonelists_mutex);
-		build_all_zonelists(NULL);
-		mutex_unlock(&zonelists_mutex);
-	}
-
 out:
 	mem_hotplug_done();
 	return ret;

From 9d3be21bf9c0b849a13e6b51e9c2ce7ccdf50851 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:30 -0700
Subject: [PATCH 035/119] mm, page_alloc: simplify zonelist initialization

build_zonelists gradually builds zonelists from the nearest to the most
distant node.  As we do not know how many populated zones we will have
in each node we rely on the _zoneref to terminate initialized part of
the zonelist by a NULL zone.  While this is functionally correct it is
quite suboptimal because we cannot allow updaters to race with zonelists
users because they could see an empty zonelist and fail the allocation
or hit the OOM killer in the worst case.

We can do much better, though.  We can store the node ordering into an
already existing node_order array and then give this array to
build_zonelists_in_node_order and do the whole initialization at once.
zonelists consumers still might see halfway initialized state but that
should be much more tolerateable because the list will not be empty and
they would either see some zone twice or skip over some zone(s) in the
worst case which shouldn't lead to immediate failures.

While at it let's simplify build_zonelists_node which is rather
confusing now.  It gets an index into the zoneref array and returns the
updated index for the next iteration.  Let's rename the function to
build_zonerefs_node to better reflect its purpose and give it zoneref
array to update.  The function doesn't the index anymore.  It just
returns the number of added zones so that the caller can advance the
zonered array start for the next update.

This patch alone doesn't introduce any functional change yet, though, it
is merely a preparatory work for later changes.

Link: http://lkml.kernel.org/r/20170721143915.14161-7-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 81 +++++++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2523d5b3b22f..36a2f18c5e0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4839,18 +4839,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
-				int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
+	int nr_zones = 0;
 
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (managed_zone(zone)) {
-			zoneref_set_zone(zone,
-				&zonelist->_zonerefs[nr_zones++]);
+			zoneref_set_zone(zone, &zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
@@ -4977,17 +4976,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+		unsigned nr_nodes)
 {
-	int j;
-	struct zonelist *zonelist;
+	struct zoneref *zonerefs;
+	int i;
 
-	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
-		;
-	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
-	zonelist->_zonerefs[j].zone = NULL;
-	zonelist->_zonerefs[j].zone_idx = 0;
+	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+	for (i = 0; i < nr_nodes; i++) {
+		int nr_zones;
+
+		pg_data_t *node = NODE_DATA(node_order[i]);
+
+		nr_zones = build_zonerefs_node(node, zonerefs);
+		zonerefs += nr_zones;
+	}
+	zonerefs->zone = NULL;
+	zonerefs->zone_idx = 0;
 }
 
 /*
@@ -4995,13 +5001,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
-	int j;
-	struct zonelist *zonelist;
+	struct zoneref *zonerefs;
+	int nr_zones;
 
-	zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
-	j = build_zonelists_node(pgdat, zonelist, 0);
-	zonelist->_zonerefs[j].zone = NULL;
-	zonelist->_zonerefs[j].zone_idx = 0;
+	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+	nr_zones = build_zonerefs_node(pgdat, zonerefs);
+	zonerefs += nr_zones;
+	zonerefs->zone = NULL;
+	zonerefs->zone_idx = 0;
 }
 
 /*
@@ -5010,21 +5017,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
-static int node_order[MAX_NUMNODES];
 
 static void build_zonelists(pg_data_t *pgdat)
 {
-	int i, node, load;
+	static int node_order[MAX_NUMNODES];
+	int node, load, nr_nodes = 0;
 	nodemask_t used_mask;
 	int local_node, prev_node;
-	struct zonelist *zonelist;
-
-	/* initialize zonelists */
-	for (i = 0; i < MAX_ZONELISTS; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->_zonerefs[0].zone = NULL;
-		zonelist->_zonerefs[0].zone_idx = 0;
-	}
 
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
@@ -5033,8 +5032,6 @@ static void build_zonelists(pg_data_t *pgdat)
 	nodes_clear(used_mask);
 
 	memset(node_order, 0, sizeof(node_order));
-	i = 0;
-
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
@@ -5045,11 +5042,12 @@ static void build_zonelists(pg_data_t *pgdat)
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 
+		node_order[nr_nodes++] = node;
 		prev_node = node;
 		load--;
-		build_zonelists_in_node_order(pgdat, node);
 	}
 
+	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
 	build_thisnode_zonelists(pgdat);
 }
 
@@ -5078,13 +5076,14 @@ static void setup_min_slab_ratio(void);
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
-	enum zone_type j;
-	struct zonelist *zonelist;
+	struct zoneref *zonerefs;
+	int nr_zones;
 
 	local_node = pgdat->node_id;
 
-	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-	j = build_zonelists_node(pgdat, zonelist, 0);
+	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+	nr_zones = build_zonerefs_node(pgdat, zonerefs);
+	zonerefs += nr_zones;
 
 	/*
 	 * Now we build the zonelist so that it contains the zones
@@ -5097,16 +5096,18 @@ static void build_zonelists(pg_data_t *pgdat)
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
-		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+		zonerefs += nr_zones;
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
-		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+		zonerefs += nr_zones;
 	}
 
-	zonelist->_zonerefs[j].zone = NULL;
-	zonelist->_zonerefs[j].zone_idx = 0;
+	zonerefs->zone = NULL;
+	zonerefs->zone_idx = 0;
 }
 
 #endif	/* CONFIG_NUMA */

From 11cd8638c37f6c400cc472cc52b6eccb505aba6e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:34 -0700
Subject: [PATCH 036/119] mm, page_alloc: remove stop_machine from
 build_all_zonelists

build_all_zonelists has been (ab)using stop_machine to make sure that
zonelists do not change while somebody is looking at them.  This is is
just a gross hack because a) it complicates the context from which we
can call build_all_zonelists (see 3f906ba23689 ("mm/memory-hotplug:
switch locking to a percpu rwsem")) and b) is is not really necessary
especially after "mm, page_alloc: simplify zonelist initialization" and
c) it doesn't really provide the protection it claims (see below).

Updates of the zonelists happen very seldom, basically only when a zone
becomes populated during memory online or when it loses all the memory
during offline.  A racing iteration over zonelists could either miss a
zone or try to work on one zone twice.  Both of these are something we
can live with occasionally because there will always be at least one
zone visible so we are not likely to fail allocation too easily for
example.

Please note that the original stop_machine approach doesn't really
provide a better exclusion because the iteration might be interrupted
half way (unless the whole iteration is preempt disabled which is not
the case in most cases) so the some zones could still be seen twice or a
zone missed.

I have run the pathological online/offline of the single memblock in the
movable zone while stressing the same small node with some memory
pressure.

Node 1, zone      DMA
  pages free     0
        min      0
        low      0
        high     0
        spanned  0
        present  0
        managed  0
        protection: (0, 943, 943, 943)
Node 1, zone    DMA32
  pages free     227310
        min      8294
        low      10367
        high     12440
        spanned  262112
        present  262112
        managed  241436
        protection: (0, 0, 0, 0)
Node 1, zone   Normal
  pages free     0
        min      0
        low      0
        high     0
        spanned  0
        present  0
        managed  0
        protection: (0, 0, 0, 1024)
Node 1, zone  Movable
  pages free     32722
        min      85
        low      117
        high     149
        spanned  32768
        present  32768
        managed  32768
        protection: (0, 0, 0, 0)

root@test1:/sys/devices/system/node/node1# while true
do
	echo offline > memory34/state
	echo online_movable > memory34/state
done

root@test1:/mnt/data/test/linux-3.7-rc5# numactl --preferred=1 make -j4

and it survived without any unexpected behavior.  While this is not
really a great testing coverage it should exercise the allocation path
quite a lot.

Link: http://lkml.kernel.org/r/20170721143915.14161-8-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36a2f18c5e0a..e3086d0fd945 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5137,8 +5137,7 @@ static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
  */
 DEFINE_MUTEX(zonelists_mutex);
 
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
 {
 	int nid;
 	int __maybe_unused cpu;
@@ -5174,8 +5173,6 @@ static int __build_all_zonelists(void *data)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
-
-	return 0;
 }
 
 static noinline void __init
@@ -5217,9 +5214,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
-		/* we have to stop all cpus to guarantee there is no user
-		   of zonelist */
-		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+		__build_all_zonelists(pgdat);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();

From b93e0f329e24f3615aa551fd9b99a75fb7c9195f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:37 -0700
Subject: [PATCH 037/119] mm, memory_hotplug: get rid of zonelists_mutex

zonelists_mutex was introduced by commit 4eaf3f64397c ("mem-hotplug: fix
potential race while building zonelist for new populated zone") to
protect zonelist building from races.  This is no longer needed though
because both memory online and offline are fully serialized.  New users
have grown since then.

Notably setup_per_zone_wmarks wants to prevent from races between memory
hotplug, khugepaged setup and manual min_free_kbytes update via sysctl
(see cfd3da1e49bb ("mm: Serialize access to min_free_kbytes").  Let's
add a private lock for that purpose.  This will not prevent from seeing
halfway through memory hotplug operation but that shouldn't be a big
deal becuse memory hotplug will update watermarks explicitly so we will
eventually get a full picture.  The lock just makes sure we won't race
when updating watermarks leading to weird results.

Also __build_all_zonelists manipulates global data so add a private lock
for it as well.  This doesn't seem to be necessary today but it is more
robust to have a lock there.

While we are at it make sure we document that memory online/offline
depends on a full serialization either via mem_hotplug_begin() or
device_lock.

Link: http://lkml.kernel.org/r/20170721143915.14161-9-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Haicheng Li <haicheng.li@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 -
 mm/memory_hotplug.c    | 12 ++----------
 mm/page_alloc.c        | 18 +++++++++---------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 551f68bec2fa..e7e92c8f4883 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,7 +770,6 @@ static inline bool is_dev_zone(const struct zone *zone)
 
 #include <linux/memory_hotplug.h>
 
-extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2f0c7ebc7624..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -897,7 +897,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 	return zone;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
 	unsigned long flags;
@@ -926,7 +926,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
-	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone)) {
 		need_zonelists_rebuild = 1;
 		setup_zone_pageset(zone);
@@ -937,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (ret) {
 		if (need_zonelists_rebuild)
 			zone_pcp_reset(zone);
-		mutex_unlock(&zonelists_mutex);
 		goto failed_addition;
 	}
 
@@ -955,8 +953,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 			zone_pcp_update(zone);
 	}
 
-	mutex_unlock(&zonelists_mutex);
-
 	init_per_zone_wmark_min();
 
 	if (onlined_pages) {
@@ -1027,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 * The node we allocated has no zone fallback lists. For avoiding
 	 * to access not-initialized zonelist, build here.
 	 */
-	mutex_lock(&zonelists_mutex);
 	build_all_zonelists(pgdat);
-	mutex_unlock(&zonelists_mutex);
 
 	/*
 	 * zone->managed_pages is set to an approximate value in
@@ -1696,9 +1690,7 @@ repeat:
 
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
-		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL);
-		mutex_unlock(&zonelists_mutex);
 	} else
 		zone_pcp_update(zone);
 
@@ -1724,7 +1716,7 @@ failed_removal:
 	return ret;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3086d0fd945..0bea94af0423 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5131,17 +5131,14 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-
 static void __build_all_zonelists(void *data)
 {
 	int nid;
 	int __maybe_unused cpu;
 	pg_data_t *self = data;
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
@@ -5173,6 +5170,8 @@ static void __build_all_zonelists(void *data)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
+
+	spin_unlock(&lock);
 }
 
 static noinline void __init
@@ -5203,7 +5202,6 @@ build_all_zonelists_init(void)
 }
 
 /*
- * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  *
  * __ref due to call of __init annotated helper build_all_zonelists_init
@@ -6939,9 +6937,11 @@ static void __setup_per_zone_wmarks(void)
  */
 void setup_per_zone_wmarks(void)
 {
-	mutex_lock(&zonelists_mutex);
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 	__setup_per_zone_wmarks();
-	mutex_unlock(&zonelists_mutex);
+	spin_unlock(&lock);
 }
 
 /*

From b95046b0472f7a805fa28fbcfc7205a76ff7a7d0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:41 -0700
Subject: [PATCH 038/119] mm, sparse, page_ext: drop ugly N_HIGH_MEMORY
 branches for allocations

Commit f52407ce2dea ("memory hotplug: alloc page from other node in
memory online") has introduced N_HIGH_MEMORY checks to only use NUMA
aware allocations when there is some memory present because the
respective node might not have any memory yet at the time and so it
could fail or even OOM.

Things have changed since then though.  Zonelists are now always
initialized before we do any allocations even for hotplug (see
959ecc48fc75 ("mm/memory_hotplug.c: fix building of node hotplug
zonelist")).

Therefore these checks are not really needed.  In fact caller of the
allocator should never care about whether the node is populated because
that might change at any time.

Link: http://lkml.kernel.org/r/20170721143915.14161-10-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_ext.c       |  5 +----
 mm/sparse-vmemmap.c | 11 +++--------
 mm/sparse.c         | 10 +++-------
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 88ccc044b09a..714ce79256c5 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
 		return addr;
 	}
 
-	if (node_state(nid, N_HIGH_MEMORY))
-		addr = vzalloc_node(size, nid);
-	else
-		addr = vzalloc(size);
+	addr = vzalloc_node(size, nid);
 
 	return addr;
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d1a39b8051e0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 	if (slab_is_available()) {
 		struct page *page;
 
-		if (node_state(node, N_HIGH_MEMORY))
-			page = alloc_pages_node(
-				node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
-				get_order(size));
-		else
-			page = alloc_pages(
-				GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
-				get_order(size));
+		page = alloc_pages_node(node,
+			GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+			get_order(size));
 		if (page)
 			return page_address(page);
 		return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..a9783acf2bb9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
 
-	if (slab_is_available()) {
-		if (node_state(nid, N_HIGH_MEMORY))
-			section = kzalloc_node(array_size, GFP_KERNEL, nid);
-		else
-			section = kzalloc(array_size, GFP_KERNEL);
-	} else {
+	if (slab_is_available())
+		section = kzalloc_node(array_size, GFP_KERNEL, nid);
+	else
 		section = memblock_virt_alloc_node(array_size, nid);
-	}
 
 	return section;
 }

From dab4ead1a9d88361c85a8209c7e23a8fd124e8d7 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 6 Sep 2017 16:20:44 -0700
Subject: [PATCH 039/119] mm, page_owner: make init_pages_in_zone() faster

In init_pages_in_zone() we currently use the generic set_page_owner()
function to initialize page_owner info for early allocated pages.  This
means we needlessly do lookup_page_ext() twice for each page, and more
importantly save_stack(), which has to unwind the stack and find the
corresponding stack depot handle.  Because the stack is always the same
for the initialization, unwind it once in init_pages_in_zone() and reuse
the handle.  Also avoid the repeated lookup_page_ext().

This can significantly reduce boot times with page_owner=on on large
machines, especially for kernels built without frame pointer, where the
stack unwinding is noticeably slower.

[vbabka@suse.cz: don't duplicate code of __set_page_owner(), per Michal Hocko]
[akpm@linux-foundation.org: coding-style fixes]
[vbabka@suse.cz: create statically allocated fake stack trace for early allocated pages, per Michal]
  Link: http://lkml.kernel.org/r/45813564-2342-fc8d-d31a-f4b68a724325@suse.cz
Link: http://lkml.kernel.org/r/20170720134029.25268-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_owner.c | 52 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0fd9dcf2c5dc..33634f74d0b2 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
 static depot_stack_handle_t dummy_handle;
 static depot_stack_handle_t failure_handle;
+static depot_stack_handle_t early_handle;
 
 static void init_early_allocated_pages(void);
 
@@ -53,7 +54,7 @@ static bool need_page_owner(void)
 	return true;
 }
 
-static noinline void register_dummy_stack(void)
+static __always_inline depot_stack_handle_t create_dummy_stack(void)
 {
 	unsigned long entries[4];
 	struct stack_trace dummy;
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void)
 	dummy.skip = 0;
 
 	save_stack_trace(&dummy);
-	dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+	return depot_save_stack(&dummy, GFP_KERNEL);
+}
+
+static noinline void register_dummy_stack(void)
+{
+	dummy_handle = create_dummy_stack();
 }
 
 static noinline void register_failure_stack(void)
 {
-	unsigned long entries[4];
-	struct stack_trace failure;
+	failure_handle = create_dummy_stack();
+}
 
-	failure.nr_entries = 0;
-	failure.max_entries = ARRAY_SIZE(entries);
-	failure.entries = &entries[0];
-	failure.skip = 0;
-
-	save_stack_trace(&failure);
-	failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+static noinline void register_early_stack(void)
+{
+	early_handle = create_dummy_stack();
 }
 
 static void init_page_owner(void)
@@ -88,6 +90,7 @@ static void init_page_owner(void)
 
 	register_dummy_stack();
 	register_failure_stack();
+	register_early_stack();
 	static_branch_enable(&page_owner_inited);
 	init_early_allocated_pages();
 }
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
 	return handle;
 }
 
-noinline void __set_page_owner(struct page *page, unsigned int order,
-					gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+	depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
 {
-	struct page_ext *page_ext = lookup_page_ext(page);
 	struct page_owner *page_owner;
 
-	if (unlikely(!page_ext))
-		return;
-
 	page_owner = get_page_owner(page_ext);
-	page_owner->handle = save_stack(gfp_mask);
+	page_owner->handle = handle;
 	page_owner->order = order;
 	page_owner->gfp_mask = gfp_mask;
 	page_owner->last_migrate_reason = -1;
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
 
+noinline void __set_page_owner(struct page *page, unsigned int order,
+					gfp_t gfp_mask)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+	depot_stack_handle_t handle;
+
+	if (unlikely(!page_ext))
+		return;
+
+	handle = save_stack(gfp_mask);
+	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
+}
+
 void __set_page_owner_migrate_reason(struct page *page, int reason)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
@@ -565,12 +577,12 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 			if (unlikely(!page_ext))
 				continue;
 
-			/* Maybe overraping zone */
+			/* Maybe overlapping zone */
 			if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 				continue;
 
 			/* Found early allocated page */
-			set_page_owner(page, 0, 0);
+			__set_page_owner_handle(page_ext, early_handle, 0, 0);
 			count++;
 		}
 	}

From 0fc542b7dd90a7080fc1a8c846d13a4ddad509ba Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 6 Sep 2017 16:20:48 -0700
Subject: [PATCH 040/119] mm, page_ext: periodically reschedule during
 page_ext_init()

page_ext_init() can take long on large machines, so add a cond_resched()
point after each section is processed.  This will allow moving the init
to a later point at boot without triggering lockup reports.

Link: http://lkml.kernel.org/r/20170720134029.25268-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 714ce79256c5..32f18911deda 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -406,6 +406,7 @@ void __init page_ext_init(void)
 				continue;
 			if (init_section_page_ext(pfn, nid))
 				goto oom;
+			cond_resched();
 		}
 	}
 	hotplug_memory_notifier(page_ext_callback, 0);

From 10903027948d768d9639b31e9a555802e2dabafc Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 6 Sep 2017 16:20:51 -0700
Subject: [PATCH 041/119] mm, page_owner: don't grab zone->lock for
 init_pages_in_zone()

init_pages_in_zone() is run under zone->lock, which means a long lock
time and disabled interrupts on large machines.  This is currently not
an issue since it runs early in boot, but a later patch will change
that.

However, like other pfn scanners, we don't actually need zone->lock even
when other cpus are running.  The only potentially dangerous operation
here is reading bogus buddy page owner due to race, and we already know
how to handle that.  The worst that can happen is that we skip some
early allocated pages, which should not affect the debugging power of
page_owner noticeably.

Link: http://lkml.kernel.org/r/20170720134029.25268-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_owner.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 33634f74d0b2..8e2d7137510c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -562,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 				continue;
 
 			/*
-			 * We are safe to check buddy flag and order, because
-			 * this is init stage and only single thread runs.
+			 * To avoid having to grab zone->lock, be a little
+			 * careful when reading buddy page order. The only
+			 * danger is that we skip too much and potentially miss
+			 * some early allocated pages, which is better than
+			 * heavy lock contention.
 			 */
 			if (PageBuddy(page)) {
-				pfn += (1UL << page_order(page)) - 1;
+				unsigned long order = page_order_unsafe(page);
+
+				if (order > 0 && order < MAX_ORDER)
+					pfn += (1UL << order) - 1;
 				continue;
 			}
 
@@ -585,6 +591,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 			__set_page_owner_handle(page_ext, early_handle, 0, 0);
 			count++;
 		}
+		cond_resched();
 	}
 
 	pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
@@ -595,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	struct zone *node_zones = pgdat->node_zones;
-	unsigned long flags;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 		if (!populated_zone(zone))
 			continue;
 
-		spin_lock_irqsave(&zone->lock, flags);
 		init_pages_in_zone(pgdat, zone);
-		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 }
 

From dba58d3b8c5045ad89c1c95d33d01451e3964db7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:20:55 -0700
Subject: [PATCH 042/119] mm/mremap: fail map duplication attempts for private
 mappings

mremap will attempt to create a 'duplicate' mapping if old_size == 0 is
specified.  In the case of private mappings, mremap will actually create
a fresh separate private mapping unrelated to the original.  This does
not fit with the design semantics of mremap as the intention is to
create a new mapping based on the original.

Therefore, return EINVAL in the case where an attempt is made to
duplicate a private mapping.  Also, print a warning message (once) if
such an attempt is made.

Link: http://lkml.kernel.org/r/cb9d9f6a-7095-582f-15a5-62643d65c736@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mm/mremap.c b/mm/mremap.c
index 3f23715d3c69..7395564daa6c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (!vma || vma->vm_start > addr)
 		return ERR_PTR(-EFAULT);
 
+	/*
+	 * !old_len is a special case where an attempt is made to 'duplicate'
+	 * a mapping.  This makes no sense for private mappings as it will
+	 * instead create a fresh/new mapping unrelated to the original.  This
+	 * is contrary to the basic idea of mremap which creates new mappings
+	 * based on the original.  There are no known use cases for this
+	 * behavior.  As a result, fail such attempts.
+	 */
+	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
+		return ERR_PTR(-EINVAL);
+	}
+
 	if (is_vm_hugetlb_page(vma))
 		return ERR_PTR(-EINVAL);
 

From 09180ca4b3bc7b1ffdbae62a75f716dfc0685861 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Wed, 6 Sep 2017 16:20:58 -0700
Subject: [PATCH 043/119] mm/gup: make __gup_device_* require THP

These functions are the only bits of generic code that use
{pud,pmd}_pfn() without checking for CONFIG_TRANSPARENT_HUGEPAGE.  This
works fine on x86, the only arch with devmap support, since the *_pfn()
functions are always defined there, but this isn't true for every
architecture.

Link: http://lkml.kernel.org/r/20170626063833.11094-1-oohall@gmail.com
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* __HAVE_ARCH_PTE_SPECIAL */
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {

From 9b19df292c666b57c407fed2496827c6aba05be2 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Wed, 6 Sep 2017 16:21:01 -0700
Subject: [PATCH 044/119] mm/hugetlb.c: make huge_pte_offset() consistent and
 document behaviour

When walking the page tables to resolve an address that points to
!p*d_present() entry, huge_pte_offset() returns inconsistent values
depending on the level of page table (PUD or PMD).

It returns NULL in the case of a PUD entry while in the case of a PMD
entry, it returns a pointer to the page table entry.

A similar inconsitency exists when handling swap entries - returns NULL
for a PUD entry while a pointer to the pte_t is retured for the PMD
entry.

Update huge_pte_offset() to make the behaviour consistent - return a
pointer to the pte_t for hugepage or swap entries.  Only return NULL in
instances where we have a p*d_none() entry and the size parameter
doesn't match the hugepage size at this level of the page table.

Document the behaviour to clarify the expected behaviour of this
function.  This is to set clear semantics for architecture specific
implementations of huge_pte_offset().

Discussions on the arm64 implementation of huge_pte_offset()
(http://www.spinics.net/lists/linux-mm/msg133699.html) showed that there
is benefit from returning a pte_t* in the case of p*d_none().

The fault handling code in hugetlb_fault() can handle p*d_none() entries
and saves an extra round trip to huge_pte_alloc().  Other callers of
huge_pte_offset() should be ok as well.

[punit.agrawal@arm.com: v2]
Link: http://lkml.kernel.org/r/20170725154114.24131-2-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..1d54a131bdd5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4600,6 +4600,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz)
 {
@@ -4614,13 +4623,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	p4d = p4d_offset(pgd, addr);
 	if (!p4d_present(*p4d))
 		return NULL;
+
 	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
+	if (sz != PUD_SIZE && pud_none(*pud))
 		return NULL;
-	if (pud_huge(*pud))
+	/* hugepage or swap? */
+	if (pud_huge(*pud) || !pud_present(*pud))
 		return (pte_t *)pud;
+
 	pmd = pmd_offset(pud, addr);
-	return (pte_t *) pmd;
+	if (sz != PMD_SIZE && pmd_none(*pmd))
+		return NULL;
+	/* hugepage or swap? */
+	if (pmd_huge(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	return NULL;
 }
 
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */

From 4647706ebeee6e50f7b9f922b095f4ec94d581c3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 6 Sep 2017 16:21:05 -0700
Subject: [PATCH 045/119] mm: always flush VMA ranges affected by
 zap_page_range

Nadav Amit report zap_page_range only specifies that the caller protect
the VMA list but does not specify whether it is held for read or write
with callers using either.  madvise holds mmap_sem for read meaning that
a parallel zap operation can unmap PTEs which are then potentially
skipped by madvise which potentially returns with stale TLB entries
present.  While the API could be extended, it would be a difficult API
to use.  This patch causes zap_page_range() to always consider flushing
the full affected range.  For small ranges or sparsely populated
mappings, this may result in one additional spurious TLB flush.  For
larger ranges, it is possible that the TLB has already been flushed and
the overhead is negligible.  Either way, this approach is safer overall
and avoids stale entries being present when madvise returns.

This can be illustrated with the following program provided by Nadav
Amit and slightly modified.  With the patch applied, it has an exit code
of 0 indicating a stale TLB entry did not leak to userspace.

---8<---

volatile int sync_step = 0;
volatile char *p;

static inline unsigned long rdtsc()
{
	unsigned long hi, lo;
	__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
	 return lo | (hi << 32);
}

static inline void wait_rdtsc(unsigned long cycles)
{
	unsigned long tsc = rdtsc();

	while (rdtsc() - tsc < cycles);
}

void *big_madvise_thread(void *ign)
{
	sync_step = 1;
	while (sync_step != 2);
	madvise((void*)p, PAGE_SIZE * N_PAGES, MADV_DONTNEED);
}

int main(void)
{
	pthread_t aux_thread;

	p = mmap(0, PAGE_SIZE * N_PAGES, PROT_READ|PROT_WRITE,
		 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

	memset((void*)p, 8, PAGE_SIZE * N_PAGES);

	pthread_create(&aux_thread, NULL, big_madvise_thread, NULL);
	while (sync_step != 1);

	*p = 8;		// Cache in TLB
	sync_step = 2;
	wait_rdtsc(100000);
	madvise((void*)p, PAGE_SIZE, MADV_DONTNEED);
	printf("data: %d (%s)\n", *p, (*p == 8 ? "stale, broken" : "cleared, fine"));
	return *p == 8 ? -1 : 0;
}
---8<---

Link: http://lkml.kernel.org/r/20170725101230.5v7gvnjmcnkzzql3@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 71c0b6f98a62..1416485e278c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	tlb_gather_mmu(&tlb, mm, start, end);
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, start, end);
-	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
 		unmap_single_vma(&tlb, vma, start, end, NULL);
+
+		/*
+		 * zap_page_range does not specify whether mmap_sem should be
+		 * held for read or write. That allows parallel zap_page_range
+		 * operations to unmap a PTE and defer a flush meaning that
+		 * this call observes pte_none and fails to flush the TLB.
+		 * Rather than adding a complex API, ensure that no stale
+		 * TLB entries exist when this call returns.
+		 */
+		flush_tlb_range(vma, start, end);
+	}
+
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	tlb_finish_mmu(&tlb, start, end);
 }

From 77ff465799c60294e248000cd22ae8171da3304c Mon Sep 17 00:00:00 2001
From: Hui Zhu <zhuhui@xiaomi.com>
Date: Wed, 6 Sep 2017 16:21:08 -0700
Subject: [PATCH 046/119] zsmalloc: zs_page_migrate: skip unnecessary loops but
 not return -EBUSY if zspage is not inuse

Getting -EBUSY from zs_page_migrate will make migration slow (retry) or
fail (zs_page_putback will schedule_work free_work, but it cannot ensure
the success).

I noticed this issue because my Kernel patched
(https://lkml.org/lkml/2014/5/28/113) that will remove retry in
__alloc_contig_migrate_range.

This retry will handle the -EBUSY because it will re-isolate the page
and re-call migrate_pages.  Without it will make cma_alloc fail at once
with -EBUSY.

According to the review from Minchan Kim in
https://lkml.org/lkml/2014/5/28/113, I update the patch to skip
unnecessary loops but not return -EBUSY if zspage is not inuse.

Following is what I got with highalloc-performance in a vbox with 2 cpu
1G memory 512 zram as swap.  And the swappiness is set to 100.

                                   ori          ne
                                  orig         new
Minor Faults                  50805113    50830235
Major Faults                     43918       56530
Swap Ins                         42087       55680
Swap Outs                        89718      104700
Allocation stalls                    0           0
DMA allocs                       57787       52364
DMA32 allocs                  47964599    48043563
Normal allocs                        0           0
Movable allocs                       0           0
Direct pages scanned             45493       23167
Kswapd pages scanned           1565222     1725078
Kswapd pages reclaimed         1342222     1503037
Direct pages reclaimed           45615       25186
Kswapd efficiency                  85%         87%
Kswapd velocity               1897.101    1949.042
Direct efficiency                 100%        108%
Direct velocity                 55.139      26.175
Percentage direct scans             2%          1%
Zone normal velocity          1952.240    1975.217
Zone dma32 velocity              0.000       0.000
Zone dma velocity                0.000       0.000
Page writes by reclaim       89764.000  105233.000
Page writes file                    46         533
Page writes anon                 89718      104700
Page reclaim immediate           21457        3699
Sector Reads                   3259688     3441368
Sector Writes                  3667252     3754836
Page rescued immediate               0           0
Slabs scanned                  1042872     1160855
Direct inode steals               8042       10089
Kswapd inode steals              54295       29170
Kswapd skipped wait                  0           0
THP fault alloc                    175         154
THP collapse alloc                 226         289
THP splits                           0           0
THP fault fallback                  11          14
THP collapse fail                    3           2
Compaction stalls                  536         646
Compaction success                 322         358
Compaction failures                214         288
Page migrate success            119608      111063
Page migrate failure              2723        2593
Compaction pages isolated       250179      232652
Compaction migrate scanned     9131832     9942306
Compaction free scanned        2093272     2613998
Compaction cost                    192         189
NUMA alloc hit                47124555    47193990
NUMA alloc miss                      0           0
NUMA interleave hit                  0           0
NUMA alloc local              47124555    47193990
NUMA base PTE updates                0           0
NUMA huge PMD updates                0           0
NUMA page range updates              0           0
NUMA hint faults                     0           0
NUMA hint local faults               0           0
NUMA hint local percent            100         100
NUMA pages migrated                  0           0
AutoNUMA cost                       0%          0%

[akpm@linux-foundation.org: remove newline, per Minchan]
Link: http://lkml.kernel.org/r/1500889535-19648-1-git-send-email-zhuhui@xiaomi.com
Signed-off-by: Hui Zhu <zhuhui@xiaomi.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 308acb9d814b..62457eb82330 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 
 	spin_lock(&class->lock);
 	if (!get_zspage_inuse(zspage)) {
-		ret = -EBUSY;
-		goto unlock_class;
+		/*
+		 * Set "offset" to end of the page so that every loops
+		 * skips unnecessary object scanning.
+		 */
+		offset = PAGE_SIZE;
 	}
 
 	pos = offset;
@@ -2052,7 +2055,6 @@ unpin_objects:
 		}
 	}
 	kunmap_atomic(s_addr);
-unlock_class:
 	spin_unlock(&class->lock);
 	migrate_write_unlock(zspage);
 

From db73ee0d463799223244e96e7b7eea73b4a6ec31 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:21:11 -0700
Subject: [PATCH 047/119] mm, vmscan: do not loop on too_many_isolated for ever

Tetsuo Handa has reported[1][2][3] that direct reclaimers might get
stuck in too_many_isolated loop basically for ever because the last few
pages on the LRU lists are isolated by the kswapd which is stuck on fs
locks when doing the pageout or slab reclaim.  This in turn means that
there is nobody to actually trigger the oom killer and the system is
basically unusable.

too_many_isolated has been introduced by commit 35cd78156c49 ("vmscan:
throttle direct reclaim when too many pages are isolated already") to
prevent from pre-mature oom killer invocations because back then no
reclaim progress could indeed trigger the OOM killer too early.

But since the oom detection rework in commit 0a0337e0d1d1 ("mm, oom:
rework oom detection") the allocation/reclaim retry loop considers all
the reclaimable pages and throttles the allocation at that layer so we
can loosen the direct reclaim throttling.

Make shrink_inactive_list loop over too_many_isolated bounded and
returns immediately when the situation hasn't resolved after the first
sleep.

Replace congestion_wait by a simple schedule_timeout_interruptible
because we are not really waiting on the IO congestion in this path.

Please note that this patch can theoretically cause the OOM killer to
trigger earlier while there are many pages isolated for the reclaim
which makes progress only very slowly.  This would be obvious from the
oom report as the number of isolated pages are printed there.  If we
ever hit this should_reclaim_retry should consider those numbers in the
evaluation in one way or another.

[1] http://lkml.kernel.org/r/201602092349.ACG81273.OSVtMJQHLOFOFF@I-love.SAKURA.ne.jp
[2] http://lkml.kernel.org/r/201702212335.DJB30777.JOFMHSFtVLQOOF@I-love.SAKURA.ne.jp
[3] http://lkml.kernel.org/r/201706300914.CEH95859.FMQOLVFHJFtOOS@I-love.SAKURA.ne.jp

[mhocko@suse.com: switch to uninterruptible sleep]
  Link: http://lkml.kernel.org/r/20170724065048.GB25221@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/20170710074842.23175-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 095817820a56..1638814c7848 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1743,9 +1743,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	int file = is_file_lru(lru);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+	bool stalled = false;
 
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
+		if (stalled)
+			return 0;
+
+		/* wait a bit for the reclaimer. */
+		msleep(100);
+		stalled = true;
 
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))

From 26b433d0da062d6e19d75350c0171d3cf8ff560d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:15 -0700
Subject: [PATCH 048/119] fscache: remove unused ->now_uncached callback

Patch series "Ranged pagevec lookup", v2.

In this series I make pagevec_lookup() update the index (to be
consistent with pagevec_lookup_tag() and also as a preparation for
ranged lookups), provide ranged variant of pagevec_lookup() and use it
in places where it makes sense.  This not only removes some common code
but is also a measurable performance win for some use cases (see patch
4/10) where radix tree is sparse and searching & grabing of a page after
the end of the range has measurable overhead.

This patch (of 10):

The callback doesn't ever get called.  Remove it.

Link: http://lkml.kernel.org/r/20170726114704.7626-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../filesystems/caching/netfs-api.txt         |  2 -
 fs/9p/cache.c                                 | 29 -------------
 fs/afs/cache.c                                | 43 -------------------
 fs/ceph/cache.c                               | 31 -------------
 fs/cifs/cache.c                               | 31 -------------
 fs/nfs/fscache-index.c                        | 40 -----------------
 include/linux/fscache.h                       |  9 ----
 7 files changed, 185 deletions(-)

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index aed6b94160b1..0eb31de3a2c1 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
 		void (*mark_pages_cached)(void *cookie_netfs_data,
 					  struct address_space *mapping,
 					  struct pagevec *cached_pvec);
-
-		void (*now_uncached)(void *cookie_netfs_data);
 	};
 
 This has the following fields:
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct v9fs_inode *v9inode = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	for (;;) {
-		nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 	.name		= "9p.inode",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 	.get_attr	= v9fs_cache_inode_get_attr,
 	.get_aux	= v9fs_cache_inode_get_aux,
 	.check_aux	= v9fs_cache_inode_check_aux,
-	.now_uncached	= v9fs_cache_inode_now_uncached,
 };
 
 void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 						       const void *buffer,
 						       uint16_t buflen);
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
 
 struct fscache_netfs afs_cache_netfs = {
 	.name			= "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
 	.get_attr		= afs_vnode_cache_get_attr,
 	.get_aux		= afs_vnode_cache_get_aux,
 	.check_aux		= afs_vnode_cache_check_aux,
-	.now_uncached		= afs_vnode_cache_now_uncached,
 };
 
 /*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 	_leave(" = SUCCESS");
 	return FSCACHE_CHECKAUX_OKAY;
 }
-
-/*
- * indication the cookie is no longer uncached
- * - this function is called when the backing store currently caching a cookie
- *   is removed
- * - the netfs should use this to clean up any markers indicating cached pages
- * - this is mandatory for any object that may have data
- */
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-{
-	struct afs_vnode *vnode = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	_enter("{%x,%x,%Lx}",
-	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	for (;;) {
-		/* grab a bunch of pages to clean */
-		nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	_leave("");
-}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 337f88673ed9..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
-{
-	struct ceph_inode_info* ci = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	dout("ceph inode 0x%p now uncached", ci);
-
-	while (1) {
-		nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
 	.name		= "CEPH.inode",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
 	.get_attr	= ceph_fscache_inode_get_attr,
 	.get_aux	= ceph_fscache_inode_get_aux,
 	.check_aux	= ceph_fscache_inode_check_aux,
-	.now_uncached	= ceph_fscache_inode_now_uncached,
 };
 
 void ceph_fscache_register_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct cifsInodeInfo *cifsi = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
-
-	for (;;) {
-		nr_pages = pagevec_lookup(&pvec,
-					  cifsi->vfs_inode.i_mapping, first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 const struct fscache_cookie_def cifs_fscache_inode_object_def = {
 	.name		= "CIFS.uniqueid",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
 	.get_attr	= cifs_fscache_inode_get_attr,
 	.get_aux	= cifs_fscache_inode_get_aux,
 	.check_aux	= cifs_fscache_inode_check_aux,
-	.now_uncached	= cifs_fscache_inode_now_uncached,
 };
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -251,45 +251,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-/*
- * Indication from FS-Cache that the cookie is no longer cached
- * - This function is called when the backing store currently caching a cookie
- *   is removed
- * - The netfs should use this to clean up any markers indicating cached pages
- * - This is mandatory for any object that may have data
- */
-static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct nfs_inode *nfsi = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
-
-	for (;;) {
-		/* grab a bunch of pages to unmark */
-		nr_pages = pagevec_lookup(&pvec,
-					  nfsi->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 /*
  * Get an extra reference on a read context.
  * - This function can be absent if the completion function doesn't require a
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
 	.get_attr	= nfs_fscache_inode_get_attr,
 	.get_aux	= nfs_fscache_inode_get_aux,
 	.check_aux	= nfs_fscache_inode_check_aux,
-	.now_uncached	= nfs_fscache_inode_now_uncached,
 	.get_context	= nfs_fh_get_context,
 	.put_context	= nfs_fh_put_context,
 };
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 115bb81912cc..f4ff47d4a893 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -143,15 +143,6 @@ struct fscache_cookie_def {
 	void (*mark_page_cached)(void *cookie_netfs_data,
 				 struct address_space *mapping,
 				 struct page *page);
-
-	/* indicate the cookie is no longer cached
-	 * - this function is called when the backing store currently caching
-	 *   a cookie is removed
-	 * - the netfs should use this to clean up any markers indicating
-	 *   cached pages
-	 * - this is mandatory for any object that may have data
-	 */
-	void (*now_uncached)(void *cookie_netfs_data);
 };
 
 /*

From d72dc8a25afc71ce90ee92bdd77550e9beb85d4d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:18 -0700
Subject: [PATCH 049/119] mm: make pagevec_lookup() update index

Make pagevec_lookup() (and underlying find_get_pages()) update index to
the next page where iteration should continue.  Most callers want this
and also pagevec_lookup_tag() already does this.

Link: http://lkml.kernel.org/r/20170726114704.7626-3-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c             | 10 ++++------
 fs/ext4/file.c          |  4 +---
 fs/ext4/inode.c         |  8 ++------
 fs/fscache/page.c       |  5 ++---
 fs/hugetlbfs/inode.c    | 17 ++++++++---------
 fs/nilfs2/page.c        |  3 +--
 fs/ramfs/file-nommu.c   |  2 +-
 include/linux/pagemap.h |  2 +-
 include/linux/pagevec.h |  2 +-
 mm/filemap.c            | 11 ++++++++---
 mm/swap.c               |  5 +++--
 11 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..5b20893708e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1633,13 +1633,12 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pagevec_init(&pvec, 0);
-	while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+	while (index <= end && pagevec_lookup(&pvec, bd_mapping, &index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-			index = page->index;
-			if (index > end)
+			if (page->index > end)
 				break;
 			if (!page_has_buffers(page))
 				continue;
@@ -1670,7 +1669,6 @@ unlock_page:
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
 	}
 }
 EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3552,7 +3550,8 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		unsigned want, nr_pages, i;
 
 		want = min_t(unsigned, end - index, PAGEVEC_SIZE);
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
+					  want);
 		if (nr_pages == 0)
 			break;
 
@@ -3594,7 +3593,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		if (nr_pages < want)
 			break;
 
-		index = pvec.pages[i - 1]->index + 1;
 		pagevec_release(&pvec);
 	} while (index < end);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f28ac999dfba..1c73b21fdc13 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		unsigned long nr_pages;
 
 		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
 					  (pgoff_t)num);
 		if (nr_pages == 0)
 			break;
@@ -536,8 +536,6 @@ next:
 		/* The no. of pages is less than our desired, we are done. */
 		if (nr_pages < num)
 			break;
-
-		index = pvec.pages[i - 1]->index + 1;
 		pagevec_release(&pvec);
 	} while (index <= end);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c774bdc22759..b3ce1c6d9f23 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,7 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup(&pvec, mapping, &index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
@@ -1737,7 +1737,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 			}
 			unlock_page(page);
 		}
-		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
 }
@@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &start,
 					  PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
@@ -2357,8 +2356,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 
 			if (page->index > end)
 				break;
-			/* Up to 'end' pages must be contiguous */
-			BUG_ON(page->index != start);
 			bh = head = page_buffers(page);
 			do {
 				if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2400,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 				pagevec_release(&pvec);
 				return err;
 			}
-			start++;
 		}
 		pagevec_release(&pvec);
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..83018861dcd2 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 	pagevec_init(&pvec, 0);
 	next = 0;
 	do {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+		if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
-			next = page->index;
 			if (PageFsCache(page)) {
 				__fscache_wait_on_page_write(cookie, page);
 				__fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-	} while (++next);
+	} while (next);
 
 	_leave("");
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..b9678ce91e25 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -401,7 +401,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	const pgoff_t end = lend >> huge_page_shift(h);
 	struct vm_area_struct pseudo_vma;
 	struct pagevec pvec;
-	pgoff_t next;
+	pgoff_t next, index;
 	int i, freed = 0;
 	long lookup_nr = PAGEVEC_SIZE;
 	bool truncate_op = (lend == LLONG_MAX);
@@ -420,7 +420,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 		/*
 		 * When no more pages are found, we are done.
 		 */
-		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+		if (!pagevec_lookup(&pvec, mapping, &next, lookup_nr))
 			break;
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
@@ -432,13 +432,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			 * only possible in the punch hole case as end is
 			 * max page offset in the truncate case.
 			 */
-			next = page->index;
-			if (next >= end)
+			index = page->index;
+			if (index >= end)
 				break;
 
 			hash = hugetlb_fault_mutex_hash(h, current->mm,
 							&pseudo_vma,
-							mapping, next, 0);
+							mapping, index, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 			/*
@@ -455,8 +455,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 				i_mmap_lock_write(mapping);
 				hugetlb_vmdelete_list(&mapping->i_mmap,
-					next * pages_per_huge_page(h),
-					(next + 1) * pages_per_huge_page(h));
+					index * pages_per_huge_page(h),
+					(index + 1) * pages_per_huge_page(h));
 				i_mmap_unlock_write(mapping);
 			}
 
@@ -475,14 +475,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			freed++;
 			if (!truncate_op) {
 				if (unlikely(hugetlb_unreserve_pages(inode,
-							next, next + 1, 1)))
+							index, index + 1, 1)))
 					hugetlb_fix_reserve_counts(inode);
 			}
 
 			unlock_page(page);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		}
-		++next;
 		huge_pagevec_release(&pvec);
 		cond_resched();
 	}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..382a36c72d72 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 
 	pagevec_init(&pvec, 0);
 repeat:
-	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE);
 	if (!n)
 		return;
-	index = pvec.pages[n - 1]->index + 1;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
 		struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 	if (!pages)
 		goto out_free;
 
-	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
 	if (nr != lpages)
 		goto out_free_pages; /* leave if some pages were missing */
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..249b1b5964c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,7 +353,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 			  unsigned int nr_entries, struct page **entries,
 			  pgoff_t *indices);
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 			unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index b45d391b4540..c395a5bb58b2 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -28,7 +28,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned nr_pages);
+		pgoff_t *start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index dad935769055..ab9011408d81 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -403,7 +403,7 @@ bool filemap_range_has_page(struct address_space *mapping,
 		return false;
 
 	pagevec_init(&pvec, 0);
-	if (!pagevec_lookup(&pvec, mapping, index, 1))
+	if (!pagevec_lookup(&pvec, mapping, &index, 1))
 		return false;
 	ret = (pvec.pages[0]->index <= end);
 	pagevec_release(&pvec);
@@ -1569,10 +1569,11 @@ export:
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 			    unsigned int nr_pages, struct page **pages)
 {
 	struct radix_tree_iter iter;
@@ -1583,7 +1584,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 		return 0;
 
 	rcu_read_lock();
-	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
 		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
@@ -1625,6 +1626,10 @@ repeat:
 	}
 
 	rcu_read_unlock();
+
+	if (ret)
+		*start = pages[ret - 1]->index + 1;
+
 	return ret;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..4bffd1198ce5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -957,12 +957,13 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
  * reference against the pages in @pvec.
  *
  * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages.
+ * indexes.  There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
  *
  * pagevec_lookup() returns the number of pages which were found.
  */
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned nr_pages)
+		pgoff_t *start, unsigned nr_pages)
 {
 	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 	return pagevec_count(pvec);

From b947cee4b96306037e166ff1ea5156c0ecdd7d91 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:21 -0700
Subject: [PATCH 050/119] mm: implement find_get_pages_range()

Implement a variant of find_get_pages() that stops iterating at given
index.  This may be substantial performance gain if the mapping is
sparse.  See following commit for details.  Furthermore lots of users of
this function (through pagevec_lookup()) actually want a range lookup
and all of them are currently open-coding this.

Also create corresponding pagevec_lookup_range() function.

Link: http://lkml.kernel.org/r/20170726114704.7626-4-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++++--
 include/linux/pagevec.h | 13 +++++++++++--
 mm/filemap.c            | 42 +++++++++++++++++++++++++++++------------
 mm/swap.c               | 22 +++++++++++++--------
 4 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 249b1b5964c7..5bbd6780f205 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 			  unsigned int nr_entries, struct page **entries,
 			  pgoff_t *indices);
-unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
-			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+			pgoff_t end, unsigned int nr_pages,
+			struct page **pages);
+static inline unsigned find_get_pages(struct address_space *mapping,
+			pgoff_t *start, unsigned int nr_pages,
+			struct page **pages)
+{
+	return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
+				    pages);
+}
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index c395a5bb58b2..7df056910437 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,8 +27,17 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t start, unsigned nr_entries,
 				pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t *start, unsigned nr_pages);
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+			      struct address_space *mapping,
+			      pgoff_t *start, pgoff_t end, unsigned nr_pages);
+static inline unsigned pagevec_lookup(struct pagevec *pvec,
+				      struct address_space *mapping,
+				      pgoff_t *start, unsigned nr_pages)
+{
+	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1,
+				    nr_pages);
+}
+
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab9011408d81..129883f160a7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1557,24 +1557,29 @@ export:
 }
 
 /**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
  * @mapping:	The address_space to search
  * @start:	The starting page index
+ * @end:	The final page index (inclusive)
  * @nr_pages:	The maximum number of pages
  * @pages:	Where the resulting pages are placed
  *
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping.  The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
+ * a reference against the returned pages.
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages.
  * We also update @start to index the next page for the traversal.
  *
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
  */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
-			    unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+			      pgoff_t end, unsigned int nr_pages,
+			      struct page **pages)
 {
 	struct radix_tree_iter iter;
 	void **slot;
@@ -1586,6 +1591,9 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
 		struct page *head, *page;
+
+		if (iter.index > end)
+			break;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1621,15 +1629,25 @@ repeat:
 		}
 
 		pages[ret] = page;
-		if (++ret == nr_pages)
-			break;
+		if (++ret == nr_pages) {
+			*start = pages[ret - 1]->index + 1;
+			goto out;
+		}
 	}
 
+	/*
+	 * We come here when there is no page beyond @end. We take care to not
+	 * overflow the index @start as it confuses some of the callers. This
+	 * breaks the iteration when there is page at index -1 but that is
+	 * already broken anyway.
+	 */
+	if (end == (pgoff_t)-1)
+		*start = (pgoff_t)-1;
+	else
+		*start = end + 1;
+out:
 	rcu_read_unlock();
 
-	if (ret)
-		*start = pages[ret - 1]->index + 1;
-
 	return ret;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 4bffd1198ce5..e06e9aa2478e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,29 +946,35 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
 }
 
 /**
- * pagevec_lookup - gang pagecache lookup
+ * pagevec_lookup_range - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
  * @mapping:	The address_space to search
  * @start:	The starting page index
+ * @end:	The final page index
  * @nr_pages:	The maximum number of pages
  *
- * pagevec_lookup() will search for and return a group of up to @nr_pages pages
- * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
  * reference against the pages in @pvec.
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages. We
  * also update @start to index the next page for the traversal.
  *
- * pagevec_lookup() returns the number of pages which were found.
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
  */
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t *start, unsigned nr_pages)
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *start, pgoff_t end,
+		unsigned nr_pages)
 {
-	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	pvec->nr = find_get_pages_range(mapping, start, end, nr_pages,
+					pvec->pages);
 	return pagevec_count(pvec);
 }
-EXPORT_SYMBOL(pagevec_lookup);
+EXPORT_SYMBOL(pagevec_lookup_range);
 
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t *index, int tag, unsigned nr_pages)

From c10f778ddfc161f5c58a8d6de4ad92235ea2eeba Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:24 -0700
Subject: [PATCH 051/119] fs: fix performance regression in
 clean_bdev_aliases()

Commit e64855c6cfaa ("fs: Add helper to clean bdev aliases under a bh
and use it") added a wrapper for clean_bdev_aliases() that invalidates
bdev aliases underlying a single buffer head.

However this has caused a performance regression for bonnie++ benchmark
on ext4 filesystem when delayed allocation is turned off (ext3 mode) -
average of 3 runs:

  Hmean SeqOut Char  164787.55 (  0.00%) 107189.06 (-34.95%)
  Hmean SeqOut Block 219883.89 (  0.00%) 168870.32 (-23.20%)

The reason for this regression is that clean_bdev_aliases() is slower
when called for a single block because pagevec_lookup() it uses will end
up iterating through the radix tree until it finds a page (which may
take a while) but we are only interested whether there's a page at a
particular index.

Fix the problem by using pagevec_lookup_range() instead which avoids the
needless iteration.

Fixes: e64855c6cfaa ("fs: Add helper to clean bdev aliases under a bh and use it")
Link: http://lkml.kernel.org/r/20170726114704.7626-5-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 5b20893708e2..7e531bb356bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1627,19 +1627,18 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 	struct pagevec pvec;
 	pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pgoff_t end;
-	int i;
+	int i, count;
 	struct buffer_head *bh;
 	struct buffer_head *head;
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pagevec_init(&pvec, 0);
-	while (index <= end && pagevec_lookup(&pvec, bd_mapping, &index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
-		for (i = 0; i < pagevec_count(&pvec); i++) {
+	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end,
+				    PAGEVEC_SIZE)) {
+		count = pagevec_count(&pvec);
+		for (i = 0; i < count; i++) {
 			struct page *page = pvec.pages[i];
 
-			if (page->index > end)
-				break;
 			if (!page_has_buffers(page))
 				continue;
 			/*
@@ -1669,6 +1668,9 @@ unlock_page:
 		}
 		pagevec_release(&pvec);
 		cond_resched();
+		/* End of range already reached? */
+		if (index > end || !index)
+			break;
 	}
 }
 EXPORT_SYMBOL(clean_bdev_aliases);

From dec0da7b608253b004daea1da5bd82b3b292ba0f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:27 -0700
Subject: [PATCH 052/119] ext4: use pagevec_lookup_range() in
 ext4_find_unwritten_pgoff()

Use pagevec_lookup_range() in ext4_find_unwritten_pgoff() since we are
interested only in pages in the given range.  Simplify the logic as a
result of not getting pages out of range and index getting automatically
advanced.

Link: http://lkml.kernel.org/r/20170726114704.7626-6-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/file.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1c73b21fdc13..726344809721 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -464,12 +464,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 
 	pagevec_init(&pvec, 0);
 	do {
-		int i, num;
+		int i;
 		unsigned long nr_pages;
 
-		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
-					  (pgoff_t)num);
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+					&index, end, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 
@@ -488,9 +487,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 				goto out;
 			}
 
-			if (page->index > end)
-				goto out;
-
 			lock_page(page);
 
 			if (unlikely(page->mapping != inode->i_mapping)) {
@@ -533,12 +529,10 @@ next:
 			unlock_page(page);
 		}
 
-		/* The no. of pages is less than our desired, we are done. */
-		if (nr_pages < num)
-			break;
 		pagevec_release(&pvec);
 	} while (index <= end);
 
+	/* There are no pages upto endoff - that would be a hole in there. */
 	if (whence == SEEK_HOLE && lastoff < endoff) {
 		found = 1;
 		*offset = lastoff;

From 2b85a6171d1370903d65cc8b84cefca26a16b5e4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:30 -0700
Subject: [PATCH 053/119] ext4: use pagevec_lookup_range() in writeback code

Both occurences of pagevec_lookup() actually want only pages from a
given range.  Use pagevec_lookup_range() for the lookup.

Link: http://lkml.kernel.org/r/20170726114704.7626-7-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/inode.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3ce1c6d9f23..6103ce0430df 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,13 +1720,13 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, &index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end,
+						PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			if (page->index > end)
-				break;
+
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			if (invalidate) {
@@ -2347,15 +2347,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &start,
-					  PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+						&start, end, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			if (page->index > end)
-				break;
 			bh = head = page_buffers(page);
 			do {
 				if (lblk < mpd->map.m_lblk)

From 48f2301c07b299f5bcd64563d90215a8ba73e154 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:33 -0700
Subject: [PATCH 054/119] hugetlbfs: use pagevec_lookup_range() in
 remove_inode_hugepages()

We want only pages from given range in remove_inode_hugepages().  Use
pagevec_lookup_range() instead of pagevec_lookup().

Link: http://lkml.kernel.org/r/20170726114704.7626-8-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9678ce91e25..8931236f3ef4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -403,7 +403,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	struct pagevec pvec;
 	pgoff_t next, index;
 	int i, freed = 0;
-	long lookup_nr = PAGEVEC_SIZE;
 	bool truncate_op = (lend == LLONG_MAX);
 
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
@@ -411,31 +410,18 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next < end) {
-		/*
-		 * Don't grab more pages than the number left in the range.
-		 */
-		if (end - next < lookup_nr)
-			lookup_nr = end - next;
-
 		/*
 		 * When no more pages are found, we are done.
 		 */
-		if (!pagevec_lookup(&pvec, mapping, &next, lookup_nr))
+		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1,
+					  PAGEVEC_SIZE))
 			break;
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
 			u32 hash;
 
-			/*
-			 * The page (index) could be beyond end.  This is
-			 * only possible in the punch hole case as end is
-			 * max page offset in the truncate case.
-			 */
 			index = page->index;
-			if (index >= end)
-				break;
-
 			hash = hugetlb_fault_mutex_hash(h, current->mm,
 							&pseudo_vma,
 							mapping, index, 0);

From 8338141f0f51325de5fd096e4179d01dac6f3655 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:37 -0700
Subject: [PATCH 055/119] fs: use pagevec_lookup_range() in
 page_cache_seek_hole_data()

We want only pages from given range in page_cache_seek_hole_data().  Use
pagevec_lookup_range() instead of pagevec_lookup() and remove
unnecessary code.

Note that the check for getting less pages than desired can be removed
because index gets updated by pagevec_lookup_range().

Link: http://lkml.kernel.org/r/20170726114704.7626-9-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 7e531bb356bd..8770b58ca569 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3549,11 +3549,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 	pagevec_init(&pvec, 0);
 
 	do {
-		unsigned want, nr_pages, i;
+		unsigned nr_pages, i;
 
-		want = min_t(unsigned, end - index, PAGEVEC_SIZE);
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
-					  want);
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+						end - 1, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 
@@ -3574,10 +3573,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 			    lastoff < page_offset(page))
 				goto check_range;
 
-			/* Searching done if the page index is out of range. */
-			if (page->index >= end)
-				goto not_found;
-
 			lock_page(page);
 			if (likely(page->mapping == inode->i_mapping) &&
 			    page_has_buffers(page)) {
@@ -3590,11 +3585,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 			unlock_page(page);
 			lastoff = page_offset(page) + PAGE_SIZE;
 		}
-
-		/* Searching done if fewer pages returned than wanted. */
-		if (nr_pages < want)
-			break;
-
 		pagevec_release(&pvec);
 	} while (index < end);
 

From f7b68046873724129798c405e1a4e326b409c08f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:40 -0700
Subject: [PATCH 056/119] mm: use find_get_pages_range() in
 filemap_range_has_page()

We want only pages from given range in filemap_range_has_page(),
furthermore we want at most a single page.

So use find_get_pages_range() instead of pagevec_lookup() and remove
unnecessary code.

Link: http://lkml.kernel.org/r/20170726114704.7626-10-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 129883f160a7..84617a05db50 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -393,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
 {
 	pgoff_t index = start_byte >> PAGE_SHIFT;
 	pgoff_t end = end_byte >> PAGE_SHIFT;
-	struct pagevec pvec;
-	bool ret;
+	struct page *page;
 
 	if (end_byte < start_byte)
 		return false;
@@ -402,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
 	if (mapping->nrpages == 0)
 		return false;
 
-	pagevec_init(&pvec, 0);
-	if (!pagevec_lookup(&pvec, mapping, &index, 1))
+	if (!find_get_pages_range(mapping, &index, end, 1, &page))
 		return false;
-	ret = (pvec.pages[0]->index <= end);
-	pagevec_release(&pvec);
-	return ret;
+	put_page(page);
+	return true;
 }
 EXPORT_SYMBOL(filemap_range_has_page);
 

From 397162ffa2ed1cadffe05c324c6ddc53647f9c62 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:43 -0700
Subject: [PATCH 057/119] mm: remove nr_pages argument from
 pagevec_lookup{,_range}()

All users of pagevec_lookup() and pagevec_lookup_range() now pass
PAGEVEC_SIZE as a desired number of pages.

Just drop the argument.

Link: http://lkml.kernel.org/r/20170726114704.7626-11-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c             | 5 ++---
 fs/ext4/file.c          | 2 +-
 fs/ext4/inode.c         | 5 ++---
 fs/fscache/page.c       | 2 +-
 fs/hugetlbfs/inode.c    | 3 +--
 fs/nilfs2/page.c        | 2 +-
 include/linux/pagevec.h | 7 +++----
 mm/swap.c               | 5 ++---
 8 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 8770b58ca569..50da0e102ca0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1633,8 +1633,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pagevec_init(&pvec, 0);
-	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end,
-				    PAGEVEC_SIZE)) {
+	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 		count = pagevec_count(&pvec);
 		for (i = 0; i < count; i++) {
 			struct page *page = pvec.pages[i];
@@ -3552,7 +3551,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		unsigned nr_pages, i;
 
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1, PAGEVEC_SIZE);
+						end - 1);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 726344809721..484c1326dd6a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		unsigned long nr_pages;
 
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-					&index, end, PAGEVEC_SIZE);
+					&index, end);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6103ce0430df..ce68a3483666 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,8 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end,
-						PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
@@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-						&start, end, PAGEVEC_SIZE);
+						&start, end);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 83018861dcd2..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,7 +1178,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 	pagevec_init(&pvec, 0);
 	next = 0;
 	do {
-		if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE))
+		if (!pagevec_lookup(&pvec, mapping, &next))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8931236f3ef4..7c02b3f738e1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -413,8 +413,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 		/*
 		 * When no more pages are found, we are done.
 		 */
-		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1,
-					  PAGEVEC_SIZE))
+		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
 			break;
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 382a36c72d72..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,7 +312,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 
 	pagevec_init(&pvec, 0);
 repeat:
-	n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE);
+	n = pagevec_lookup(&pvec, smap, &index);
 	if (!n)
 		return;
 
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7df056910437..4dcd5506f1ed 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -29,13 +29,12 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
-			      pgoff_t *start, pgoff_t end, unsigned nr_pages);
+			      pgoff_t *start, pgoff_t end);
 static inline unsigned pagevec_lookup(struct pagevec *pvec,
 				      struct address_space *mapping,
-				      pgoff_t *start, unsigned nr_pages)
+				      pgoff_t *start)
 {
-	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1,
-				    nr_pages);
+	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
 }
 
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index e06e9aa2478e..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -967,10 +967,9 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
  * reached.
  */
 unsigned pagevec_lookup_range(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *start, pgoff_t end,
-		unsigned nr_pages)
+		struct address_space *mapping, pgoff_t *start, pgoff_t end)
 {
-	pvec->nr = find_get_pages_range(mapping, start, end, nr_pages,
+	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
 					pvec->pages);
 	return pagevec_count(pvec);
 }

From 63677c745d63ba75ef97a7728c85168ddd6c0040 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 6 Sep 2017 16:21:47 -0700
Subject: [PATCH 058/119] mm, memcg: reset memory.low during memcg offlining

A removed memory cgroup with a defined memory.low and some belonging
pagecache has very low chances to be freed.

If a cgroup has been removed, there is likely no memory pressure inside
the cgroup, and the pagecache is protected from the external pressure by
the defined low limit.  The cgroup will be freed only after the reclaim
of all belonging pages.  And it will not happen until there are any
reclaimable memory in the system.  That means, there is a good chance,
that a cold pagecache will reside in the memory for an undefined amount
of time, wasting system resources.

This problem was fixed earlier by fa06235b8eb0 ("cgroup: reset css on
destruction"), but it's not a best way to do it, as we can't really
reset all limits/counters during cgroup offlining.

Link: http://lkml.kernel.org/r/20170727130428.28856-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e09741af816f..2926c44519b6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4319,6 +4319,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	}
 	spin_unlock(&memcg->event_list_lock);
 
+	memcg->low = 0;
+
 	memcg_offline_kmem(memcg);
 	wb_memcg_offline(memcg);
 

From 65f3975f3584eee2da88b11f06f66e2d39fd30d0 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 6 Sep 2017 16:21:50 -0700
Subject: [PATCH 059/119] cgroup: revert fa06235b8eb0 ("cgroup: reset css on
 destruction")

Commit fa06235b8eb0 ("cgroup: reset css on destruction") caused
css_reset callback to be called from the offlining path.  Although it
solves the problem mentioned in the commit description ("For instance,
memory cgroup needs to reset memory.low, otherwise pages charged to a
dead cgroup might never get reclaimed."), generally speaking, it's not
correct.

An offline cgroup can still be a resource domain, and we shouldn't grant
it more resources than it had before deletion.

For instance, if an offline memory cgroup has dirty pages, we should
still imply i/o limits during writeback.

The css_reset callback is designed to return the cgroup state into the
original state, that means reset all limits and counters.  It's
spomething different from the offlining, and we shouldn't use it from
the offlining path.  Instead, we should adjust necessary settings from
the per-controller css_offline callbacks (e.g.  reset memory.low).

Link: http://lkml.kernel.org/r/20170727130428.28856-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..f64fc967a9ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css)
 	if (!(css->flags & CSS_ONLINE))
 		return;
 
-	if (ss->css_reset)
-		ss->css_reset(css);
-
 	if (ss->css_offline)
 		ss->css_offline(css);
 

From f907c26a9166a1f4076940b9146d54bd1043db77 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 6 Sep 2017 16:21:53 -0700
Subject: [PATCH 060/119] mm/ksm.c: constify attribute_group structures

attribute_group are not supposed to change at runtime.  All functions
working with attribute_group provided by <linux/sysfs.h> work with const
attribute_group.  So mark the non-const structs as const.

Link: http://lkml.kernel.org/r/1501157167-3706-2-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index db20f8436bc3..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group ksm_attr_group = {
+static const struct attribute_group ksm_attr_group = {
 	.attrs = ksm_attrs,
 	.name = "ksm",
 };

From 1fdaaa2329d15559de8446a8d3d4d72635704eab Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 6 Sep 2017 16:21:56 -0700
Subject: [PATCH 061/119] mm/slub.c: constify attribute_group structures

attribute_group are not supposed to change at runtime.  All functions
working with attribute_group provided by <linux/sysfs.h> work with const
attribute_group.  So mark the non-const structs as const.

Link: http://lkml.kernel.org/r/1501157186-3749-1-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 16a60f871f39..ddb04576b342 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5459,7 +5459,7 @@ static struct attribute *slab_attrs[] = {
 	NULL
 };
 
-static struct attribute_group slab_attr_group = {
+static const struct attribute_group slab_attr_group = {
 	.attrs = slab_attrs,
 };
 

From fd147cbb6a337673cc267358a602d23f085325f6 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 6 Sep 2017 16:21:59 -0700
Subject: [PATCH 062/119] mm/page_idle.c: constify attribute_group structures

attribute_group are not supposed to change at runtime.  All functions
working with attribute_group provided by <linux/sysfs.h> work with const
attribute_group.  So mark the non-const structs as const.

Link: http://lkml.kernel.org/r/1501157221-3832-1-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1b0f48c62316..4bd03a8d809e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group page_idle_attr_group = {
+static const struct attribute_group page_idle_attr_group = {
 	.bin_attrs = page_idle_bin_attrs,
 	.name = "page_idle",
 };

From 8aa95a21bc51205112b3a416810666616287b49b Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 6 Sep 2017 16:22:03 -0700
Subject: [PATCH 063/119] mm/huge_memory.c: constify attribute_group structures

attribute_group are not supposed to change at runtime.  All functions
working with attribute_group provided by <linux/sysfs.h> work with const
attribute_group.  So mark the non-const structs as const.

Link: http://lkml.kernel.org/r/1501157240-3876-1-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3644ff918434..7f0b3f6db923 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
 	NULL,
 };
 
-static struct attribute_group hugepage_attr_group = {
+static const struct attribute_group hugepage_attr_group = {
 	.attrs = hugepage_attr,
 };
 

From 67e5ed969944d7c6d93f658a188240cf60c49f71 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 6 Sep 2017 16:22:06 -0700
Subject: [PATCH 064/119] mm/hugetlb.c: constify attribute_group structures

attribute_group are not supposed to change at runtime.  All functions
working with attribute_group provided by <linux/sysfs.h> work with const
attribute_group.  So mark the non-const structs as const.

Link: http://lkml.kernel.org/r/1501157260-3922-1-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1d54a131bdd5..cafd60316e68 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2569,13 +2569,13 @@ static struct attribute *hstate_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group hstate_attr_group = {
+static const struct attribute_group hstate_attr_group = {
 	.attrs = hstate_attrs,
 };
 
 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
 				    struct kobject **hstate_kobjs,
-				    struct attribute_group *hstate_attr_group)
+				    const struct attribute_group *hstate_attr_group)
 {
 	int retval;
 	int hi = hstate_index(h);
@@ -2633,7 +2633,7 @@ static struct attribute *per_node_hstate_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group per_node_hstate_attr_group = {
+static const struct attribute_group per_node_hstate_attr_group = {
 	.attrs = per_node_hstate_attrs,
 };
 

From 04fecbf51b3cb79a628d50b65797c4866342b8d2 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 6 Sep 2017 16:22:09 -0700
Subject: [PATCH 065/119] mm: memcontrol: use int for event/state parameter in
 several functions

Several functions use an enum type as parameter for an event/state, but
are called in some locations with an argument of a different enum type.
Adjust the interface of these functions to reality by changing the
parameter to int.

This fixes a ton of enum-conversion warnings that are generated when
building the kernel with clang.

[mka@chromium.org: also change parameter type of inc/dec/mod_memcg_page_state()]
  Link: http://lkml.kernel.org/r/20170728213442.93823-1-mka@chromium.org
Link: http://lkml.kernel.org/r/20170727211004.34435-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Doug Anderson <dianders@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 52 +++++++++++++++++++++++---------------
 mm/memcontrol.c            |  4 ++-
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b15a4bcfa77..69966c461d1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
 void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     enum memcg_stat_item idx)
+					     int idx)
 {
 	long val = 0;
 	int cpu;
@@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 	return val;
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx, int val)
+				     int idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		__this_cpu_add(memcg->stat->count[idx], val);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx, int val)
+				   int idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		this_cpu_add(memcg->stat->count[idx], val);
@@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
  * Kernel pages are an exception to this, since they'll never move.
  */
 static inline void __mod_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx, int val)
+					  int idx, int val)
 {
 	if (page->mem_cgroup)
 		__mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx, int val)
+					int idx, int val)
 {
 	if (page->mem_cgroup)
 		mod_memcg_state(page->mem_cgroup, idx, val);
@@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 		this_cpu_add(memcg->stat->events[idx], count);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void count_memcg_page_event(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	if (page->mem_cgroup)
 		count_memcg_events(page->mem_cgroup, idx, 1);
@@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 }
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     enum memcg_stat_item idx)
+					     int idx)
 {
 	return 0;
 }
 
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx,
+				     int idx,
 				     int nr)
 {
 }
 
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx,
+				   int idx,
 				   int nr)
 {
 }
 
 static inline void __mod_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx,
+					  int idx,
 					  int nr)
 {
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx,
+					int idx,
 					int nr)
 {
 }
@@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 }
 
 static inline void count_memcg_page_event(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 }
 
@@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx)
+				     int idx)
 {
 	__mod_memcg_state(memcg, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx)
+				     int idx)
 {
 	__mod_memcg_state(memcg, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	__mod_memcg_page_state(page, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	__mod_memcg_page_state(page, idx, -1);
 }
@@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page,
 	__mod_lruvec_page_state(page, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+				   int idx)
 {
 	mod_memcg_state(memcg, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+				   int idx)
 {
 	mod_memcg_state(memcg, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+					int idx)
 {
 	mod_memcg_page_state(page, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+					int idx)
 {
 	mod_memcg_page_state(page, idx, -1);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2926c44519b6..3d3f7b1686ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
  */
 
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-				      enum memcg_event_item event)
+				      int event)
 {
 	unsigned long val = 0;
 	int cpu;

From a3aea839e42ef8d76bb58091ab7f5a45a85ea299 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:12 -0700
Subject: [PATCH 066/119] mm, THP, swap: support to clear swap cache flag for
 THP swapped out

Patch series "mm, THP, swap: Delay splitting THP after swapped out", v3.

This is the second step of THP (Transparent Huge Page) swap
optimization.  In the first step, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache.  In the second
step, the splitting is delayed further to after the swapping out
finished.  The plan is to delay splitting THP step by step, finally
avoid splitting THP for the THP swapping out and swap out/in the THP as
a whole.

In the patchset, more operations for the anonymous THP reclaiming, such
as TLB flushing, writing the THP to the swap device, removing the THP
from the swap cache are batched.  So that the performance of anonymous
THP swapping out are improved.

During the development, the following scenarios/code paths have been
checked,

 - swap out/in
 - swap off
 - write protect page fault
 - madvise_free
 - process exit
 - split huge page

With the patchset, the swap out throughput improves 42% (from about
5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case
with 16 processes.  At the same time, the IPI (reflect TLB flushing)
reduced about 78.9%.  The test is done on a Xeon E5 v3 system.  The swap
device used is a RAM simulated PMEM (persistent memory) device.  To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.

Below is the part of the cover letter for the first step patchset of THP
swap optimization which applies to all steps.

=========================

Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine.  Because the
performance of the storage device improved faster than that of single
logical CPU.  And it seems that the trend will not change in the near
future.  On the other hand, the THP becomes more and more popular
because of increased memory size.  So it becomes necessary to optimize
THP swap performance.

The advantages of the THP swap support include:

 - Batch the swap operations for the THP to reduce TLB flushing and lock
   acquiring/releasing, including allocating/freeing the swap space,
   adding/deleting to/from the swap cache, and writing/reading the swap
   space, etc. This will help improve the performance of the THP swap.

 - The THP swap space read/write will be 2M sequential IO. It is
   particularly helpful for the swap read, which are usually 4k random
   IO. This will improve the performance of the THP swap too.

 - It will help the memory fragmentation, especially when the THP is
   heavily used by the applications. The 2M continuous pages will be
   free up after THP swapping out.

 - It will improve the THP utilization on the system with the swap
   turned on. Because the speed for khugepaged to collapse the normal
   pages into the THP is quite slow. After the THP is split during the
   swapping out, it will take quite long time for the normal pages to
   collapse back into the THP after being swapped in. The high THP
   utilization helps the efficiency of the page based memory management
   too.

There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device.  To deal with that, the THP swap in should be turned
on only when necessary.

For example, it can be selected via "always/never/madvise" logic, to be
turned on globally, turned off globally, or turned on only for VMA with
MADV_HUGEPAGE, etc.

This patch (of 12):

Previously, swapcache_free_cluster() is used only in the error path of
shrink_page_list() to free the swap cluster just allocated if the THP
(Transparent Huge Page) is failed to be split.  In this patch, it is
enhanced to clear the swap cache flag (SWAP_HAS_CACHE) for the swap
cluster that holds the contents of THP swapped out.

This will be used in delaying splitting THP after swapping out support.
Because there is no THP swapping in as a whole support yet, after
clearing the swap cache flag, the swap cluster backing the THP swapped
out will be split.  So that the swap slots in the swap cluster can be
swapped in as normal pages later.

Link: http://lkml.kernel.org/r/20170724051840.2309-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..c32e9b23d642 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1168,22 +1168,40 @@ static void swapcache_free_cluster(swp_entry_t entry)
 	struct swap_cluster_info *ci;
 	struct swap_info_struct *si;
 	unsigned char *map;
-	unsigned int i;
+	unsigned int i, free_entries = 0;
+	unsigned char val;
 
-	si = swap_info_get(entry);
+	si = _swap_info_get(entry);
 	if (!si)
 		return;
 
 	ci = lock_cluster(si, offset);
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-		VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
-		map[i] = 0;
+		val = map[i];
+		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+		if (val == SWAP_HAS_CACHE)
+			free_entries++;
+	}
+	if (!free_entries) {
+		for (i = 0; i < SWAPFILE_CLUSTER; i++)
+			map[i] &= ~SWAP_HAS_CACHE;
 	}
 	unlock_cluster(ci);
-	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-	swap_free_cluster(si, idx);
-	spin_unlock(&si->lock);
+	if (free_entries == SWAPFILE_CLUSTER) {
+		spin_lock(&si->lock);
+		ci = lock_cluster(si, offset);
+		memset(map, 0, SWAPFILE_CLUSTER);
+		unlock_cluster(ci);
+		mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+		swap_free_cluster(si, idx);
+		spin_unlock(&si->lock);
+	} else if (free_entries) {
+		for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+			if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+				free_swap_slot(entry);
+		}
+	}
 }
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)

From e07098294adfd03d582af7626752255e3d170393 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:16 -0700
Subject: [PATCH 067/119] mm, THP, swap: support to reclaim swap space for THP
 swapped out

The normal swap slot reclaiming can be done when the swap count reaches
SWAP_HAS_CACHE.  But for the swap slot which is backing a THP, all swap
slots backing one THP must be reclaimed together, because the swap slot
may be used again when the THP is swapped out again later.  So the swap
slots backing one THP can be reclaimed together when the swap count for
all swap slots for the THP reached SWAP_HAS_CACHE.  In the patch, the
functions to check whether the swap count for all swap slots backing one
THP reached SWAP_HAS_CACHE are implemented and used when checking
whether a swap slot can be reclaimed.

To make it easier to determine whether a swap slot is backing a THP, a
new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap
cluster which is backing a THP (Transparent Huge Page).  Because THP
swap in as a whole isn't supported now.  After deleting the THP from the
swap cache (for example, swapping out finished), the CLUSTER_FLAG_HUGE
flag will be cleared.  So that, the normal pages inside THP can be
swapped in individually.

[ying.huang@intel.com: fix swap_page_trans_huge_swapped on HDD]
  Link: http://lkml.kernel.org/r/874ltsm0bi.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/20170724051840.2309-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 78 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..964b4f1fba4a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,6 +188,7 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c32e9b23d642..164d9624d7d2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -265,6 +265,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+	return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+	info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 						     unsigned long offset)
 {
@@ -846,7 +856,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 	offset = idx * SWAPFILE_CLUSTER;
 	ci = lock_cluster(si, offset);
 	alloc_cluster(si, idx);
-	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -1176,6 +1186,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		return;
 
 	ci = lock_cluster(si, offset);
+	VM_BUG_ON(!cluster_is_huge(ci));
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
 		val = map[i];
@@ -1187,6 +1198,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		for (i = 0; i < SWAPFILE_CLUSTER; i++)
 			map[i] &= ~SWAP_HAS_CACHE;
 	}
+	cluster_clear_huge(ci);
 	unlock_cluster(ci);
 	if (free_entries == SWAPFILE_CLUSTER) {
 		spin_lock(&si->lock);
@@ -1350,6 +1362,54 @@ out:
 	return count;
 }
 
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+					 swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned char *map = si->swap_map;
+	unsigned long roffset = swp_offset(entry);
+	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+	int i;
+	bool ret = false;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	if (!ci || !cluster_is_huge(ci)) {
+		if (map[roffset] != SWAP_HAS_CACHE)
+			ret = true;
+		goto unlock_out;
+	}
+	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		if (map[offset + i] != SWAP_HAS_CACHE) {
+			ret = true;
+			break;
+		}
+	}
+unlock_out:
+	unlock_cluster_or_swap_info(si, ci);
+	return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+	swp_entry_t entry;
+	struct swap_info_struct *si;
+
+	if (likely(!PageTransCompound(page)))
+		return page_swapcount(page) != 0;
+
+	page = compound_head(page);
+	entry.val = page_private(page);
+	si = _swap_info_get(entry);
+	if (si)
+		return swap_page_trans_huge_swapped(si, entry);
+	return false;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
+#define page_swapped(page)			(page_swapcount(page) != 0)
+#endif
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
@@ -1404,7 +1464,7 @@ int try_to_free_swap(struct page *page)
 		return 0;
 	if (PageWriteback(page))
 		return 0;
-	if (page_swapcount(page))
+	if (page_swapped(page))
 		return 0;
 
 	/*
@@ -1425,6 +1485,7 @@ int try_to_free_swap(struct page *page)
 	if (pm_suspended_storage())
 		return 0;
 
+	page = compound_head(page);
 	delete_from_swap_cache(page);
 	SetPageDirty(page);
 	return 1;
@@ -1446,7 +1507,8 @@ int free_swap_and_cache(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		count = __swap_entry_free(p, entry, 1);
-		if (count == SWAP_HAS_CACHE) {
+		if (count == SWAP_HAS_CACHE &&
+		    !swap_page_trans_huge_swapped(p, entry)) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
@@ -1463,7 +1525,8 @@ int free_swap_and_cache(swp_entry_t entry)
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
 		    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-		    !swap_swapcount(p, entry)) {
+		    !swap_page_trans_huge_swapped(p, entry)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
@@ -2017,7 +2080,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 				.sync_mode = WB_SYNC_NONE,
 			};
 
-			swap_writepage(page, &wbc);
+			swap_writepage(compound_head(page), &wbc);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
@@ -2030,8 +2093,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		 * delete, since it may not have been written out to swap yet.
 		 */
 		if (PageSwapCache(page) &&
-		    likely(page_private(page) == entry.val))
-			delete_from_swap_cache(page);
+		    likely(page_private(page) == entry.val) &&
+		    !page_swapped(page))
+			delete_from_swap_cache(compound_head(page));
 
 		/*
 		 * So we could skip searching mms once swap count went

From ba3c4ce6def4915093be80585ff69f780630f32f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:19 -0700
Subject: [PATCH 068/119] mm, THP, swap: make reuse_swap_page() works for THP
 swapped out

After supporting to delay THP (Transparent Huge Page) splitting after
swapped out, it is possible that some page table mappings of the THP are
turned into swap entries.  So reuse_swap_page() need to check the swap
count in addition to the map count as before.  This patch done that.

In the huge PMD write protect fault handler, in addition to the page map
count, the swap count need to be checked too, so the page lock need to
be acquired too when calling reuse_swap_page() in addition to the page
table lock.

[ying.huang@intel.com: silence a compiler warning]
  Link: http://lkml.kernel.org/r/87bmnzizjy.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/20170724051840.2309-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   4 +-
 mm/huge_memory.c     |  16 ++++++-
 mm/memory.c          |   6 +--
 mm/swapfile.c        | 102 +++++++++++++++++++++++++++++++++++++++----
 4 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 964b4f1fba4a..7176ba780e83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -510,8 +510,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page, total_mapcount) \
-	(page_trans_huge_mapcount(page, total_mapcount) == 1)
+#define reuse_swap_page(page, total_map_swapcount) \
+	(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7f0b3f6db923..02dfe635c9fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 	 * We can only reuse the page if nobody else maps the huge page or it's
 	 * part.
 	 */
-	if (page_trans_huge_mapcount(page, NULL) == 1) {
+	if (!trylock_page(page)) {
+		get_page(page);
+		spin_unlock(vmf->ptl);
+		lock_page(page);
+		spin_lock(vmf->ptl);
+		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			unlock_page(page);
+			put_page(page);
+			goto out_unlock;
+		}
+		put_page(page);
+	}
+	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		ret |= VM_FAULT_WRITE;
+		unlock_page(page);
 		goto out_unlock;
 	}
+	unlock_page(page);
 	get_page(page);
 	spin_unlock(vmf->ptl);
 alloc:
diff --git a/mm/memory.c b/mm/memory.c
index 1416485e278c..3dd8bb46391b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2619,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
 	 * not dirty accountable.
 	 */
 	if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
-		int total_mapcount;
+		int total_map_swapcount;
 		if (!trylock_page(vmf->page)) {
 			get_page(vmf->page);
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2634,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
 			}
 			put_page(vmf->page);
 		}
-		if (reuse_swap_page(vmf->page, &total_mapcount)) {
-			if (total_mapcount == 1) {
+		if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+			if (total_map_swapcount == 1) {
 				/*
 				 * The page is all ours. Move it to
 				 * our anon_vma so the rmap code will
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 164d9624d7d2..2bfbfb87123a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1405,9 +1405,89 @@ static bool page_swapped(struct page *page)
 		return swap_page_trans_huge_swapped(si, entry);
 	return false;
 }
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int i, map_swapcount, _total_mapcount, _total_swapcount;
+	unsigned long offset = 0;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci = NULL;
+	unsigned char *map = NULL;
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	if (likely(!PageTransCompound(page))) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (total_mapcount)
+			*total_mapcount = mapcount;
+		if (PageSwapCache(page))
+			swapcount = page_swapcount(page);
+		if (total_swapcount)
+			*total_swapcount = swapcount;
+		return mapcount + swapcount;
+	}
+
+	page = compound_head(page);
+
+	_total_mapcount = _total_swapcount = map_swapcount = 0;
+	if (PageSwapCache(page)) {
+		swp_entry_t entry;
+
+		entry.val = page_private(page);
+		si = _swap_info_get(entry);
+		if (si) {
+			map = si->swap_map;
+			offset = swp_offset(entry);
+		}
+	}
+	if (map)
+		ci = lock_cluster(si, offset);
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		mapcount = atomic_read(&page[i]._mapcount) + 1;
+		_total_mapcount += mapcount;
+		if (map) {
+			swapcount = swap_count(map[offset + i]);
+			_total_swapcount += swapcount;
+		}
+		map_swapcount = max(map_swapcount, mapcount + swapcount);
+	}
+	unlock_cluster(ci);
+	if (PageDoubleMap(page)) {
+		map_swapcount -= 1;
+		_total_mapcount -= HPAGE_PMD_NR;
+	}
+	mapcount = compound_mapcount(page);
+	map_swapcount += mapcount;
+	_total_mapcount += mapcount;
+	if (total_mapcount)
+		*total_mapcount = _total_mapcount;
+	if (total_swapcount)
+		*total_swapcount = _total_swapcount;
+
+	return map_swapcount;
+}
 #else
 #define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
 #define page_swapped(page)			(page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	mapcount = page_trans_huge_mapcount(page, total_mapcount);
+	if (PageSwapCache(page))
+		swapcount = page_swapcount(page);
+	if (total_swapcount)
+		*total_swapcount = swapcount;
+	return mapcount + swapcount;
+}
 #endif
 
 /*
@@ -1416,23 +1496,27 @@ static bool page_swapped(struct page *page)
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
  *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
  * reuse_swap_page() returns false, but it may be always overwritten
  * (see the other implementation for CONFIG_SWAP=n).
  */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-	int count;
+	int count, total_mapcount, total_swapcount;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (unlikely(PageKsm(page)))
 		return false;
-	count = page_trans_huge_mapcount(page, total_mapcount);
-	if (count <= 1 && PageSwapCache(page)) {
-		count += page_swapcount(page);
-		if (count != 1)
-			goto out;
+	count = page_trans_huge_map_swapcount(page, &total_mapcount,
+					      &total_swapcount);
+	if (total_map_swapcount)
+		*total_map_swapcount = total_mapcount + total_swapcount;
+	if (count == 1 && PageSwapCache(page) &&
+	    (likely(!PageTransCompound(page)) ||
+	     /* The remaining swap count will be freed soon */
+	     total_swapcount == page_swapcount(page))) {
 		if (!PageWriteback(page)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		} else {
@@ -1448,7 +1532,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
 			spin_unlock(&p->lock);
 		}
 	}
-out:
+
 	return count <= 1;
 }
 

From f0eea189e8e969b66e03bac8a7d92888ba267854 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:23 -0700
Subject: [PATCH 069/119] mm, THP, swap: don't allocate huge cluster for file
 backed swap device

It's hard to write a whole transparent huge page (THP) to a file backed
swap device during swapping out and the file backed swap device isn't
very popular.  So the huge cluster allocation for the file backed swap
device is disabled.

Link: http://lkml.kernel.org/r/20170724051840.2309-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bfbfb87123a..267b1fe41844 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -948,9 +948,10 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-		if (cluster)
-			n_ret = swap_alloc_cluster(si, swp_entries);
-		else
+		if (cluster) {
+			if (!(si->flags & SWP_FILE))
+				n_ret = swap_alloc_cluster(si, swp_entries);
+		} else
 			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 						    n_goal, swp_entries);
 		spin_unlock(&si->lock);

From 98cc093cba1e925eb34963dedb5f1684f1bdb2f4 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:27 -0700
Subject: [PATCH 070/119] block, THP: make block_device_operations.rw_page
 support THP

The .rw_page in struct block_device_operations is used by the swap
subsystem to read/write the page contents from/into the corresponding
swap slot in the swap device.  To support the THP (Transparent Huge
Page) swap optimization, the .rw_page is enhanced to support to
read/write THP if possible.

Link: http://lkml.kernel.org/r/20170724051840.2309-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/brd.c           |  6 +++++-
 drivers/block/zram/zram_drv.c |  2 ++
 drivers/nvdimm/btt.c          |  4 +++-
 drivers/nvdimm/pmem.c         | 37 ++++++++++++++++++++++++++---------
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 104b71c0490d..5d9ed0616413 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, bool is_write)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
-	int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
+	int err;
+
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
+	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
 	page_endio(page, is_write, err);
 	return err;
 }
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 4ddc18a0a04f..4a0438c4ef2a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1285,6 +1285,8 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct zram *zram;
 	struct bio_vec bv;
 
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
 	zram = bdev->bd_disk->private_data;
 
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 14323faf8bd9..60491641a8d6 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1241,8 +1241,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
 {
 	struct btt *btt = bdev->bd_disk->private_data;
 	int rc;
+	unsigned int len;
 
-	rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
+	len = hpage_nr_pages(page) * PAGE_SIZE;
+	rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
 	if (rc == 0)
 		page_endio(page, is_write, 0);
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f7099adaabc0..e9aa453da50c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -80,22 +80,40 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 static void write_pmem(void *pmem_addr, struct page *page,
 		unsigned int off, unsigned int len)
 {
-	void *mem = kmap_atomic(page);
+	unsigned int chunk;
+	void *mem;
 
-	memcpy_flushcache(pmem_addr, mem + off, len);
-	kunmap_atomic(mem);
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE);
+		memcpy_flushcache(pmem_addr, mem + off, chunk);
+		kunmap_atomic(mem);
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += PAGE_SIZE;
+	}
 }
 
 static blk_status_t read_pmem(struct page *page, unsigned int off,
 		void *pmem_addr, unsigned int len)
 {
+	unsigned int chunk;
 	int rc;
-	void *mem = kmap_atomic(page);
+	void *mem;
 
-	rc = memcpy_mcsafe(mem + off, pmem_addr, len);
-	kunmap_atomic(mem);
-	if (rc)
-		return BLK_STS_IOERR;
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE);
+		rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+		kunmap_atomic(mem);
+		if (rc)
+			return BLK_STS_IOERR;
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += PAGE_SIZE;
+	}
 	return BLK_STS_OK;
 }
 
@@ -188,7 +206,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	blk_status_t rc;
 
-	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
+	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
+			  0, is_write, sector);
 
 	/*
 	 * The ->rw_page interface is subtle and tricky.  The core

From 225311a46411c37e20e73d99f4382f141e12f6f9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:30 -0700
Subject: [PATCH 071/119] mm: test code to write THP to swap device as a whole

To support delay splitting THP (Transparent Huge Page) after swapped
out, we need to enhance swap writing code to support to write a THP as a
whole.  This will improve swap write IO performance.

As Ming Lei <ming.lei@redhat.com> pointed out, this should be based on
multipage bvec support, which hasn't been merged yet.  So this patch is
only for testing the functionality of the other patches in the series.
And will be reimplemented after multipage bvec support is merged.

Link: http://lkml.kernel.org/r/20170724051840.2309-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Shaohua Li <shli@kernel.org>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bio.h           |  8 ++++++++
 include/linux/page-flags.h    |  4 ++--
 include/linux/vm_event_item.h |  1 +
 mm/page_io.c                  | 21 ++++++++++++++++-----
 mm/vmstat.c                   |  1 +
 5 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b1cf4ba0902..1f0720de8990 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -38,7 +38,15 @@
 #define BIO_BUG_ON
 #endif
 
+#ifdef CONFIG_THP_SWAP
+#if HPAGE_PMD_NR > 256
+#define BIO_MAX_PAGES		HPAGE_PMD_NR
+#else
 #define BIO_MAX_PAGES		256
+#endif
+#else
+#define BIO_MAX_PAGES		256
+#endif
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d33e3280c8ad..ba2d470d2d0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
  */
-TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
-	TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
+	TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
 PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31a4632..c75024e80eed 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -85,6 +85,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
+		THP_SWPOUT,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/page_io.c b/mm/page_io.c
index 5f61b54ee1f3..20139b90125a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,16 +28,18 @@
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
+	int i, nr = hpage_nr_pages(page);
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc(gfp_flags, nr);
 	if (bio) {
 		bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
 		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
 		bio->bi_end_io = end_io;
 
-		bio_add_page(bio, page, PAGE_SIZE, 0);
-		BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+		for (i = 0; i < nr; i++)
+			bio_add_page(bio, page + i, PAGE_SIZE, 0);
+		VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
 	}
 	return bio;
 }
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page)
 	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 }
 
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (unlikely(PageTransHuge(page)))
+		count_vm_event(THP_SWPOUT);
+#endif
+	count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		bio_end_io_t end_write_func)
 {
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 	if (!ret) {
-		count_vm_event(PSWPOUT);
+		count_swpout_vm_event(page);
 		return 0;
 	}
 
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		goto out;
 	}
 	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-	count_vm_event(PSWPOUT);
+	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(bio);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..bccf426453cd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1071,6 +1071,7 @@ const char * const vmstat_text[] = {
 #endif
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
+	"thp_swpout",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",

From 59807685a7e77e8c8fe5925613968841538d53d7 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:34 -0700
Subject: [PATCH 072/119] mm, THP, swap: support splitting THP for THP swap out

After adding swapping out support for THP (Transparent Huge Page), it is
possible that a THP in swap cache (partly swapped out) need to be split.
To split such a THP, the swap cluster backing the THP need to be split
too, that is, the CLUSTER_FLAG_HUGE flag need to be cleared for the swap
cluster.  The patch implemented this.

And because the THP swap writing needs the THP keeps as huge page during
writing.  The PageWriteback flag is checked before splitting.

Link: http://lkml.kernel.org/r/20170724051840.2309-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  9 +++++++++
 mm/huge_memory.c     | 10 +++++++++-
 mm/swapfile.c        | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7176ba780e83..461cf107ad52 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#ifdef CONFIG_THP_SWAP
+extern int split_swap_cluster(swp_entry_t entry);
+#else
+static inline int split_swap_cluster(swp_entry_t entry)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 02dfe635c9fe..772048c233d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2481,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
+	if (PageWriteback(page))
+		return -EBUSY;
+
 	if (PageAnon(head)) {
 		/*
 		 * The caller does not necessarily hold an mmap_sem that would
@@ -2558,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			__dec_node_page_state(page, NR_SHMEM_THPS);
 		spin_unlock(&pgdata->split_queue_lock);
 		__split_huge_page(page, list, flags);
-		ret = 0;
+		if (PageSwapCache(head)) {
+			swp_entry_t entry = { .val = page_private(head) };
+
+			ret = split_swap_cluster(entry);
+		} else
+			ret = 0;
 	} else {
 		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 			pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 267b1fe41844..42eff9e4e972 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1216,6 +1216,21 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		}
 	}
 }
+
+int split_swap_cluster(swp_entry_t entry)
+{
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+
+	si = _swap_info_get(entry);
+	if (!si)
+		return -EBUSY;
+	ci = lock_cluster(si, offset);
+	cluster_clear_huge(ci);
+	unlock_cluster(ci);
+	return 0;
+}
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
 {

From 3e14a57b2416b7c94189b95baffd673cf5e0d0a3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:37 -0700
Subject: [PATCH 073/119] memcg, THP, swap: support move mem cgroup charge for
 THP swapped out

PTE mapped THP (Transparent Huge Page) will be ignored when moving
memory cgroup charge.  But for THP which is in the swap cache, the
memory cgroup charge for the swap of a tail-page may be moved in current
implementation.  That isn't correct, because the swap charge for all
sub-pages of a THP should be moved together.  Following the processing
of the PTE mapped THP, the mem cgroup charge moving for the swap entry
for a tail-page of a THP is ignored too.

Link: http://lkml.kernel.org/r/20170724051840.2309-9-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Shaohua Li <shli@kernel.org>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3d3f7b1686ac..2ae315e4d9f6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4639,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		if (!ret || !target)
 			put_page(page);
 	}
-	/* There is a swap entry and a page doesn't exist or isn't charged */
-	if (ent.val && !ret &&
+	/*
+	 * There is a swap entry and a page doesn't exist or isn't charged.
+	 * But we cannot move a tail-page in a THP.
+	 */
+	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)

From abe2895b76047bf5430990f2584cd91f76692218 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:41 -0700
Subject: [PATCH 074/119] memcg, THP, swap: avoid to duplicated charge THP in
 swap cache

For a THP (Transparent Huge Page), tail_page->mem_cgroup is NULL.  So to
check whether the page is charged already, we need to check the head
page.  This is not an issue before because it is impossible for a THP to
be in the swap cache before.  But after we add delaying splitting THP
after swapped out support, it is possible now.

Link: http://lkml.kernel.org/r/20170724051840.2309-10-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Shaohua Li <shli@kernel.org>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2ae315e4d9f6..e7f47a38fe4f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5430,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		 * in turn serializes uncharging.
 		 */
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
-		if (page->mem_cgroup)
+		if (compound_head(page)->mem_cgroup)
 			goto out;
 
 		if (do_swap_account) {

From d6810d730022016d9c0f389452b86b035dba1492 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:45 -0700
Subject: [PATCH 075/119] memcg, THP, swap: make mem_cgroup_swapout() support
 THP

This patch makes mem_cgroup_swapout() works for the transparent huge
page (THP).  Which will move the memory cgroup charge from memory to
swap for a THP.

This will be used for the THP swap support.  Where a THP may be swapped
out as a whole to a set of (HPAGE_PMD_NR) continuous swap slots on the
swap device.

Link: http://lkml.kernel.org/r/20170724051840.2309-11-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Shaohua Li <shli@kernel.org>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e7f47a38fe4f..c1f9b79817d7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4654,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5913,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
 	struct mem_cgroup *memcg, *swap_memcg;
+	unsigned int nr_entries;
 	unsigned short oldid;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5933,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * ancestor for the swap instead and transfer the memory+swap charge.
 	 */
 	swap_memcg = mem_cgroup_id_get_online(memcg);
-	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+	nr_entries = hpage_nr_pages(page);
+	/* Get references for the tail pages, too */
+	if (nr_entries > 1)
+		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+				   nr_entries);
 	VM_BUG_ON_PAGE(oldid, page);
-	mem_cgroup_swap_statistics(swap_memcg, 1);
+	mem_cgroup_swap_statistics(swap_memcg, nr_entries);
 
 	page->mem_cgroup = NULL;
 
 	if (!mem_cgroup_is_root(memcg))
-		page_counter_uncharge(&memcg->memory, 1);
+		page_counter_uncharge(&memcg->memory, nr_entries);
 
 	if (memcg != swap_memcg) {
 		if (!mem_cgroup_is_root(swap_memcg))
-			page_counter_charge(&swap_memcg->memsw, 1);
-		page_counter_uncharge(&memcg->memsw, 1);
+			page_counter_charge(&swap_memcg->memsw, nr_entries);
+		page_counter_uncharge(&memcg->memsw, nr_entries);
 	}
 
 	/*
@@ -5955,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * only synchronisation we have for udpating the per-CPU variables.
 	 */
 	VM_BUG_ON(!irqs_disabled());
-	mem_cgroup_charge_statistics(memcg, page, false, -1);
+	mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+				     -nr_entries);
 	memcg_check_events(memcg, page);
 
 	if (!mem_cgroup_is_root(memcg))

From bd4c82c22c367e068acb1ec9ec02be2fac3e09e2 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:49 -0700
Subject: [PATCH 076/119] mm, THP, swap: delay splitting THP after swapped out

In this patch, splitting transparent huge page (THP) during swapping out
is delayed from after adding the THP into the swap cache to after
swapping out finishes.  After the patch, more operations for the
anonymous THP reclaiming, such as writing the THP to the swap device,
removing the THP from the swap cache could be batched.  So that the
performance of anonymous THP swapping out could be improved.

This is the second step for the THP swap support.  The plan is to delay
splitting the THP step by step and avoid splitting the THP finally.

With the patchset, the swap out throughput improves 42% (from about
5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case
with 16 processes.  At the same time, the IPI (reflect TLB flushing)
reduced about 78.9%.  The test is done on a Xeon E5 v3 system.  The swap
device used is a RAM simulated PMEM (persistent memory) device.  To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.

Link: http://lkml.kernel.org/r/20170724051840.2309-12-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 95 +++++++++++++++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1638814c7848..6fbf707c0ce2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -536,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page)
 	 * that isolated the page, the page cache radix tree and
 	 * optional buffer heads at page->private.
 	 */
-	return page_count(page) - page_has_private(page) == 2;
+	int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+		HPAGE_PMD_NR : 1;
+	return page_count(page) - page_has_private(page) == 1 + radix_pins;
 }
 
 static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -666,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 			    bool reclaimed)
 {
 	unsigned long flags;
+	int refcount;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
@@ -696,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
-	if (!page_ref_freeze(page, 2))
+	if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
+		refcount = 1 + HPAGE_PMD_NR;
+	else
+		refcount = 2;
+	if (!page_ref_freeze(page, refcount))
 		goto cannot_free;
 	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
-		page_ref_unfreeze(page, 2);
+		page_ref_unfreeze(page, refcount);
 		goto cannot_free;
 	}
 
@@ -1122,58 +1129,56 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Try to allocate it some swap space here.
 		 * Lazyfree page could be freed directly
 		 */
-		if (PageAnon(page) && PageSwapBacked(page) &&
-		    !PageSwapCache(page)) {
-			if (!(sc->gfp_mask & __GFP_IO))
-				goto keep_locked;
-			if (PageTransHuge(page)) {
-				/* cannot split THP, skip it */
-				if (!can_split_huge_page(page, NULL))
-					goto activate_locked;
-				/*
-				 * Split pages without a PMD map right
-				 * away. Chances are some or all of the
-				 * tail pages can be freed without IO.
-				 */
-				if (!compound_mapcount(page) &&
-				    split_huge_page_to_list(page, page_list))
-					goto activate_locked;
-			}
-			if (!add_to_swap(page)) {
-				if (!PageTransHuge(page))
-					goto activate_locked;
-				/* Split THP and swap individual base pages */
-				if (split_huge_page_to_list(page, page_list))
-					goto activate_locked;
-				if (!add_to_swap(page))
-					goto activate_locked;
-			}
+		if (PageAnon(page) && PageSwapBacked(page)) {
+			if (!PageSwapCache(page)) {
+				if (!(sc->gfp_mask & __GFP_IO))
+					goto keep_locked;
+				if (PageTransHuge(page)) {
+					/* cannot split THP, skip it */
+					if (!can_split_huge_page(page, NULL))
+						goto activate_locked;
+					/*
+					 * Split pages without a PMD map right
+					 * away. Chances are some or all of the
+					 * tail pages can be freed without IO.
+					 */
+					if (!compound_mapcount(page) &&
+					    split_huge_page_to_list(page,
+								    page_list))
+						goto activate_locked;
+				}
+				if (!add_to_swap(page)) {
+					if (!PageTransHuge(page))
+						goto activate_locked;
+					/* Fallback to swap normal pages */
+					if (split_huge_page_to_list(page,
+								    page_list))
+						goto activate_locked;
+					if (!add_to_swap(page))
+						goto activate_locked;
+				}
 
-			/* XXX: We don't support THP writes */
-			if (PageTransHuge(page) &&
-				  split_huge_page_to_list(page, page_list)) {
-				delete_from_swap_cache(page);
-				goto activate_locked;
+				may_enter_fs = 1;
+
+				/* Adding to swap updated mapping */
+				mapping = page_mapping(page);
 			}
-
-			may_enter_fs = 1;
-
-			/* Adding to swap updated mapping */
-			mapping = page_mapping(page);
 		} else if (unlikely(PageTransHuge(page))) {
 			/* Split file THP */
 			if (split_huge_page_to_list(page, page_list))
 				goto keep_locked;
 		}
 
-		VM_BUG_ON_PAGE(PageTransHuge(page), page);
-
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page)) {
-			if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
+			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+
+			if (unlikely(PageTransHuge(page)))
+				flags |= TTU_SPLIT_HUGE_PMD;
+			if (!try_to_unmap(page, flags)) {
 				nr_unmap_fail++;
 				goto activate_locked;
 			}
@@ -1313,7 +1318,11 @@ free_it:
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
 		 */
-		list_add(&page->lru, &free_pages);
+		if (unlikely(PageTransHuge(page))) {
+			mem_cgroup_uncharge(page);
+			(*get_compound_page_dtor(page))(page);
+		} else
+			list_add(&page->lru, &free_pages);
 		continue;
 
 activate_locked:

From fe490cc0fe9e6ee48cc48bb5dc463bc5f0f1428f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:52 -0700
Subject: [PATCH 077/119] mm, THP, swap: add THP swapping out fallback counting

When swapping out THP (Transparent Huge Page), instead of swapping out
the THP as a whole, sometimes we have to fallback to split the THP into
normal pages before swapping, because no free swap clusters are
available, or cgroup limit is exceeded, etc.  To count the number of the
fallback, a new VM event THP_SWPOUT_FALLBACK is added, and counted when
we fallback to split the THP.

Link: http://lkml.kernel.org/r/20170724051840.2309-13-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 +
 mm/vmscan.c                   | 3 +++
 mm/vmstat.c                   | 1 +
 3 files changed, 5 insertions(+)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index c75024e80eed..e02820fc2861 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -86,6 +86,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 		THP_SWPOUT,
+		THP_SWPOUT_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6fbf707c0ce2..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1154,6 +1154,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 					if (split_huge_page_to_list(page,
 								    page_list))
 						goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+					count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
 					if (!add_to_swap(page))
 						goto activate_locked;
 				}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bccf426453cd..e131b51654c7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1072,6 +1072,7 @@ const char * const vmstat_text[] = {
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 	"thp_swpout",
+	"thp_swpout_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",

From b1cc94ab2f2ba31fcb2c59df0b9cf03f6d720553 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:22:56 -0700
Subject: [PATCH 078/119] shmem: shmem_charge: verify max_block is not exceeded
 before inode update

Patch series "userfaultfd: enable zeropage support for shmem".

These patches enable support for UFFDIO_ZEROPAGE for shared memory.

The first two patches are not strictly related to userfaultfd, they are
just minor refactoring to reduce amount of code duplication.

This patch (of 7):

Currently we update inode and shmem_inode_info before verifying that
used_blocks will not exceed max_blocks.  In case it will, we undo the
update.  Let's switch the order and move the verification of the blocks
count before the inode and shmem_inode_info update.

Link: http://lkml.kernel.org/r/1497939652-16528-2-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index fbcb3c96a186..35b524085c44 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -266,6 +266,14 @@ bool shmem_charge(struct inode *inode, long pages)
 
 	if (shmem_acct_block(info->flags, pages))
 		return false;
+
+	if (sbinfo->max_blocks) {
+		if (percpu_counter_compare(&sbinfo->used_blocks,
+					   sbinfo->max_blocks - pages) > 0)
+			goto unacct;
+		percpu_counter_add(&sbinfo->used_blocks, pages);
+	}
+
 	spin_lock_irqsave(&info->lock, flags);
 	info->alloced += pages;
 	inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -273,20 +281,11 @@ bool shmem_charge(struct inode *inode, long pages)
 	spin_unlock_irqrestore(&info->lock, flags);
 	inode->i_mapping->nrpages += pages;
 
-	if (!sbinfo->max_blocks)
-		return true;
-	if (percpu_counter_compare(&sbinfo->used_blocks,
-				sbinfo->max_blocks - pages) > 0) {
-		inode->i_mapping->nrpages -= pages;
-		spin_lock_irqsave(&info->lock, flags);
-		info->alloced -= pages;
-		shmem_recalc_inode(inode);
-		spin_unlock_irqrestore(&info->lock, flags);
-		shmem_unacct_blocks(info->flags, pages);
-		return false;
-	}
-	percpu_counter_add(&sbinfo->used_blocks, pages);
 	return true;
+
+unacct:
+	shmem_unacct_blocks(info->flags, pages);
+	return false;
 }
 
 void shmem_uncharge(struct inode *inode, long pages)

From 0f0796945614b7523987f7eea32407421af4b1ee Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:22:59 -0700
Subject: [PATCH 079/119] shmem: introduce shmem_inode_acct_block

The shmem_acct_block and the update of used_blocks are following one
another in all the places they are used.  Combine these two into a
helper function.

Link: http://lkml.kernel.org/r/1497939652-16528-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 102 ++++++++++++++++++++++++-----------------------------
 1 file changed, 46 insertions(+), 56 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 35b524085c44..b7d84c4f2a5c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -188,6 +188,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 }
 
+static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+	if (shmem_acct_block(info->flags, pages))
+		return false;
+
+	if (sbinfo->max_blocks) {
+		if (percpu_counter_compare(&sbinfo->used_blocks,
+					   sbinfo->max_blocks - pages) > 0)
+			goto unacct;
+		percpu_counter_add(&sbinfo->used_blocks, pages);
+	}
+
+	return true;
+
+unacct:
+	shmem_unacct_blocks(info->flags, pages);
+	return false;
+}
+
+static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+	if (sbinfo->max_blocks)
+		percpu_counter_sub(&sbinfo->used_blocks, pages);
+	shmem_unacct_blocks(info->flags, pages);
+}
+
 static const struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
 static const struct file_operations shmem_file_operations;
@@ -249,31 +281,20 @@ static void shmem_recalc_inode(struct inode *inode)
 
 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 	if (freed > 0) {
-		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-		if (sbinfo->max_blocks)
-			percpu_counter_add(&sbinfo->used_blocks, -freed);
 		info->alloced -= freed;
 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
-		shmem_unacct_blocks(info->flags, freed);
+		shmem_inode_unacct_blocks(inode, freed);
 	}
 }
 
 bool shmem_charge(struct inode *inode, long pages)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	unsigned long flags;
 
-	if (shmem_acct_block(info->flags, pages))
+	if (!shmem_inode_acct_block(inode, pages))
 		return false;
 
-	if (sbinfo->max_blocks) {
-		if (percpu_counter_compare(&sbinfo->used_blocks,
-					   sbinfo->max_blocks - pages) > 0)
-			goto unacct;
-		percpu_counter_add(&sbinfo->used_blocks, pages);
-	}
-
 	spin_lock_irqsave(&info->lock, flags);
 	info->alloced += pages;
 	inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -282,16 +303,11 @@ bool shmem_charge(struct inode *inode, long pages)
 	inode->i_mapping->nrpages += pages;
 
 	return true;
-
-unacct:
-	shmem_unacct_blocks(info->flags, pages);
-	return false;
 }
 
 void shmem_uncharge(struct inode *inode, long pages)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock, flags);
@@ -300,9 +316,7 @@ void shmem_uncharge(struct inode *inode, long pages)
 	shmem_recalc_inode(inode);
 	spin_unlock_irqrestore(&info->lock, flags);
 
-	if (sbinfo->max_blocks)
-		percpu_counter_sub(&sbinfo->used_blocks, pages);
-	shmem_unacct_blocks(info->flags, pages);
+	shmem_inode_unacct_blocks(inode, pages);
 }
 
 /*
@@ -1451,9 +1465,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 
 static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
-		struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+		struct inode *inode,
 		pgoff_t index, bool huge)
 {
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct page *page;
 	int nr;
 	int err = -ENOSPC;
@@ -1462,14 +1477,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
 		huge = false;
 	nr = huge ? HPAGE_PMD_NR : 1;
 
-	if (shmem_acct_block(info->flags, nr))
+	if (!shmem_inode_acct_block(inode, nr))
 		goto failed;
-	if (sbinfo->max_blocks) {
-		if (percpu_counter_compare(&sbinfo->used_blocks,
-					sbinfo->max_blocks - nr) > 0)
-			goto unacct;
-		percpu_counter_add(&sbinfo->used_blocks, nr);
-	}
 
 	if (huge)
 		page = shmem_alloc_hugepage(gfp, info, index);
@@ -1482,10 +1491,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
 	}
 
 	err = -ENOMEM;
-	if (sbinfo->max_blocks)
-		percpu_counter_add(&sbinfo->used_blocks, -nr);
-unacct:
-	shmem_unacct_blocks(info->flags, nr);
+	shmem_inode_unacct_blocks(inode, nr);
 failed:
 	return ERR_PTR(err);
 }
@@ -1750,10 +1756,9 @@ repeat:
 		}
 
 alloc_huge:
-		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
-				index, true);
+		page = shmem_alloc_and_acct_page(gfp, inode, index, true);
 		if (IS_ERR(page)) {
-alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, inode,
 					index, false);
 		}
 		if (IS_ERR(page)) {
@@ -1875,10 +1880,7 @@ clear:
 	 * Error recovery.
 	 */
 unacct:
-	if (sbinfo->max_blocks)
-		percpu_counter_sub(&sbinfo->used_blocks,
-				1 << compound_order(page));
-	shmem_unacct_blocks(info->flags, 1 << compound_order(page));
+	shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
 
 	if (PageTransHuge(page)) {
 		unlock_page(page);
@@ -2214,7 +2216,6 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp = mapping_gfp_mask(mapping);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -2226,19 +2227,13 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	int ret;
 
 	ret = -ENOMEM;
-	if (shmem_acct_block(info->flags, 1))
+	if (!shmem_inode_acct_block(inode, 1))
 		goto out;
-	if (sbinfo->max_blocks) {
-		if (percpu_counter_compare(&sbinfo->used_blocks,
-					   sbinfo->max_blocks) >= 0)
-			goto out_unacct_blocks;
-		percpu_counter_inc(&sbinfo->used_blocks);
-	}
 
 	if (!*pagep) {
 		page = shmem_alloc_page(gfp, info, pgoff);
 		if (!page)
-			goto out_dec_used_blocks;
+			goto out_unacct_blocks;
 
 		page_kaddr = kmap_atomic(page);
 		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
@@ -2248,9 +2243,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		/* fallback to copy_from_user outside mmap_sem */
 		if (unlikely(ret)) {
 			*pagep = page;
-			if (sbinfo->max_blocks)
-				percpu_counter_add(&sbinfo->used_blocks, -1);
-			shmem_unacct_blocks(info->flags, 1);
+			shmem_inode_unacct_blocks(inode, 1);
 			/* don't free the page */
 			return -EFAULT;
 		}
@@ -2313,11 +2306,8 @@ out_release_uncharge:
 out_release:
 	unlock_page(page);
 	put_page(page);
-out_dec_used_blocks:
-	if (sbinfo->max_blocks)
-		percpu_counter_add(&sbinfo->used_blocks, -1);
 out_unacct_blocks:
-	shmem_unacct_blocks(info->flags, 1);
+	shmem_inode_unacct_blocks(inode, 1);
 	goto out;
 }
 

From 8d10396342063c79e92c4e46215370ab7b988569 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:02 -0700
Subject: [PATCH 080/119] userfaultfd: shmem: add shmem_mfill_zeropage_pte for
 userfaultfd support

shmem_mfill_zeropage_pte is the low level routine that implements the
userfaultfd UFFDIO_ZEROPAGE command.  Since for shmem mappings zero
pages are always allocated and accounted, the new method is a slight
extension of the existing shmem_mcopy_atomic_pte.

Link: http://lkml.kernel.org/r/1497939652-16528-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  6 ++++
 mm/shmem.c               | 60 +++++++++++++++++++++++++++++-----------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a7d6bd2a918f..b6c3540e07bc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
 				  struct page **pagep);
+extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+				    pmd_t *dst_pmd,
+				    struct vm_area_struct *dst_vma,
+				    unsigned long dst_addr);
 #else
 #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
 			       src_addr, pagep)        ({ BUG(); 0; })
+#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
+				 dst_addr)      ({ BUG(); 0; })
 #endif
 
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index b7d84c4f2a5c..64bdc91187f7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2207,12 +2207,13 @@ bool shmem_mapping(struct address_space *mapping)
 	return mapping->a_ops == &shmem_aops;
 }
 
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
-			   pmd_t *dst_pmd,
-			   struct vm_area_struct *dst_vma,
-			   unsigned long dst_addr,
-			   unsigned long src_addr,
-			   struct page **pagep)
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+				  pmd_t *dst_pmd,
+				  struct vm_area_struct *dst_vma,
+				  unsigned long dst_addr,
+				  unsigned long src_addr,
+				  bool zeropage,
+				  struct page **pagep)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2235,17 +2236,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		if (!page)
 			goto out_unacct_blocks;
 
-		page_kaddr = kmap_atomic(page);
-		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
-				     PAGE_SIZE);
-		kunmap_atomic(page_kaddr);
+		if (!zeropage) {	/* mcopy_atomic */
+			page_kaddr = kmap_atomic(page);
+			ret = copy_from_user(page_kaddr,
+					     (const void __user *)src_addr,
+					     PAGE_SIZE);
+			kunmap_atomic(page_kaddr);
 
-		/* fallback to copy_from_user outside mmap_sem */
-		if (unlikely(ret)) {
-			*pagep = page;
-			shmem_inode_unacct_blocks(inode, 1);
-			/* don't free the page */
-			return -EFAULT;
+			/* fallback to copy_from_user outside mmap_sem */
+			if (unlikely(ret)) {
+				*pagep = page;
+				shmem_inode_unacct_blocks(inode, 1);
+				/* don't free the page */
+				return -EFAULT;
+			}
+		} else {		/* mfill_zeropage_atomic */
+			clear_highpage(page);
 		}
 	} else {
 		page = *pagep;
@@ -2311,6 +2317,28 @@ out_unacct_blocks:
 	goto out;
 }
 
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			   pmd_t *dst_pmd,
+			   struct vm_area_struct *dst_vma,
+			   unsigned long dst_addr,
+			   unsigned long src_addr,
+			   struct page **pagep)
+{
+	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+				      dst_addr, src_addr, false, pagep);
+}
+
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+			     pmd_t *dst_pmd,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr)
+{
+	struct page *page = NULL;
+
+	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+				      dst_addr, 0, true, &page);
+}
+
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;

From 3217d3c79b5d7aabf62daa4db8cf757abedc9f28 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:06 -0700
Subject: [PATCH 081/119] userfaultfd: mcopy_atomic: introduce mfill_atomic_pte
 helper

Shuffle the code a bit to improve readability.

Link: http://lkml.kernel.org/r/1497939652-16528-5-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/userfaultfd.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8bcb501bce60..48c015c80120 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -371,6 +371,34 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      bool zeropage);
 #endif /* CONFIG_HUGETLB_PAGE */
 
+static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
+						pmd_t *dst_pmd,
+						struct vm_area_struct *dst_vma,
+						unsigned long dst_addr,
+						unsigned long src_addr,
+						struct page **page,
+						bool zeropage)
+{
+	ssize_t err;
+
+	if (vma_is_anonymous(dst_vma)) {
+		if (!zeropage)
+			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+					       dst_addr, src_addr, page);
+		else
+			err = mfill_zeropage_pte(dst_mm, dst_pmd,
+						 dst_vma, dst_addr);
+	} else {
+		err = -EINVAL; /* if zeropage is true return -EINVAL */
+		if (likely(!zeropage))
+			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+						     dst_vma, dst_addr,
+						     src_addr, page);
+	}
+
+	return err;
+}
+
 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
@@ -487,22 +515,8 @@ retry:
 		BUG_ON(pmd_none(*dst_pmd));
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
-		if (vma_is_anonymous(dst_vma)) {
-			if (!zeropage)
-				err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-						       dst_addr, src_addr,
-						       &page);
-			else
-				err = mfill_zeropage_pte(dst_mm, dst_pmd,
-							 dst_vma, dst_addr);
-		} else {
-			err = -EINVAL; /* if zeropage is true return -EINVAL */
-			if (likely(!zeropage))
-				err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
-							     dst_vma, dst_addr,
-							     src_addr, &page);
-		}
-
+		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+				       src_addr, &page, zeropage);
 		cond_resched();
 
 		if (unlikely(err == -EFAULT)) {

From 8fb44e5403ca86e33411dfa12dd298ed5ab1c3f7 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:09 -0700
Subject: [PATCH 082/119] userfaultfd: shmem: wire up shmem_mfill_zeropage_pte

For shmem VMAs we can use shmem_mfill_zeropage_pte for UFFDIO_ZEROPAGE

Link: http://lkml.kernel.org/r/1497939652-16528-6-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/userfaultfd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 48c015c80120..81192701964d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -389,11 +389,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 			err = mfill_zeropage_pte(dst_mm, dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
-		err = -EINVAL; /* if zeropage is true return -EINVAL */
-		if (likely(!zeropage))
+		if (!zeropage)
 			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
 						     dst_vma, dst_addr,
 						     src_addr, page);
+		else
+			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
+						       dst_vma, dst_addr);
 	}
 
 	return err;

From ce53e8e6f2cb02d69f0a1d7954e10d3231bc75f5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:12 -0700
Subject: [PATCH 083/119] userfaultfd: report UFFDIO_ZEROPAGE as available for
 shmem VMAs

Now when shmem VMAs can be filled with zero page via userfaultfd we can
report that UFFDIO_ZEROPAGE is available for those VMAs

Link: http://lkml.kernel.org/r/1497939652-16528-7-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 886085b47c75..01a85e2660b8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1194,7 +1194,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
-	bool non_anon_pages;
+	bool basic_ioctls;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1260,7 +1260,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	 * Search for not compatible vmas.
 	 */
 	found = false;
-	non_anon_pages = false;
+	basic_ioctls = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -1299,8 +1299,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		/*
 		 * Note vmas containing huge pages
 		 */
-		if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
-			non_anon_pages = true;
+		if (is_vm_hugetlb_page(cur))
+			basic_ioctls = true;
 
 		found = true;
 	}
@@ -1371,7 +1371,7 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+		if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
 			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;

From 824f973904a1108806fa0fbe15dc93ee9ecd9e0a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:16 -0700
Subject: [PATCH 084/119] userfaultfd: selftest: enable testing of
 UFFDIO_ZEROPAGE for shmem

Link: http://lkml.kernel.org/r/1497939652-16528-8-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 1eae79ae5b4e..b0c9263f5df7 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -198,7 +198,7 @@ static struct uffd_test_ops anon_uffd_test_ops = {
 };
 
 static struct uffd_test_ops shmem_uffd_test_ops = {
-	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+	.expected_ioctls = ANON_EXPECTED_IOCTLS,
 	.allocate_area	= shmem_allocate_area,
 	.release_pages	= shmem_release_pages,
 };

From de23abd15175fb170270d485f267caebe86fb0b4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Sep 2017 16:23:19 -0700
Subject: [PATCH 085/119] fs/sync.c: remove unnecessary NULL f_mapping check in
 sync_file_range

fsync codepath assumes that f_mapping can never be NULL, but
sync_file_range has a check for that.

Remove the one from sync_file_range as I don't see how you'd ever get a
NULL pointer in here.

Link: http://lkml.kernel.org/r/20170525110509.9434-1-jlayton@redhat.com
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sync.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 2a54c1f22035..cf57f33fcd16 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 		goto out_put;
 
 	mapping = f.file->f_mapping;
-	if (!mapping) {
-		ret = -EINVAL;
-		goto out_put;
-	}
-
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);

From a446d6f9ce545248c62ab9694d05ac6f68563706 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Sep 2017 16:23:22 -0700
Subject: [PATCH 086/119] include/linux/fs.h: remove unneeded forward
 definition of mm_struct

Link: http://lkml.kernel.org/r/20170525102927.6163-1-jlayton@redhat.com
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc2e0f5a8fd1..baea880c5c03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1270,8 +1270,6 @@ extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
 
-struct mm_struct;
-
 /*
  *	Umount options
  */

From e652f694598273c5d749687032d1534a30e6a3a5 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:25 -0700
Subject: [PATCH 087/119] mm: hugetlb: define system call hugetlb size
 encodings in single file

Patch series "Consolidate system call hugetlb page size encodings".

These patches are the result of discussions in
https://lkml.org/lkml/2017/3/8/548.  The following changes are made in the
patch set:

1) Put all the log2 encoded huge page size definitions in a common
   header file.  The idea is have a set of definitions that can be use as
   the basis for system call specific definitions such as MAP_HUGE_* and
   SHM_HUGE_*.

2) Remove MAP_HUGE_* definitions in arch specific files.  All these
   definitions are the same.  Consolidate all definitions in the primary
   user header file (uapi/linux/mman.h).

3) Remove SHM_HUGE_* definitions intended for user space from kernel
   header file, and add to user (uapi/linux/shm.h) header file.  Add
   definitions for all known huge page size encodings as in mmap.

This patch (of 3):

If hugetlb pages are requested in mmap or shmget system calls, a huge
page size other than default can be requested.  This is accomplished by
encoding the log2 of the huge page size in the upper bits of the flag
argument.  asm-generic and arch specific headers all define the same
values for these encodings.

Put common definitions in a single header file.  The primary uapi header
files for mmap and shm will use these definitions as a basis for
definitions specific to those system calls.

Link: http://lkml.kernel.org/r/1501527386-10736-2-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/hugetlb_encode.h | 34 +++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 include/uapi/asm-generic/hugetlb_encode.h

diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
new file mode 100644
index 000000000000..e4732d3c2998
--- /dev/null
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_
+#define _ASM_GENERIC_HUGETLB_ENCODE_H_
+
+/*
+ * Several system calls take a flag to request "hugetlb" huge pages.
+ * Without further specification, these system calls will use the
+ * system's default huge page size.  If a system supports multiple
+ * huge page sizes, the desired huge page size can be specified in
+ * bits [26:31] of the flag arguments.  The value in these 6 bits
+ * will encode the log2 of the huge page size.
+ *
+ * The following definitions are associated with this huge page size
+ * encoding in flag arguments.  System call specific header files
+ * that use this encoding should include this file.  They can then
+ * provide definitions based on these with their own specific prefix.
+ * for example:
+ * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+ */
+
+#define HUGETLB_FLAG_ENCODE_SHIFT	26
+#define HUGETLB_FLAG_ENCODE_MASK	0x3f
+
+#define HUGETLB_FLAG_ENCODE_64KB	(16 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512KB	(19 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1MB		(20 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2MB		(21 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_8MB		(23 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16MB	(24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_256MB	(28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1GB		(30 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2GB		(31 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16GB	(34 << HUGETLB_FLAG_ENCODE_SHIFT)
+
+#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */

From aafd4562dfee81a40ba21b5ea3cf5e06664bc7f6 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:29 -0700
Subject: [PATCH 088/119] mm: arch: consolidate mmap hugetlb size encodings

A non-default huge page size can be encoded in the flags argument of the
mmap system call.  The definitions for these encodings are in arch
specific header files.  However, all architectures use the same values.

Consolidate all the definitions in the primary user header file
(uapi/linux/mman.h).  Include definitions for all known huge page sizes.
Use the generic encoding definitions in hugetlb_encode.h as the basis
for these definitions.

Link: http://lkml.kernel.org/r/1501527386-10736-3-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/uapi/asm/mman.h     | 11 -----------
 arch/mips/include/uapi/asm/mman.h      | 11 -----------
 arch/parisc/include/uapi/asm/mman.h    | 11 -----------
 arch/powerpc/include/uapi/asm/mman.h   | 16 ----------------
 arch/x86/include/uapi/asm/mman.h       |  3 ---
 arch/xtensa/include/uapi/asm/mman.h    | 11 -----------
 include/uapi/asm-generic/mman-common.h | 11 -----------
 include/uapi/linux/mman.h              | 22 ++++++++++++++++++++++
 8 files changed, 22 insertions(+), 74 deletions(-)

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 02760f6e6ca4..13b52aad3c43 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -67,17 +67,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 655e2fb5395b..398eebcc3541 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -94,17 +94,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 9a9c2fe4be50..b87fbe3f338a 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -64,17 +64,6 @@
 #define MAP_FILE	0
 #define MAP_VARIABLE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,20 +29,4 @@
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
-/*
- * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
- * encode the log2 of the huge page size. A value of zero indicates that the
- * default huge page size should be used. To use a non-default huge page size,
- * one of these defines can be used, or the size can be encoded by hand. Note
- * that on most systems only a subset, or possibly none, of these sizes will be
- * available.
- */
-#define MAP_HUGE_512KB	(19 << MAP_HUGE_SHIFT)	/* 512KB HugeTLB Page */
-#define MAP_HUGE_1MB	(20 << MAP_HUGE_SHIFT)	/* 1MB   HugeTLB Page */
-#define MAP_HUGE_2MB	(21 << MAP_HUGE_SHIFT)	/* 2MB   HugeTLB Page */
-#define MAP_HUGE_8MB	(23 << MAP_HUGE_SHIFT)	/* 8MB   HugeTLB Page */
-#define MAP_HUGE_16MB	(24 << MAP_HUGE_SHIFT)	/* 16MB  HugeTLB Page */
-#define MAP_HUGE_1GB	(30 << MAP_HUGE_SHIFT)	/* 1GB   HugeTLB Page */
-#define MAP_HUGE_16GB	(34 << MAP_HUGE_SHIFT)	/* 16GB  HugeTLB Page */
-
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
 
 #define MAP_32BIT	0x40		/* only give out 32bit addresses */
 
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
-
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 /*
  * Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 24365b30aae9..8ce77a2e9bab 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -106,17 +106,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..d248f3c335b5 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -61,17 +61,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd3a90c..a937480d7cd3 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -2,6 +2,7 @@
 #define _UAPI_LINUX_MMAN_H
 
 #include <asm/mman.h>
+#include <asm-generic/hugetlb_encode.h>
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
@@ -10,4 +11,25 @@
 #define OVERCOMMIT_ALWAYS		1
 #define OVERCOMMIT_NEVER		2
 
+/*
+ * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MAP_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MAP_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MAP_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MAP_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MAP_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MAP_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MAP_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MAP_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MAP_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MAP_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MAP_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MAP_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
+
 #endif /* _UAPI_LINUX_MMAN_H */

From 4da243ac1cf6aeb30b7c555d56208982d66d6d33 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:33 -0700
Subject: [PATCH 089/119] mm: shm: use new hugetlb size encoding definitions

Use the common definitions from hugetlb_encode.h header file for
encoding hugetlb size definitions in shmget system call flags.

In addition, move these definitions from the internal (kernel) to user
(uapi) header file.

Link: http://lkml.kernel.org/r/1501527386-10736-4-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shm.h      | 17 -----------------
 include/uapi/linux/shm.h | 31 +++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 0fb7061ec54c..21a5e6c43385 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
 /* shm_mode upper byte flags */
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_NORESERVE   010000  /* don't check for reservations */
-
-/* Bits [26:31] are reserved */
-
-/*
- * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 1fbf24ea37fd..cf23c873719d 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -3,6 +3,7 @@
 
 #include <linux/ipc.h>
 #include <linux/errno.h>
+#include <asm-generic/hugetlb_encode.h>
 #ifndef __KERNEL__
 #include <unistd.h>
 #endif
@@ -40,11 +41,37 @@ struct shmid_ds {
 /* Include the definition of shmid64_ds and shminfo64 */
 #include <asm/shmbuf.h>
 
-/* permission flag for shmget */
+/*
+ * shmget() shmflg values.
+ */
+/* The bottom nine bits are the same as open(2) mode flags */
 #define SHM_R		0400	/* or S_IRUGO from <linux/stat.h> */
 #define SHM_W		0200	/* or S_IWUGO from <linux/stat.h> */
+/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
+#define SHM_HUGETLB	04000	/* segment will use huge TLB pages */
+#define SHM_NORESERVE	010000	/* don't check for reservations */
 
-/* mode for attach */
+/*
+ * Huge page size encoding when SHM_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h
+ */
+#define SHM_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define SHM_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define SHM_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define SHM_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define SHM_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define SHM_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define SHM_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define SHM_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define SHM_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define SHM_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define SHM_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define SHM_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
+
+/*
+ * shmat() shmflg values
+ */
 #define	SHM_RDONLY	010000	/* read-only access */
 #define	SHM_RND		020000	/* round attach address to SHMLBA boundary */
 #define	SHM_REMAP	040000	/* take-over region on attach */

From c41f012ade0b95b0a6e25c7150673e0554736165 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:23:36 -0700
Subject: [PATCH 090/119] mm: rename global_page_state to
 global_zone_page_state

global_page_state is error prone as a recent bug report pointed out [1].
It only returns proper values for zone based counters as the enum it
gets suggests.  We already have global_node_page_state so let's rename
global_page_state to global_zone_page_state to be more explicit here.
All existing users seems to be correct:

$ git grep "global_page_state(NR_" | sed 's@.*(\(NR_[A-Z_]*\)).*@\1@' | sort | uniq -c
      2 NR_BOUNCE
      2 NR_FREE_CMA_PAGES
     11 NR_FREE_PAGES
      1 NR_KERNEL_STACK_KB
      1 NR_MLOCK
      2 NR_PAGETABLE

This patch shouldn't introduce any functional change.

[1] http://lkml.kernel.org/r/201707260628.v6Q6SmaS030814@www262.sakura.ne.jp

Link: http://lkml.kernel.org/r/20170801134256.5400-2-hannes@cmpxchg.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c      | 10 +++++-----
 include/linux/swap.h   |  4 ++--
 include/linux/vmstat.h |  4 ++--
 mm/mmap.c              |  6 +++---
 mm/nommu.c             |  4 ++--
 mm/page-writeback.c    |  4 ++--
 mm/page_alloc.c        | 12 ++++++------
 mm/util.c              |  2 +-
 mm/vmstat.c            |  4 ++--
 9 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "Active(file):   ", pages[LRU_ACTIVE_FILE]);
 	show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
 	show_val_kb(m, "Unevictable:    ", pages[LRU_UNEVICTABLE]);
-	show_val_kb(m, "Mlocked:        ", global_page_state(NR_MLOCK));
+	show_val_kb(m, "Mlocked:        ", global_zone_page_state(NR_MLOCK));
 
 #ifdef CONFIG_HIGHMEM
 	show_val_kb(m, "HighTotal:      ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "SUnreclaim:     ",
 		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
 	seq_printf(m, "KernelStack:    %8lu kB\n",
-		   global_page_state(NR_KERNEL_STACK_KB));
+		   global_zone_page_state(NR_KERNEL_STACK_KB));
 	show_val_kb(m, "PageTables:     ",
-		    global_page_state(NR_PAGETABLE));
+		    global_zone_page_state(NR_PAGETABLE));
 #ifdef CONFIG_QUICKLIST
 	show_val_kb(m, "Quicklists:     ", quicklist_total_size());
 #endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "NFS_Unstable:   ",
 		    global_node_page_state(NR_UNSTABLE_NFS));
 	show_val_kb(m, "Bounce:         ",
-		    global_page_state(NR_BOUNCE));
+		    global_zone_page_state(NR_BOUNCE));
 	show_val_kb(m, "WritebackTmp:   ",
 		    global_node_page_state(NR_WRITEBACK_TEMP));
 	show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_CMA
 	show_val_kb(m, "CmaTotal:       ", totalcma_pages);
 	show_val_kb(m, "CmaFree:        ",
-		    global_page_state(NR_FREE_CMA_PAGES));
+		    global_zone_page_state(NR_FREE_CMA_PAGES));
 #endif
 
 	hugetlb_report_meminfo(m);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 461cf107ad52..76f1632eea5a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -263,8 +263,8 @@ extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
 extern unsigned long nr_free_pagecache_pages(void);
 
-/* Definition of global_page_state not available yet */
-#define nr_free_pages() global_page_state(NR_FREE_PAGES)
+/* Definition of global_zone_page_state not available yet */
+#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
 
 
 /* linux/mm/swap.c */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f30d424..97e11ab573f0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat,
 	atomic_long_add(x, &vm_node_stat[item]);
 }
 
-static inline unsigned long global_page_state(enum zone_stat_item item)
+static inline unsigned long global_zone_page_state(enum zone_stat_item item)
 {
 	long x = atomic_long_read(&vm_zone_stat[item]);
 #ifdef CONFIG_SMP
@@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long node_page_state(struct pglist_data *pgdat,
 						enum node_stat_item item);
 #else
-#define sum_zone_node_page_state(node, item) global_page_state(item)
+#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..9800e29763f4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3514,7 +3514,7 @@ static int init_user_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
@@ -3535,7 +3535,7 @@ static int init_admin_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
@@ -3579,7 +3579,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
 
 		break;
 	case MEM_OFFLINE:
-		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+		free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 		if (sysctl_user_reserve_kbytes > free_kbytes) {
 			init_user_reserve();
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bf050ab025b7..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES);
+	x = global_zone_page_state(NR_FREE_PAGES);
 	/*
 	 * Pages reserved for the kernel should not be considered
 	 * dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
  * will look to see if it needs to start dirty throttling.
  *
  * If dirty_poll_interval is too low, big NUMA machines will call the expensive
- * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
  * (the number of pages we may dirty without exceeding the dirty limits).
  */
 static unsigned long dirty_poll_interval(unsigned long dirty,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0bea94af0423..a4562c058ec4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4509,7 +4509,7 @@ long si_mem_available(void)
 	 * Estimate the amount of memory available for userspace allocations,
 	 * without causing swapping.
 	 */
-	available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
 
 	/*
 	 * Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4538,7 @@ void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = global_node_page_state(NR_SHMEM);
-	val->freeram = global_page_state(NR_FREE_PAGES);
+	val->freeram = global_zone_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
@@ -4673,11 +4673,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
-		global_page_state(NR_PAGETABLE),
-		global_page_state(NR_BOUNCE),
-		global_page_state(NR_FREE_PAGES),
+		global_zone_page_state(NR_PAGETABLE),
+		global_zone_page_state(NR_BOUNCE),
+		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
-		global_page_state(NR_FREE_CMA_PAGES));
+		global_zone_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
diff --git a/mm/util.c b/mm/util.c
index 9ecddf568fe3..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		return 0;
 
 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-		free = global_page_state(NR_FREE_PAGES);
+		free = global_zone_page_state(NR_FREE_PAGES);
 		free += global_node_page_state(NR_FILE_PAGES);
 
 		/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e131b51654c7..ba9b202e8500 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1502,7 +1502,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	if (!v)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		v[i] = global_page_state(i);
+		v[i] = global_zone_page_state(i);
 	v += NR_VM_ZONE_STAT_ITEMS;
 
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1591,7 +1591,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
 	 * which can equally be echo'ed to or cat'ted from (by root),
 	 * can be used to update the stats just before reading them.
 	 *
-	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * Oh, and since global_zone_page_state() etc. are so careful to hide
 	 * transiently negative values, report an error here if any of
 	 * the stats is negative, so we know to go looking for imbalance.
 	 */

From 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e Mon Sep 17 00:00:00 2001
From: Prakash Sangappa <prakash.sangappa@oracle.com>
Date: Wed, 6 Sep 2017 16:23:39 -0700
Subject: [PATCH 091/119] mm: userfaultfd: add feature to request for a signal
 delivery

In some cases, userfaultfd mechanism should just deliver a SIGBUS signal
to the faulting process, instead of the page-fault event.  Dealing with
page-fault event using a monitor thread can be an overhead in these
cases.  For example applications like the database could use the
signaling mechanism for robustness purpose.

Database uses hugetlbfs for performance reason.  Files on hugetlbfs
filesystem are created and huge pages allocated using fallocate() API.
Pages are deallocated/freed using fallocate() hole punching support.
These files are mmapped and accessed by many processes as shared memory.
The database keeps track of which offsets in the hugetlbfs file have
pages allocated.

Any access to mapped address over holes in the file, which can occur due
to bugs in the application, is considered invalid and expect the process
to simply receive a SIGBUS.  However, currently when a hole in the file
is accessed via the mapped address, kernel/mm attempts to automatically
allocate a page at page fault time, resulting in implicitly filling the
hole in the file.  This may not be the desired behavior for applications
like the database that want to explicitly manage page allocations of
hugetlbfs files.

Using userfaultfd mechanism with this support to get a signal, database
application can prevent pages from being allocated implicitly when
processes access mapped address over holes in the file.

This patch adds UFFD_FEATURE_SIGBUS feature to userfaultfd mechnism to
request for a SIGBUS signal.

See following for previous discussion about the database requirement
leading to this proposal as suggested by Andrea.

http://www.spinics.net/lists/linux-mm/msg129224.html

Link: http://lkml.kernel.org/r/1501552446-748335-2-git-send-email-prakash.sangappa@oracle.com
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  3 +++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 01a85e2660b8..5fd4d846691f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -370,6 +370,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
 	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
 
+	if (ctx->features & UFFD_FEATURE_SIGBUS)
+		goto out;
+
 	/*
 	 * If it's already released don't get it. This avoids to loop
 	 * in __get_user_pages if userfaultfd_release waits on the
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..d39d5db56771 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -23,7 +23,8 @@
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
-			   UFFD_FEATURE_MISSING_SHMEM)
+			   UFFD_FEATURE_MISSING_SHMEM |		\
+			   UFFD_FEATURE_SIGBUS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -153,6 +154,12 @@ struct uffdio_api {
 	 * UFFD_FEATURE_MISSING_SHMEM works the same as
 	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
 	 * (i.e. tmpfs and other shmem based APIs).
+	 *
+	 * UFFD_FEATURE_SIGBUS feature means no page-fault
+	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
+	 * a SIGBUS signal will be sent to the faulting process.
+	 * The application process can enable this behavior by adding
+	 * it to uffdio_api.features.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -161,6 +168,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_SIGBUS			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;

From 81aac3a15ef3e85952daebd199df9c8f1eb24f84 Mon Sep 17 00:00:00 2001
From: Prakash Sangappa <prakash.sangappa@oracle.com>
Date: Wed, 6 Sep 2017 16:23:43 -0700
Subject: [PATCH 092/119] userfaultfd: selftest: add tests for
 UFFD_FEATURE_SIGBUS feature

Add tests for UFFD_FEATURE_SIGBUS feature.  The tests will verify signal
delivery instead of userfault events.  Also, test use of UFFDIO_COPY to
allocate memory and retry accessing monitored area after signal
delivery.

Also fix a bug in uffd_poll_thread() where 'uffd' is leaked.

Link: http://lkml.kernel.org/r/1501552446-748335-3-git-send-email-prakash.sangappa@oracle.com
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 127 ++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index b0c9263f5df7..7db6299b2f0d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -66,6 +66,7 @@
 #include <sys/wait.h>
 #include <pthread.h>
 #include <linux/userfaultfd.h>
+#include <setjmp.h>
 
 #ifdef __NR_userfaultfd
 
@@ -408,6 +409,7 @@ static void *uffd_poll_thread(void *arg)
 				userfaults++;
 			break;
 		case UFFD_EVENT_FORK:
+			close(uffd);
 			uffd = msg.arg.fork.ufd;
 			pollfd[0].fd = uffd;
 			break;
@@ -572,6 +574,17 @@ static int userfaultfd_open(int features)
 	return 0;
 }
 
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+	if (sig == SIGBUS) {
+		if (sigbuf)
+			siglongjmp(*sigbuf, 1);
+		abort();
+	}
+}
+
 /*
  * For non-cooperative userfaultfd test we fork() a process that will
  * generate pagefaults, will mremap the area monitored by the
@@ -585,19 +598,59 @@ static int userfaultfd_open(int features)
  * The release of the pages currently generates event for shmem and
  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
  * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
  */
-static int faulting_process(void)
+static int faulting_process(int signal_test)
 {
 	unsigned long nr;
 	unsigned long long count;
 	unsigned long split_nr_pages;
+	unsigned long lastnr;
+	struct sigaction act;
+	unsigned long signalled = 0;
 
 	if (test_type != TEST_HUGETLB)
 		split_nr_pages = (nr_pages + 1) / 2;
 	else
 		split_nr_pages = nr_pages;
 
+	if (signal_test) {
+		sigbuf = &jbuf;
+		memset(&act, 0, sizeof(act));
+		act.sa_sigaction = sighndl;
+		act.sa_flags = SA_SIGINFO;
+		if (sigaction(SIGBUS, &act, 0)) {
+			perror("sigaction");
+			return 1;
+		}
+		lastnr = (unsigned long)-1;
+	}
+
 	for (nr = 0; nr < split_nr_pages; nr++) {
+		if (signal_test) {
+			if (sigsetjmp(*sigbuf, 1) != 0) {
+				if (nr == lastnr) {
+					fprintf(stderr, "Signal repeated\n");
+					return 1;
+				}
+
+				lastnr = nr;
+				if (signal_test == 1) {
+					if (copy_page(uffd, nr * page_size))
+						signalled++;
+				} else {
+					signalled++;
+					continue;
+				}
+			}
+		}
+
 		count = *area_count(area_dst, nr);
 		if (count != count_verify[nr]) {
 			fprintf(stderr,
@@ -607,6 +660,9 @@ static int faulting_process(void)
 		}
 	}
 
+	if (signal_test)
+		return signalled != split_nr_pages;
+
 	if (test_type == TEST_HUGETLB)
 		return 0;
 
@@ -761,7 +817,7 @@ static int userfaultfd_events_test(void)
 		perror("fork"), exit(1);
 
 	if (!pid)
-		return faulting_process();
+		return faulting_process(0);
 
 	waitpid(pid, &err, 0);
 	if (err)
@@ -778,6 +834,70 @@ static int userfaultfd_events_test(void)
 	return userfaults != nr_pages;
 }
 
+static int userfaultfd_sig_test(void)
+{
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+	unsigned long userfaults;
+	pthread_t uffd_mon;
+	int err, features;
+	pid_t pid;
+	char c;
+
+	printf("testing signal delivery: ");
+	fflush(stdout);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+	if (userfaultfd_open(features) < 0)
+		return 1;
+	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		fprintf(stderr, "register failure\n"), exit(1);
+
+	expected_ioctls = uffd_test_ops->expected_ioctls;
+	if ((uffdio_register.ioctls & expected_ioctls) !=
+	    expected_ioctls)
+		fprintf(stderr,
+			"unexpected missing ioctl for anon memory\n"),
+			exit(1);
+
+	if (faulting_process(1))
+		fprintf(stderr, "faulting process failed\n"), exit(1);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+		perror("uffd_poll_thread create"), exit(1);
+
+	pid = fork();
+	if (pid < 0)
+		perror("fork"), exit(1);
+
+	if (!pid)
+		exit(faulting_process(2));
+
+	waitpid(pid, &err, 0);
+	if (err)
+		fprintf(stderr, "faulting process failed\n"), exit(1);
+
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		perror("pipe write"), exit(1);
+	if (pthread_join(uffd_mon, (void **)&userfaults))
+		return 1;
+
+	printf("done.\n");
+	printf(" Signal test userfaults: %ld\n", userfaults);
+	close(uffd);
+	return userfaults != 0;
+}
 static int userfaultfd_stress(void)
 {
 	void *area;
@@ -946,7 +1066,8 @@ static int userfaultfd_stress(void)
 		return err;
 
 	close(uffd);
-	return userfaultfd_zeropage_test() || userfaultfd_events_test();
+	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+		|| userfaultfd_events_test();
 }
 
 /*

From 67e803281d1ce26daee4f1c0a489cad27b2a583c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:46 -0700
Subject: [PATCH 093/119] userfaultfd: selftest: exercise UFFDIO_COPY/ZEROPAGE
 -EEXIST

This will retry the UFFDIO_COPY/ZEROPAGE to verify it returns -EEXIST at
the first invocation and then later every 10 seconds.

In the filebacked MAP_SHARED case this also verifies the -EEXIST
triggered in the filesystem pagecache insertion, if the offset in the
file was not a hole.

shmem MAP_SHARED tries to index the newly allocated pagecache in the
radix tree before checking the pagetable so it doesn't need any
assistance to exercise that case.

hugetlbfs checks the pmd to be not none before trying to index the
hugetlbfs page in the radix tree, so it requires to run UFFDIO_COPY into
an alias mapping (the alternative would be to use MADV_DONTNEED to only
zap the pagetables, but that doesn't work on hugetlbfs).

[akpm@linux-foundation.org: fix uffdio_zeropage(), per Mike Kravetz]
Link: http://lkml.kernel.org/r/20170802165145.22628-3-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 148 +++++++++++++++++++++--
 1 file changed, 140 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7db6299b2f0d..4549ae425f3e 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -67,6 +67,7 @@
 #include <pthread.h>
 #include <linux/userfaultfd.h>
 #include <setjmp.h>
+#include <stdbool.h>
 
 #ifdef __NR_userfaultfd
 
@@ -83,11 +84,17 @@ static int bounces;
 #define TEST_SHMEM	3
 static int test_type;
 
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+
+static bool map_shared;
 static int huge_fd;
 static char *huge_fd_off0;
 static unsigned long long *count_verify;
 static int uffd, uffd_flags, finished, *pipefd;
-static char *area_src, *area_dst;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
 static char *zeropage;
 pthread_attr_t attr;
 
@@ -126,6 +133,9 @@ static void anon_allocate_area(void **alloc_area)
 	}
 }
 
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
 
 /* HugeTLB memory */
 static int hugetlb_release_pages(char *rel_area)
@@ -146,17 +156,51 @@ static int hugetlb_release_pages(char *rel_area)
 
 static void hugetlb_allocate_area(void **alloc_area)
 {
+	void *area_alias = NULL;
+	char **alloc_area_alias;
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-				MAP_PRIVATE | MAP_HUGETLB, huge_fd,
-				*alloc_area == area_src ? 0 :
-				nr_pages * page_size);
+			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+			   MAP_HUGETLB,
+			   huge_fd, *alloc_area == area_src ? 0 :
+			   nr_pages * page_size);
 	if (*alloc_area == MAP_FAILED) {
 		fprintf(stderr, "mmap of hugetlbfs file failed\n");
 		*alloc_area = NULL;
 	}
 
-	if (*alloc_area == area_src)
+	if (map_shared) {
+		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+				  MAP_SHARED | MAP_HUGETLB,
+				  huge_fd, *alloc_area == area_src ? 0 :
+				  nr_pages * page_size);
+		if (area_alias == MAP_FAILED) {
+			if (munmap(*alloc_area, nr_pages * page_size) < 0)
+				perror("hugetlb munmap"), exit(1);
+			*alloc_area = NULL;
+			return;
+		}
+	}
+	if (*alloc_area == area_src) {
 		huge_fd_off0 = *alloc_area;
+		alloc_area_alias = &area_src_alias;
+	} else {
+		alloc_area_alias = &area_dst_alias;
+	}
+	if (area_alias)
+		*alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+	if (!map_shared)
+		return;
+	/*
+	 * We can't zap just the pagetable with hugetlbfs because
+	 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
+	 * mapping where the pagetables are not established initially,
+	 * this way we'll exercise the -EEXEC at the fs level.
+	 */
+	*start = (unsigned long) area_dst_alias + offset;
 }
 
 /* Shared memory */
@@ -186,6 +230,7 @@ struct uffd_test_ops {
 	unsigned long expected_ioctls;
 	void (*allocate_area)(void **alloc_area);
 	int (*release_pages)(char *rel_area);
+	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 };
 
 #define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
@@ -196,18 +241,21 @@ static struct uffd_test_ops anon_uffd_test_ops = {
 	.expected_ioctls = ANON_EXPECTED_IOCTLS,
 	.allocate_area	= anon_allocate_area,
 	.release_pages	= anon_release_pages,
+	.alias_mapping = noop_alias_mapping,
 };
 
 static struct uffd_test_ops shmem_uffd_test_ops = {
 	.expected_ioctls = ANON_EXPECTED_IOCTLS,
 	.allocate_area	= shmem_allocate_area,
 	.release_pages	= shmem_release_pages,
+	.alias_mapping = noop_alias_mapping,
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
 	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
 	.allocate_area	= hugetlb_allocate_area,
 	.release_pages	= hugetlb_release_pages,
+	.alias_mapping = hugetlb_alias_mapping,
 };
 
 static struct uffd_test_ops *uffd_test_ops;
@@ -332,6 +380,23 @@ static void *locking_thread(void *arg)
 	return NULL;
 }
 
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+			    unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+				     uffdio_copy->len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy->copy != -EEXIST)
+			fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
+				uffdio_copy->copy), exit(1);
+	} else {
+		fprintf(stderr,	"UFFDIO_COPY retry unexpected %Ld\n",
+			uffdio_copy->copy), exit(1);
+	}
+}
+
 static int copy_page(int ufd, unsigned long offset)
 {
 	struct uffdio_copy uffdio_copy;
@@ -352,8 +417,13 @@ static int copy_page(int ufd, unsigned long offset)
 	} else if (uffdio_copy.copy != page_size) {
 		fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
 			uffdio_copy.copy), exit(1);
-	} else
+	} else {
+		if (test_uffdio_copy_eexist) {
+			test_uffdio_copy_eexist = false;
+			retry_copy_page(ufd, &uffdio_copy, offset);
+		}
 		return 1;
+	}
 	return 0;
 }
 
@@ -692,6 +762,23 @@ static int faulting_process(int signal_test)
 	return 0;
 }
 
+static void retry_uffdio_zeropage(int ufd,
+				  struct uffdio_zeropage *uffdio_zeropage,
+				  unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+				     uffdio_zeropage->range.len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+		if (uffdio_zeropage->zeropage != -EEXIST)
+			fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
+				uffdio_zeropage->zeropage), exit(1);
+	} else {
+		fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
+			uffdio_zeropage->zeropage), exit(1);
+	}
+}
+
 static int uffdio_zeropage(int ufd, unsigned long offset)
 {
 	struct uffdio_zeropage uffdio_zeropage;
@@ -726,8 +813,14 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
 		if (uffdio_zeropage.zeropage != page_size) {
 			fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
 				uffdio_zeropage.zeropage), exit(1);
-		} else
+		} else {
+			if (test_uffdio_zeropage_eexist) {
+				test_uffdio_zeropage_eexist = false;
+				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+						      offset);
+			}
 			return 1;
+		}
 	} else {
 		fprintf(stderr,
 			"UFFDIO_ZEROPAGE succeeded %Ld\n",
@@ -999,6 +1092,15 @@ static int userfaultfd_stress(void)
 			return 1;
 		}
 
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long)
+				area_dst_alias;
+			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+				fprintf(stderr, "register failure alias\n");
+				return 1;
+			}
+		}
+
 		/*
 		 * The madvise done previously isn't enough: some
 		 * uffd_thread could have read userfaults (one of
@@ -1032,9 +1134,17 @@ static int userfaultfd_stress(void)
 
 		/* unregister */
 		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
-			fprintf(stderr, "register failure\n");
+			fprintf(stderr, "unregister failure\n");
 			return 1;
 		}
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long) area_dst;
+			if (ioctl(uffd, UFFDIO_UNREGISTER,
+				  &uffdio_register.range)) {
+				fprintf(stderr, "unregister failure alias\n");
+				return 1;
+			}
+		}
 
 		/* verification */
 		if (bounces & BOUNCE_VERIFY) {
@@ -1056,6 +1166,10 @@ static int userfaultfd_stress(void)
 		area_src = area_dst;
 		area_dst = tmp_area;
 
+		tmp_area = area_src_alias;
+		area_src_alias = area_dst_alias;
+		area_dst_alias = tmp_area;
+
 		printf("userfaults:");
 		for (cpu = 0; cpu < nr_cpus; cpu++)
 			printf(" %lu", userfaults[cpu]);
@@ -1102,7 +1216,12 @@ static void set_test_type(const char *type)
 	} else if (!strcmp(type, "hugetlb")) {
 		test_type = TEST_HUGETLB;
 		uffd_test_ops = &hugetlb_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb_shared")) {
+		map_shared = true;
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
 	} else if (!strcmp(type, "shmem")) {
+		map_shared = true;
 		test_type = TEST_SHMEM;
 		uffd_test_ops = &shmem_uffd_test_ops;
 	} else {
@@ -1122,12 +1241,25 @@ static void set_test_type(const char *type)
 		fprintf(stderr, "Impossible to run this test\n"), exit(2);
 }
 
+static void sigalrm(int sig)
+{
+	if (sig != SIGALRM)
+		abort();
+	test_uffdio_copy_eexist = true;
+	test_uffdio_zeropage_eexist = true;
+	alarm(ALARM_INTERVAL_SECS);
+}
+
 int main(int argc, char **argv)
 {
 	if (argc < 4)
 		fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
 				exit(1);
 
+	if (signal(SIGALRM, sigalrm) == SIG_ERR)
+		fprintf(stderr, "failed to arm SIGALRM"), exit(1);
+	alarm(ALARM_INTERVAL_SECS);
+
 	set_test_type(argv[1]);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);

From d312cb1e4884c606bafe6499fade2f91ccc2e944 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:49 -0700
Subject: [PATCH 094/119] userfaultfd: selftest: explicit failure if the SIGBUS
 test failed

Showing zero in the output isn't very self explanatory as a successful
result.  Show a more explicit error output if the test fails.

Link: http://lkml.kernel.org/r/20170802165145.22628-4-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 4549ae425f3e..a2c53a3d223d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -987,7 +987,9 @@ static int userfaultfd_sig_test(void)
 		return 1;
 
 	printf("done.\n");
-	printf(" Signal test userfaults: %ld\n", userfaults);
+	if (userfaults)
+		fprintf(stderr, "Signal test failed, userfaults: %ld\n",
+			userfaults);
 	close(uffd);
 	return userfaults != 0;
 }

From 2376dd7ceddae67432db055ff3f2b7f4122a919d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:53 -0700
Subject: [PATCH 095/119] userfaultfd: call userfaultfd_unmap_prep only if
 __split_vma succeeds

A __split_vma is not a worthy event to report, and it's definitely not a
unmap so it would be incorrect to report unmap for the whole region to
the userfaultfd manager if a __split_vma fails.

So only call userfaultfd_unmap_prep after the __vma_splitting is over
and do_munmap cannot fail anymore.

Also add unlikely because it's better to optimize for the vast majority
of apps that aren't using userfaultfd in a non cooperative way.  Ideally
we should also find a way to eliminate the branch entirely if
CONFIG_USERFAULTFD=n, but it would complicate things so stick to
unlikely for now.

Link: http://lkml.kernel.org/r/20170802165145.22628-5-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 9800e29763f4..52f6c6b18f40 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2639,13 +2639,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	if (vma->vm_start >= end)
 		return 0;
 
-	if (uf) {
-		int error = userfaultfd_unmap_prep(vma, start, end, uf);
-
-		if (error)
-			return error;
-	}
-
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -2679,6 +2672,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	}
 	vma = prev ? prev->vm_next : mm->mmap;
 
+	if (unlikely(uf)) {
+		/*
+		 * If userfaultfd_unmap_prep returns an error the vmas
+		 * will remain splitted, but userland will get a
+		 * highly unexpected error anyway. This is no
+		 * different than the case where the first of the two
+		 * __split_vma fails, but we don't undo the first
+		 * split, despite we could. This is unlikely enough
+		 * failure that it's not worth optimizing it for.
+		 */
+		int error = userfaultfd_unmap_prep(vma, start, end, uf);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * unlock any mlock()ed ranges before detaching vmas
 	 */

From 9d4ac934829ac58c5109c49e6dfe677300e5e652 Mon Sep 17 00:00:00 2001
From: Alexey Perevalov <a.perevalov@samsung.com>
Date: Wed, 6 Sep 2017 16:23:56 -0700
Subject: [PATCH 096/119] userfaultfd: provide pid in userfault msg

It could be useful for calculating downtime during postcopy live
migration per vCPU.  Side observer or application itself will be
informed about proper task's sleep during userfaultfd processing.

Process's thread id is being provided when user requeste it by setting
UFFD_FEATURE_THREAD_ID bit into uffdio_api.features.

Link: http://lkml.kernel.org/r/20170802165145.22628-6-aarcange@redhat.com
Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  8 ++++++--
 include/uapi/linux/userfaultfd.h | 10 +++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5fd4d846691f..665bf7a930b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
 
 static inline struct uffd_msg userfault_msg(unsigned long address,
 					    unsigned int flags,
-					    unsigned long reason)
+					    unsigned long reason,
+					    unsigned int features)
 {
 	struct uffd_msg msg;
 	msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 * write protect fault.
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	if (features & UFFD_FEATURE_THREAD_ID)
+		msg.arg.pagefault.ptid = task_pid_vnr(current);
 	return msg;
 }
 
@@ -422,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+			ctx->features);
 	uwq.ctx = ctx;
 	uwq.waken = false;
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index d39d5db56771..2b24c28d99a7 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -24,7 +24,8 @@
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM |		\
-			   UFFD_FEATURE_SIGBUS)
+			   UFFD_FEATURE_SIGBUS |		\
+			   UFFD_FEATURE_THREAD_ID)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -79,6 +80,7 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
+			__u32   ptid;
 		} pagefault;
 
 		struct {
@@ -158,8 +160,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_SIGBUS feature means no page-fault
 	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
 	 * a SIGBUS signal will be sent to the faulting process.
-	 * The application process can enable this behavior by adding
-	 * it to uffdio_api.features.
+	 *
+	 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
+	 * be returned, if feature is not requested 0 will be returned.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -169,6 +172,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 #define UFFD_FEATURE_SIGBUS			(1<<7)
+#define UFFD_FEATURE_THREAD_ID			(1<<8)
 	__u64 features;
 
 	__u64 ioctls;

From a36985d31a65d5c0559fb582719e32eaf0ccec3b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:59 -0700
Subject: [PATCH 097/119] userfaultfd: provide pid in userfault msg - add feat
 union

No ABI change, but this will make it more explicit to software that ptid
is only available if requested by passing UFFD_FEATURE_THREAD_ID to
UFFDIO_API.  The fact it's a union will also self document it shouldn't
be taken for granted there's a tpid there.

Link: http://lkml.kernel.org/r/20170802165145.22628-7-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 2 +-
 include/uapi/linux/userfaultfd.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 665bf7a930b2..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -204,7 +204,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 	if (features & UFFD_FEATURE_THREAD_ID)
-		msg.arg.pagefault.ptid = task_pid_vnr(current);
+		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 	return msg;
 }
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2b24c28d99a7..d6d1f65cb3c3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -80,7 +80,9 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
-			__u32   ptid;
+			union {
+				__u32 ptid;
+			} feat;
 		} pagefault;
 
 		struct {

From 79b63f12abcbbd2caf7064b294af648a87de07ff Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:24:03 -0700
Subject: [PATCH 098/119] mm, hugetlb: do not allocate non-migrateable gigantic
 pages from movable zones

alloc_gigantic_page doesn't consider movability of the gigantic hugetlb
when scanning eligible ranges for the allocation.  As 1GB hugetlb pages
are not movable currently this can break the movable zone assumption
that all allocations are migrateable and as such break memory hotplug.

Reorganize the code and use the standard zonelist allocations scheme
that we use for standard hugetbl pages.  htlb_alloc_mask will ensure
that only migratable hugetlb pages will ever see a movable zone.

Link: http://lkml.kernel.org/r/20170803083549.21407-1-mhocko@kernel.org
Fixes: 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafd60316e68..34625b257128 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 }
 
 static int __alloc_gigantic_page(unsigned long start_pfn,
-				unsigned long nr_pages)
+				unsigned long nr_pages, gfp_t gfp_mask)
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
 	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
-				  GFP_KERNEL);
+				  gfp_mask);
 }
 
 static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
 	return zone_spans_pfn(zone, last_pfn);
 }
 
-static struct page *alloc_gigantic_page(int nid, unsigned int order)
+static struct page *alloc_gigantic_page(int nid, struct hstate *h)
 {
+	unsigned int order = huge_page_order(h);
 	unsigned long nr_pages = 1 << order;
 	unsigned long ret, pfn, flags;
-	struct zone *z;
+	struct zonelist *zonelist;
+	struct zone *zone;
+	struct zoneref *z;
+	gfp_t gfp_mask;
 
-	z = NODE_DATA(nid)->node_zones;
-	for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
-		spin_lock_irqsave(&z->lock, flags);
+	gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+	zonelist = node_zonelist(nid, gfp_mask);
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+		spin_lock_irqsave(&zone->lock, flags);
 
-		pfn = ALIGN(z->zone_start_pfn, nr_pages);
-		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
-			if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
+		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
 				/*
 				 * We release the zone lock here because
 				 * alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
 				 * spinning on this lock, it may win the race
 				 * and cause alloc_contig_range() to fail...
 				 */
-				spin_unlock_irqrestore(&z->lock, flags);
-				ret = __alloc_gigantic_page(pfn, nr_pages);
+				spin_unlock_irqrestore(&zone->lock, flags);
+				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
 				if (!ret)
 					return pfn_to_page(pfn);
-				spin_lock_irqsave(&z->lock, flags);
+				spin_lock_irqsave(&zone->lock, flags);
 			}
 			pfn += nr_pages;
 		}
 
-		spin_unlock_irqrestore(&z->lock, flags);
+		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
 	return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
-	page = alloc_gigantic_page(nid, huge_page_order(h));
+	page = alloc_gigantic_page(nid, h);
 	if (page) {
 		prep_compound_gigantic_page(page, huge_page_order(h));
 		prep_new_huge_page(h, page, nid);

From 88d6ac40c1c6b2334046d3f89f1911a3c8febd4c Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang99@zte.com.cn>
Date: Wed, 6 Sep 2017 16:24:06 -0700
Subject: [PATCH 099/119] mm/vmstat: fix divide error at __fragmentation_index

When order is -1 or too big, *1UL << order* will be 0, which will cause
a divide error.  Although it seems that all callers of
__fragmentation_index() will only do so with a valid order, the patch
can make it more robust.

Should prevent reoccurrences of
https://bugzilla.kernel.org/show_bug.cgi?id=196555

Link: http://lkml.kernel.org/r/1501751520-2598-1-git-send-email-wen.yang99@zte.com.cn
Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Reviewed-by: Jiang Biao <jiang.biao2@zte.com.cn>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index ba9b202e8500..05de233a6fca 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 {
 	unsigned long requested = 1UL << order;
 
+	if (WARN_ON_ONCE(order >= MAX_ORDER))
+		return 0;
+
 	if (!info->free_blocks_total)
 		return 0;
 

From c568da282bbc8f09c4b49201177fa259fe184c47 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 6 Sep 2017 16:24:09 -0700
Subject: [PATCH 100/119] mm/vmalloc.c: halve the number of comparisons
 performed in pcpu_get_vm_areas()

In pcpu_get_vm_areas(), it checks each range is not overlapped.  To make
sure it is, only (N^2)/2 comparison is necessary, while current code
does N^2 times.  By starting from the next range, it achieves the goal
and the continue could be removed.

Also,

 - the overlap check of two ranges could be done with one clause

 - one typo in comment is fixed.

Link: http://lkml.kernel.org/r/20170803063822.48702-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a47e3894c775..fa409c9092be 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2482,7 +2482,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  * matching slot.  While scanning, if any of the areas overlaps with
  * existing vmap_area, the base address is pulled down to fit the
  * area.  Scanning is repeated till all the areas fit and then all
- * necessary data structres are inserted and the result is returned.
+ * necessary data structures are inserted and the result is returned.
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
@@ -2510,15 +2510,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		if (start > offsets[last_area])
 			last_area = area;
 
-		for (area2 = 0; area2 < nr_vms; area2++) {
+		for (area2 = area + 1; area2 < nr_vms; area2++) {
 			unsigned long start2 = offsets[area2];
 			unsigned long end2 = start2 + sizes[area2];
 
-			if (area2 == area)
-				continue;
-
-			BUG_ON(start2 >= start && start2 < end);
-			BUG_ON(end2 <= end && end2 > start);
+			BUG_ON(start2 < end && start < end2);
 		}
 	}
 	last_end = offsets[last_area] + sizes[last_area];

From ab1b597ee0e4208a1db227bb7b2c9512c8234b48 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 6 Sep 2017 16:24:13 -0700
Subject: [PATCH 101/119] mm, devm_memremap_pages: use multi-order radix for
 ZONE_DEVICE lookups

devm_memremap_pages() records mapped ranges in pgmap_radix with an entry
per section's worth of memory (128MB).  The key for each of those
entries is a section number.

This leads to false positives when devm_memremap_pages() is passed a
section-unaligned range as lookups in the misalignment fail to return
NULL.  We can close this hole by using the pfn as the key for entries in
the tree.  The number of entries required to describe a remapped range
is reduced by leveraging multi-order entries.

In practice this approach usually yields just one entry in the tree if
the size and starting address are of the same power-of-2 alignment.
Previously we always needed nr_entries = mapping_size / 128MB.

Link: https://lists.01.org/pipermail/linux-nvdimm/2016-August/006666.html
Link: http://lkml.kernel.org/r/150215410565.39310.13767886055248249438.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 52 ++++++++++++++++++++++++++++++++++-------------
 mm/Kconfig        |  1 +
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9afdc434fb49..066e73c2fcc9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -194,18 +194,41 @@ struct page_map {
 	struct vmem_altmap altmap;
 };
 
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
+{
+	unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+	unsigned long nr_pages, mask;
+
+	nr_pages = PHYS_PFN(resource_size(res));
+	if (nr_pages == pgoff)
+		return ULONG_MAX;
+
+	/*
+	 * What is the largest aligned power-of-2 range available from
+	 * this resource pgoff to the end of the resource range,
+	 * considering the alignment of the current pgoff?
+	 */
+	mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+	if (!mask)
+		return ULONG_MAX;
+
+	return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+	for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+			pgoff += 1UL << order, order = order_at((res), pgoff))
+
 static void pgmap_radix_release(struct resource *res)
 {
-	resource_size_t key, align_start, align_size, align_end;
-
-	align_start = res->start & ~(SECTION_SIZE - 1);
-	align_size = ALIGN(resource_size(res), SECTION_SIZE);
-	align_end = align_start + align_size - 1;
+	unsigned long pgoff, order;
 
 	mutex_lock(&pgmap_lock);
-	for (key = res->start; key <= res->end; key += SECTION_SIZE)
-		radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+	foreach_order_pgoff(res, order, pgoff)
+		radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
 	mutex_unlock(&pgmap_lock);
+
+	synchronize_rcu();
 }
 
 static unsigned long pfn_first(struct page_map *page_map)
@@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+	page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
 	return page_map ? &page_map->pgmap : NULL;
 }
 
@@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
-	resource_size_t key, align_start, align_size, align_end;
+	resource_size_t align_start, align_size, align_end;
+	unsigned long pfn, pgoff, order;
 	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
 	int error, nid, is_ram;
-	unsigned long pfn;
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	mutex_lock(&pgmap_lock);
 	error = 0;
 	align_end = align_start + align_size - 1;
-	for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+	foreach_order_pgoff(res, order, pgoff) {
 		struct dev_pagemap *dup;
 
 		rcu_read_lock();
-		dup = find_dev_pagemap(key);
+		dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
 		rcu_read_unlock();
 		if (dup) {
 			dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 			error = -EBUSY;
 			break;
 		}
-		error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
-				page_map);
+		error = __radix_tree_insert(&pgmap_radix,
+				PHYS_PFN(res->start) + pgoff, order, page_map);
 		if (error) {
 			dev_err(dev, "%s: failed: %d\n", __func__, error);
 			break;
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
 	depends on ARCH_HAS_ZONE_DEVICE
+	select RADIX_TREE_MULTIORDER
 
 	help
 	  Device memory hotplug support allows for establishing pmem,

From 749df87bd7bee5a79cef073f5d032ddb2b211de8 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:24:16 -0700
Subject: [PATCH 102/119] mm/shmem: add hugetlbfs support to memfd_create()

This patch came out of discussions in this e-mail thread:
  http://lkml.kernel.org/r/1499357846-7481-1-git-send-email-mike.kravetz%40oracle.com

The Oracle JVM team is developing a new garbage collection model.  This
new model requires multiple mappings of the same anonymous memory.  One
straight forward way to accomplish this is with memfd_create.  They can
use the returned fd to create multiple mappings of the same memory.

The JVM today has an option to use (static hugetlb) huge pages.  If this
option is specified, they would like to use the same garbage collection
model requiring multiple mappings to the same memory.  Using hugetlbfs,
it is possible to explicitly mount a filesystem and specify file paths
in order to get an fd that can be used for multiple mappings.  However,
this introduces additional system admin work and coordination.

Ideally they would like to get a hugetlbfs fd without requiring explicit
mounting of a filesystem.  Today, mmap and shmget can make use of
hugetlbfs without explicitly mounting a filesystem.  The patch adds this
functionality to memfd_create.

Add a new flag MFD_HUGETLB to memfd_create() that will specify the file
to be created resides in the hugetlbfs filesystem.  This is the generic
hugetlbfs filesystem not associated with any specific mount point.  As
with other system calls that request hugetlbfs backed pages, there is
the ability to encode huge page size in the flag arguments.

hugetlbfs does not support sealing operations, therefore specifying
MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL.

Of course, the memfd_man page would need updating if this type of
functionality moves forward.

Link: http://lkml.kernel.org/r/1502149672-7759-2-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/memfd.h | 24 ++++++++++++++++++++++++
 mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 534e364bda92..7f3a722dbd72 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -1,8 +1,32 @@
 #ifndef _UAPI_LINUX_MEMFD_H
 #define _UAPI_LINUX_MEMFD_H
 
+#include <asm-generic/hugetlb_encode.h>
+
 /* flags for memfd_create(2) (unsigned int) */
 #define MFD_CLOEXEC		0x0001U
 #define MFD_ALLOW_SEALING	0x0002U
+#define MFD_HUGETLB		0x0004U
+
+/*
+ * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MFD_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MFD_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MFD_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MFD_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MFD_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MFD_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MFD_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MFD_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MFD_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MFD_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
 #endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 64bdc91187f7..47179bbe9ee7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
 
@@ -3652,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 
 SYSCALL_DEFINE2(memfd_create,
 		const char __user *, uname,
@@ -3664,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
 	char *name;
 	long len;
 
-	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
-		return -EINVAL;
+	if (!(flags & MFD_HUGETLB)) {
+		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+			return -EINVAL;
+	} else {
+		/* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+		if (flags & MFD_ALLOW_SEALING)
+			return -EINVAL;
+		/* Allow huge page size encoding in flags. */
+		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+			return -EINVAL;
+	}
 
 	/* length includes terminating zero */
 	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3696,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
 		goto err_name;
 	}
 
-	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (flags & MFD_HUGETLB) {
+		struct user_struct *user = NULL;
+
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+					HUGETLB_ANONHUGE_INODE,
+					(flags >> MFD_HUGE_SHIFT) &
+					MFD_HUGE_MASK);
+	} else
+		file = shmem_file_setup(name, 0, VM_NORESERVE);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_fd;
 	}
-	info = SHMEM_I(file_inode(file));
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_RDWR | O_LARGEFILE;
-	if (flags & MFD_ALLOW_SEALING)
+
+	if (flags & MFD_ALLOW_SEALING) {
+		/*
+		 * flags check at beginning of function ensures
+		 * this is not a hugetlbfs (MFD_HUGETLB) file.
+		 */
+		info = SHMEM_I(file_inode(file));
 		info->seals &= ~F_SEAL_SEAL;
+	}
 
 	fd_install(fd, file);
 	kfree(name);

From 1f522a4856600ac579765b729178f2b3b6a69129 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:24:19 -0700
Subject: [PATCH 103/119] selftests/memfd: add memfd_create hugetlbfs selftest

With the addition of hugetlbfs support in memfd_create, the memfd
selftests should verify correct functionality with hugetlbfs.

Instead of writing a separate memfd hugetlbfs test, modify the
memfd_test program to take an optional argument 'hugetlbfs'.  If the
hugetlbfs argument is specified, basic memfd_create functionality will
be exercised on hugetlbfs.  If hugetlbfs is not specified, the current
functionality of the test is unchanged.

Note that many of the tests in memfd_test test file sealing operations.
hugetlbfs does not support file sealing, therefore for hugetlbfs all
sealing related tests are skipped.

In order to test on hugetlbfs, there needs to be preallocated huge
pages.  A new script (run_tests) is added.  This script will first run
the existing memfd_create tests.  It will then, attempt to allocate the
required number of huge pages before running the hugetlbfs test.  At the
end of testing, it will release any huge pages allocated for testing
purposes.

Link: http://lkml.kernel.org/r/1502495772-24736-3-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/memfd/Makefile     |   2 +-
 tools/testing/selftests/memfd/memfd_test.c | 374 ++++++++++++++++-----
 tools/testing/selftests/memfd/run_tests.sh |  69 ++++
 3 files changed, 358 insertions(+), 87 deletions(-)
 create mode 100644 tools/testing/selftests/memfd/run_tests.sh

diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index ad8a0897e47f..bc9d02d615da 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -3,7 +3,7 @@ CFLAGS += -I../../../../include/uapi/
 CFLAGS += -I../../../../include/
 CFLAGS += -I../../../../usr/include/
 
-TEST_PROGS := run_fuse_test.sh
+TEST_PROGS := run_tests.sh
 TEST_GEN_FILES := memfd_test fuse_mnt fuse_test
 
 fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags)
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 26546892cd54..f94c6d1fb46f 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -18,12 +18,48 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#define MEMFD_STR	"memfd:"
+#define SHARED_FT_STR	"(shared file-table)"
+
 #define MFD_DEF_SIZE 8192
 #define STACK_SIZE 65536
 
+/*
+ * Default is not to test hugetlbfs
+ */
+static int hugetlbfs_test;
+static size_t mfd_def_size = MFD_DEF_SIZE;
+
+/*
+ * Copied from mlock2-tests.c
+ */
+static unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
 static int sys_memfd_create(const char *name,
 			    unsigned int flags)
 {
+	if (hugetlbfs_test)
+		flags |= MFD_HUGETLB;
+
 	return syscall(__NR_memfd_create, name, flags);
 }
 
@@ -150,7 +186,7 @@ static void *mfd_assert_mmap_shared(int fd)
 	void *p;
 
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ | PROT_WRITE,
 		 MAP_SHARED,
 		 fd,
@@ -168,7 +204,7 @@ static void *mfd_assert_mmap_private(int fd)
 	void *p;
 
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ,
 		 MAP_PRIVATE,
 		 fd,
@@ -223,7 +259,7 @@ static void mfd_assert_read(int fd)
 
 	/* verify PROT_READ *is* allowed */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ,
 		 MAP_PRIVATE,
 		 fd,
@@ -232,11 +268,11 @@ static void mfd_assert_read(int fd)
 		printf("mmap() failed: %m\n");
 		abort();
 	}
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	/* verify MAP_PRIVATE is *always* allowed (even writable) */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ | PROT_WRITE,
 		 MAP_PRIVATE,
 		 fd,
@@ -245,7 +281,7 @@ static void mfd_assert_read(int fd)
 		printf("mmap() failed: %m\n");
 		abort();
 	}
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 }
 
 static void mfd_assert_write(int fd)
@@ -254,16 +290,22 @@ static void mfd_assert_write(int fd)
 	void *p;
 	int r;
 
-	/* verify write() succeeds */
-	l = write(fd, "\0\0\0\0", 4);
-	if (l != 4) {
-		printf("write() failed: %m\n");
-		abort();
+	/*
+	 * huegtlbfs does not support write, but we want to
+	 * verify everything else here.
+	 */
+	if (!hugetlbfs_test) {
+		/* verify write() succeeds */
+		l = write(fd, "\0\0\0\0", 4);
+		if (l != 4) {
+			printf("write() failed: %m\n");
+			abort();
+		}
 	}
 
 	/* verify PROT_READ | PROT_WRITE is allowed */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ | PROT_WRITE,
 		 MAP_SHARED,
 		 fd,
@@ -273,11 +315,11 @@ static void mfd_assert_write(int fd)
 		abort();
 	}
 	*(char *)p = 0;
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	/* verify PROT_WRITE is allowed */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_WRITE,
 		 MAP_SHARED,
 		 fd,
@@ -287,12 +329,12 @@ static void mfd_assert_write(int fd)
 		abort();
 	}
 	*(char *)p = 0;
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	/* verify PROT_READ with MAP_SHARED is allowed and a following
 	 * mprotect(PROT_WRITE) allows writing */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ,
 		 MAP_SHARED,
 		 fd,
@@ -302,20 +344,20 @@ static void mfd_assert_write(int fd)
 		abort();
 	}
 
-	r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+	r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
 	if (r < 0) {
 		printf("mprotect() failed: %m\n");
 		abort();
 	}
 
 	*(char *)p = 0;
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	/* verify PUNCH_HOLE works */
 	r = fallocate(fd,
 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 		      0,
-		      MFD_DEF_SIZE);
+		      mfd_def_size);
 	if (r < 0) {
 		printf("fallocate(PUNCH_HOLE) failed: %m\n");
 		abort();
@@ -337,7 +379,7 @@ static void mfd_fail_write(int fd)
 
 	/* verify PROT_READ | PROT_WRITE is not allowed */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ | PROT_WRITE,
 		 MAP_SHARED,
 		 fd,
@@ -349,7 +391,7 @@ static void mfd_fail_write(int fd)
 
 	/* verify PROT_WRITE is not allowed */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_WRITE,
 		 MAP_SHARED,
 		 fd,
@@ -362,13 +404,13 @@ static void mfd_fail_write(int fd)
 	/* Verify PROT_READ with MAP_SHARED with a following mprotect is not
 	 * allowed. Note that for r/w the kernel already prevents the mmap. */
 	p = mmap(NULL,
-		 MFD_DEF_SIZE,
+		 mfd_def_size,
 		 PROT_READ,
 		 MAP_SHARED,
 		 fd,
 		 0);
 	if (p != MAP_FAILED) {
-		r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+		r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
 		if (r >= 0) {
 			printf("mmap()+mprotect() didn't fail as expected\n");
 			abort();
@@ -379,7 +421,7 @@ static void mfd_fail_write(int fd)
 	r = fallocate(fd,
 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 		      0,
-		      MFD_DEF_SIZE);
+		      mfd_def_size);
 	if (r >= 0) {
 		printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
 		abort();
@@ -390,13 +432,13 @@ static void mfd_assert_shrink(int fd)
 {
 	int r, fd2;
 
-	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	r = ftruncate(fd, mfd_def_size / 2);
 	if (r < 0) {
 		printf("ftruncate(SHRINK) failed: %m\n");
 		abort();
 	}
 
-	mfd_assert_size(fd, MFD_DEF_SIZE / 2);
+	mfd_assert_size(fd, mfd_def_size / 2);
 
 	fd2 = mfd_assert_open(fd,
 			      O_RDWR | O_CREAT | O_TRUNC,
@@ -410,7 +452,7 @@ static void mfd_fail_shrink(int fd)
 {
 	int r;
 
-	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	r = ftruncate(fd, mfd_def_size / 2);
 	if (r >= 0) {
 		printf("ftruncate(SHRINK) didn't fail as expected\n");
 		abort();
@@ -425,31 +467,31 @@ static void mfd_assert_grow(int fd)
 {
 	int r;
 
-	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	r = ftruncate(fd, mfd_def_size * 2);
 	if (r < 0) {
 		printf("ftruncate(GROW) failed: %m\n");
 		abort();
 	}
 
-	mfd_assert_size(fd, MFD_DEF_SIZE * 2);
+	mfd_assert_size(fd, mfd_def_size * 2);
 
 	r = fallocate(fd,
 		      0,
 		      0,
-		      MFD_DEF_SIZE * 4);
+		      mfd_def_size * 4);
 	if (r < 0) {
 		printf("fallocate(ALLOC) failed: %m\n");
 		abort();
 	}
 
-	mfd_assert_size(fd, MFD_DEF_SIZE * 4);
+	mfd_assert_size(fd, mfd_def_size * 4);
 }
 
 static void mfd_fail_grow(int fd)
 {
 	int r;
 
-	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	r = ftruncate(fd, mfd_def_size * 2);
 	if (r >= 0) {
 		printf("ftruncate(GROW) didn't fail as expected\n");
 		abort();
@@ -458,7 +500,7 @@ static void mfd_fail_grow(int fd)
 	r = fallocate(fd,
 		      0,
 		      0,
-		      MFD_DEF_SIZE * 4);
+		      mfd_def_size * 4);
 	if (r >= 0) {
 		printf("fallocate(ALLOC) didn't fail as expected\n");
 		abort();
@@ -467,25 +509,37 @@ static void mfd_fail_grow(int fd)
 
 static void mfd_assert_grow_write(int fd)
 {
-	static char buf[MFD_DEF_SIZE * 8];
+	static char *buf;
 	ssize_t l;
 
-	l = pwrite(fd, buf, sizeof(buf), 0);
-	if (l != sizeof(buf)) {
+	buf = malloc(mfd_def_size * 8);
+	if (!buf) {
+		printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
+		abort();
+	}
+
+	l = pwrite(fd, buf, mfd_def_size * 8, 0);
+	if (l != (mfd_def_size * 8)) {
 		printf("pwrite() failed: %m\n");
 		abort();
 	}
 
-	mfd_assert_size(fd, MFD_DEF_SIZE * 8);
+	mfd_assert_size(fd, mfd_def_size * 8);
 }
 
 static void mfd_fail_grow_write(int fd)
 {
-	static char buf[MFD_DEF_SIZE * 8];
+	static char *buf;
 	ssize_t l;
 
-	l = pwrite(fd, buf, sizeof(buf), 0);
-	if (l == sizeof(buf)) {
+	buf = malloc(mfd_def_size * 8);
+	if (!buf) {
+		printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
+		abort();
+	}
+
+	l = pwrite(fd, buf, mfd_def_size * 8, 0);
+	if (l == (mfd_def_size * 8)) {
 		printf("pwrite() didn't fail as expected\n");
 		abort();
 	}
@@ -543,6 +597,8 @@ static void test_create(void)
 	char buf[2048];
 	int fd;
 
+	printf("%s CREATE\n", MEMFD_STR);
+
 	/* test NULL name */
 	mfd_fail_new(NULL, 0);
 
@@ -570,13 +626,18 @@ static void test_create(void)
 	fd = mfd_assert_new("", 0, MFD_CLOEXEC);
 	close(fd);
 
-	/* verify MFD_ALLOW_SEALING is allowed */
-	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
-	close(fd);
+	if (!hugetlbfs_test) {
+		/* verify MFD_ALLOW_SEALING is allowed */
+		fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+		close(fd);
 
-	/* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
-	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
-	close(fd);
+		/* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
+		fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+		close(fd);
+	} else {
+		/* sealing is not supported on hugetlbfs */
+		mfd_fail_new("", MFD_ALLOW_SEALING);
+	}
 }
 
 /*
@@ -587,8 +648,14 @@ static void test_basic(void)
 {
 	int fd;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s BASIC\n", MEMFD_STR);
+
 	fd = mfd_assert_new("kern_memfd_basic",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 
 	/* add basic seals */
@@ -619,7 +686,7 @@ static void test_basic(void)
 
 	/* verify sealing does not work without MFD_ALLOW_SEALING */
 	fd = mfd_assert_new("kern_memfd_basic",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC);
 	mfd_assert_has_seals(fd, F_SEAL_SEAL);
 	mfd_fail_add_seals(fd, F_SEAL_SHRINK |
@@ -629,6 +696,28 @@ static void test_basic(void)
 	close(fd);
 }
 
+/*
+ * hugetlbfs doesn't support seals or write, so just verify grow and shrink
+ * on a hugetlbfs file created via memfd_create.
+ */
+static void test_hugetlbfs_grow_shrink(void)
+{
+	int fd;
+
+	printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR);
+
+	fd = mfd_assert_new("kern_memfd_seal_write",
+			    mfd_def_size,
+			    MFD_CLOEXEC);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_assert_shrink(fd);
+	mfd_assert_grow(fd);
+
+	close(fd);
+}
+
 /*
  * Test SEAL_WRITE
  * Test whether SEAL_WRITE actually prevents modifications.
@@ -637,8 +726,17 @@ static void test_seal_write(void)
 {
 	int fd;
 
+	/*
+	 * hugetlbfs does not contain sealing or write support.  Just test
+	 * basic grow and shrink via test_hugetlbfs_grow_shrink.
+	 */
+	if (hugetlbfs_test)
+		return test_hugetlbfs_grow_shrink();
+
+	printf("%s SEAL-WRITE\n", MEMFD_STR);
+
 	fd = mfd_assert_new("kern_memfd_seal_write",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 	mfd_assert_add_seals(fd, F_SEAL_WRITE);
@@ -661,8 +759,14 @@ static void test_seal_shrink(void)
 {
 	int fd;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s SEAL-SHRINK\n", MEMFD_STR);
+
 	fd = mfd_assert_new("kern_memfd_seal_shrink",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
@@ -685,8 +789,14 @@ static void test_seal_grow(void)
 {
 	int fd;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s SEAL-GROW\n", MEMFD_STR);
+
 	fd = mfd_assert_new("kern_memfd_seal_grow",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 	mfd_assert_add_seals(fd, F_SEAL_GROW);
@@ -709,8 +819,14 @@ static void test_seal_resize(void)
 {
 	int fd;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s SEAL-RESIZE\n", MEMFD_STR);
+
 	fd = mfd_assert_new("kern_memfd_seal_resize",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 	mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
@@ -726,15 +842,52 @@ static void test_seal_resize(void)
 }
 
 /*
- * Test sharing via dup()
- * Test that seals are shared between dupped FDs and they're all equal.
+ * hugetlbfs does not support seals.  Basic test to dup the memfd created
+ * fd and perform some basic operations on it.
  */
-static void test_share_dup(void)
+static void hugetlbfs_dup(char *b_suffix)
 {
 	int fd, fd2;
 
+	printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix);
+
 	fd = mfd_assert_new("kern_memfd_share_dup",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
+			    MFD_CLOEXEC);
+
+	fd2 = mfd_assert_dup(fd);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+
+	mfd_assert_shrink(fd2);
+	mfd_assert_grow(fd2);
+
+	close(fd2);
+	close(fd);
+}
+
+/*
+ * Test sharing via dup()
+ * Test that seals are shared between dupped FDs and they're all equal.
+ */
+static void test_share_dup(char *banner, char *b_suffix)
+{
+	int fd, fd2;
+
+	/*
+	 * hugetlbfs does not contain sealing support.  Perform some
+	 * basic testing on dup'ed fd instead via hugetlbfs_dup.
+	 */
+	if (hugetlbfs_test) {
+		hugetlbfs_dup(b_suffix);
+		return;
+	}
+
+	printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+
+	fd = mfd_assert_new("kern_memfd_share_dup",
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 
@@ -768,13 +921,19 @@ static void test_share_dup(void)
  * Test sealing with active mmap()s
  * Modifying seals is only allowed if no other mmap() refs exist.
  */
-static void test_share_mmap(void)
+static void test_share_mmap(char *banner, char *b_suffix)
 {
 	int fd;
 	void *p;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s %s %s\n", MEMFD_STR,  banner, b_suffix);
+
 	fd = mfd_assert_new("kern_memfd_share_mmap",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 
@@ -784,29 +943,66 @@ static void test_share_mmap(void)
 	mfd_assert_has_seals(fd, 0);
 	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
 	mfd_assert_has_seals(fd, F_SEAL_SHRINK);
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	/* readable ref allows sealing */
 	p = mfd_assert_mmap_private(fd);
 	mfd_assert_add_seals(fd, F_SEAL_WRITE);
 	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
-	munmap(p, MFD_DEF_SIZE);
+	munmap(p, mfd_def_size);
 
 	close(fd);
 }
 
+/*
+ * Basic test to make sure we can open the hugetlbfs fd via /proc and
+ * perform some simple operations on it.
+ */
+static void hugetlbfs_proc_open(char *b_suffix)
+{
+	int fd, fd2;
+
+	printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix);
+
+	fd = mfd_assert_new("kern_memfd_share_open",
+			    mfd_def_size,
+			    MFD_CLOEXEC);
+
+	fd2 = mfd_assert_open(fd, O_RDWR, 0);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+
+	mfd_assert_shrink(fd2);
+	mfd_assert_grow(fd2);
+
+	close(fd2);
+	close(fd);
+}
+
 /*
  * Test sealing with open(/proc/self/fd/%d)
  * Via /proc we can get access to a separate file-context for the same memfd.
  * This is *not* like dup(), but like a real separate open(). Make sure the
  * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
  */
-static void test_share_open(void)
+static void test_share_open(char *banner, char *b_suffix)
 {
 	int fd, fd2;
 
+	/*
+	 * hugetlbfs does not contain sealing support.  So test basic
+	 * functionality of using /proc fd via hugetlbfs_proc_open
+	 */
+	if (hugetlbfs_test) {
+		hugetlbfs_proc_open(b_suffix);
+		return;
+	}
+
+	printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+
 	fd = mfd_assert_new("kern_memfd_share_open",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 
@@ -841,13 +1037,19 @@ static void test_share_open(void)
  * Test sharing via fork()
  * Test whether seal-modifications work as expected with forked childs.
  */
-static void test_share_fork(void)
+static void test_share_fork(char *banner, char *b_suffix)
 {
 	int fd;
 	pid_t pid;
 
+	/* hugetlbfs does not contain sealing support */
+	if (hugetlbfs_test)
+		return;
+
+	printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+
 	fd = mfd_assert_new("kern_memfd_share_fork",
-			    MFD_DEF_SIZE,
+			    mfd_def_size,
 			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
 	mfd_assert_has_seals(fd, 0);
 
@@ -870,40 +1072,40 @@ int main(int argc, char **argv)
 {
 	pid_t pid;
 
-	printf("memfd: CREATE\n");
+	if (argc == 2) {
+		if (!strcmp(argv[1], "hugetlbfs")) {
+			unsigned long hpage_size = default_huge_page_size();
+
+			if (!hpage_size) {
+				printf("Unable to determine huge page size\n");
+				abort();
+			}
+
+			hugetlbfs_test = 1;
+			mfd_def_size = hpage_size * 2;
+		}
+	}
+
 	test_create();
-	printf("memfd: BASIC\n");
 	test_basic();
 
-	printf("memfd: SEAL-WRITE\n");
 	test_seal_write();
-	printf("memfd: SEAL-SHRINK\n");
 	test_seal_shrink();
-	printf("memfd: SEAL-GROW\n");
 	test_seal_grow();
-	printf("memfd: SEAL-RESIZE\n");
 	test_seal_resize();
 
-	printf("memfd: SHARE-DUP\n");
-	test_share_dup();
-	printf("memfd: SHARE-MMAP\n");
-	test_share_mmap();
-	printf("memfd: SHARE-OPEN\n");
-	test_share_open();
-	printf("memfd: SHARE-FORK\n");
-	test_share_fork();
+	test_share_dup("SHARE-DUP", "");
+	test_share_mmap("SHARE-MMAP", "");
+	test_share_open("SHARE-OPEN", "");
+	test_share_fork("SHARE-FORK", "");
 
 	/* Run test-suite in a multi-threaded environment with a shared
 	 * file-table. */
 	pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
-	printf("memfd: SHARE-DUP (shared file-table)\n");
-	test_share_dup();
-	printf("memfd: SHARE-MMAP (shared file-table)\n");
-	test_share_mmap();
-	printf("memfd: SHARE-OPEN (shared file-table)\n");
-	test_share_open();
-	printf("memfd: SHARE-FORK (shared file-table)\n");
-	test_share_fork();
+	test_share_dup("SHARE-DUP", SHARED_FT_STR);
+	test_share_mmap("SHARE-MMAP", SHARED_FT_STR);
+	test_share_open("SHARE-OPEN", SHARED_FT_STR);
+	test_share_fork("SHARE-FORK", SHARED_FT_STR);
 	join_idle_thread(pid);
 
 	printf("memfd: DONE\n");
diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh
new file mode 100644
index 000000000000..daabb350697c
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_tests.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# please run as root
+
+#
+# Normal tests requiring no special resources
+#
+./run_fuse_test.sh
+./memfd_test
+
+#
+# To test memfd_create with hugetlbfs, there needs to be hpages_test
+# huge pages free.  Attempt to allocate enough pages to test.
+#
+hpages_test=8
+
+#
+# Get count of free huge pages from /proc/meminfo
+#
+while read name size unit; do
+        if [ "$name" = "HugePages_Free:" ]; then
+                freepgs=$size
+        fi
+done < /proc/meminfo
+
+#
+# If not enough free huge pages for test, attempt to increase
+#
+if [ -n "$freepgs" ] && [ $freepgs -lt $hpages_test ]; then
+	nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+	hpages_needed=`expr $hpages_test - $freepgs`
+
+	echo 3 > /proc/sys/vm/drop_caches
+	echo $(( $hpages_needed + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+	if [ $? -ne 0 ]; then
+		echo "Please run this test as root"
+		exit 1
+	fi
+	while read name size unit; do
+		if [ "$name" = "HugePages_Free:" ]; then
+			freepgs=$size
+		fi
+	done < /proc/meminfo
+fi
+
+#
+# If still not enough huge pages available, exit.  But, give back any huge
+# pages potentially allocated above.
+#
+if [ $freepgs -lt $hpages_test ]; then
+	# nr_hugepgs non-zero only if we attempted to increase
+	if [ -n "$nr_hugepgs" ]; then
+		echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+	fi
+	printf "Not enough huge pages available (%d < %d)\n" \
+		$freepgs $needpgs
+	exit 1
+fi
+
+#
+# Run the hugetlbfs test
+#
+./memfd_test hugetlbfs
+
+#
+# Give back any huge pages allocated for the test
+#
+if [ -n "$nr_hugepgs" ]; then
+	echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+fi

From f113e64121ba9f4791332248b315d9f57ee33a6b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj38.park@gmail.com>
Date: Wed, 6 Sep 2017 16:24:23 -0700
Subject: [PATCH 104/119] mm/vmstat.c: fix wrong comment

Comment for pagetypeinfo_showblockcount() is mistakenly duplicated from
pagetypeinfo_show_free()'s comment.  This commit fixes it.

Link: http://lkml.kernel.org/r/20170809185816.11244-1-sj38.park@gmail.com
Fixes: 467c996c1e19 ("Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo")
Signed-off-by: SeongJae Park <sj38.park@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 05de233a6fca..85f3a2e04adc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1255,7 +1255,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 	seq_putc(m, '\n');
 }
 
-/* Print out the free pages at each order for each migratetype */
+/* Print out the number of pageblocks for each migratetype */
 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
 {
 	int mtype;

From 894e58c1475a03cd8260be7f28444cc298239432 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 6 Sep 2017 16:24:26 -0700
Subject: [PATCH 105/119] mm/vmalloc.c: don't reinvent the wheel but use
 existing llist API

Although llist provides proper APIs, they are not used.  Make them used.

Link: http://lkml.kernel.org/r/1502095374-16112-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: zijun_hu <zijun_hu@htc.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fa409c9092be..8a43db6284eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int);
 static void free_work(struct work_struct *w)
 {
 	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
-	struct llist_node *llnode = llist_del_all(&p->list);
-	while (llnode) {
-		void *p = llnode;
-		llnode = llist_next(llnode);
-		__vunmap(p, 1);
-	}
+	struct llist_node *t, *llnode;
+
+	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+		__vunmap((void *)llnode, 1);
 }
 
 /*** Page table manipulation functions ***/

From cbc65df240c104bf540af1ad58595bf1eaa5ee10 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:29 -0700
Subject: [PATCH 106/119] mm, swap: add swap readahead hit statistics

Patch series "mm, swap: VMA based swap readahead", v4.

The swap readahead is an important mechanism to reduce the swap in
latency.  Although pure sequential memory access pattern isn't very
popular for anonymous memory, the space locality is still considered
valid.

In the original swap readahead implementation, the consecutive blocks in
swap device are readahead based on the global space locality estimation.
But the consecutive blocks in swap device just reflect the order of page
reclaiming, don't necessarily reflect the access pattern in virtual
memory space.  And the different tasks in the system may have different
access patterns, which makes the global space locality estimation
incorrect.

In this patchset, when page fault occurs, the virtual pages near the
fault address will be readahead instead of the swap slots near the fault
swap slot in swap device.  This avoid to readahead the unrelated swap
slots.  At the same time, the swap readahead is changed to work on
per-VMA from globally.  So that the different access patterns of the
different VMAs could be distinguished, and the different readahead
policy could be applied accordingly.  The original core readahead
detection and scaling algorithm is reused, because it is an effect
algorithm to detect the space locality.

In addition to the swap readahead changes, some new sysfs interface is
added to show the efficiency of the readahead algorithm and some other
swap statistics.

This new implementation will incur more small random read, on SSD, the
improved correctness of estimation and readahead target should beat the
potential increased overhead, this is also illustrated in the test
results below.  But on HDD, the overhead may beat the benefit, so the
original implementation will be used by default.

The test and result is as follow,

Common test condition
=====================

Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM)
Swap device: NVMe disk

Micro-benchmark with combined access pattern
============================================

vm-scalability, sequential swap test case, 4 processes to eat 50G
virtual memory space, repeat the sequential memory writing until 300
seconds.  The first round writing will trigger swap out, the following
rounds will trigger sequential swap in and out.

At the same time, run vm-scalability random swap test case in
background, 8 processes to eat 30G virtual memory space, repeat the
random memory write until 300 seconds.  This will trigger random swap-in
in the background.

This is a combined workload with sequential and random memory accessing
at the same time.  The result (for sequential workload) is as follow,

			Base		Optimized
			----		---------
throughput		345413 KB/s	414029 KB/s (+19.9%)
latency.average		97.14 us	61.06 us (-37.1%)
latency.50th		2 us		1 us
latency.60th		2 us		1 us
latency.70th		98 us		2 us
latency.80th		160 us		2 us
latency.90th		260 us		217 us
latency.95th		346 us		369 us
latency.99th		1.34 ms		1.09 ms
ra_hit%			52.69%		99.98%

The original swap readahead algorithm is confused by the background
random access workload, so readahead hit rate is lower.  The VMA-base
readahead algorithm works much better.

Linpack
=======

The test memory size is bigger than RAM to trigger swapping.

			Base		Optimized
			----		---------
elapsed_time		393.49 s	329.88 s (-16.2%)
ra_hit%			86.21%		98.82%

The score of base and optimized kernel hasn't visible changes.  But the
elapsed time reduced and readahead hit rate improved, so the optimized
kernel runs better for startup and tear down stages.  And the absolute
value of readahead hit rate is high, shows that the space locality is
still valid in some practical workloads.

This patch (of 5):

The statistics for total readahead pages and total readahead hits are
recorded and exported via the following sysfs interface.

/sys/kernel/mm/swap/ra_hits
/sys/kernel/mm/swap/ra_total

With them, the efficiency of the swap readahead could be measured, so
that the swap readahead algorithm and parameters could be tuned
accordingly.

[akpm@linux-foundation.org: don't display swap stats if CONFIG_SWAP=n]
Link: http://lkml.kernel.org/r/20170807054038.1843-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 4 ++++
 mm/swap_state.c               | 9 +++++++--
 mm/vmstat.c                   | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e02820fc2861..d77bc35278b0 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -105,6 +105,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		VMACACHE_FIND_CALLS,
 		VMACACHE_FIND_HITS,
 		VMACACHE_FULL_FLUSHES,
+#endif
+#ifdef CONFIG_SWAP
+		SWAP_RA,
+		SWAP_RA_HIT,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..d1bdb31cab13 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -305,8 +305,10 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 
 	if (page && likely(!PageTransCompound(page))) {
 		INC_CACHE_INFO(find_success);
-		if (TestClearPageReadahead(page))
+		if (TestClearPageReadahead(page)) {
 			atomic_inc(&swapin_readahead_hits);
+			count_vm_event(SWAP_RA_HIT);
+		}
 	}
 
 	INC_CACHE_INFO(find_total);
@@ -516,8 +518,11 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 						gfp_mask, vma, addr, false);
 		if (!page)
 			continue;
-		if (offset != entry_offset && likely(!PageTransCompound(page)))
+		if (offset != entry_offset &&
+		    likely(!PageTransCompound(page))) {
 			SetPageReadahead(page);
+			count_vm_event(SWAP_RA);
+		}
 		put_page(page);
 	}
 	blk_finish_plug(&plug);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 85f3a2e04adc..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1098,6 +1098,10 @@ const char * const vmstat_text[] = {
 	"vmacache_find_hits",
 	"vmacache_full_flushes",
 #endif
+#ifdef CONFIG_SWAP
+	"swap_ra",
+	"swap_ra_hit",
+#endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */

From c4fa63092f216737b60c789968371d9960a598e5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:33 -0700
Subject: [PATCH 107/119] mm, swap: fix swap readahead marking

In the original implementation, it is possible that the existing pages
in the swap cache (not newly readahead) could be marked as the readahead
pages.  This will cause the statistics of swap readahead be wrong and
influence the swap readahead algorithm too.

This is fixed via marking a page as the readahead page only if it is
newly allocated and read from the disk.

When testing with linpack, after the fixing the swap readahead hit rate
increased from ~66% to ~86%.

Link: http://lkml.kernel.org/r/20170807054038.1843-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index d1bdb31cab13..a901afe9da61 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -498,7 +498,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	unsigned long start_offset, end_offset;
 	unsigned long mask;
 	struct blk_plug plug;
-	bool do_poll = true;
+	bool do_poll = true, page_allocated;
 
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
@@ -514,14 +514,18 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	blk_start_plug(&plug);
 	for (offset = start_offset; offset <= end_offset ; offset++) {
 		/* Ok, do the async read-ahead now */
-		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
-						gfp_mask, vma, addr, false);
+		page = __read_swap_cache_async(
+			swp_entry(swp_type(entry), offset),
+			gfp_mask, vma, addr, &page_allocated);
 		if (!page)
 			continue;
-		if (offset != entry_offset &&
-		    likely(!PageTransCompound(page))) {
-			SetPageReadahead(page);
-			count_vm_event(SWAP_RA);
+		if (page_allocated) {
+			swap_readpage(page, false);
+			if (offset != entry_offset &&
+			    likely(!PageTransCompound(page))) {
+				SetPageReadahead(page);
+				count_vm_event(SWAP_RA);
+			}
 		}
 		put_page(page);
 	}

From ec560175c0b6fce86994bdf036754d48122c5c87 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:36 -0700
Subject: [PATCH 108/119] mm, swap: VMA based swap readahead

The swap readahead is an important mechanism to reduce the swap in
latency.  Although pure sequential memory access pattern isn't very
popular for anonymous memory, the space locality is still considered
valid.

In the original swap readahead implementation, the consecutive blocks in
swap device are readahead based on the global space locality estimation.
But the consecutive blocks in swap device just reflect the order of page
reclaiming, don't necessarily reflect the access pattern in virtual
memory.  And the different tasks in the system may have different access
patterns, which makes the global space locality estimation incorrect.

In this patch, when page fault occurs, the virtual pages near the fault
address will be readahead instead of the swap slots near the fault swap
slot in swap device.  This avoid to readahead the unrelated swap slots.
At the same time, the swap readahead is changed to work on per-VMA from
globally.  So that the different access patterns of the different VMAs
could be distinguished, and the different readahead policy could be
applied accordingly.  The original core readahead detection and scaling
algorithm is reused, because it is an effect algorithm to detect the
space locality.

The test and result is as follow,

Common test condition
=====================

Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device:
NVMe disk

Micro-benchmark with combined access pattern
============================================

vm-scalability, sequential swap test case, 4 processes to eat 50G
virtual memory space, repeat the sequential memory writing until 300
seconds.  The first round writing will trigger swap out, the following
rounds will trigger sequential swap in and out.

At the same time, run vm-scalability random swap test case in
background, 8 processes to eat 30G virtual memory space, repeat the
random memory write until 300 seconds.  This will trigger random swap-in
in the background.

This is a combined workload with sequential and random memory accessing
at the same time.  The result (for sequential workload) is as follow,

			Base		Optimized
			----		---------
throughput		345413 KB/s	414029 KB/s (+19.9%)
latency.average		97.14 us	61.06 us (-37.1%)
latency.50th		2 us		1 us
latency.60th		2 us		1 us
latency.70th		98 us		2 us
latency.80th		160 us		2 us
latency.90th		260 us		217 us
latency.95th		346 us		369 us
latency.99th		1.34 ms		1.09 ms
ra_hit%			52.69%		99.98%

The original swap readahead algorithm is confused by the background
random access workload, so readahead hit rate is lower.  The VMA-base
readahead algorithm works much better.

Linpack
=======

The test memory size is bigger than RAM to trigger swapping.

			Base		Optimized
			----		---------
elapsed_time		393.49 s	329.88 s (-16.2%)
ra_hit%			86.21%		98.82%

The score of base and optimized kernel hasn't visible changes.  But the
elapsed time reduced and readahead hit rate improved, so the optimized
kernel runs better for startup and tear down stages.  And the absolute
value of readahead hit rate is high, shows that the space locality is
still valid in some practical workloads.

Link: http://lkml.kernel.org/r/20170807054038.1843-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |   1 +
 include/linux/swap.h     |  57 ++++++++++-
 mm/memory.c              |  23 ++++-
 mm/shmem.c               |   2 +-
 mm/swap_state.c          | 215 +++++++++++++++++++++++++++++++++++----
 5 files changed, 273 insertions(+), 25 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 57378c7cb5f8..f45ad815b7d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@ struct vm_area_struct {
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
+	atomic_long_t swap_readahead_info;
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 76f1632eea5a..61d63379e956 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -251,6 +251,25 @@ struct swap_info_struct {
 	struct swap_cluster_list discard_clusters; /* discard clusters list */
 };
 
+#ifdef CONFIG_64BIT
+#define SWAP_RA_ORDER_CEILING	5
+#else
+/* Avoid stack overflow, because we need to save part of page table */
+#define SWAP_RA_ORDER_CEILING	3
+#define SWAP_RA_PTE_CACHE_SIZE	(1 << SWAP_RA_ORDER_CEILING)
+#endif
+
+struct vma_swap_readahead {
+	unsigned short win;
+	unsigned short offset;
+	unsigned short nr_pte;
+#ifdef CONFIG_64BIT
+	pte_t *ptes;
+#else
+	pte_t ptes[SWAP_RA_PTE_CACHE_SIZE];
+#endif
+};
+
 /* linux/mm/workingset.c */
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
@@ -350,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 #define SWAP_ADDRESS_SPACE_SHIFT	14
 #define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
 extern struct address_space *swapper_spaces[];
+extern bool swap_vma_readahead;
 #define swap_address_space(entry)			    \
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
@@ -362,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
-extern struct page *lookup_swap_cache(swp_entry_t);
+extern struct page *lookup_swap_cache(swp_entry_t entry,
+				      struct vm_area_struct *vma,
+				      unsigned long addr);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr,
 			bool do_poll);
@@ -372,6 +394,17 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
 
+extern struct page *swap_readahead_detect(struct vm_fault *vmf,
+					  struct vma_swap_readahead *swap_ra);
+extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+					   struct vm_fault *vmf,
+					   struct vma_swap_readahead *swap_ra);
+
+static inline bool swap_use_vma_readahead(void)
+{
+	return READ_ONCE(swap_vma_readahead);
+}
+
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
@@ -466,12 +499,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline bool swap_use_vma_readahead(void)
+{
+	return false;
+}
+
+static inline struct page *swap_readahead_detect(
+	struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+	return NULL;
+}
+
+static inline struct page *do_swap_page_readahead(
+	swp_entry_t fentry, gfp_t gfp_mask,
+	struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+	return NULL;
+}
+
 static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
 	return 0;
 }
 
-static inline struct page *lookup_swap_cache(swp_entry_t swp)
+static inline struct page *lookup_swap_cache(swp_entry_t swp,
+					     struct vm_area_struct *vma,
+					     unsigned long addr)
 {
 	return NULL;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 3dd8bb46391b..e87953775e3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2752,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
 int do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page, *swapcache;
+	struct page *page = NULL, *swapcache;
 	struct mem_cgroup *memcg;
+	struct vma_swap_readahead swap_ra;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
 	int exclusive = 0;
 	int ret = 0;
+	bool vma_readahead = swap_use_vma_readahead();
 
-	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+	if (vma_readahead)
+		page = swap_readahead_detect(vmf, &swap_ra);
+	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+		if (page)
+			put_page(page);
 		goto out;
+	}
 
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
@@ -2777,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-	page = lookup_swap_cache(entry);
+	if (!page)
+		page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+					 vmf->address);
 	if (!page) {
-		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
-					vmf->address);
+		if (vma_readahead)
+			page = do_swap_page_readahead(entry,
+				GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+		else
+			page = swapin_readahead(entry,
+				GFP_HIGHUSER_MOVABLE, vma, vmf->address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
diff --git a/mm/shmem.c b/mm/shmem.c
index 47179bbe9ee7..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1650,7 +1650,7 @@ repeat:
 
 	if (swap.val) {
 		/* Look it up and read it in.. */
-		page = lookup_swap_cache(swap);
+		page = lookup_swap_cache(swap, NULL, 0);
 		if (!page) {
 			/* Or update major stats only when swapin succeeds?? */
 			if (fault_type) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a901afe9da61..3885fef7bdf5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
 
 struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+
+#define SWAP_RA_MAX_ORDER_DEFAULT	3
+
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+
+#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+
+#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)
+
+#define SWAP_RA_VAL(addr, win, hits)				\
+	(((addr) & PAGE_MASK) |					\
+	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
+	 ((hits) & SWAP_RA_HITS_MASK))
+
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma)					\
+	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 #define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
@@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
  * lock getting page table operations atomic even if we drop the page
  * lock before returning.
  */
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+			       unsigned long addr)
 {
 	struct page *page;
+	unsigned long ra_info;
+	int win, hits, readahead;
 
 	page = find_get_page(swap_address_space(entry), swp_offset(entry));
 
-	if (page && likely(!PageTransCompound(page))) {
+	INC_CACHE_INFO(find_total);
+	if (page) {
 		INC_CACHE_INFO(find_success);
-		if (TestClearPageReadahead(page)) {
-			atomic_inc(&swapin_readahead_hits);
+		if (unlikely(PageTransCompound(page)))
+			return page;
+		readahead = TestClearPageReadahead(page);
+		if (vma) {
+			ra_info = GET_SWAP_RA_VAL(vma);
+			win = SWAP_RA_WIN(ra_info);
+			hits = SWAP_RA_HITS(ra_info);
+			if (readahead)
+				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+			atomic_long_set(&vma->swap_readahead_info,
+					SWAP_RA_VAL(addr, win, hits));
+		}
+		if (readahead) {
 			count_vm_event(SWAP_RA_HIT);
+			if (!vma)
+				atomic_inc(&swapin_readahead_hits);
 		}
 	}
-
-	INC_CACHE_INFO(find_total);
 	return page;
 }
 
@@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	return retpage;
 }
 
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+				      unsigned long offset,
+				      int hits,
+				      int max_pages,
+				      int prev_win)
 {
-	static unsigned long prev_offset;
-	unsigned int pages, max_pages, last_ra;
-	static atomic_t last_readahead_pages;
-
-	max_pages = 1 << READ_ONCE(page_cluster);
-	if (max_pages <= 1)
-		return 1;
+	unsigned int pages, last_ra;
 
 	/*
 	 * This heuristic has been found to work well on both sequential and
 	 * random loads, swapping to hard disk or to SSD: please don't ask
 	 * what the "+ 2" means, it just happens to work well, that's all.
 	 */
-	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+	pages = hits + 2;
 	if (pages == 2) {
 		/*
 		 * We can have no readahead hits to judge by: but must not get
@@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 		 */
 		if (offset != prev_offset + 1 && offset != prev_offset - 1)
 			pages = 1;
-		prev_offset = offset;
 	} else {
 		unsigned int roundup = 4;
 		while (roundup < pages)
@@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 		pages = max_pages;
 
 	/* Don't shrink readahead too fast */
-	last_ra = atomic_read(&last_readahead_pages) / 2;
+	last_ra = prev_win / 2;
 	if (pages < last_ra)
 		pages = last_ra;
+
+	return pages;
+}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+	static unsigned long prev_offset;
+	unsigned int hits, pages, max_pages;
+	static atomic_t last_readahead_pages;
+
+	max_pages = 1 << READ_ONCE(page_cluster);
+	if (max_pages <= 1)
+		return 1;
+
+	hits = atomic_xchg(&swapin_readahead_hits, 0);
+	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+				  atomic_read(&last_readahead_pages));
+	if (!hits)
+		prev_offset = offset;
 	atomic_set(&last_readahead_pages, pages);
 
 	return pages;
@@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type)
 	synchronize_rcu();
 	kvfree(spaces);
 }
+
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+				     unsigned long faddr,
+				     unsigned long lpfn,
+				     unsigned long rpfn,
+				     unsigned long *start,
+				     unsigned long *end)
+{
+	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
+		      PFN_DOWN(faddr & PMD_MASK));
+	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
+		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+				   struct vma_swap_readahead *swap_ra)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long swap_ra_info;
+	struct page *page;
+	swp_entry_t entry;
+	unsigned long faddr, pfn, fpfn;
+	unsigned long start, end;
+	pte_t *pte;
+	unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+	pte_t *tpte;
+#endif
+
+	faddr = vmf->address;
+	entry = pte_to_swp_entry(vmf->orig_pte);
+	if ((unlikely(non_swap_entry(entry))))
+		return NULL;
+	page = lookup_swap_cache(entry, vma, faddr);
+	if (page)
+		return page;
+
+	max_win = 1 << READ_ONCE(swap_ra_max_order);
+	if (max_win == 1) {
+		swap_ra->win = 1;
+		return NULL;
+	}
+
+	fpfn = PFN_DOWN(faddr);
+	swap_ra_info = GET_SWAP_RA_VAL(vma);
+	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+	prev_win = SWAP_RA_WIN(swap_ra_info);
+	hits = SWAP_RA_HITS(swap_ra_info);
+	swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+					       max_win, prev_win);
+	atomic_long_set(&vma->swap_readahead_info,
+			SWAP_RA_VAL(faddr, win, 0));
+
+	if (win == 1)
+		return NULL;
+
+	/* Copy the PTEs because the page table may be unmapped */
+	if (fpfn == pfn + 1)
+		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+	else if (pfn == fpfn + 1)
+		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+				  &start, &end);
+	else {
+		left = (win - 1) / 2;
+		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+				  &start, &end);
+	}
+	swap_ra->nr_pte = end - start;
+	swap_ra->offset = fpfn - start;
+	pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+	swap_ra->ptes = pte;
+#else
+	tpte = swap_ra->ptes;
+	for (pfn = start; pfn != end; pfn++)
+		*tpte++ = *pte++;
+#endif
+
+	return NULL;
+}
+
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+				    struct vm_fault *vmf,
+				    struct vma_swap_readahead *swap_ra)
+{
+	struct blk_plug plug;
+	struct vm_area_struct *vma = vmf->vma;
+	struct page *page;
+	pte_t *pte, pentry;
+	swp_entry_t entry;
+	unsigned int i;
+	bool page_allocated;
+
+	if (swap_ra->win == 1)
+		goto skip;
+
+	blk_start_plug(&plug);
+	for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+	     i++, pte++) {
+		pentry = *pte;
+		if (pte_none(pentry))
+			continue;
+		if (pte_present(pentry))
+			continue;
+		entry = pte_to_swp_entry(pentry);
+		if (unlikely(non_swap_entry(entry)))
+			continue;
+		page = __read_swap_cache_async(entry, gfp_mask, vma,
+					       vmf->address, &page_allocated);
+		if (!page)
+			continue;
+		if (page_allocated) {
+			swap_readpage(page, false);
+			if (i != swap_ra->offset &&
+			    likely(!PageTransCompound(page))) {
+				SetPageReadahead(page);
+				count_vm_event(SWAP_RA);
+			}
+		}
+		put_page(page);
+	}
+	blk_finish_plug(&plug);
+	lru_add_drain();
+skip:
+	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+				     swap_ra->win == 1);
+}

From d9bfcfdc41e8e5d80f7591f95a09ccce7cb8ad05 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:40 -0700
Subject: [PATCH 109/119] mm, swap: add sysfs interface for VMA based swap
 readahead

The sysfs interface to control the VMA based swap readahead is added as
follow,

/sys/kernel/mm/swap/vma_ra_enabled

Enable the VMA based swap readahead algorithm, or use the original
global swap readahead algorithm.

/sys/kernel/mm/swap/vma_ra_max_order

Set the max order of the readahead window size for the VMA based swap
readahead algorithm.

The corresponding ABI documentation is added too.

Link: http://lkml.kernel.org/r/20170807054038.1843-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-swap          | 26 ++++++
 mm/swap_state.c                               | 80 +++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-swap

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap
new file mode 100644
index 000000000000..587db52084c7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap
@@ -0,0 +1,26 @@
+What:		/sys/kernel/mm/swap/
+Date:		August 2017
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for swapping
+
+What:		/sys/kernel/mm/swap/vma_ra_enabled
+Date:		August 2017
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Enable/disable VMA based swap readahead.
+
+		If set to true, the VMA based swap readahead algorithm
+		will be used for swappable anonymous pages mapped in a
+		VMA, and the global swap readahead algorithm will be
+		still used for tmpfs etc. other users.  If set to
+		false, the global swap readahead algorithm will be
+		used for all swappable pages.
+
+What:		/sys/kernel/mm/swap/vma_ra_max_order
+Date:		August 2017
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	The max readahead size in order for VMA based swap readahead
+
+		VMA based swap readahead algorithm will readahead at
+		most 1 << max_order pages for each readahead.  The
+		real readahead size for each readahead will be scaled
+		according to the estimation algorithm.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3885fef7bdf5..71ce2d1ccbf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -751,3 +751,83 @@ skip:
 	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
 				     swap_ra->win == 1);
 }
+
+#ifdef CONFIG_SYSFS
+static ssize_t vma_ra_enabled_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
+}
+static ssize_t vma_ra_enabled_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+		swap_vma_readahead = true;
+	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+		swap_vma_readahead = false;
+	else
+		return -EINVAL;
+
+	return count;
+}
+static struct kobj_attribute vma_ra_enabled_attr =
+	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
+	       vma_ra_enabled_store);
+
+static ssize_t vma_ra_max_order_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", swap_ra_max_order);
+}
+static ssize_t vma_ra_max_order_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int err, v;
+
+	err = kstrtoint(buf, 10, &v);
+	if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
+		return -EINVAL;
+
+	swap_ra_max_order = v;
+
+	return count;
+}
+static struct kobj_attribute vma_ra_max_order_attr =
+	__ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
+	       vma_ra_max_order_store);
+
+static struct attribute *swap_attrs[] = {
+	&vma_ra_enabled_attr.attr,
+	&vma_ra_max_order_attr.attr,
+	NULL,
+};
+
+static struct attribute_group swap_attr_group = {
+	.attrs = swap_attrs,
+};
+
+static int __init swap_init_sysfs(void)
+{
+	int err;
+	struct kobject *swap_kobj;
+
+	swap_kobj = kobject_create_and_add("swap", mm_kobj);
+	if (!swap_kobj) {
+		pr_err("failed to create swap kobject\n");
+		return -ENOMEM;
+	}
+	err = sysfs_create_group(swap_kobj, &swap_attr_group);
+	if (err) {
+		pr_err("failed to register swap group\n");
+		goto delete_obj;
+	}
+	return 0;
+
+delete_obj:
+	kobject_put(swap_kobj);
+	return err;
+}
+subsys_initcall(swap_init_sysfs);
+#endif

From 81a0298bdfab0203d360df7c9bf690d1d457f999 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:43 -0700
Subject: [PATCH 110/119] mm, swap: don't use VMA based swap readahead if HDD
 is used as swap

VMA based swap readahead will readahead the virtual pages that is
continuous in the virtual address space.  While the original swap
readahead will readahead the swap slots that is continuous in the swap
device.  Although VMA based swap readahead is more correct for the swap
slots to be readahead, it will trigger more small random readings, which
may cause the performance of HDD (hard disk) to degrade heavily, and may
finally exceed the benefit.

To avoid the issue, in this patch, if the HDD is used as swap, the VMA
based swap readahead will be disabled, and the original swap readahead
will be used instead.

Link: http://lkml.kernel.org/r/20170807054038.1843-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 ++++++-----
 mm/swapfile.c        |  8 +++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 61d63379e956..9c4ae6f14eea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -400,16 +400,17 @@ extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 					   struct vm_fault *vmf,
 					   struct vma_swap_readahead *swap_ra);
 
-static inline bool swap_use_vma_readahead(void)
-{
-	return READ_ONCE(swap_vma_readahead);
-}
-
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern atomic_t nr_rotate_swap;
 extern bool has_usable_swap(void);
 
+static inline bool swap_use_vma_readahead(void)
+{
+	return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
+}
+
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42eff9e4e972..4f8b3e08a547 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
 /* Activity counter to indicate that a swapon or swapoff has occurred */
 static atomic_t proc_poll_event = ATOMIC_INIT(0);
 
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
 static inline unsigned char swap_count(unsigned char ent)
 {
 	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
@@ -2569,6 +2571,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
 
+	if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+		atomic_dec(&nr_rotate_swap);
+
 	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
@@ -3145,7 +3150,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
 			cluster_set_null(&cluster->index);
 		}
-	}
+	} else
+		atomic_inc(&nr_rotate_swap);
 
 	error = swap_cgroup_swapon(p->type, maxpages);
 	if (error)

From d30561c56f4114f7d6595a40498ba364ffa6e28e Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Wed, 6 Sep 2017 16:24:47 -0700
Subject: [PATCH 111/119] z3fold: use per-cpu unbuddied lists

It's been noted that z3fold doesn't scale well when it's run in a large
number of threads on many cores, which can be easily reproduced with fio
'randrw' test with --numjobs=32.  E.g.  the result for 1 cluster (4 cores)
is:

Run status group 0 (all jobs):
   READ: io=244785MB, aggrb=496883KB/s, minb=15527KB/s, ...
  WRITE: io=246735MB, aggrb=500841KB/s, minb=15651KB/s, ...

While for 8 cores (2 clusters) the result is:

Run status group 0 (all jobs):
   READ: io=244785MB, aggrb=265942KB/s, minb=8310KB/s, ...
  WRITE: io=246735MB, aggrb=268060KB/s, minb=8376KB/s, ...

The bottleneck here is the pool lock which many threads become waiting
upon.  To reduce that spin lock contention, z3fold can operate only on
the lists local to the current CPU whenever possible.  Due to the nature
of z3fold unbuddied list handling (it only takes the first entry off the
list on a hot path), if the z3fold pool is big enough and balanced well
enough, limiting search to only local unbuddied list doesn't lead to a
significant compression ratio degrade (2.57x vs 2.65x in our
measurements).

This patch also introduces two worker threads: one for async in-page
object layout optimization and one for releasing freed pages.  This is
done to speed up z3fold_free() which is often on a hot path.

The fio results for 8-core case are now the following:

Run status group 0 (all jobs):
   READ: io=244785MB, aggrb=1568.3MB/s, minb=50182KB/s, ...
  WRITE: io=246735MB, aggrb=1580.8MB/s, minb=50582KB/s, ...

So we're in for almost 6x performance increase.

Link: http://lkml.kernel.org/r/20170806181443.f9b65018f8bde25ef990f9e8@gmail.com
Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 485 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 347 insertions(+), 138 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 54f63c4a809a..486550df32be 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -23,10 +23,13 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/atomic.h>
+#include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
+#include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
@@ -48,11 +51,15 @@ enum buddy {
 };
 
 /*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
  *			z3fold page, except for HEADLESS pages
- * @buddy:	links the z3fold page into the relevant list in the pool
+ * @buddy:		links the z3fold page into the relevant list in the
+ *			pool
  * @page_lock:		per-page lock
- * @refcount:		reference cound for the z3fold page
+ * @refcount:		reference count for the z3fold page
+ * @work:		work_struct for page layout optimization
+ * @pool:		pointer to the pool which this page belongs to
+ * @cpu:		CPU which this page "belongs" to
  * @first_chunks:	the size of the first buddy in chunks, 0 if free
  * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
  * @last_chunks:	the size of the last buddy in chunks, 0 if free
@@ -62,6 +69,9 @@ struct z3fold_header {
 	struct list_head buddy;
 	spinlock_t page_lock;
 	struct kref refcount;
+	struct work_struct work;
+	struct z3fold_pool *pool;
+	short cpu;
 	unsigned short first_chunks;
 	unsigned short middle_chunks;
 	unsigned short last_chunks;
@@ -92,28 +102,39 @@ struct z3fold_header {
 
 /**
  * struct z3fold_pool - stores metadata for each z3fold pool
- * @lock:	protects all pool fields and first|last_chunk fields of any
- *		z3fold page in the pool
- * @unbuddied:	array of lists tracking z3fold pages that contain 2- buddies;
- *		the lists each z3fold page is added to depends on the size of
- *		its free region.
+ * @name:	pool name
+ * @lock:	protects pool unbuddied/lru lists
+ * @stale_lock:	protects pool stale page list
+ * @unbuddied:	per-cpu array of lists tracking z3fold pages that contain 2-
+ *		buddies; the list each z3fold page is added to depends on
+ *		the size of its free region.
  * @lru:	list tracking the z3fold pages in LRU order by most recently
  *		added buddy.
+ * @stale:	list of pages marked for freeing
  * @pages_nr:	number of z3fold pages in the pool.
  * @ops:	pointer to a structure of user defined operations specified at
  *		pool creation time.
+ * @compact_wq:	workqueue for page layout background optimization
+ * @release_wq:	workqueue for safe page release
+ * @work:	work_struct for safe page release
  *
  * This structure is allocated at pool creation time and maintains metadata
  * pertaining to a particular z3fold pool.
  */
 struct z3fold_pool {
+	const char *name;
 	spinlock_t lock;
-	struct list_head unbuddied[NCHUNKS];
+	spinlock_t stale_lock;
+	struct list_head *unbuddied;
 	struct list_head lru;
+	struct list_head stale;
 	atomic64_t pages_nr;
 	const struct z3fold_ops *ops;
 	struct zpool *zpool;
 	const struct zpool_ops *zpool_ops;
+	struct workqueue_struct *compact_wq;
+	struct workqueue_struct *release_wq;
+	struct work_struct work;
 };
 
 /*
@@ -122,9 +143,10 @@ struct z3fold_pool {
 enum z3fold_page_flags {
 	PAGE_HEADLESS = 0,
 	MIDDLE_CHUNK_MAPPED,
+	NEEDS_COMPACTING,
+	PAGE_STALE
 };
 
-
 /*****************
  * Helpers
 *****************/
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size)
 #define for_each_unbuddied_list(_iter, _begin) \
 	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
 
+static void compact_page_work(struct work_struct *w);
+
 /* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page)
+static struct z3fold_header *init_z3fold_page(struct page *page,
+					struct z3fold_pool *pool)
 {
 	struct z3fold_header *zhdr = page_address(page);
 
 	INIT_LIST_HEAD(&page->lru);
 	clear_bit(PAGE_HEADLESS, &page->private);
 	clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+	clear_bit(NEEDS_COMPACTING, &page->private);
+	clear_bit(PAGE_STALE, &page->private);
 
 	spin_lock_init(&zhdr->page_lock);
 	kref_init(&zhdr->refcount);
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
 	zhdr->last_chunks = 0;
 	zhdr->first_num = 0;
 	zhdr->start_middle = 0;
+	zhdr->cpu = -1;
+	zhdr->pool = pool;
 	INIT_LIST_HEAD(&zhdr->buddy);
+	INIT_WORK(&zhdr->work, compact_page_work);
 	return zhdr;
 }
 
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page)
 	__free_page(page);
 }
 
-static void release_z3fold_page(struct kref *ref)
-{
-	struct z3fold_header *zhdr;
-	struct page *page;
-
-	zhdr = container_of(ref, struct z3fold_header, refcount);
-	page = virt_to_page(zhdr);
-
-	if (!list_empty(&zhdr->buddy))
-		list_del(&zhdr->buddy);
-	if (!list_empty(&page->lru))
-		list_del(&page->lru);
-	free_z3fold_page(page);
-}
-
 /* Lock a z3fold page */
 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
 {
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle)
 	return (handle - zhdr->first_num) & BUDDY_MASK;
 }
 
+static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
+{
+	struct page *page = virt_to_page(zhdr);
+	struct z3fold_pool *pool = zhdr->pool;
+
+	WARN_ON(!list_empty(&zhdr->buddy));
+	set_bit(PAGE_STALE, &page->private);
+	spin_lock(&pool->lock);
+	if (!list_empty(&page->lru))
+		list_del(&page->lru);
+	spin_unlock(&pool->lock);
+	if (locked)
+		z3fold_page_unlock(zhdr);
+	spin_lock(&pool->stale_lock);
+	list_add(&zhdr->buddy, &pool->stale);
+	queue_work(pool->release_wq, &pool->work);
+	spin_unlock(&pool->stale_lock);
+}
+
+static void __attribute__((__unused__))
+			release_z3fold_page(struct kref *ref)
+{
+	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+						refcount);
+	__release_z3fold_page(zhdr, false);
+}
+
+static void release_z3fold_page_locked(struct kref *ref)
+{
+	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+						refcount);
+	WARN_ON(z3fold_page_trylock(zhdr));
+	__release_z3fold_page(zhdr, true);
+}
+
+static void release_z3fold_page_locked_list(struct kref *ref)
+{
+	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+					       refcount);
+	spin_lock(&zhdr->pool->lock);
+	list_del_init(&zhdr->buddy);
+	spin_unlock(&zhdr->pool->lock);
+
+	WARN_ON(z3fold_page_trylock(zhdr));
+	__release_z3fold_page(zhdr, true);
+}
+
+static void free_pages_work(struct work_struct *w)
+{
+	struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
+
+	spin_lock(&pool->stale_lock);
+	while (!list_empty(&pool->stale)) {
+		struct z3fold_header *zhdr = list_first_entry(&pool->stale,
+						struct z3fold_header, buddy);
+		struct page *page = virt_to_page(zhdr);
+
+		list_del(&zhdr->buddy);
+		if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
+			continue;
+		clear_bit(NEEDS_COMPACTING, &page->private);
+		spin_unlock(&pool->stale_lock);
+		cancel_work_sync(&zhdr->work);
+		free_z3fold_page(page);
+		cond_resched();
+		spin_lock(&pool->stale_lock);
+	}
+	spin_unlock(&pool->stale_lock);
+}
+
 /*
  * Returns the number of free chunks in a z3fold page.
  * NB: can't be used with HEADLESS pages.
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr)
 	return nfree;
 }
 
-/*****************
- * API Functions
-*****************/
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @gfp:	gfp flags when allocating the z3fold pool structure
- * @ops:	user-defined operations for the z3fold pool
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
-		const struct z3fold_ops *ops)
-{
-	struct z3fold_pool *pool;
-	int i;
-
-	pool = kzalloc(sizeof(struct z3fold_pool), gfp);
-	if (!pool)
-		return NULL;
-	spin_lock_init(&pool->lock);
-	for_each_unbuddied_list(i, 0)
-		INIT_LIST_HEAD(&pool->unbuddied[i]);
-	INIT_LIST_HEAD(&pool->lru);
-	atomic64_set(&pool->pages_nr, 0);
-	pool->ops = ops;
-	return pool;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool:	the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
-	kfree(pool);
-}
-
 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
 				unsigned short dst_chunk)
 {
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
 	return 0;
 }
 
+static void do_compact_page(struct z3fold_header *zhdr, bool locked)
+{
+	struct z3fold_pool *pool = zhdr->pool;
+	struct page *page;
+	struct list_head *unbuddied;
+	int fchunks;
+
+	page = virt_to_page(zhdr);
+	if (locked)
+		WARN_ON(z3fold_page_trylock(zhdr));
+	else
+		z3fold_page_lock(zhdr);
+	if (test_bit(PAGE_STALE, &page->private) ||
+	    !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+		z3fold_page_unlock(zhdr);
+		return;
+	}
+	spin_lock(&pool->lock);
+	list_del_init(&zhdr->buddy);
+	spin_unlock(&pool->lock);
+
+	z3fold_compact_page(zhdr);
+	unbuddied = get_cpu_ptr(pool->unbuddied);
+	fchunks = num_free_chunks(zhdr);
+	if (fchunks < NCHUNKS &&
+	    (!zhdr->first_chunks || !zhdr->middle_chunks ||
+			!zhdr->last_chunks)) {
+		/* the page's not completely free and it's unbuddied */
+		spin_lock(&pool->lock);
+		list_add(&zhdr->buddy, &unbuddied[fchunks]);
+		spin_unlock(&pool->lock);
+		zhdr->cpu = smp_processor_id();
+	}
+	put_cpu_ptr(pool->unbuddied);
+	z3fold_page_unlock(zhdr);
+}
+
+static void compact_page_work(struct work_struct *w)
+{
+	struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
+						work);
+
+	do_compact_page(zhdr, false);
+}
+
+
+/*
+ * API Functions
+ */
+
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @name:	pool name
+ * @gfp:	gfp flags when allocating the z3fold pool structure
+ * @ops:	user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
+		const struct z3fold_ops *ops)
+{
+	struct z3fold_pool *pool = NULL;
+	int i, cpu;
+
+	pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+	if (!pool)
+		goto out;
+	spin_lock_init(&pool->lock);
+	spin_lock_init(&pool->stale_lock);
+	pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+	for_each_possible_cpu(cpu) {
+		struct list_head *unbuddied =
+				per_cpu_ptr(pool->unbuddied, cpu);
+		for_each_unbuddied_list(i, 0)
+			INIT_LIST_HEAD(&unbuddied[i]);
+	}
+	INIT_LIST_HEAD(&pool->lru);
+	INIT_LIST_HEAD(&pool->stale);
+	atomic64_set(&pool->pages_nr, 0);
+	pool->name = name;
+	pool->compact_wq = create_singlethread_workqueue(pool->name);
+	if (!pool->compact_wq)
+		goto out;
+	pool->release_wq = create_singlethread_workqueue(pool->name);
+	if (!pool->release_wq)
+		goto out_wq;
+	INIT_WORK(&pool->work, free_pages_work);
+	pool->ops = ops;
+	return pool;
+
+out_wq:
+	destroy_workqueue(pool->compact_wq);
+out:
+	kfree(pool);
+	return NULL;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool:	the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+	destroy_workqueue(pool->release_wq);
+	destroy_workqueue(pool->compact_wq);
+	kfree(pool);
+}
+
 /**
  * z3fold_alloc() - allocates a region of a given size
  * @pool:	z3fold pool from which to allocate
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 {
 	int chunks = 0, i, freechunks;
 	struct z3fold_header *zhdr = NULL;
+	struct page *page = NULL;
 	enum buddy bud;
-	struct page *page;
+	bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
 
 	if (!size || (gfp & __GFP_HIGHMEM))
 		return -EINVAL;
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
 		bud = HEADLESS;
 	else {
+		struct list_head *unbuddied;
 		chunks = size_to_chunks(size);
 
+lookup:
 		/* First, try to find an unbuddied z3fold page. */
-		zhdr = NULL;
+		unbuddied = get_cpu_ptr(pool->unbuddied);
 		for_each_unbuddied_list(i, chunks) {
-			spin_lock(&pool->lock);
-			zhdr = list_first_entry_or_null(&pool->unbuddied[i],
+			struct list_head *l = &unbuddied[i];
+
+			zhdr = list_first_entry_or_null(READ_ONCE(l),
 						struct z3fold_header, buddy);
-			if (!zhdr || !z3fold_page_trylock(zhdr)) {
-				spin_unlock(&pool->lock);
+
+			if (!zhdr)
 				continue;
+
+			/* Re-check under lock. */
+			spin_lock(&pool->lock);
+			l = &unbuddied[i];
+			if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+					struct z3fold_header, buddy)) ||
+			    !z3fold_page_trylock(zhdr)) {
+				spin_unlock(&pool->lock);
+				put_cpu_ptr(pool->unbuddied);
+				goto lookup;
 			}
-			kref_get(&zhdr->refcount);
 			list_del_init(&zhdr->buddy);
+			zhdr->cpu = -1;
 			spin_unlock(&pool->lock);
 
 			page = virt_to_page(zhdr);
+			if (test_bit(NEEDS_COMPACTING, &page->private)) {
+				z3fold_page_unlock(zhdr);
+				zhdr = NULL;
+				put_cpu_ptr(pool->unbuddied);
+				if (can_sleep)
+					cond_resched();
+				goto lookup;
+			}
+
+			/*
+			 * this page could not be removed from its unbuddied
+			 * list while pool lock was held, and then we've taken
+			 * page lock so kref_put could not be called before
+			 * we got here, so it's safe to just call kref_get()
+			 */
+			kref_get(&zhdr->refcount);
+			break;
+		}
+		put_cpu_ptr(pool->unbuddied);
+
+		if (zhdr) {
 			if (zhdr->first_chunks == 0) {
 				if (zhdr->middle_chunks != 0 &&
 				    chunks >= zhdr->start_middle)
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 			else if (zhdr->middle_chunks == 0)
 				bud = MIDDLE;
 			else {
-				z3fold_page_unlock(zhdr);
-				spin_lock(&pool->lock);
 				if (kref_put(&zhdr->refcount,
-					     release_z3fold_page))
+					     release_z3fold_page_locked))
 					atomic64_dec(&pool->pages_nr);
-				spin_unlock(&pool->lock);
+				else
+					z3fold_page_unlock(zhdr);
 				pr_err("No free chunks in unbuddied\n");
 				WARN_ON(1);
-				continue;
+				goto lookup;
 			}
 			goto found;
 		}
 		bud = FIRST;
 	}
 
-	/* Couldn't find unbuddied z3fold page, create new one */
-	page = alloc_page(gfp);
+	spin_lock(&pool->stale_lock);
+	zhdr = list_first_entry_or_null(&pool->stale,
+					struct z3fold_header, buddy);
+	/*
+	 * Before allocating a page, let's see if we can take one from the
+	 * stale pages list. cancel_work_sync() can sleep so we must make
+	 * sure it won't be called in case we're in atomic context.
+	 */
+	if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
+	    !unlikely(work_busy(&zhdr->work)))) {
+		list_del(&zhdr->buddy);
+		clear_bit(NEEDS_COMPACTING, &page->private);
+		spin_unlock(&pool->stale_lock);
+		if (can_sleep)
+			cancel_work_sync(&zhdr->work);
+		page = virt_to_page(zhdr);
+	} else {
+		spin_unlock(&pool->stale_lock);
+		page = alloc_page(gfp);
+	}
+
 	if (!page)
 		return -ENOMEM;
 
 	atomic64_inc(&pool->pages_nr);
-	zhdr = init_z3fold_page(page);
+	zhdr = init_z3fold_page(page, pool);
 
 	if (bud == HEADLESS) {
 		set_bit(PAGE_HEADLESS, &page->private);
-		spin_lock(&pool->lock);
 		goto headless;
 	}
 	z3fold_page_lock(zhdr);
@@ -451,15 +659,21 @@ found:
 		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
 	}
 
-	spin_lock(&pool->lock);
 	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
 			zhdr->middle_chunks == 0) {
+		struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
 		/* Add to unbuddied list */
 		freechunks = num_free_chunks(zhdr);
-		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+		spin_lock(&pool->lock);
+		list_add(&zhdr->buddy, &unbuddied[freechunks]);
+		spin_unlock(&pool->lock);
+		zhdr->cpu = smp_processor_id();
+		put_cpu_ptr(pool->unbuddied);
 	}
 
 headless:
+	spin_lock(&pool->lock);
 	/* Add/move z3fold page to beginning of LRU */
 	if (!list_empty(&page->lru))
 		list_del(&page->lru);
@@ -487,7 +701,6 @@ headless:
 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 {
 	struct z3fold_header *zhdr;
-	int freechunks;
 	struct page *page;
 	enum buddy bud;
 
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 		spin_unlock(&pool->lock);
 		free_z3fold_page(page);
 		atomic64_dec(&pool->pages_nr);
-	} else {
-		if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 ||
-		    zhdr->last_chunks != 0) {
-			z3fold_compact_page(zhdr);
-			/* Add to the unbuddied list */
-			spin_lock(&pool->lock);
-			if (!list_empty(&zhdr->buddy))
-				list_del(&zhdr->buddy);
-			freechunks = num_free_chunks(zhdr);
-			list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
-			spin_unlock(&pool->lock);
-		}
-		z3fold_page_unlock(zhdr);
-		spin_lock(&pool->lock);
-		if (kref_put(&zhdr->refcount, release_z3fold_page))
-			atomic64_dec(&pool->pages_nr);
-		spin_unlock(&pool->lock);
+		return;
 	}
 
+	if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
+		atomic64_dec(&pool->pages_nr);
+		return;
+	}
+	if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+		z3fold_page_unlock(zhdr);
+		return;
+	}
+	if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
+		spin_lock(&pool->lock);
+		list_del_init(&zhdr->buddy);
+		spin_unlock(&pool->lock);
+		zhdr->cpu = -1;
+		do_compact_page(zhdr, true);
+		return;
+	}
+	queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+	z3fold_page_unlock(zhdr);
 }
 
 /**
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
  */
 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 {
-	int i, ret = 0, freechunks;
-	struct z3fold_header *zhdr;
-	struct page *page;
+	int i, ret = 0;
+	struct z3fold_header *zhdr = NULL;
+	struct page *page = NULL;
+	struct list_head *pos;
 	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
 
 	spin_lock(&pool->lock);
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 			spin_unlock(&pool->lock);
 			return -EINVAL;
 		}
-		page = list_last_entry(&pool->lru, struct page, lru);
-		list_del_init(&page->lru);
+		list_for_each_prev(pos, &pool->lru) {
+			page = list_entry(pos, struct page, lru);
+			if (test_bit(PAGE_HEADLESS, &page->private))
+				/* candidate found */
+				break;
 
-		zhdr = page_address(page);
-		if (!test_bit(PAGE_HEADLESS, &page->private)) {
-			if (!list_empty(&zhdr->buddy))
-				list_del_init(&zhdr->buddy);
+			zhdr = page_address(page);
+			if (!z3fold_page_trylock(zhdr))
+				continue; /* can't evict at this point */
 			kref_get(&zhdr->refcount);
-			spin_unlock(&pool->lock);
-			z3fold_page_lock(zhdr);
+			list_del_init(&zhdr->buddy);
+			zhdr->cpu = -1;
+		}
+
+		list_del_init(&page->lru);
+		spin_unlock(&pool->lock);
+
+		if (!test_bit(PAGE_HEADLESS, &page->private)) {
 			/*
 			 * We need encode the handles before unlocking, since
 			 * we can race with free that will set
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 				middle_handle = encode_handle(zhdr, MIDDLE);
 			if (zhdr->last_chunks)
 				last_handle = encode_handle(zhdr, LAST);
+			/*
+			 * it's safe to unlock here because we hold a
+			 * reference to this page
+			 */
 			z3fold_page_unlock(zhdr);
 		} else {
 			first_handle = encode_handle(zhdr, HEADLESS);
 			last_handle = middle_handle = 0;
-			spin_unlock(&pool->lock);
 		}
 
 		/* Issue the eviction callback(s) */
@@ -652,31 +879,12 @@ next:
 			if (ret == 0) {
 				free_z3fold_page(page);
 				return 0;
-			} else {
-				spin_lock(&pool->lock);
-			}
-		} else {
-			z3fold_page_lock(zhdr);
-			if ((zhdr->first_chunks || zhdr->last_chunks ||
-			     zhdr->middle_chunks) &&
-			    !(zhdr->first_chunks && zhdr->last_chunks &&
-			      zhdr->middle_chunks)) {
-				z3fold_compact_page(zhdr);
-				/* add to unbuddied list */
-				spin_lock(&pool->lock);
-				freechunks = num_free_chunks(zhdr);
-				list_add(&zhdr->buddy,
-					 &pool->unbuddied[freechunks]);
-				spin_unlock(&pool->lock);
-			}
-			z3fold_page_unlock(zhdr);
-			spin_lock(&pool->lock);
-			if (kref_put(&zhdr->refcount, release_z3fold_page)) {
-				spin_unlock(&pool->lock);
-				atomic64_dec(&pool->pages_nr);
-				return 0;
 			}
+		} else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
+			atomic64_dec(&pool->pages_nr);
+			return 0;
 		}
+		spin_lock(&pool->lock);
 
 		/*
 		 * Add to the beginning of LRU.
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp,
 {
 	struct z3fold_pool *pool;
 
-	pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+	pool = z3fold_create_pool(name, gfp,
+				zpool_ops ? &z3fold_zpool_ops : NULL);
 	if (pool) {
 		pool->zpool = zpool;
 		pool->zpool_ops = zpool_ops;

From cd04ae1e2dc8e3651b8c427ec1b9500c6eed7b90 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:24:50 -0700
Subject: [PATCH 112/119] mm, oom: do not rely on TIF_MEMDIE for memory
 reserves access

For ages we have been relying on TIF_MEMDIE thread flag to mark OOM
victims and then, among other things, to give these threads full access
to memory reserves.  There are few shortcomings of this implementation,
though.

First of all and the most serious one is that the full access to memory
reserves is quite dangerous because we leave no safety room for the
system to operate and potentially do last emergency steps to move on.

Secondly this flag is per task_struct while the OOM killer operates on
mm_struct granularity so all processes sharing the given mm are killed.
Giving the full access to all these task_structs could lead to a quick
memory reserves depletion.  We have tried to reduce this risk by giving
TIF_MEMDIE only to the main thread and the currently allocating task but
that doesn't really solve this problem while it surely opens up a room
for corner cases - e.g.  GFP_NO{FS,IO} requests might loop inside the
allocator without access to memory reserves because a particular thread
was not the group leader.

Now that we have the oom reaper and that all oom victims are reapable
after 1b51e65eab64 ("oom, oom_reaper: allow to reap mm shared by the
kthreads") we can be more conservative and grant only partial access to
memory reserves because there are reasonable chances of the parallel
memory freeing.  We still want some access to reserves because we do not
want other consumers to eat up the victim's freed memory.  oom victims
will still contend with __GFP_HIGH users but those shouldn't be so
aggressive to starve oom victims completely.

Introduce ALLOC_OOM flag and give all tsk_is_oom_victim tasks access to
the half of the reserves.  This makes the access to reserves independent
on which task has passed through mark_oom_victim.  Also drop any usage
of TIF_MEMDIE from the page allocator proper and replace it by
tsk_is_oom_victim as well which will make page_alloc.c completely
TIF_MEMDIE free finally.

CONFIG_MMU=n doesn't have oom reaper so let's stick to the original
ALLOC_NO_WATERMARKS approach.

There is a demand to make the oom killer memcg aware which will imply
many tasks killed at once.  This change will allow such a usecase
without worrying about complete memory reserves depletion.

Link: http://lkml.kernel.org/r/20170810075019.28998-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   | 11 +++++++
 mm/oom_kill.c   |  9 +++---
 mm/page_alloc.c | 80 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 781c0d54d75a..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 /* Mask to get the watermark bits */
 #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
 
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM		0x08
+#else
+#define ALLOC_OOM		ALLOC_NO_WATERMARKS
+#endif
+
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..c9f3569a76c7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -824,7 +824,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
-	 * its children or threads, just set TIF_MEMDIE so it can die quickly
+	 * its children or threads, just give it access to memory reserves
+	 * so it can die quickly
 	 */
 	task_lock(p);
 	if (task_will_free_mem(p)) {
@@ -889,9 +890,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	count_memcg_event_mm(mm, OOM_KILL);
 
 	/*
-	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
-	 * the OOM victim from depleting the memory reserves from the user
-	 * space under its control.
+	 * We should send SIGKILL before granting access to memory reserves
+	 * in order to prevent the OOM victim from depleting the memory
+	 * reserves from the user space under its control.
 	 */
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mark_oom_victim(victim);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4562c058ec4..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 {
 	long min = mark;
 	int o;
-	const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
 
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 	 * the high-atomic reserves. This will over-estimate the size of the
 	 * atomic reserve but it avoids a search.
 	 */
-	if (likely(!alloc_harder))
+	if (likely(!alloc_harder)) {
 		free_pages -= z->nr_reserved_highatomic;
-	else
-		min -= min / 4;
+	} else {
+		/*
+		 * OOM victims can try even harder than normal ALLOC_HARDER
+		 * users on the grounds that it's definitely going to be in
+		 * the exit path shortly and free memory. Any allocation it
+		 * makes during the free path will be small and short-lived.
+		 */
+		if (alloc_flags & ALLOC_OOM)
+			min -= min / 2;
+		else
+			min -= min / 4;
+	}
+
 
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
-		if (test_thread_flag(TIF_MEMDIE) ||
+		if (tsk_is_oom_victim(current) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	return alloc_flags;
 }
 
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
 {
-	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+	if (!tsk_is_oom_victim(tsk))
 		return false;
 
-	if (gfp_mask & __GFP_MEMALLOC)
-		return true;
-	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-		return true;
-	if (!in_interrupt() &&
-			((current->flags & PF_MEMALLOC) ||
-			 unlikely(test_thread_flag(TIF_MEMDIE))))
-		return true;
+	/*
+	 * !MMU doesn't have oom reaper so give access to memory reserves
+	 * only to the thread with TIF_MEMDIE set
+	 */
+	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+		return false;
 
-	return false;
+	return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+		return 0;
+	if (gfp_mask & __GFP_MEMALLOC)
+		return ALLOC_NO_WATERMARKS;
+	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+		return ALLOC_NO_WATERMARKS;
+	if (!in_interrupt()) {
+		if (current->flags & PF_MEMALLOC)
+			return ALLOC_NO_WATERMARKS;
+		else if (oom_reserves_allowed(current))
+			return ALLOC_OOM;
+	}
+
+	return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+	return !!__gfp_pfmemalloc_flags(gfp_mask);
 }
 
 /*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long alloc_start = jiffies;
 	unsigned int stall_timeout = 10 * HZ;
 	unsigned int cpuset_mems_cookie;
+	int reserve_flags;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 
-	if (gfp_pfmemalloc_allowed(gfp_mask))
-		alloc_flags = ALLOC_NO_WATERMARKS;
+	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+	if (reserve_flags)
+		alloc_flags = reserve_flags;
 
 	/*
 	 * Reset the zonelist iterators if memory policies can be ignored.
 	 * These allocations are high priority and system rather than user
 	 * orientated.
 	 */
-	if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
 		ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
 					ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
 		goto got_pg;
 
 	/* Avoid allocations with no watermarks from looping endlessly */
-	if (test_thread_flag(TIF_MEMDIE) &&
-	    (alloc_flags == ALLOC_NO_WATERMARKS ||
+	if (tsk_is_oom_victim(current) &&
+	    (alloc_flags == ALLOC_OOM ||
 	     (gfp_mask & __GFP_NOMEMALLOC)))
 		goto nopage;
 

From da99ecf117fce6570bd3989263d68ee0007e1249 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:24:53 -0700
Subject: [PATCH 113/119] mm: replace TIF_MEMDIE checks by tsk_is_oom_victim

TIF_MEMDIE is set only to the tasks whick were either directly selected
by the OOM killer or passed through mark_oom_victim from the allocator
path.  tsk_is_oom_victim is more generic and allows to identify all
tasks (threads) which share the mm with the oom victim.

Please note that the freezer still needs to check TIF_MEMDIE because we
cannot thaw tasks which do not participage in oom_victims counting
otherwise a !TIF_MEMDIE task could interfere after oom_disbale returns.

Link: http://lkml.kernel.org/r/20170810075019.28998-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cpuset.c | 9 +++++----
 mm/memcontrol.c        | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..e7485786db9b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
 #include <linux/time64.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/oom.h>
 
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
@@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  * If we're in interrupt, yes, we can always allocate.  If @node is set in
  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes.  If current has access to memory reserves as an oom victim, yes.
  * Otherwise, no.
  *
  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
  * and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest enclosing hardwalled ancestor cpuset.
  *
@@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  * affect that:
  *	in_interrupt - any node ok (current task context irrelevant)
  *	GFP_ATOMIC   - any node ok
- *	TIF_MEMDIE   - any node ok
+ *	tsk_is_oom_victim   - any node ok
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
@@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)))
+	if (unlikely(tsk_is_oom_victim(current)))
 		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
 		return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c1f9b79817d7..ad15850ee157 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1917,7 +1917,7 @@ retry:
 	 * bypass the last charges so that they can exit quickly and
 	 * free their memory.
 	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+	if (unlikely(tsk_is_oom_victim(current) ||
 		     fatal_signal_pending(current) ||
 		     current->flags & PF_EXITING))
 		goto force;

From a2468cc9bfdff6139f59ca896671e5819ff5f94a Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Wed, 6 Sep 2017 16:24:57 -0700
Subject: [PATCH 114/119] swap: choose swap device according to numa node

If the system has more than one swap device and swap device has the node
information, we can make use of this information to decide which swap
device to use in get_swap_pages() to get better performance.

The current code uses a priority based list, swap_avail_list, to decide
which swap device to use and if multiple swap devices share the same
priority, they are used round robin.  This patch changes the previous
single global swap_avail_list into a per-numa-node list, i.e.  for each
numa node, it sees its own priority based list of available swap
devices.  Swap device's priority can be promoted on its matching node's
swap_avail_list.

The current swap device's priority is set as: user can set a >=0 value,
or the system will pick one starting from -1 then downwards.  The
priority value in the swap_avail_list is the negated value of the swap
device's due to plist being sorted from low to high.  The new policy
doesn't change the semantics for priority >=0 cases, the previous
starting from -1 then downwards now becomes starting from -2 then
downwards and -1 is reserved as the promoted value.

Take 4-node EX machine as an example, suppose 4 swap devices are
available, each sit on a different node:
swapA on node 0
swapB on node 1
swapC on node 2
swapD on node 3

After they are all swapped on in the sequence of ABCD.

Current behaviour:
their priorities will be:
swapA: -1
swapB: -2
swapC: -3
swapD: -4
And their position in the global swap_avail_list will be:
swapA   -> swapB   -> swapC   -> swapD
prio:1     prio:2     prio:3     prio:4

New behaviour:
their priorities will be(note that -1 is skipped):
swapA: -2
swapB: -3
swapC: -4
swapD: -5
And their positions in the 4 swap_avail_lists[nid] will be:
swap_avail_lists[0]: /* node 0's available swap device list */
swapA   -> swapB   -> swapC   -> swapD
prio:1     prio:3     prio:4     prio:5
swap_avali_lists[1]: /* node 1's available swap device list */
swapB   -> swapA   -> swapC   -> swapD
prio:1     prio:2     prio:4     prio:5
swap_avail_lists[2]: /* node 2's available swap device list */
swapC   -> swapA   -> swapB   -> swapD
prio:1     prio:2     prio:3     prio:5
swap_avail_lists[3]: /* node 3's available swap device list */
swapD   -> swapA   -> swapB   -> swapC
prio:1     prio:2     prio:3     prio:4

To see the effect of the patch, a test that starts N process, each mmap
a region of anonymous memory and then continually write to it at random
position to trigger both swap in and out is used.

On a 2 node Skylake EP machine with 64GiB memory, two 170GB SSD drives
are used as swap devices with each attached to a different node, the
result is:

runtime=30m/processes=32/total test size=128G/each process mmap region=4G
kernel         throughput
vanilla        13306
auto-binding   15169 +14%

runtime=30m/processes=64/total test size=128G/each process mmap region=2G
kernel         throughput
vanilla        11885
auto-binding   14879 +25%

[aaron.lu@intel.com: v2]
  Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com
  Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com
[akpm@linux-foundation.org: use kmalloc_array()]
Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com
Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/swap_numa.txt |  69 +++++++++++++++++++
 include/linux/swap.h           |   2 +-
 mm/swapfile.c                  | 120 ++++++++++++++++++++++++++-------
 3 files changed, 164 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/vm/swap_numa.txt

diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
new file mode 100644
index 000000000000..d5960c9124f5
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
+Automatically bind swap device to numa node
+-------------------------------------------
+
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+
+
+How to use this feature
+-----------------------
+
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing:
+# swapon /dev/swapA
+# swapon /dev/swapB
+
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above:
+# swapon /dev/swapA
+# swapon /dev/swapB
+# swapon /dev/swapC
+# swapon /dev/swapD
+# swapon /dev/swapE
+# swapon /dev/swapF
+
+Then node 0 will use them in the order of:
+swapA/swapB -> swapC -> swapD -> swapE -> swapF
+swapA and swapB will be used in a round robin mode before any other swap device.
+
+node 1 will use them in the order of:
+swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of:
+swapD/swapE -> swapA -> swapB -> swapC -> swapF
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+
+node 3 will use them in the order of:
+swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+
+
+Implementation details
+----------------------
+
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9c4ae6f14eea..8bf3487fb204 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ struct swap_info_struct {
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
-	struct plist_node avail_list;	/* entry in swap_avail_head */
+	struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8b3e08a547..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
 EXPORT_SYMBOL_GPL(nr_swap_pages);
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
 
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
  * is held and the locking order requires swap_lock to be taken
  * before any swap_info_struct->lock.
  */
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
 static DEFINE_SPINLOCK(swap_avail_lock);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -592,6 +592,21 @@ new_cluster:
 	return found_free;
 }
 
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+	int nid;
+
+	for_each_node(nid)
+		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+	spin_lock(&swap_avail_lock);
+	__del_from_avail_list(p);
+	spin_unlock(&swap_avail_lock);
+}
+
 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 			     unsigned int nr_entries)
 {
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 	if (si->inuse_pages == si->pages) {
 		si->lowest_bit = si->max;
 		si->highest_bit = 0;
-		spin_lock(&swap_avail_lock);
-		plist_del(&si->avail_list, &swap_avail_head);
-		spin_unlock(&swap_avail_lock);
+		del_from_avail_list(si);
 	}
 }
 
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+	int nid;
+
+	spin_lock(&swap_avail_lock);
+	for_each_node(nid) {
+		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+	}
+	spin_unlock(&swap_avail_lock);
+}
+
 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 			    unsigned int nr_entries)
 {
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		bool was_full = !si->highest_bit;
 
 		si->highest_bit = end;
-		if (was_full && (si->flags & SWP_WRITEOK)) {
-			spin_lock(&swap_avail_lock);
-			WARN_ON(!plist_node_empty(&si->avail_list));
-			if (plist_node_empty(&si->avail_list))
-				plist_add(&si->avail_list, &swap_avail_head);
-			spin_unlock(&swap_avail_lock);
-		}
+		if (was_full && (si->flags & SWP_WRITEOK))
+			add_to_avail_list(si);
 	}
 	atomic_long_add(nr_entries, &nr_swap_pages);
 	si->inuse_pages -= nr_entries;
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
+	int node;
 
 	/* Only single cluster request supported */
 	WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 	spin_lock(&swap_avail_lock);
 
 start_over:
-	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+	node = numa_node_id();
+	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 		/* requeue si to after same-priority siblings */
-		plist_requeue(&si->avail_list, &swap_avail_head);
+		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 		spin_unlock(&swap_avail_lock);
 		spin_lock(&si->lock);
 		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
 			spin_lock(&swap_avail_lock);
-			if (plist_node_empty(&si->avail_list)) {
+			if (plist_node_empty(&si->avail_lists[node])) {
 				spin_unlock(&si->lock);
 				goto nextsi;
 			}
@@ -946,7 +968,7 @@ start_over:
 			WARN(!(si->flags & SWP_WRITEOK),
 			     "swap_info %d in list but !SWP_WRITEOK\n",
 			     si->type);
-			plist_del(&si->avail_list, &swap_avail_head);
+			__del_from_avail_list(si);
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
@@ -975,7 +997,7 @@ nextsi:
 		 * swap_avail_head list then try it, otherwise start over
 		 * if we have not gotten any slots.
 		 */
-		if (plist_node_empty(&next->avail_list))
+		if (plist_node_empty(&next->avail_lists[node]))
 			goto start_over;
 	}
 
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	return generic_swapfile_activate(sis, swap_file, span);
 }
 
+static int swap_node(struct swap_info_struct *p)
+{
+	struct block_device *bdev;
+
+	if (p->bdev)
+		bdev = p->bdev;
+	else
+		bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
+	int i;
+
 	if (prio >= 0)
 		p->prio = prio;
 	else
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	 * low-to-high, while swap ordering is high-to-low
 	 */
 	p->list.prio = -p->prio;
-	p->avail_list.prio = -p->prio;
+	for_each_node(i) {
+		if (p->prio >= 0)
+			p->avail_lists[i].prio = -p->prio;
+		else {
+			if (swap_node(p) == i)
+				p->avail_lists[i].prio = 1;
+			else
+				p->avail_lists[i].prio = -p->prio;
+		}
+	}
 	p->swap_map = swap_map;
 	p->cluster_info = cluster_info;
 	p->flags |= SWP_WRITEOK;
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	 * swap_info_struct.
 	 */
 	plist_add(&p->list, &swap_active_head);
-	spin_lock(&swap_avail_lock);
-	plist_add(&p->avail_list, &swap_avail_head);
-	spin_unlock(&swap_avail_lock);
+	add_to_avail_list(p);
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
-	spin_lock(&swap_avail_lock);
-	plist_del(&p->avail_list, &swap_avail_head);
-	spin_unlock(&swap_avail_lock);
+	del_from_avail_list(p);
 	spin_lock(&p->lock);
 	if (p->prio < 0) {
 		struct swap_info_struct *si = p;
+		int nid;
 
 		plist_for_each_entry_continue(si, &swap_active_head, list) {
 			si->prio++;
 			si->list.prio--;
-			si->avail_list.prio--;
+			for_each_node(nid) {
+				if (si->avail_lists[nid].prio != 1)
+					si->avail_lists[nid].prio--;
+			}
 		}
 		least_priority++;
 	}
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 {
 	struct swap_info_struct *p;
 	unsigned int type;
+	int i;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
 	}
 	INIT_LIST_HEAD(&p->first_swap_extent.list);
 	plist_node_init(&p->list, 0);
-	plist_node_init(&p->avail_list, 0);
+	for_each_node(i)
+		plist_node_init(&p->avail_lists[i], 0);
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
 	spin_lock_init(&p->lock);
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!swap_avail_heads)
+		return -ENOMEM;
+
 	p = alloc_swap_info();
 	if (IS_ERR(p))
 		return PTR_ERR(p);
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 		}
 	}
 }
+
+static int __init swapfile_init(void)
+{
+	int nid;
+
+	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+					 GFP_KERNEL);
+	if (!swap_avail_heads) {
+		pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+		return -ENOMEM;
+	}
+
+	for_each_node(nid)
+		plist_head_init(&swap_avail_heads[nid]);
+
+	return 0;
+}
+subsys_initcall(swapfile_init);

From 212925802454672e6cd2949a727f5e2c1377bf06 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:25:00 -0700
Subject: [PATCH 115/119] mm: oom: let oom_reap_task and exit_mmap run
 concurrently

This is purely required because exit_aio() may block and exit_mmap() may
never start, if the oom_reap_task cannot start running on a mm with
mm_users == 0.

At the same time if the OOM reaper doesn't wait at all for the memory of
the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would
generate a spurious OOM kill.

If it wasn't because of the exit_aio or similar blocking functions in
the last mmput, it would be enough to change the oom_reap_task() in the
case it finds mm_users == 0, to wait for a timeout or to wait for
__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the
problem here so the concurrency of exit_mmap and oom_reap_task is
apparently warranted.

It's a non standard runtime, exit_mmap() runs without mmap_sem, and
oom_reap_task runs with the mmap_sem for reading as usual (kind of
MADV_DONTNEED).

The race between the two is solved with a combination of
tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP
(serialized by a dummy down_write/up_write cycle on the same lines of
the ksm_exit method).

If the oom_reap_task() may be running concurrently during exit_mmap,
exit_mmap will wait it to finish in down_write (before taking down mm
structures that would make the oom_reap_task fail with use after free).

If exit_mmap comes first, oom_reap_task() will skip the mm if
MMF_OOM_SKIP is already set and in turn all memory is already freed and
furthermore the mm data structures may already have been taken down by
free_pgtables.

[aarcange@redhat.com: incremental one liner]
  Link: http://lkml.kernel.org/r/20170726164319.GC29716@redhat.com
[rientjes@google.com: remove unused mmput_async]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708141733130.50317@chino.kir.corp.google.com
[aarcange@redhat.com: microoptimization]
  Link: http://lkml.kernel.org/r/20170817171240.GB5066@redhat.com
Link: http://lkml.kernel.org/r/20170726162912.GA29716@redhat.com
Fixes: 26db62f179d1 ("oom: keep mm of the killed task available")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h |  6 ------
 kernel/fork.c            | 17 -----------------
 mm/mmap.c                | 18 ++++++++++++++++++
 mm/oom_kill.c            | 15 +++++----------
 4 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b0a281f9d26..3a19c253bdb1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
-/* same as above but performs the slow path from the async context. Can
- * be called from the atomic context as well
- */
-extern void mmput_async(struct mm_struct *);
-#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e5345c07344..7ed64600da6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
-	set_bit(MMF_OOM_SKIP, &mm->flags);
 	mmdrop(mm);
 }
 
@@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-#ifdef CONFIG_MMU
-static void mmput_async_fn(struct work_struct *work)
-{
-	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
-	__mmput(mm);
-}
-
-void mmput_async(struct mm_struct *mm)
-{
-	if (atomic_dec_and_test(&mm->mm_users)) {
-		INIT_WORK(&mm->async_put_work, mmput_async_fn);
-		schedule_work(&mm->async_put_work);
-	}
-}
-#endif
-
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 52f6c6b18f40..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/moduleparam.h>
 #include <linux/pkeys.h>
+#include <linux/oom.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -3001,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
+	set_bit(MMF_OOM_SKIP, &mm->flags);
+	if (unlikely(tsk_is_oom_victim(current))) {
+		/*
+		 * Wait for oom_reap_task() to stop working on this
+		 * mm. Because MMF_OOM_SKIP is already set before
+		 * calling down_read(), oom_reap_task() will not run
+		 * on this "mm" post up_write().
+		 *
+		 * tsk_is_oom_victim() cannot be set from under us
+		 * either because current->mm is already set to NULL
+		 * under task_lock before calling mmput and oom_mm is
+		 * set not NULL by the OOM killer only if current->mm
+		 * is found not NULL while holding the task_lock.
+		 */
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	}
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c9f3569a76c7..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * increase mm_users only after we know we will reap something so
-	 * that the mmput_async is called only when we have reaped something
-	 * and delayed __mmput doesn't matter that much
+	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
+	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
+	 * under mmap_sem for reading because it serializes against the
+	 * down_write();up_write() cycle in exit_mmap().
 	 */
-	if (!mmget_not_zero(mm)) {
+	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
 		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
 		goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 			K(get_mm_counter(mm, MM_SHMEMPAGES)));
 	up_read(&mm->mmap_sem);
 
-	/*
-	 * Drop our reference but make sure the mmput slow path is called from a
-	 * different context because we shouldn't risk we get stuck there and
-	 * put the oom_reaper out of the way.
-	 */
-	mmput_async(mm);
 	trace_finish_task_reaping(tsk->pid);
 unlock_oom:
 	mutex_unlock(&oom_lock);

From c79b57e462b5d2f47afa5f175cf1828f16e18612 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:25:04 -0700
Subject: [PATCH 116/119] mm: hugetlb: clear target sub-page last when clearing
 huge page

Huge page helps to reduce TLB miss rate, but it has higher cache
footprint, sometimes this may cause some issue.  For example, when
clearing huge page on x86_64 platform, the cache footprint is 2M.  But
on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M
LLC (last level cache).  That is, in average, there are 2.5M LLC for
each core and 1.25M LLC for each thread.

If the cache pressure is heavy when clearing the huge page, and we clear
the huge page from the begin to the end, it is possible that the begin
of huge page is evicted from the cache after we finishing clearing the
end of the huge page.  And it is possible for the application to access
the begin of the huge page after clearing the huge page.

To help the above situation, in this patch, when we clear a huge page,
the order to clear sub-pages is changed.  In quite some situation, we
can get the address that the application will access after we clear the
huge page, for example, in a page fault handler.  Instead of clearing
the huge page from begin to end, we will clear the sub-pages farthest
from the the sub-page to access firstly, and clear the sub-page to
access last.  This will make the sub-page to access most cache-hot and
sub-pages around it more cache-hot too.  If we cannot know the address
the application will access, the begin of the huge page is assumed to be
the the address the application will access.

With this patch, the throughput increases ~28.3% in vm-scalability
anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699
system (36 cores, 72 threads).  The test case creates 72 processes, each
process mmap a big anonymous memory area and writes to it from the begin
to the end.  For each process, other processes could be seen as other
workload which generates heavy cache pressure.  At the same time, the
cache miss rate reduced from ~33.4% to ~31.7%, the IPC (instruction per
cycle) increased from 0.56 to 0.74, and the time spent in user space is
reduced ~7.9%

Christopher Lameter suggests to clear bytes inside a sub-page from end
to begin too.  But tests show no visible performance difference in the
tests.  May because the size of page is small compared with the cache
size.

Thanks Andi Kleen to propose to use address to access to determine the
order of sub-pages to clear.

The hugetlbfs access address could be improved, will do that in another
patch.

[ying.huang@intel.com: improve readability of clear_huge_page()]
  Link: http://lkml.kernel.org/r/20170830051842.1397-1-ying.huang@intel.com
Link: http://lkml.kernel.org/r/20170815014618.15842-1-ying.huang@intel.com
Suggested-by: Andi Kleen <andi.kleen@intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shaohua Li <shli@fb.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/huge_memory.c   |  4 ++--
 mm/memory.c        | 42 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eb5e4bc946cc..0acea73af839 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2508,7 +2508,7 @@ enum mf_action_page_type {
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
-			    unsigned long addr,
+			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
 extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned long addr, struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 772048c233d1..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		goto release;
 	}
 
-	clear_huge_page(page, haddr, HPAGE_PMD_NR);
+	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
@@ -1305,7 +1305,7 @@ alloc:
 	count_vm_event(THP_FAULT_ALLOC);
 
 	if (!page)
-		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
 	else
 		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index e87953775e3c..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4417,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
 	}
 }
 void clear_huge_page(struct page *page,
-		     unsigned long addr, unsigned int pages_per_huge_page)
+		     unsigned long addr_hint, unsigned int pages_per_huge_page)
 {
-	int i;
+	int i, n, base, l;
+	unsigned long addr = addr_hint &
+		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
 		clear_gigantic_page(page, addr, pages_per_huge_page);
 		return;
 	}
 
+	/* Clear sub-page to access last to keep its cache lines hot */
 	might_sleep();
-	for (i = 0; i < pages_per_huge_page; i++) {
+	n = (addr_hint - addr) / PAGE_SIZE;
+	if (2 * n <= pages_per_huge_page) {
+		/* If sub-page to access in first half of huge page */
+		base = 0;
+		l = n;
+		/* Clear sub-pages at the end of huge page */
+		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+			cond_resched();
+			clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		}
+	} else {
+		/* If sub-page to access in second half of huge page */
+		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+		l = pages_per_huge_page - n;
+		/* Clear sub-pages at the begin of huge page */
+		for (i = 0; i < base; i++) {
+			cond_resched();
+			clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		}
+	}
+	/*
+	 * Clear remaining sub-pages in left-right-left-right pattern
+	 * towards the sub-page to access
+	 */
+	for (i = 0; i < l; i++) {
+		int left_idx = base + i;
+		int right_idx = base + 2 * l - 1 - i;
+
 		cond_resched();
-		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		clear_user_highpage(page + left_idx,
+				    addr + left_idx * PAGE_SIZE);
+		cond_resched();
+		clear_user_highpage(page + right_idx,
+				    addr + right_idx * PAGE_SIZE);
 	}
 }
 

From 493b0e9d945fa9dfe96be93ae41b4ca4b6fdb317 Mon Sep 17 00:00:00 2001
From: Daniel Colascione <dancol@google.com>
Date: Wed, 6 Sep 2017 16:25:08 -0700
Subject: [PATCH 117/119] mm: add /proc/pid/smaps_rollup

/proc/pid/smaps_rollup is a new proc file that improves the performance
of user programs that determine aggregate memory statistics (e.g., total
PSS) of a process.

Android regularly "samples" the memory usage of various processes in
order to balance its memory pool sizes.  This sampling process involves
opening /proc/pid/smaps and summing certain fields.  For very large
processes, sampling memory use this way can take several hundred
milliseconds, due mostly to the overhead of the seq_printf calls in
task_mmu.c.

smaps_rollup improves the situation.  It contains most of the fields of
/proc/pid/smaps, but instead of a set of fields for each VMA,
smaps_rollup instead contains one synthetic smaps-format entry
representing the whole process.  In the single smaps_rollup synthetic
entry, each field is the summation of the corresponding field in all of
the real-smaps VMAs.  Using a common format for smaps_rollup and smaps
allows userspace parsers to repurpose parsers meant for use with
non-rollup smaps for smaps_rollup, and it allows userspace to switch
between smaps_rollup and smaps at runtime (say, based on the
availability of smaps_rollup in a given kernel) with minimal fuss.

By using smaps_rollup instead of smaps, a caller can avoid the
significant overhead of formatting, reading, and parsing each of a large
process's potentially very numerous memory mappings.  For sampling
system_server's PSS in Android, we measured a 12x speedup, representing
a savings of several hundred milliseconds.

One alternative to a new per-process proc file would have been including
PSS information in /proc/pid/status.  We considered this option but
thought that PSS would be too expensive (by a few orders of magnitude)
to collect relative to what's already emitted as part of
/proc/pid/status, and slowing every user of /proc/pid/status for the
sake of readers that happen to want PSS feels wrong.

The code itself works by reusing the existing VMA-walking framework we
use for regular smaps generation and keeping the mem_size_stats
structure around between VMA walks instead of using a fresh one for each
VMA.  In this way, summation happens automatically.  We let seq_file
walk over the VMAs just as it does for regular smaps and just emit
nothing to the seq_file until we hit the last VMA.

Benchmarks:

    using smaps:
    iterations:1000 pid:1163 pss:220023808
    0m29.46s real 0m08.28s user 0m20.98s system

    using smaps_rollup:
    iterations:1000 pid:1163 pss:220702720
    0m04.39s real 0m00.03s user 0m04.31s system

We're using the PSS samples we collect asynchronously for
system-management tasks like fine-tuning oom_adj_score, memory use
tracking for debugging, application-level memory-use attribution, and
deciding whether we want to kill large processes during system idle
maintenance windows.  Android has been using PSS for these purposes for
a long time; as the average process VMA count has increased and and
devices become more efficiency-conscious, PSS-collection inefficiency
has started to matter more.  IMHO, it'd be a lot safer to optimize the
existing PSS-collection model, which has been fine-tuned over the years,
instead of changing the memory tracking approach entirely to work around
smaps-generation inefficiency.

Tim said:

: There are two main reasons why Android gathers PSS information:
:
: 1. Android devices can show the user the amount of memory used per
:    application via the settings app.  This is a less important use case.
:
: 2. We log PSS to help identify leaks in applications.  We have found
:    an enormous number of bugs (in the Android platform, in Google's own
:    apps, and in third-party applications) using this data.
:
: To do this, system_server (the main process in Android userspace) will
: sample the PSS of a process three seconds after it changes state (for
: example, app is launched and becomes the foreground application) and about
: every ten minutes after that.  The net result is that PSS collection is
: regularly running on at least one process in the system (usually a few
: times a minute while the screen is on, less when screen is off due to
: suspend).  PSS of a process is an incredibly useful stat to track, and we
: aren't going to get rid of it.  We've looked at some very hacky approaches
: using RSS ("take the RSS of the target process, subtract the RSS of the
: zygote process that is the parent of all Android apps") to reduce the
: accounting time, but it regularly overestimated the memory used by 20+
: percent.  Accordingly, I don't think that there's a good alternative to
: using PSS.
:
: We started looking into PSS collection performance after we noticed random
: frequency spikes while a phone's screen was off; occasionally, one of the
: CPU clusters would ramp to a high frequency because there was 200-300ms of
: constant CPU work from a single thread in the main Android userspace
: process.  The work causing the spike (which is reasonable governor
: behavior given the amount of CPU time needed) was always PSS collection.
: As a result, Android is burning more power than we should be on PSS
: collection.
:
: The other issue (and why I'm less sure about improving smaps as a
: long-term solution) is that the number of VMAs per process has increased
: significantly from release to release.  After trying to figure out why we
: were seeing these 200-300ms PSS collection times on Android O but had not
: noticed it in previous versions, we found that the number of VMAs in the
: main system process increased by 50% from Android N to Android O (from
: ~1800 to ~2700) and varying increases in every userspace process.  Android
: M to N also had an increase in the number of VMAs, although not as much.
: I'm not sure why this is increasing so much over time, but thinking about
: ASLR and ways to make ASLR better, I expect that this will continue to
: increase going forward.  I would not be surprised if we hit 5000 VMAs on
: the main Android process (system_server) by 2020.
:
: If we assume that the number of VMAs is going to increase over time, then
: doing anything we can do to reduce the overhead of each VMA during PSS
: collection seems like the right way to go, and that means outputting an
: aggregate statistic (to avoid whatever overhead there is per line in
: writing smaps and in reading each line from userspace).

Link: http://lkml.kernel.org/r/20170812022148.178293-1-dancol@google.com
Signed-off-by: Daniel Colascione <dancol@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/procfs-smaps_rollup |  31 +++
 fs/proc/base.c                                |   2 +
 fs/proc/internal.h                            |   3 +
 fs/proc/task_mmu.c                            | 192 ++++++++++++------
 4 files changed, 168 insertions(+), 60 deletions(-)
 create mode 100644 Documentation/ABI/testing/procfs-smaps_rollup

diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
new file mode 100644
index 000000000000..0a54ed0d63c9
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -0,0 +1,31 @@
+What:		/proc/pid/smaps_rollup
+Date:		August 2017
+Contact:	Daniel Colascione <dancol@google.com>
+Description:
+		This file provides pre-summed memory information for a
+		process.  The format is identical to /proc/pid/smaps,
+		except instead of an entry for each VMA in a process,
+		smaps_rollup has a single entry (tagged "[rollup]")
+		for which each field is the sum of the corresponding
+		fields from all the maps in /proc/pid/smaps.
+		For more details, see the procfs man page.
+
+		Typical output looks like this:
+
+		00100000-ff709000 ---p 00000000 00:00 0		 [rollup]
+		Rss:		     884 kB
+		Pss:		     385 kB
+		Shared_Clean:	     696 kB
+		Shared_Dirty:	       0 kB
+		Private_Clean:	     120 kB
+		Private_Dirty:	      68 kB
+		Referenced:	     884 kB
+		Anonymous:	      68 kB
+		LazyFree:	       0 kB
+		AnonHugePages:	       0 kB
+		ShmemPmdMapped:	       0 kB
+		Shared_Hugetlb:	       0 kB
+		Private_Hugetlb:       0 kB
+		Swap:		       0 kB
+		SwapPss:	       0 kB
+		Locked:		     385 kB
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98fd8f6df851..e5d89a0d0b8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
+	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
@@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
+	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..2cbfcd32e884 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
 /*
  * task_[no]mmu.c
  */
+struct mem_size_stats;
 struct proc_maps_private {
 	struct inode *inode;
 	struct task_struct *task;
 	struct mm_struct *mm;
+	struct mem_size_stats *rollup;
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
 #endif
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
 extern const struct file_operations proc_pid_numa_maps_operations;
 extern const struct file_operations proc_tid_numa_maps_operations;
 extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_tid_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..b2330aedc63f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
 	if (priv->mm)
 		mmdrop(priv->mm);
 
+	kfree(priv->rollup);
 	return seq_release_private(inode, file);
 }
 
@@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv,
 		vma->vm_end >= vma->vm_mm->start_stack;
 }
 
+static void show_vma_header_prefix(struct seq_file *m,
+				   unsigned long start, unsigned long end,
+				   vm_flags_t flags, unsigned long long pgoff,
+				   dev_t dev, unsigned long ino)
+{
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+		   start,
+		   end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? 's' : 'p',
+		   pgoff,
+		   MAJOR(dev), MINOR(dev), ino);
+}
+
 static void
 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 {
@@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 	start = vma->vm_start;
 	end = vma->vm_end;
-
-	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
-	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
-			start,
-			end,
-			flags & VM_READ ? 'r' : '-',
-			flags & VM_WRITE ? 'w' : '-',
-			flags & VM_EXEC ? 'x' : '-',
-			flags & VM_MAYSHARE ? 's' : 'p',
-			pgoff,
-			MAJOR(dev), MINOR(dev), ino);
+	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
 
 	/*
 	 * Print the dentry name for named mappings, and a
@@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = {
 
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
+	bool first;
 	unsigned long resident;
 	unsigned long shared_clean;
 	unsigned long shared_dirty;
@@ -443,7 +452,9 @@ struct mem_size_stats {
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
+	unsigned long first_vma_start;
 	u64 pss;
+	u64 pss_locked;
 	u64 swap_pss;
 	bool check_shmem_swap;
 };
@@ -719,18 +730,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
 
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct mem_size_stats mss;
+	struct mem_size_stats mss_stack;
+	struct mem_size_stats *mss;
 	struct mm_walk smaps_walk = {
 		.pmd_entry = smaps_pte_range,
 #ifdef CONFIG_HUGETLB_PAGE
 		.hugetlb_entry = smaps_hugetlb_range,
 #endif
 		.mm = vma->vm_mm,
-		.private = &mss,
 	};
+	int ret = 0;
+	bool rollup_mode;
+	bool last_vma;
 
-	memset(&mss, 0, sizeof mss);
+	if (priv->rollup) {
+		rollup_mode = true;
+		mss = priv->rollup;
+		if (mss->first) {
+			mss->first_vma_start = vma->vm_start;
+			mss->first = false;
+		}
+		last_vma = !m_next_vma(priv, vma);
+	} else {
+		rollup_mode = false;
+		memset(&mss_stack, 0, sizeof(mss_stack));
+		mss = &mss_stack;
+	}
+
+	smaps_walk.private = mss;
 
 #ifdef CONFIG_SHMEM
 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
@@ -748,9 +777,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 		if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
 					!(vma->vm_flags & VM_WRITE)) {
-			mss.swap = shmem_swapped;
+			mss->swap = shmem_swapped;
 		} else {
-			mss.check_shmem_swap = true;
+			mss->check_shmem_swap = true;
 			smaps_walk.pte_hole = smaps_pte_hole;
 		}
 	}
@@ -758,54 +787,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 	/* mmap_sem is held in m_start */
 	walk_page_vma(vma, &smaps_walk);
+	if (vma->vm_flags & VM_LOCKED)
+		mss->pss_locked += mss->pss;
 
-	show_map_vma(m, vma, is_pid);
+	if (!rollup_mode) {
+		show_map_vma(m, vma, is_pid);
+	} else if (last_vma) {
+		show_vma_header_prefix(
+			m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
+		seq_pad(m, ' ');
+		seq_puts(m, "[rollup]\n");
+	} else {
+		ret = SEQ_SKIP;
+	}
 
-	seq_printf(m,
-		   "Size:           %8lu kB\n"
-		   "Rss:            %8lu kB\n"
-		   "Pss:            %8lu kB\n"
-		   "Shared_Clean:   %8lu kB\n"
-		   "Shared_Dirty:   %8lu kB\n"
-		   "Private_Clean:  %8lu kB\n"
-		   "Private_Dirty:  %8lu kB\n"
-		   "Referenced:     %8lu kB\n"
-		   "Anonymous:      %8lu kB\n"
-		   "LazyFree:       %8lu kB\n"
-		   "AnonHugePages:  %8lu kB\n"
-		   "ShmemPmdMapped: %8lu kB\n"
-		   "Shared_Hugetlb: %8lu kB\n"
-		   "Private_Hugetlb: %7lu kB\n"
-		   "Swap:           %8lu kB\n"
-		   "SwapPss:        %8lu kB\n"
-		   "KernelPageSize: %8lu kB\n"
-		   "MMUPageSize:    %8lu kB\n"
-		   "Locked:         %8lu kB\n",
-		   (vma->vm_end - vma->vm_start) >> 10,
-		   mss.resident >> 10,
-		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
-		   mss.shared_clean  >> 10,
-		   mss.shared_dirty  >> 10,
-		   mss.private_clean >> 10,
-		   mss.private_dirty >> 10,
-		   mss.referenced >> 10,
-		   mss.anonymous >> 10,
-		   mss.lazyfree >> 10,
-		   mss.anonymous_thp >> 10,
-		   mss.shmem_thp >> 10,
-		   mss.shared_hugetlb >> 10,
-		   mss.private_hugetlb >> 10,
-		   mss.swap >> 10,
-		   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
-		   vma_kernel_pagesize(vma) >> 10,
-		   vma_mmu_pagesize(vma) >> 10,
-		   (vma->vm_flags & VM_LOCKED) ?
-			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+	if (!rollup_mode)
+		seq_printf(m,
+			   "Size:           %8lu kB\n"
+			   "KernelPageSize: %8lu kB\n"
+			   "MMUPageSize:    %8lu kB\n",
+			   (vma->vm_end - vma->vm_start) >> 10,
+			   vma_kernel_pagesize(vma) >> 10,
+			   vma_mmu_pagesize(vma) >> 10);
 
-	arch_show_smap(m, vma);
-	show_smap_vma_flags(m, vma);
+
+	if (!rollup_mode || last_vma)
+		seq_printf(m,
+			   "Rss:            %8lu kB\n"
+			   "Pss:            %8lu kB\n"
+			   "Shared_Clean:   %8lu kB\n"
+			   "Shared_Dirty:   %8lu kB\n"
+			   "Private_Clean:  %8lu kB\n"
+			   "Private_Dirty:  %8lu kB\n"
+			   "Referenced:     %8lu kB\n"
+			   "Anonymous:      %8lu kB\n"
+			   "LazyFree:       %8lu kB\n"
+			   "AnonHugePages:  %8lu kB\n"
+			   "ShmemPmdMapped: %8lu kB\n"
+			   "Shared_Hugetlb: %8lu kB\n"
+			   "Private_Hugetlb: %7lu kB\n"
+			   "Swap:           %8lu kB\n"
+			   "SwapPss:        %8lu kB\n"
+			   "Locked:         %8lu kB\n",
+			   mss->resident >> 10,
+			   (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
+			   mss->shared_clean  >> 10,
+			   mss->shared_dirty  >> 10,
+			   mss->private_clean >> 10,
+			   mss->private_dirty >> 10,
+			   mss->referenced >> 10,
+			   mss->anonymous >> 10,
+			   mss->lazyfree >> 10,
+			   mss->anonymous_thp >> 10,
+			   mss->shmem_thp >> 10,
+			   mss->shared_hugetlb >> 10,
+			   mss->private_hugetlb >> 10,
+			   mss->swap >> 10,
+			   (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
+			   (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
+
+	if (!rollup_mode) {
+		arch_show_smap(m, vma);
+		show_smap_vma_flags(m, vma);
+	}
 	m_cache_vma(m, vma);
-	return 0;
+	return ret;
 }
 
 static int show_pid_smap(struct seq_file *m, void *v)
@@ -837,6 +883,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
 	return do_maps_open(inode, file, &proc_pid_smaps_op);
 }
 
+static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct proc_maps_private *priv;
+	int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
+
+	if (ret < 0)
+		return ret;
+	seq = file->private_data;
+	priv = seq->private;
+	priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
+	if (!priv->rollup) {
+		proc_map_release(inode, file);
+		return -ENOMEM;
+	}
+	priv->rollup->first = true;
+	return 0;
+}
+
 static int tid_smaps_open(struct inode *inode, struct file *file)
 {
 	return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -849,6 +914,13 @@ const struct file_operations proc_pid_smaps_operations = {
 	.release	= proc_map_release,
 };
 
+const struct file_operations proc_pid_smaps_rollup_operations = {
+	.open		= pid_smaps_rollup_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_map_release,
+};
+
 const struct file_operations proc_tid_smaps_operations = {
 	.open		= tid_smaps_open,
 	.read		= seq_read,

From df3735c5b40fad8d0d28eb8ab065fe955b3347ee Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 6 Sep 2017 16:25:11 -0700
Subject: [PATCH 118/119] x86,mpx: make mpx depend on x86-64 to free up VMA
 flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm,fork,security: introduce MADV_WIPEONFORK", v4.

If a child process accesses memory that was MADV_WIPEONFORK, it will get
zeroes.  The address ranges are still valid, they are just empty.

If a child process accesses memory that was MADV_DONTFORK, it will get a
segmentation fault, since those address ranges are no longer valid in
the child after fork.

Since MADV_DONTFORK also seems to be used to allow very large programs
to fork in systems with strict memory overcommit restrictions, changing
the semantics of MADV_DONTFORK might break existing programs.

The use case is libraries that store or cache information, and want to
know that they need to regenerate it in the child process after fork.

Examples of this would be:
 - systemd/pulseaudio API checks (fail after fork) (replacing a getpid
   check, which is too slow without a PID cache)
 - PKCS#11 API reinitialization check (mandated by specification)
 - glibc's upcoming PRNG (reseed after fork)
 - OpenSSL PRNG (reseed after fork)

The security benefits of a forking server having a re-inialized PRNG in
every child process are pretty obvious.  However, due to libraries
having all kinds of internal state, and programs getting compiled with
many different versions of each library, it is unreasonable to expect
calling programs to re-initialize everything manually after fork.

A further complication is the proliferation of clone flags, programs
bypassing glibc's functions to call clone directly, and programs calling
unshare, causing the glibc pthread_atfork hook to not get called.

It would be better to have the kernel take care of this automatically.

The patchset also adds MADV_KEEPONFORK, to undo the effects of a prior
MADV_WIPEONFORK.

This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO:

    https://man.openbsd.org/minherit.2

This patch (of 2):

MPX only seems to be available on 64 bit CPUs, starting with Skylake and
Goldmont.  Move VM_MPX into the 64 bit only portion of vma->vm_flags, in
order to free up a VMA flag.

Link: http://lkml.kernel.org/r/20170811212829.29186-2-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Colm MacCártaigh <colm@allcosts.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig   | 4 +++-
 include/linux/mm.h | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index acb366bf6bc1..4b278a33ccbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1806,7 +1806,9 @@ config X86_SMAP
 config X86_INTEL_MPX
 	prompt "Intel MPX (Memory Protection Extensions)"
 	def_bool n
-	depends on CPU_SUP_INTEL
+	# Note: only available in 64-bit mode due to VMA flags shortage
+	depends on CPU_SUP_INTEL && X86_64
+	select ARCH_USES_HIGH_VMA_FLAGS
 	---help---
 	  MPX provides hardware features that can be used in
 	  conjunction with compiler-instrumented code to check
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0acea73af839..9efe62032094 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_1	33	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
+#define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #if defined(CONFIG_X86)
@@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
 #endif
 
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86_INTEL_MPX)
 /* MPX specific bounds table or bounds directory */
-# define VM_MPX		VM_ARCH_2
+# define VM_MPX		VM_HIGH_ARCH_BIT_4
+#else
+# define VM_MPX		VM_NONE
 #endif
 
 #ifndef VM_GROWSUP

From d2cd9ede6e193dd7d88b6d27399e96229a551b19 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 6 Sep 2017 16:25:15 -0700
Subject: [PATCH 119/119] mm,fork: introduce MADV_WIPEONFORK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty
in the child process after fork.  This differs from MADV_DONTFORK in one
important way.

If a child process accesses memory that was MADV_WIPEONFORK, it will get
zeroes.  The address ranges are still valid, they are just empty.

If a child process accesses memory that was MADV_DONTFORK, it will get a
segmentation fault, since those address ranges are no longer valid in
the child after fork.

Since MADV_DONTFORK also seems to be used to allow very large programs
to fork in systems with strict memory overcommit restrictions, changing
the semantics of MADV_DONTFORK might break existing programs.

MADV_WIPEONFORK only works on private, anonymous VMAs.

The use case is libraries that store or cache information, and want to
know that they need to regenerate it in the child process after fork.

Examples of this would be:
 - systemd/pulseaudio API checks (fail after fork) (replacing a getpid
   check, which is too slow without a PID cache)
 - PKCS#11 API reinitialization check (mandated by specification)
 - glibc's upcoming PRNG (reseed after fork)
 - OpenSSL PRNG (reseed after fork)

The security benefits of a forking server having a re-inialized PRNG in
every child process are pretty obvious.  However, due to libraries
having all kinds of internal state, and programs getting compiled with
many different versions of each library, it is unreasonable to expect
calling programs to re-initialize everything manually after fork.

A further complication is the proliferation of clone flags, programs
bypassing glibc's functions to call clone directly, and programs calling
unshare, causing the glibc pthread_atfork hook to not get called.

It would be better to have the kernel take care of this automatically.

The patch also adds MADV_KEEPONFORK, to undo the effects of a prior
MADV_WIPEONFORK.

This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO:

    https://man.openbsd.org/minherit.2

[akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines]
Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Florian Weimer <fweimer@redhat.com>
Reported-by: Colm MacCártaigh <colm@allcosts.net>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/uapi/asm/mman.h     |  3 +++
 arch/mips/include/uapi/asm/mman.h      |  3 +++
 arch/parisc/include/uapi/asm/mman.h    |  3 +++
 arch/xtensa/include/uapi/asm/mman.h    |  3 +++
 fs/proc/task_mmu.c                     |  1 +
 include/linux/mm.h                     |  2 +-
 include/trace/events/mmflags.h         |  8 +-------
 include/uapi/asm-generic/mman-common.h |  3 +++
 kernel/fork.c                          | 10 ++++++++--
 mm/madvise.c                           | 13 +++++++++++++
 10 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 13b52aad3c43..3b26cc62dadb 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -64,6 +64,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 398eebcc3541..da3216007fe0 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -91,6 +91,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index b87fbe3f338a..775b5d5e41a1 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -57,6 +57,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	70		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 71		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 72		/* Undo MADV_WIPEONFORK */
+
 #define MADV_HWPOISON     100		/* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101		/* soft offline page for testing */
 
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 8ce77a2e9bab..b15b278aa314 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,6 +103,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b2330aedc63f..a290966f91ec 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
 		[ilog2(VM_ARCH_1)]	= "ar",
+		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
 		[ilog2(VM_SOFTDIRTY)]	= "sd",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9efe62032094..39db8e54c5d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_ARCH_2	0x02000000
+#define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 8e50d01c645f..4c2e4737d7bc 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1,	"arch_1"	}
 #endif
 
-#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC_2 {VM_MPX,		"mpx"		}
-#else
-#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2,	"arch_2"	}
-#endif
-
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
 #else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 	{VM_NORESERVE,			"noreserve"	},		\
 	{VM_HUGETLB,			"hugetlb"	},		\
 	__VM_ARCH_SPECIFIC_1				,		\
-	__VM_ARCH_SPECIFIC_2				,		\
+	{VM_WIPEONFORK,			"wipeonfork"	},		\
 	{VM_DONTDUMP,			"dontdump"	},		\
 IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	{VM_MIXEDMAP,			"mixedmap"	},		\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d248f3c335b5..203268f9231e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -58,6 +58,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_DONTDUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ed64600da6c..24a4c0be80d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		retval = dup_userfaultfd(tmp, &uf);
 		if (retval)
 			goto fail_nomem_anon_vma_fork;
-		if (anon_vma_fork(tmp, mpnt))
+		if (tmp->vm_flags & VM_WIPEONFORK) {
+			/* VM_WIPEONFORK gets a clean slate in the child. */
+			tmp->anon_vma = NULL;
+			if (anon_vma_prepare(tmp))
+				goto fail_nomem_anon_vma_fork;
+		} else if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, oldmm, mpnt);
+		if (!(tmp->vm_flags & VM_WIPEONFORK))
+			retval = copy_page_range(mm, oldmm, mpnt);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
 		}
 		new_flags &= ~VM_DONTCOPY;
 		break;
+	case MADV_WIPEONFORK:
+		/* MADV_WIPEONFORK is only supported on anonymous memory. */
+		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags |= VM_WIPEONFORK;
+		break;
+	case MADV_KEEPONFORK:
+		new_flags &= ~VM_WIPEONFORK;
+		break;
 	case MADV_DONTDUMP:
 		new_flags |= VM_DONTDUMP;
 		break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
+	case MADV_WIPEONFORK:
+	case MADV_KEEPONFORK:
 #ifdef CONFIG_MEMORY_FAILURE
 	case MADV_SOFT_OFFLINE:
 	case MADV_HWPOISON: