From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:41 -0700
Subject: [PATCH] [PATCH] mm: fix rss and mmlist locking

A couple of oddities were guarded by page_table_lock, no longer properly
guarded when that is split.

The mm_counters of file_rss and anon_rss: make those an atomic_t, or an
atomic64_t if the architecture supports it, in such a case.  Definitions by
courtesy of Christoph Lameter: who spent considerable effort on more scalable
ways of counting, but found insufficient benefit in practice.

And adding an mm with swap to the mmlist for swapoff: the list is well-
guarded by its own lock, but the list_empty check now has to be repeated
inside it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++----
 mm/memory.c           |  4 +++-
 mm/rmap.c             |  3 ++-
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292cb57ce38f..1c30bc308ef1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
+ * that is, at 16TB if using 4kB page size.
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
-#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+typedef unsigned long mm_counter_t;
 
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define get_mm_rss(mm)					\
+	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
 #define update_hiwater_rss(mm)	do {			\
 	unsigned long _rss = get_mm_rss(mm);		\
 	if ((mm)->hiwater_rss < _rss)			\
@@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-typedef unsigned long mm_counter_t;
-
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -291,7 +323,9 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	/* Special counters protected by the page_table_lock */
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
diff --git a/mm/memory.c b/mm/memory.c
index e9ef599498b5..d68421dd64ef 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -372,7 +372,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
-				list_add(&dst_mm->mmlist, &src_mm->mmlist);
+				if (list_empty(&dst_mm->mmlist))
+					list_add(&dst_mm->mmlist,
+						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index a33e779d1bd8..a7427bbf57e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -559,7 +559,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		swap_duplicate(entry);
 		if (list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
+			if (list_empty(&mm->mmlist))
+				list_add(&mm->mmlist, &init_mm.mmlist);
 			spin_unlock(&mmlist_lock);
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));