ARC changes for v5.15-rc1

- MM rework + Implementing up to 4 paging levels. + Enable STRICT_MM_TYPECHECK + switch pgtable_t back to struct page * - Atomics rework / implementing relaxed accessors - Retiring of legacy MMUv1,v2; ARC750 cores - A few other build errors, typos -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEOXpuCuR6hedrdLCJadfx3eKKwl4FAmE0NbUACgkQadfx3eKK wl6ibQ//QXhHlrJiLuzvPsw6XkEwhxmJiSCcoJ1Tl1ZST2i+aLopnbVRy6aGsQWm 6gUxgUHR6SvRSU5c7/VGSCKTIRXNqeK1nfhcdzT1zpOjhrn70lp3JobvFJbjo/Ca 6gZvSuPy9ozdQMKvK294eqnKpNo4AmJEVEJUSAGMUtNH1ikjGqelqKKq+P0lM1uh sGlYr4MBqNu+1oCrOBltnuMLHDRNeHrMmdp6YLaFNSmQ2KRvoJokD25M5NmHrmyI VNGDb9K5c/pSIbJRHdYsR79Ad4M4+0p+kP97cQ1LXP4lfnDWgYtm/wpONQQfST8u zV8BFdz3Qo79AoRqG7DtMVtg3vqWuB30puV6N2mZbF3q6PrqtR47qtxWkrBaZ4Nv xIYmKLPnUP4NlFPRB9EDE3P5SZxcDRug/GlYdR42n6x0InHbKnm/3giu32RfCKDe Xg3KxfAl2sV9Wze8GwFkDqcAC8JTlBCu/qbp83s3Sx/1pqWB3mrGDZUcVrn0+ogC 745mU7ZAvOsYRroL1FhGYav7hxCB1V81aJ2fdf9sOU2tPTDYIgt8Oq0ZQELLi8ZS 1HP+yDzPMpDDdzD6uRvkaqM1FN+TtbIIIwyyFusXYj+R0rXxjy6xC6Zl1Fx52w45 mSkP3OGItxh3C+tAyMWqmndTXJt6B3I/WvIVvVi67eSo1VWnXnA= =hIYy -----END PGP SIGNATURE----- Merge tag 'arc-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc Pull ARC updates from Vineet Gupta: "Finally a big pile of changes for ARC (atomics/mm). These are from our internal arc64 tree, preparing mainline for eventual arc64 support. I'm spreading them out to avoid tsunami of patches in one release. - MM rework: - Implement up to 4 paging levels - Enable STRICT_MM_TYPECHECK - switch pgtable_t back to 'struct page *' - Atomics rework / implement relaxed accessors - Retire legacy MMUv1,v2; ARC750 cores - A few other build errors, typos" * tag 'arc-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc: (33 commits) ARC: mm: vmalloc sync from kernel to user table to update PMD ... ARC: mm: support 4 levels of page tables ARC: mm: support 3 levels of page tables ARC: mm: switch to asm-generic/pgalloc.h ARC: mm: switch pgtable_t back to struct page * ARC: mm: hack to allow 2 level build with 4 level code ARC: mm: disintegrate pgtable.h into levels and flags ARC: mm: disintegrate mmu.h (arcv2 bits out) ARC: mm: move MMU specific bits out of entry code ... ARC: mm: move MMU specific bits out of ASID allocator ARC: mm: non-functional code movement/cleanup ARC: mm: pmd_populate* to use the canonical set_pmd (and drop pmd_set) ARC: ioremap: use more commonly used PAGE_KERNEL based uncached flag ARC: mm: Enable STRICT_MM_TYPECHECKS ARC: mm: Fixes to allow STRICT_MM_TYPECHECKS ARC: mm: move mmu/cache externs out to setup.h ARC: mm: remove tlb paranoid code ARC: mm: use SCRATCH_DATA0 register for caching pgdir in ARCv2 only ARC: retire MMUv1 and MMUv2 support ARC: retire ARC750 support ...
2021-09-05 11:43:03 -07:00 · 2021-09-05 11:43:03 -07:00 · e07af26266
commit e07af26266
parent 063df71a57 56809a28d4
33 changed files with 1238 additions and 1848 deletions
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@ -116,16 +116,9 @@ choice
 	default ARC_CPU_770 if ISA_ARCOMPACT
 	default ARC_CPU_HS if ISA_ARCV2
 if ISA_ARCOMPACT
 config ARC_CPU_750D
 	bool "ARC750D"
 	select ARC_CANT_LLSC
 	help
 	  Support for ARC750 core
 config ARC_CPU_770
 	bool "ARC770"
 	depends on ISA_ARCOMPACT
 	select ARC_HAS_SWAPE
 	help
 	  Support for ARC770 core introduced with Rel 4.10 (Summer 2011)
@ -135,8 +128,6 @@ config ARC_CPU_770
 	  -Caches: New Prog Model, Region Flush
 	  -Insns: endian swap, load-locked/store-conditional, time-stamp-ctr
 endif #ISA_ARCOMPACT
 config ARC_CPU_HS
 	bool "ARC-HS"
 	depends on ISA_ARCV2
@ -274,33 +265,17 @@ config ARC_DCCM_BASE
 choice
 	prompt "MMU Version"
-	default ARC_MMU_V3 if ARC_CPU_770
+	default ARC_MMU_V3 if ISA_ARCOMPACT
-	default ARC_MMU_V2 if ARC_CPU_750D
+	default ARC_MMU_V4 if ISA_ARCV2
 	default ARC_MMU_V4 if ARC_CPU_HS
 if ISA_ARCOMPACT
 config ARC_MMU_V1
 	bool "MMU v1"
 	help
 	  Orig ARC700 MMU
 config ARC_MMU_V2
 	bool "MMU v2"
 	help
 	  Fixed the deficiency of v1 - possible thrashing in memcpy scenario
 	  when 2 D-TLB and 1 I-TLB entries index into same 2way set.
 config ARC_MMU_V3
 	bool "MMU v3"
-	depends on ARC_CPU_770
+	depends on ISA_ARCOMPACT
 	help
 	  Introduced with ARC700 4.10: New Features
 	  Variable Page size (1k-16k), var JTLB size 128 x (2 or 4)
 	  Shared Address Spaces (SASID)
 endif
 config ARC_MMU_V4
 	bool "MMU v4"
 	depends on ISA_ARCV2
@ -319,7 +294,6 @@ config ARC_PAGE_SIZE_8K
 config ARC_PAGE_SIZE_16K
 	bool "16KB"
 	depends on ARC_MMU_V3 || ARC_MMU_V4
 config ARC_PAGE_SIZE_4K
 	bool "4KB"
@ -340,6 +314,10 @@ config ARC_HUGEPAGE_16M
 endchoice
 config PGTABLE_LEVELS
 	int "Number of Page table levels"
 	default 2
 config ARC_COMPACT_IRQ_LEVELS
 	depends on ISA_ARCOMPACT
 	bool "Setup Timer IRQ as high Priority"
@ -563,9 +541,6 @@ config ARC_DW2_UNWIND
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame unwind information
 config ARC_DBG_TLB_PARANOIA
 	bool "Paranoia Checks in Low Level TLB Handlers"
 config ARC_DBG_JUMP_LABEL
 	bool "Paranoid checks in Static Keys (jump labels) code"
 	depends on JUMP_LABEL
--- a/arch/arc/include/asm/atomic-llsc.h
+++ b/arch/arc/include/asm/atomic-llsc.h
@ -0,0 +1,97 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #ifndef _ASM_ARC_ATOMIC_LLSC_H
 #define _ASM_ARC_ATOMIC_LLSC_H
 #define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 #define ATOMIC_OP(op, c_op, asm_op)					\
 static inline void arch_atomic_##op(int i, atomic_t *v)			\
 {									\
 	unsigned int val;						\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[val], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[val], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val) /* Early clobber to prevent reg reuse */	\
 	: [ctr]	"r"	(&v->counter), /* Not "m": llock only supports reg direct addr mode */	\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 }									\
 #define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 static inline int arch_atomic_##op##_return_relaxed(int i, atomic_t *v)	\
 {									\
 	unsigned int val;						\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[val], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[val], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val)						\
 	: [ctr]	"r"	(&v->counter),					\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 									\
 	return val;							\
 }
 #define arch_atomic_add_return_relaxed		arch_atomic_add_return_relaxed
 #define arch_atomic_sub_return_relaxed		arch_atomic_sub_return_relaxed
 #define ATOMIC_FETCH_OP(op, c_op, asm_op)				\
 static inline int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v)	\
 {									\
 	unsigned int val, orig;						\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[orig], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[orig], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val),						\
 	  [orig] "=&r" (orig)						\
 	: [ctr]	"r"	(&v->counter),					\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 									\
 	return orig;							\
 }
 #define arch_atomic_fetch_add_relaxed		arch_atomic_fetch_add_relaxed
 #define arch_atomic_fetch_sub_relaxed		arch_atomic_fetch_sub_relaxed
 #define arch_atomic_fetch_and_relaxed		arch_atomic_fetch_and_relaxed
 #define arch_atomic_fetch_andnot_relaxed	arch_atomic_fetch_andnot_relaxed
 #define arch_atomic_fetch_or_relaxed		arch_atomic_fetch_or_relaxed
 #define arch_atomic_fetch_xor_relaxed		arch_atomic_fetch_xor_relaxed
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 #undef ATOMIC_OPS
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(and, &=, and)
 ATOMIC_OPS(andnot, &= ~, bic)
 ATOMIC_OPS(or, |=, or)
 ATOMIC_OPS(xor, ^=, xor)
 #define arch_atomic_andnot		arch_atomic_andnot
 #undef ATOMIC_OPS
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 #endif
--- a/arch/arc/include/asm/atomic-spinlock.h
+++ b/arch/arc/include/asm/atomic-spinlock.h
@ -0,0 +1,102 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #ifndef _ASM_ARC_ATOMIC_SPLOCK_H
 #define _ASM_ARC_ATOMIC_SPLOCK_H
 /*
 * Non hardware assisted Atomic-R-M-W
 * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
 */
 static inline void arch_atomic_set(atomic_t *v, int i)
 {
 	/*
 	 * Independent of hardware support, all of the atomic_xxx() APIs need
 	 * to follow the same locking rules to make sure that a "hardware"
 	 * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn
 	 * sequence
 	 *
 	 * Thus atomic_set() despite being 1 insn (and seemingly atomic)
 	 * requires the locking.
 	 */
 	unsigned long flags;
 	atomic_ops_lock(flags);
 	WRITE_ONCE(v->counter, i);
 	atomic_ops_unlock(flags);
 }
 #define arch_atomic_set_release(v, i)	arch_atomic_set((v), (i))
 #define ATOMIC_OP(op, c_op, asm_op)					\
 static inline void arch_atomic_##op(int i, atomic_t *v)			\
 {									\
 	unsigned long flags;						\
 									\
 	atomic_ops_lock(flags);						\
 	v->counter c_op i;						\
 	atomic_ops_unlock(flags);					\
 }
 #define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 static inline int arch_atomic_##op##_return(int i, atomic_t *v)		\
 {									\
 	unsigned long flags;						\
 	unsigned int temp;						\
 									\
 	/*								\
 	 * spin lock/unlock provides the needed smp_mb() before/after	\
 	 */								\
 	atomic_ops_lock(flags);						\
 	temp = v->counter;						\
 	temp c_op i;							\
 	v->counter = temp;						\
 	atomic_ops_unlock(flags);					\
 									\
 	return temp;							\
 }
 #define ATOMIC_FETCH_OP(op, c_op, asm_op)				\
 static inline int arch_atomic_fetch_##op(int i, atomic_t *v)		\
 {									\
 	unsigned long flags;						\
 	unsigned int orig;						\
 									\
 	/*								\
 	 * spin lock/unlock provides the needed smp_mb() before/after	\
 	 */								\
 	atomic_ops_lock(flags);						\
 	orig = v->counter;						\
 	v->counter c_op i;						\
 	atomic_ops_unlock(flags);					\
 									\
 	return orig;							\
 }
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 #undef ATOMIC_OPS
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(and, &=, and)
 ATOMIC_OPS(andnot, &= ~, bic)
 ATOMIC_OPS(or, |=, or)
 ATOMIC_OPS(xor, ^=, xor)
 #define arch_atomic_andnot		arch_atomic_andnot
 #define arch_atomic_fetch_andnot	arch_atomic_fetch_andnot
 #undef ATOMIC_OPS
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 #endif
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@ -17,435 +17,43 @@
 #define arch_atomic_read(v)  READ_ONCE((v)->counter)
 #ifdef CONFIG_ARC_HAS_LLSC
-
+#include <asm/atomic-llsc.h>
 #define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 #define ATOMIC_OP(op, c_op, asm_op)					\
 static inline void arch_atomic_##op(int i, atomic_t *v)			\
 {									\
 	unsigned int val;						\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[val], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[val], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val) /* Early clobber to prevent reg reuse */	\
 	: [ctr]	"r"	(&v->counter), /* Not "m": llock only supports reg direct addr mode */	\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 }									\
 #define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 static inline int arch_atomic_##op##_return(int i, atomic_t *v)		\
 {									\
 	unsigned int val;						\
 									\
 	/*								\
 	 * Explicit full memory barrier needed before/after as		\
 	 * LLOCK/SCOND themselves don't provide any such semantics	\
 	 */								\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[val], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[val], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val)						\
 	: [ctr]	"r"	(&v->counter),					\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 									\
 	smp_mb();							\
 									\
 	return val;							\
 }
 #define ATOMIC_FETCH_OP(op, c_op, asm_op)				\
 static inline int arch_atomic_fetch_##op(int i, atomic_t *v)		\
 {									\
 	unsigned int val, orig;						\
 									\
 	/*								\
 	 * Explicit full memory barrier needed before/after as		\
 	 * LLOCK/SCOND themselves don't provide any such semantics	\
 	 */								\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock   %[orig], [%[ctr]]		\n"		\
 	"	" #asm_op " %[val], %[orig], %[i]	\n"		\
 	"	scond   %[val], [%[ctr]]		\n"		\
 	"	bnz     1b				\n"		\
 	: [val]	"=&r"	(val),						\
 	  [orig] "=&r" (orig)						\
 	: [ctr]	"r"	(&v->counter),					\
 	  [i]	"ir"	(i)						\
 	: "cc");							\
 									\
 	smp_mb();							\
 									\
 	return orig;							\
 }
 #else	/* !CONFIG_ARC_HAS_LLSC */
 #ifndef CONFIG_SMP
 /* violating atomic_xxx API locking protocol in UP for optimization sake */
 #define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
 #else
 #include <asm/atomic-spinlock.h>
 #endif
-static inline void arch_atomic_set(atomic_t *v, int i)
+#define arch_atomic_cmpxchg(v, o, n)					\
-{
+({									\
-	/*
+	arch_cmpxchg(&((v)->counter), (o), (n));			\
-	 * Independent of hardware support, all of the atomic_xxx() APIs need
+})
 	 * to follow the same locking rules to make sure that a "hardware"
 	 * atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn
 	 * sequence
 	 *
 	 * Thus atomic_set() despite being 1 insn (and seemingly atomic)
 	 * requires the locking.
 	 */
 	unsigned long flags;
-	atomic_ops_lock(flags);
+#ifdef arch_cmpxchg_relaxed
-	WRITE_ONCE(v->counter, i);
+#define arch_atomic_cmpxchg_relaxed(v, o, n)				\
-	atomic_ops_unlock(flags);
+({									\
-}
+	arch_cmpxchg_relaxed(&((v)->counter), (o), (n));		\
 })
 #endif
-#define arch_atomic_set_release(v, i)	arch_atomic_set((v), (i))
+#define arch_atomic_xchg(v, n)						\
 ({									\
 	arch_xchg(&((v)->counter), (n));				\
 })
 #ifdef arch_xchg_relaxed
 #define arch_atomic_xchg_relaxed(v, n)					\
 ({									\
 	arch_xchg_relaxed(&((v)->counter), (n));			\
 })
 #endif
 /*
- * Non hardware assisted Atomic-R-M-W
+ * 64-bit atomics
 * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
 */
 #define ATOMIC_OP(op, c_op, asm_op)					\
 static inline void arch_atomic_##op(int i, atomic_t *v)			\
 {									\
 	unsigned long flags;						\
 									\
 	atomic_ops_lock(flags);						\
 	v->counter c_op i;						\
 	atomic_ops_unlock(flags);					\
 }
 #define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 static inline int arch_atomic_##op##_return(int i, atomic_t *v)		\
 {									\
 	unsigned long flags;						\
 	unsigned long temp;						\
 									\
 	/*								\
 	 * spin lock/unlock provides the needed smp_mb() before/after	\
 	 */								\
 	atomic_ops_lock(flags);						\
 	temp = v->counter;						\
 	temp c_op i;							\
 	v->counter = temp;						\
 	atomic_ops_unlock(flags);					\
 									\
 	return temp;							\
 }
 #define ATOMIC_FETCH_OP(op, c_op, asm_op)				\
 static inline int arch_atomic_fetch_##op(int i, atomic_t *v)		\
 {									\
 	unsigned long flags;						\
 	unsigned long orig;						\
 									\
 	/*								\
 	 * spin lock/unlock provides the needed smp_mb() before/after	\
 	 */								\
 	atomic_ops_lock(flags);						\
 	orig = v->counter;						\
 	v->counter c_op i;						\
 	atomic_ops_unlock(flags);					\
 									\
 	return orig;							\
 }
 #endif /* !CONFIG_ARC_HAS_LLSC */
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 #undef ATOMIC_OPS
 #define ATOMIC_OPS(op, c_op, asm_op)					\
 	ATOMIC_OP(op, c_op, asm_op)					\
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 ATOMIC_OPS(and, &=, and)
 ATOMIC_OPS(andnot, &= ~, bic)
 ATOMIC_OPS(or, |=, or)
 ATOMIC_OPS(xor, ^=, xor)
 #define arch_atomic_andnot		arch_atomic_andnot
 #define arch_atomic_fetch_andnot	arch_atomic_fetch_andnot
 #undef ATOMIC_OPS
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
-
+#else
-#else	/* Kconfig ensures this is only enabled with needed h/w assist */
+#include <asm/atomic64-arcv2.h>
-
+#endif
 /*
 * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD)
 *  - The address HAS to be 64-bit aligned
 *  - There are 2 semantics involved here:
 *    = exclusive implies no interim update between load/store to same addr
 *    = both words are observed/updated together: this is guaranteed even
 *      for regular 64-bit load (LDD) / store (STD). Thus atomic64_set()
 *      is NOT required to use LLOCKD+SCONDD, STD suffices
 */
 typedef struct {
 	s64 __aligned(8) counter;
 } atomic64_t;
 #define ATOMIC64_INIT(a) { (a) }
 static inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	s64 val;
 	__asm__ __volatile__(
 	"	ldd   %0, [%1]	\n"
 	: "=r"(val)
 	: "r"(&v->counter));
 	return val;
 }
 static inline void arch_atomic64_set(atomic64_t *v, s64 a)
 {
 	/*
 	 * This could have been a simple assignment in "C" but would need
 	 * explicit volatile. Otherwise gcc optimizers could elide the store
 	 * which borked atomic64 self-test
 	 * In the inline asm version, memory clobber needed for exact same
 	 * reason, to tell gcc about the store.
 	 *
 	 * This however is not needed for sibling atomic64_add() etc since both
 	 * load/store are explicitly done in inline asm. As long as API is used
 	 * for each access, gcc has no way to optimize away any load/store
 	 */
 	__asm__ __volatile__(
 	"	std   %0, [%1]	\n"
 	:
 	: "r"(a), "r"(&v->counter)
 	: "memory");
 }
 #define ATOMIC64_OP(op, op1, op2)					\
 static inline void arch_atomic64_##op(s64 a, atomic64_t *v)		\
 {									\
 	s64 val;							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd  %0, [%1]	\n"				\
 	"	" #op1 " %L0, %L0, %L2	\n"				\
 	"	" #op2 " %H0, %H0, %H2	\n"				\
 	"	scondd   %0, [%1]	\n"				\
 	"	bnz     1b		\n"				\
 	: "=&r"(val)							\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");							\
 }									\
 #define ATOMIC64_OP_RETURN(op, op1, op2)		        	\
 static inline s64 arch_atomic64_##op##_return(s64 a, atomic64_t *v)	\
 {									\
 	s64 val;							\
 									\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd   %0, [%1]	\n"				\
 	"	" #op1 " %L0, %L0, %L2	\n"				\
 	"	" #op2 " %H0, %H0, %H2	\n"				\
 	"	scondd   %0, [%1]	\n"				\
 	"	bnz     1b		\n"				\
 	: [val] "=&r"(val)						\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");	/* memory clobber comes from smp_mb() */	\
 									\
 	smp_mb();							\
 									\
 	return val;							\
 }
 #define ATOMIC64_FETCH_OP(op, op1, op2)		        		\
 static inline s64 arch_atomic64_fetch_##op(s64 a, atomic64_t *v)	\
 {									\
 	s64 val, orig;							\
 									\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd   %0, [%2]	\n"				\
 	"	" #op1 " %L1, %L0, %L3	\n"				\
 	"	" #op2 " %H1, %H0, %H3	\n"				\
 	"	scondd   %1, [%2]	\n"				\
 	"	bnz     1b		\n"				\
 	: "=&r"(orig), "=&r"(val)					\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");	/* memory clobber comes from smp_mb() */	\
 									\
 	smp_mb();							\
 									\
 	return orig;							\
 }
 #define ATOMIC64_OPS(op, op1, op2)					\
 	ATOMIC64_OP(op, op1, op2)					\
 	ATOMIC64_OP_RETURN(op, op1, op2)				\
 	ATOMIC64_FETCH_OP(op, op1, op2)
 ATOMIC64_OPS(add, add.f, adc)
 ATOMIC64_OPS(sub, sub.f, sbc)
 ATOMIC64_OPS(and, and, and)
 ATOMIC64_OPS(andnot, bic, bic)
 ATOMIC64_OPS(or, or, or)
 ATOMIC64_OPS(xor, xor, xor)
 #define arch_atomic64_andnot		arch_atomic64_andnot
 #define arch_atomic64_fetch_andnot	arch_atomic64_fetch_andnot
 #undef ATOMIC64_OPS
 #undef ATOMIC64_FETCH_OP
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
 static inline s64
 arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
 {
 	s64 prev;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	brne    %L0, %L2, 2f	\n"
 	"	brne    %H0, %H2, 2f	\n"
 	"	scondd  %3, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(prev)
 	: "r"(ptr), "ir"(expected), "r"(new)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return prev;
 }
 static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new)
 {
 	s64 prev;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	scondd  %2, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(prev)
 	: "r"(ptr), "r"(new)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return prev;
 }
 /**
 * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
 * @v: pointer of type atomic64_t
 *
 * The function returns the old value of *v minus 1, even if
 * the atomic variable, v, was not decremented.
 */
 static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	s64 val;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	sub.f   %L0, %L0, 1	# w0 - 1, set C on borrow\n"
 	"	sub.c   %H0, %H0, 1	# if C set, w1 - 1\n"
 	"	brlt    %H0, 0, 2f	\n"
 	"	scondd  %0, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(val)
 	: "r"(&v->counter)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return val;
 }
 #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
 /**
 * arch_atomic64_fetch_add_unless - add unless the number is a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if it was not @u.
 * Returns the old value of @v
 */
 static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 {
 	s64 old, temp;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%2]	\n"
 	"	brne	%L0, %L4, 2f	# continue to add since v != u \n"
 	"	breq.d	%H0, %H4, 3f	# return since v == u \n"
 	"2:				\n"
 	"	add.f   %L1, %L0, %L3	\n"
 	"	adc     %H1, %H0, %H3	\n"
 	"	scondd  %1, [%2]	\n"
 	"	bnz     1b		\n"
 	"3:				\n"
 	: "=&r"(old), "=&r" (temp)
 	: "r"(&v->counter), "r"(a), "r"(u)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return old;
 }
 #define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
 #endif	/* !CONFIG_GENERIC_ATOMIC64 */
 #endif	/* !__ASSEMBLY__ */
--- a/arch/arc/include/asm/atomic64-arcv2.h
+++ b/arch/arc/include/asm/atomic64-arcv2.h
@ -0,0 +1,250 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD)
 *  - The address HAS to be 64-bit aligned
 */
 #ifndef _ASM_ARC_ATOMIC64_ARCV2_H
 #define _ASM_ARC_ATOMIC64_ARCV2_H
 typedef struct {
 	s64 __aligned(8) counter;
 } atomic64_t;
 #define ATOMIC64_INIT(a) { (a) }
 static inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	s64 val;
 	__asm__ __volatile__(
 	"	ldd   %0, [%1]	\n"
 	: "=r"(val)
 	: "r"(&v->counter));
 	return val;
 }
 static inline void arch_atomic64_set(atomic64_t *v, s64 a)
 {
 	/*
 	 * This could have been a simple assignment in "C" but would need
 	 * explicit volatile. Otherwise gcc optimizers could elide the store
 	 * which borked atomic64 self-test
 	 * In the inline asm version, memory clobber needed for exact same
 	 * reason, to tell gcc about the store.
 	 *
 	 * This however is not needed for sibling atomic64_add() etc since both
 	 * load/store are explicitly done in inline asm. As long as API is used
 	 * for each access, gcc has no way to optimize away any load/store
 	 */
 	__asm__ __volatile__(
 	"	std   %0, [%1]	\n"
 	:
 	: "r"(a), "r"(&v->counter)
 	: "memory");
 }
 #define ATOMIC64_OP(op, op1, op2)					\
 static inline void arch_atomic64_##op(s64 a, atomic64_t *v)		\
 {									\
 	s64 val;							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd  %0, [%1]	\n"				\
 	"	" #op1 " %L0, %L0, %L2	\n"				\
 	"	" #op2 " %H0, %H0, %H2	\n"				\
 	"	scondd   %0, [%1]	\n"				\
 	"	bnz     1b		\n"				\
 	: "=&r"(val)							\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");							\
 }									\
 #define ATOMIC64_OP_RETURN(op, op1, op2)		        	\
 static inline s64 arch_atomic64_##op##_return_relaxed(s64 a, atomic64_t *v)	\
 {									\
 	s64 val;							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd   %0, [%1]	\n"				\
 	"	" #op1 " %L0, %L0, %L2	\n"				\
 	"	" #op2 " %H0, %H0, %H2	\n"				\
 	"	scondd   %0, [%1]	\n"				\
 	"	bnz     1b		\n"				\
 	: [val] "=&r"(val)						\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");	/* memory clobber comes from smp_mb() */	\
 									\
 	return val;							\
 }
 #define arch_atomic64_add_return_relaxed	arch_atomic64_add_return_relaxed
 #define arch_atomic64_sub_return_relaxed	arch_atomic64_sub_return_relaxed
 #define ATOMIC64_FETCH_OP(op, op1, op2)		        		\
 static inline s64 arch_atomic64_fetch_##op##_relaxed(s64 a, atomic64_t *v)	\
 {									\
 	s64 val, orig;							\
 									\
 	__asm__ __volatile__(						\
 	"1:				\n"				\
 	"	llockd   %0, [%2]	\n"				\
 	"	" #op1 " %L1, %L0, %L3	\n"				\
 	"	" #op2 " %H1, %H0, %H3	\n"				\
 	"	scondd   %1, [%2]	\n"				\
 	"	bnz     1b		\n"				\
 	: "=&r"(orig), "=&r"(val)					\
 	: "r"(&v->counter), "ir"(a)					\
 	: "cc");	/* memory clobber comes from smp_mb() */	\
 									\
 	return orig;							\
 }
 #define arch_atomic64_fetch_add_relaxed		arch_atomic64_fetch_add_relaxed
 #define arch_atomic64_fetch_sub_relaxed		arch_atomic64_fetch_sub_relaxed
 #define arch_atomic64_fetch_and_relaxed		arch_atomic64_fetch_and_relaxed
 #define arch_atomic64_fetch_andnot_relaxed	arch_atomic64_fetch_andnot_relaxed
 #define arch_atomic64_fetch_or_relaxed		arch_atomic64_fetch_or_relaxed
 #define arch_atomic64_fetch_xor_relaxed		arch_atomic64_fetch_xor_relaxed
 #define ATOMIC64_OPS(op, op1, op2)					\
 	ATOMIC64_OP(op, op1, op2)					\
 	ATOMIC64_OP_RETURN(op, op1, op2)				\
 	ATOMIC64_FETCH_OP(op, op1, op2)
 ATOMIC64_OPS(add, add.f, adc)
 ATOMIC64_OPS(sub, sub.f, sbc)
 #undef ATOMIC64_OPS
 #define ATOMIC64_OPS(op, op1, op2)					\
 	ATOMIC64_OP(op, op1, op2)					\
 	ATOMIC64_FETCH_OP(op, op1, op2)
 ATOMIC64_OPS(and, and, and)
 ATOMIC64_OPS(andnot, bic, bic)
 ATOMIC64_OPS(or, or, or)
 ATOMIC64_OPS(xor, xor, xor)
 #define arch_atomic64_andnot		arch_atomic64_andnot
 #undef ATOMIC64_OPS
 #undef ATOMIC64_FETCH_OP
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
 static inline s64
 arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
 {
 	s64 prev;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	brne    %L0, %L2, 2f	\n"
 	"	brne    %H0, %H2, 2f	\n"
 	"	scondd  %3, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(prev)
 	: "r"(ptr), "ir"(expected), "r"(new)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return prev;
 }
 static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new)
 {
 	s64 prev;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	scondd  %2, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(prev)
 	: "r"(ptr), "r"(new)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return prev;
 }
 /**
 * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
 * @v: pointer of type atomic64_t
 *
 * The function returns the old value of *v minus 1, even if
 * the atomic variable, v, was not decremented.
 */
 static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	s64 val;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
 	"	sub.f   %L0, %L0, 1	# w0 - 1, set C on borrow\n"
 	"	sub.c   %H0, %H0, 1	# if C set, w1 - 1\n"
 	"	brlt    %H0, 0, 2f	\n"
 	"	scondd  %0, [%1]	\n"
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(val)
 	: "r"(&v->counter)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return val;
 }
 #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
 /**
 * arch_atomic64_fetch_add_unless - add unless the number is a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if it was not @u.
 * Returns the old value of @v
 */
 static inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 {
 	s64 old, temp;
 	smp_mb();
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%2]	\n"
 	"	brne	%L0, %L4, 2f	# continue to add since v != u \n"
 	"	breq.d	%H0, %H4, 3f	# return since v == u \n"
 	"2:				\n"
 	"	add.f   %L1, %L0, %L3	\n"
 	"	adc     %H1, %H0, %H3	\n"
 	"	scondd  %1, [%2]	\n"
 	"	bnz     1b		\n"
 	"3:				\n"
 	: "=&r"(old), "=&r" (temp)
 	: "r"(&v->counter), "r"(a), "r"(u)
 	: "cc");	/* memory clobber comes from smp_mb() */
 	smp_mb();
 	return old;
 }
 #define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
 #endif
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@ -14,188 +14,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <asm/barrier.h>
 #ifndef CONFIG_ARC_HAS_LLSC
 #include <asm/smp.h>
 #endif
 #ifdef CONFIG_ARC_HAS_LLSC
 /*
 * Hardware assisted Atomic-R-M-W
 */
 #define BIT_OP(op, c_op, asm_op)					\
 static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
 {									\
 	unsigned int temp;						\
 									\
 	m += nr >> 5;							\
 									\
 	nr &= 0x1f;							\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock       %0, [%1]		\n"			\
 	"	" #asm_op " %0, %0, %2	\n"				\
 	"	scond       %0, [%1]		\n"			\
 	"	bnz         1b			\n"			\
 	: "=&r"(temp)	/* Early clobber, to prevent reg reuse */	\
 	: "r"(m),	/* Not "m": llock only supports reg direct addr mode */	\
 	  "ir"(nr)							\
 	: "cc");							\
 }
 /*
 * Semantically:
 *    Test the bit
 *    if clear
 *        set it and return 0 (old value)
 *    else
 *        return 1 (old value).
 *
 * Since ARC lacks a equivalent h/w primitive, the bit is set unconditionally
 * and the old value of bit is returned
 */
 #define TEST_N_BIT_OP(op, c_op, asm_op)					\
 static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
 {									\
 	unsigned long old, temp;					\
 									\
 	m += nr >> 5;							\
 									\
 	nr &= 0x1f;							\
 									\
 	/*								\
 	 * Explicit full memory barrier needed before/after as		\
 	 * LLOCK/SCOND themselves don't provide any such smenatic	\
 	 */								\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock       %0, [%2]	\n"				\
 	"	" #asm_op " %1, %0, %3	\n"				\
 	"	scond       %1, [%2]	\n"				\
 	"	bnz         1b		\n"				\
 	: "=&r"(old), "=&r"(temp)					\
 	: "r"(m), "ir"(nr)						\
 	: "cc");							\
 									\
 	smp_mb();							\
 									\
 	return (old & (1 << nr)) != 0;					\
 }
 #else /* !CONFIG_ARC_HAS_LLSC */
 /*
 * Non hardware assisted Atomic-R-M-W
 * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
 *
 * There's "significant" micro-optimization in writing our own variants of
 * bitops (over generic variants)
 *
 * (1) The generic APIs have "signed" @nr while we have it "unsigned"
 *     This avoids extra code to be generated for pointer arithmatic, since
 *     is "not sure" that index is NOT -ve
 * (2) Utilize the fact that ARCompact bit fidding insn (BSET/BCLR/ASL) etc
 *     only consider bottom 5 bits of @nr, so NO need to mask them off.
 *     (GCC Quirk: however for constant @nr we still need to do the masking
 *             at compile time)
 */
 #define BIT_OP(op, c_op, asm_op)					\
 static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
 {									\
 	unsigned long temp, flags;					\
 	m += nr >> 5;							\
 									\
 	/*								\
 	 * spin lock/unlock provide the needed smp_mb() before/after	\
 	 */								\
 	bitops_lock(flags);						\
 									\
 	temp = *m;							\
 	*m = temp c_op (1UL << (nr & 0x1f));					\
 									\
 	bitops_unlock(flags);						\
 }
 #define TEST_N_BIT_OP(op, c_op, asm_op)					\
 static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
 {									\
 	unsigned long old, flags;					\
 	m += nr >> 5;							\
 									\
 	bitops_lock(flags);						\
 									\
 	old = *m;							\
 	*m = old c_op (1UL << (nr & 0x1f));				\
 									\
 	bitops_unlock(flags);						\
 									\
 	return (old & (1UL << (nr & 0x1f))) != 0;			\
 }
 #endif
 /***************************************
 * Non atomic variants
 **************************************/
 #define __BIT_OP(op, c_op, asm_op)					\
 static inline void __##op##_bit(unsigned long nr, volatile unsigned long *m)	\
 {									\
 	unsigned long temp;						\
 	m += nr >> 5;							\
 									\
 	temp = *m;							\
 	*m = temp c_op (1UL << (nr & 0x1f));				\
 }
 #define __TEST_N_BIT_OP(op, c_op, asm_op)				\
 static inline int __test_and_##op##_bit(unsigned long nr, volatile unsigned long *m)\
 {									\
 	unsigned long old;						\
 	m += nr >> 5;							\
 									\
 	old = *m;							\
 	*m = old c_op (1UL << (nr & 0x1f));				\
 									\
 	return (old & (1UL << (nr & 0x1f))) != 0;			\
 }
 #define BIT_OPS(op, c_op, asm_op)					\
 									\
 	/* set_bit(), clear_bit(), change_bit() */			\
 	BIT_OP(op, c_op, asm_op)					\
 									\
 	/* test_and_set_bit(), test_and_clear_bit(), test_and_change_bit() */\
 	TEST_N_BIT_OP(op, c_op, asm_op)					\
 									\
 	/* __set_bit(), __clear_bit(), __change_bit() */		\
 	__BIT_OP(op, c_op, asm_op)					\
 									\
 	/* __test_and_set_bit(), __test_and_clear_bit(), __test_and_change_bit() */\
 	__TEST_N_BIT_OP(op, c_op, asm_op)
 BIT_OPS(set, |, bset)
 BIT_OPS(clear, & ~, bclr)
 BIT_OPS(change, ^, bxor)
 /*
 * This routine doesn't need to be atomic.
 */
 static inline int
 test_bit(unsigned int nr, const volatile unsigned long *addr)
 {
 	unsigned long mask;
 	addr += nr >> 5;
 	mask = 1UL << (nr & 0x1f);
 	return ((mask & *addr) != 0);
 }
 #ifdef CONFIG_ISA_ARCOMPACT
@ -296,7 +114,7 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word)
 * @result: [1-32]
 * fls(1) = 1, fls(0x80000000) = 32, fls(0) = 0
 */
-static inline __attribute__ ((const)) int fls(unsigned long x)
+static inline __attribute__ ((const)) int fls(unsigned int x)
 {
 	int n;
@ -323,7 +141,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x)
 * ffs = Find First Set in word (LSB to MSB)
 * @result: [1-32], 0 if all 0's
 */
-static inline __attribute__ ((const)) int ffs(unsigned long x)
+static inline __attribute__ ((const)) int ffs(unsigned int x)
 {
 	int n;
@ -368,6 +186,8 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/le.h>
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@ -62,10 +62,6 @@
 #define ARCH_SLAB_MINALIGN	8
 #endif
 extern void arc_cache_init(void);
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
 extern int ioc_enable;
 extern unsigned long perip_base, perip_end;
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@ -6,6 +6,7 @@
 #ifndef __ASM_ARC_CMPXCHG_H
 #define __ASM_ARC_CMPXCHG_H
 #include <linux/build_bug.h>
 #include <linux/types.h>
 #include <asm/barrier.h>
@ -13,146 +14,130 @@
 #ifdef CONFIG_ARC_HAS_LLSC
-static inline unsigned long
+/*
-__cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
+ * if (*ptr == @old)
-{
+ *      *ptr = @new
-	unsigned long prev;
+ */
-
+#define __cmpxchg(ptr, old, new)					\
-	/*
+({									\
-	 * Explicit full memory barrier needed before/after as
+	__typeof__(*(ptr)) _prev;					\
-	 * LLOCK/SCOND themselves don't provide any such semantics
+									\
-	 */
+	__asm__ __volatile__(						\
-	smp_mb();
+	"1:	llock  %0, [%1]	\n"					\
-
+	"	brne   %0, %2, 2f	\n"				\
-	__asm__ __volatile__(
+	"	scond  %3, [%1]	\n"					\
-	"1:	llock   %0, [%1]	\n"
+	"	bnz     1b		\n"				\
-	"	brne    %0, %2, 2f	\n"
+	"2:				\n"				\
-	"	scond   %3, [%1]	\n"
+	: "=&r"(_prev)	/* Early clobber prevent reg reuse */		\
-	"	bnz     1b		\n"
+	: "r"(ptr),	/* Not "m": llock only supports reg */		\
-	"2:				\n"
+	  "ir"(old),							\
-	: "=&r"(prev)	/* Early clobber, to prevent reg reuse */
+	  "r"(new)	/* Not "ir": scond can't take LIMM */		\
-	: "r"(ptr),	/* Not "m": llock only supports reg direct addr mode */
+	: "cc",								\
-	  "ir"(expected),
+	  "memory");	/* gcc knows memory is clobbered */		\
-	  "r"(new)	/* can't be "ir". scond can't take LIMM for "b" */
+									\
-	: "cc", "memory"); /* so that gcc knows memory is being written here */
+	_prev;								\
 	smp_mb();
 	return prev;
 }
 #else /* !CONFIG_ARC_HAS_LLSC */
 static inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long expected, unsigned long new)
 {
 	unsigned long flags;
 	int prev;
 	volatile unsigned long *p = ptr;
 	/*
 	 * spin lock/unlock provide the needed smp_mb() before/after
 	 */
 	atomic_ops_lock(flags);
 	prev = *p;
 	if (prev == expected)
 		*p = new;
 	atomic_ops_unlock(flags);
 	return prev;
 }
 #endif
 #define arch_cmpxchg(ptr, o, n) ({			\
 	(typeof(*(ptr)))__cmpxchg((ptr),		\
 				  (unsigned long)(o),	\
 				  (unsigned long)(n));	\
 })
-/*
+#define arch_cmpxchg_relaxed(ptr, old, new)				\
- * atomic_cmpxchg is same as cmpxchg
+({									\
- *   LLSC: only different in data-type, semantics are exactly same
+	__typeof__(ptr) _p_ = (ptr);					\
- *  !LLSC: cmpxchg() has to use an external lock atomic_ops_lock to guarantee
+	__typeof__(*(ptr)) _o_ = (old);					\
- *         semantics, and this lock also happens to be used by atomic_*()
+	__typeof__(*(ptr)) _n_ = (new);					\
- */
+	__typeof__(*(ptr)) _prev_;					\
-#define arch_atomic_cmpxchg(v, o, n) ((int)arch_cmpxchg(&((v)->counter), (o), (n)))
+									\
-
+	switch(sizeof((_p_))) {						\
-
+	case 4:								\
-/*
+		_prev_ = __cmpxchg(_p_, _o_, _n_);			\
- * xchg (reg with memory) based on "Native atomic" EX insn
+		break;							\
- */
+	default:							\
-static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
+		BUILD_BUG();						\
-				   int size)
+	}								\
-{
+	_prev_;								\
 	extern unsigned long __xchg_bad_pointer(void);
 	switch (size) {
 	case 4:
 		smp_mb();
 		__asm__ __volatile__(
 		"	ex  %0, [%1]	\n"
 		: "+r"(val)
 		: "r"(ptr)
 		: "memory");
 		smp_mb();
 		return val;
 	}
 	return __xchg_bad_pointer();
 }
 #define _xchg(ptr, with) ((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), \
 						 sizeof(*(ptr))))
 /*
 * xchg() maps directly to ARC EX instruction which guarantees atomicity.
 * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock
 * due to a subtle reason:
 *  - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot
 *    of  kernel code which calls xchg()/cmpxchg() on same data (see llist.h)
 *    Hence xchg() needs to follow same locking rules.
 *
 * Technically the lock is also needed for UP (boils down to irq save/restore)
 * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to
 * be disabled thus can't possibly be interrupted/preempted/clobbered by xchg()
 * Other way around, xchg is one instruction anyways, so can't be interrupted
 * as such
 */
 #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP)
 #define arch_xchg(ptr, with)		\
 ({					\
 	unsigned long flags;		\
 	typeof(*(ptr)) old_val;		\
 					\
 	atomic_ops_lock(flags);		\
 	old_val = _xchg(ptr, with);	\
 	atomic_ops_unlock(flags);	\
 	old_val;			\
 })
 #else
-#define arch_xchg(ptr, with)  _xchg(ptr, with)
+#define arch_cmpxchg(ptr, old, new)				        \
 ({									\
 	volatile __typeof__(ptr) _p_ = (ptr);				\
 	__typeof__(*(ptr)) _o_ = (old);					\
 	__typeof__(*(ptr)) _n_ = (new);					\
 	__typeof__(*(ptr)) _prev_;					\
 	unsigned long __flags;						\
 									\
 	BUILD_BUG_ON(sizeof(_p_) != 4);					\
 									\
 	/*								\
 	 * spin lock/unlock provide the needed smp_mb() before/after	\
 	 */								\
 	atomic_ops_lock(__flags);					\
 	_prev_ = *_p_;							\
 	if (_prev_ == _o_)						\
 		*_p_ = _n_;						\
 	atomic_ops_unlock(__flags);					\
 	_prev_;								\
 })
 #endif
 /*
- * "atomic" variant of xchg()
+ * xchg
 * REQ: It needs to follow the same serialization rules as other atomic_xxx()
 * Since xchg() doesn't always do that, it would seem that following definition
 * is incorrect. But here's the rationale:
 *   SMP : Even xchg() takes the atomic_ops_lock, so OK.
 *   LLSC: atomic_ops_lock are not relevant at all (even if SMP, since LLSC
 *         is natively "SMP safe", no serialization required).
 *   UP  : other atomics disable IRQ, so no way a difft ctxt atomic_xchg()
 *         could clobber them. atomic_xchg() itself would be 1 insn, so it
 *         can't be clobbered by others. Thus no serialization required when
 *         atomic_xchg is involved.
 */
-#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new))
+#ifdef CONFIG_ARC_HAS_LLSC
 #define __xchg(ptr, val)						\
 ({									\
 	__asm__ __volatile__(						\
 	"	ex  %0, [%1]	\n"	/* set new value */	        \
 	: "+r"(val)							\
 	: "r"(ptr)							\
 	: "memory");							\
 	_val_;		/* get old value */				\
 })
 #define arch_xchg_relaxed(ptr, val)					\
 ({									\
 	__typeof__(ptr) _p_ = (ptr);					\
 	__typeof__(*(ptr)) _val_ = (val);				\
 									\
 	switch(sizeof(*(_p_))) {					\
 	case 4:								\
 		_val_ = __xchg(_p_, _val_);				\
 		break;							\
 	default:							\
 		BUILD_BUG();						\
 	}								\
 	_val_;								\
 })
 #else  /* !CONFIG_ARC_HAS_LLSC */
 /*
 * EX instructions is baseline and present in !LLSC too. But in this
 * regime it still needs use @atomic_ops_lock spinlock to allow interop
 * with cmpxchg() which uses spinlock in !LLSC
 * (llist.h use xchg and cmpxchg on sama data)
 */
 #define arch_xchg(ptr, val)					        \
 ({									\
 	__typeof__(ptr) _p_ = (ptr);					\
 	__typeof__(*(ptr)) _val_ = (val);				\
 									\
 	unsigned long __flags;						\
 									\
 	atomic_ops_lock(__flags);					\
 									\
 	__asm__ __volatile__(						\
 	"	ex  %0, [%1]	\n"					\
 	: "+r"(_val_)							\
 	: "r"(_p_)							\
 	: "memory");							\
 									\
 	atomic_ops_unlock(__flags);					\
 	_val_;								\
 })
 #endif
 #endif
--- a/arch/arc/include/asm/entry-compact.h
+++ b/arch/arc/include/asm/entry-compact.h
@ -126,19 +126,11 @@
 * to be saved again on kernel mode stack, as part of pt_regs.
 *-------------------------------------------------------------*/
 .macro PROLOG_FREEUP_REG	reg, mem
 #ifndef ARC_USE_SCRATCH_REG
 	sr  \reg, [ARC_REG_SCRATCH_DATA0]
 #else
 	st  \reg, [\mem]
 #endif
 .endm
 .macro PROLOG_RESTORE_REG	reg, mem
 #ifndef ARC_USE_SCRATCH_REG
 	lr  \reg, [ARC_REG_SCRATCH_DATA0]
 #else
 	ld  \reg, [\mem]
 #endif
 .endm
 /*--------------------------------------------------------------
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@ -58,14 +58,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 				 pmd_t *pmd);
 /* Generic variants assume pgtable_t is struct page *, hence need for these */
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 				       pgtable_t pgtable);
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 				unsigned long end);
--- a/arch/arc/include/asm/mmu-arcv2.h
+++ b/arch/arc/include/asm/mmu-arcv2.h
@ -0,0 +1,103 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright (C) 2004, 2007-2010, 2011-2012, 2019-20 Synopsys, Inc. (www.synopsys.com)
 *
 * MMUv3 (arc700) / MMUv4 (archs) are software page walked and software managed.
 * This file contains the TLB access registers and commands
 */
 #ifndef _ASM_ARC_MMU_ARCV2_H
 #define _ASM_ARC_MMU_ARCV2_H
 /*
 * TLB Management regs
 */
 #define ARC_REG_MMU_BCR		0x06f
 #ifdef CONFIG_ARC_MMU_V3
 #define ARC_REG_TLBPD0		0x405
 #define ARC_REG_TLBPD1		0x406
 #define ARC_REG_TLBPD1HI	0	/* Dummy: allows common code */
 #define ARC_REG_TLBINDEX	0x407
 #define ARC_REG_TLBCOMMAND	0x408
 #define ARC_REG_PID		0x409
 #define ARC_REG_SCRATCH_DATA0	0x418
 #else
 #define ARC_REG_TLBPD0		0x460
 #define ARC_REG_TLBPD1		0x461
 #define ARC_REG_TLBPD1HI	0x463
 #define ARC_REG_TLBINDEX	0x464
 #define ARC_REG_TLBCOMMAND	0x465
 #define ARC_REG_PID		0x468
 #define ARC_REG_SCRATCH_DATA0	0x46c
 #endif
 /* Bits in MMU PID reg */
 #define __TLB_ENABLE		(1 << 31)
 #define __PROG_ENABLE		(1 << 30)
 #define MMU_ENABLE		(__TLB_ENABLE | __PROG_ENABLE)
 /* Bits in TLB Index reg */
 #define TLB_LKUP_ERR		0x80000000
 #ifdef CONFIG_ARC_MMU_V3
 #define TLB_DUP_ERR		(TLB_LKUP_ERR | 0x00000001)
 #else
 #define TLB_DUP_ERR		(TLB_LKUP_ERR | 0x40000000)
 #endif
 /*
 * TLB Commands
 */
 #define TLBWrite    		0x1
 #define TLBRead     		0x2
 #define TLBGetIndex 		0x3
 #define TLBProbe    		0x4
 #define TLBWriteNI		0x5  /* write JTLB without inv uTLBs */
 #define TLBIVUTLB		0x6  /* explicitly inv uTLBs */
 #ifdef CONFIG_ARC_MMU_V4
 #define TLBInsertEntry		0x7
 #define TLBDeleteEntry		0x8
 #endif
 /* Masks for actual TLB "PD"s */
 #define PTE_BITS_IN_PD0		(_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ)
 #define PTE_BITS_RWX		(_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
 #define PTE_BITS_NON_RWX_IN_PD1	(PAGE_MASK_PHYS | _PAGE_CACHEABLE)
 #ifndef __ASSEMBLY__
 struct mm_struct;
 extern int pae40_exist_but_not_enab(void);
 static inline int is_pae40_enabled(void)
 {
 	return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
 }
 static inline void mmu_setup_asid(struct mm_struct *mm, unsigned long asid)
 {
 	write_aux_reg(ARC_REG_PID, asid | MMU_ENABLE);
 }
 static inline void mmu_setup_pgd(struct mm_struct *mm, void *pgd)
 {
 	/* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */
 #ifdef CONFIG_ISA_ARCV2
 	write_aux_reg(ARC_REG_SCRATCH_DATA0, (unsigned int)pgd);
 #endif
 }
 #else
 .macro ARC_MMU_REENABLE reg
 	lr \reg, [ARC_REG_PID]
 	or \reg, \reg, MMU_ENABLE
 	sr \reg, [ARC_REG_PID]
 .endm
 #endif /* !__ASSEMBLY__ */
 #endif
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@ -7,98 +7,15 @@
 #define _ASM_ARC_MMU_H
 #ifndef __ASSEMBLY__
 #include <linux/threads.h>	/* NR_CPUS */
 #endif
 #if defined(CONFIG_ARC_MMU_V1)
 #define CONFIG_ARC_MMU_VER 1
 #elif defined(CONFIG_ARC_MMU_V2)
 #define CONFIG_ARC_MMU_VER 2
 #elif defined(CONFIG_ARC_MMU_V3)
 #define CONFIG_ARC_MMU_VER 3
 #elif defined(CONFIG_ARC_MMU_V4)
 #define CONFIG_ARC_MMU_VER 4
 #endif
 /* MMU Management regs */
 #define ARC_REG_MMU_BCR		0x06f
 #if (CONFIG_ARC_MMU_VER < 4)
 #define ARC_REG_TLBPD0		0x405
 #define ARC_REG_TLBPD1		0x406
 #define ARC_REG_TLBPD1HI	0	/* Dummy: allows code sharing with ARC700 */
 #define ARC_REG_TLBINDEX	0x407
 #define ARC_REG_TLBCOMMAND	0x408
 #define ARC_REG_PID		0x409
 #define ARC_REG_SCRATCH_DATA0	0x418
 #else
 #define ARC_REG_TLBPD0		0x460
 #define ARC_REG_TLBPD1		0x461
 #define ARC_REG_TLBPD1HI	0x463
 #define ARC_REG_TLBINDEX	0x464
 #define ARC_REG_TLBCOMMAND	0x465
 #define ARC_REG_PID		0x468
 #define ARC_REG_SCRATCH_DATA0	0x46c
 #endif
 #if defined(CONFIG_ISA_ARCV2) || !defined(CONFIG_SMP)
 #define	ARC_USE_SCRATCH_REG
 #endif
 /* Bits in MMU PID register */
 #define __TLB_ENABLE		(1 << 31)
 #define __PROG_ENABLE		(1 << 30)
 #define MMU_ENABLE		(__TLB_ENABLE | __PROG_ENABLE)
 /* Error code if probe fails */
 #define TLB_LKUP_ERR		0x80000000
 #if (CONFIG_ARC_MMU_VER < 4)
 #define TLB_DUP_ERR	(TLB_LKUP_ERR | 0x00000001)
 #else
 #define TLB_DUP_ERR	(TLB_LKUP_ERR | 0x40000000)
 #endif
 /* TLB Commands */
 #define TLBWrite    0x1
 #define TLBRead     0x2
 #define TLBGetIndex 0x3
 #define TLBProbe    0x4
 #if (CONFIG_ARC_MMU_VER >= 2)
 #define TLBWriteNI  0x5		/* write JTLB without inv uTLBs */
 #define TLBIVUTLB   0x6		/* explicitly inv uTLBs */
 #else
 #define TLBWriteNI  TLBWrite	/* Not present in hardware, fallback */
 #endif
 #if (CONFIG_ARC_MMU_VER >= 4)
 #define TLBInsertEntry	0x7
 #define TLBDeleteEntry	0x8
 #endif
 #ifndef __ASSEMBLY__
 typedef struct {
 	unsigned long asid[NR_CPUS];	/* 8 bit MMU PID + Generation cycle */
 } mm_context_t;
 #ifdef CONFIG_ARC_DBG_TLB_PARANOIA
 void tlb_paranoid_check(unsigned int mm_asid, unsigned long address);
 #else
 #define tlb_paranoid_check(a, b)
 #endif
-void arc_mmu_init(void);
+#include <asm/mmu-arcv2.h>
 extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len);
 void read_decode_mmu_bcr(void);
 static inline int is_pae40_enabled(void)
 {
 	return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
 }
 extern int pae40_exist_but_not_enab(void);
 #endif	/* !__ASSEMBLY__ */
 #endif
--- a/arch/arc/include/asm/mmu_context.h
+++ b/arch/arc/include/asm/mmu_context.h
@ -15,22 +15,23 @@
 #ifndef _ASM_ARC_MMU_CONTEXT_H
 #define _ASM_ARC_MMU_CONTEXT_H
 #include <asm/arcregs.h>
 #include <asm/tlb.h>
 #include <linux/sched/mm.h>
 #include <asm/tlb.h>
 #include <asm-generic/mm_hooks.h>
-/*		ARC700 ASID Management
+/*		ARC ASID Management
 *
- * ARC MMU provides 8-bit ASID (0..255) to TAG TLB entries, allowing entries
+ * MMU tags TLBs with an 8-bit ASID, avoiding need to flush the TLB on
- * with same vaddr (different tasks) to co-exit. This provides for
+ * context-switch.
 * "Fast Context Switch" i.e. no TLB flush on ctxt-switch
 *
- * Linux assigns each task a unique ASID. A simple round-robin allocation
+ * ASID is managed per cpu, so task threads across CPUs can have different
- * of H/w ASID is done using software tracker @asid_cpu.
+ * ASID. Global ASID management is needed if hardware supports TLB shootdown
- * When it reaches max 255, the allocation cycle starts afresh by flushing
+ * and/or shared TLB across cores, which ARC doesn't.
- * the entire TLB and wrapping ASID back to zero.
+ *
 * Each task is assigned unique ASID, with a simple round-robin allocator
 * tracked in @asid_cpu. When 8-bit value rolls over,a new cycle is started
 * over from 0, and TLB is flushed
 *
 * A new allocation cycle, post rollover, could potentially reassign an ASID
 * to a different task. Thus the rule is to refresh the ASID in a new cycle.
@ -93,7 +94,7 @@ static inline void get_new_mmu_context(struct mm_struct *mm)
 	asid_mm(mm, cpu) = asid_cpu(cpu);
 set_hw:
-	write_aux_reg(ARC_REG_PID, hw_pid(mm, cpu) | MMU_ENABLE);
+	mmu_setup_asid(mm, hw_pid(mm, cpu));
 	local_irq_restore(flags);
 }
@ -146,10 +147,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	 */
 	cpumask_set_cpu(cpu, mm_cpumask(next));
-#ifdef ARC_USE_SCRATCH_REG
+	mmu_setup_pgd(next, next->pgd);
 	/* PGD cached in MMU reg to avoid 3 mem lookups: task->mm->pgd */
 	write_aux_reg(ARC_REG_SCRATCH_DATA0, next->pgd);
 #endif
 	get_new_mmu_context(next);
 }
--- a/arch/arc/include/asm/page.h
+++ b/arch/arc/include/asm/page.h
@ -34,12 +34,35 @@ void copy_user_highpage(struct page *to, struct page *from,
 			unsigned long u_vaddr, struct vm_area_struct *vma);
 void clear_user_page(void *to, unsigned long u_vaddr, struct page *page);
-#undef STRICT_MM_TYPECHECKS
+typedef struct {
 	unsigned long pgd;
 } pgd_t;
 #define pgd_val(x)	((x).pgd)
 #define __pgd(x)	((pgd_t) { (x) })
 #if CONFIG_PGTABLE_LEVELS > 3
 typedef struct {
 	unsigned long pud;
 } pud_t;
 #define pud_val(x)      	((x).pud)
 #define __pud(x)        	((pud_t) { (x) })
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
 typedef struct {
 	unsigned long pmd;
 } pmd_t;
 #define pmd_val(x)	((x).pmd)
 #define __pmd(x)	((pmd_t) { (x) })
 #endif
 #ifdef STRICT_MM_TYPECHECKS
 /*
 * These are used to make use of C type-checking..
 */
 typedef struct {
 #ifdef CONFIG_ARC_HAS_PAE40
 	unsigned long long pte;
@ -47,44 +70,19 @@ typedef struct {
 	unsigned long pte;
 #endif
 } pte_t;
-typedef struct {
+
-	unsigned long pgd;
+#define pte_val(x)	((x).pte)
-} pgd_t;
+#define __pte(x)	((pte_t) { (x) })
 typedef struct {
 	unsigned long pgprot;
 } pgprot_t;
-#define pte_val(x)      ((x).pte)
+#define pgprot_val(x)	((x).pgprot)
-#define pgd_val(x)      ((x).pgd)
+#define __pgprot(x)	((pgprot_t) { (x) })
-#define pgprot_val(x)   ((x).pgprot)
+#define pte_pgprot(x)	__pgprot(pte_val(x))
-#define __pte(x)        ((pte_t) { (x) })
+typedef struct page *pgtable_t;
 #define __pgd(x)        ((pgd_t) { (x) })
 #define __pgprot(x)     ((pgprot_t) { (x) })
 #define pte_pgprot(x) __pgprot(pte_val(x))
 #else /* !STRICT_MM_TYPECHECKS */
 #ifdef CONFIG_ARC_HAS_PAE40
 typedef unsigned long long pte_t;
 #else
 typedef unsigned long pte_t;
 #endif
 typedef unsigned long pgd_t;
 typedef unsigned long pgprot_t;
 #define pte_val(x)	(x)
 #define pgd_val(x)	(x)
 #define pgprot_val(x)	(x)
 #define __pte(x)	(x)
 #define __pgd(x)	(x)
 #define __pgprot(x)	(x)
 #define pte_pgprot(x)	(x)
 #endif
 typedef pte_t * pgtable_t;
 /*
 * Use virt_to_pfn with caution:
@ -122,8 +120,8 @@ extern int pfn_valid(unsigned long pfn);
 * virt here means link-address/program-address as embedded in object code.
 * And for ARC, link-addr = physical address
 */
-#define __pa(vaddr)  ((unsigned long)(vaddr))
+#define __pa(vaddr)  		((unsigned long)(vaddr))
-#define __va(paddr)  ((void *)((unsigned long)(paddr)))
+#define __va(paddr)  		((void *)((unsigned long)(paddr)))
 #define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
 #define virt_addr_valid(kaddr)  pfn_valid(virt_to_pfn(kaddr))
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@ -31,30 +31,32 @@
 #include <linux/mm.h>
 #include <linux/log2.h>
 #include <asm-generic/pgalloc.h>
 static inline void
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
 {
-	pmd_set(pmd, pte);
+	/*
 	 * The cast to long below is OK in 32-bit PAE40 regime with long long pte
 	 * Despite "wider" pte, the pte table needs to be in non-PAE low memory
 	 * as all higher levels can only hold long pointers.
 	 *
 	 * The cast itself is needed given simplistic definition of set_pmd()
 	 */
 	set_pmd(pmd, __pmd((unsigned long)pte));
 }
-static inline void
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page)
 pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t ptep)
 {
-	pmd_set(pmd, (pte_t *) ptep);
+	set_pmd(pmd, __pmd((unsigned long)page_address(pte_page)));
 }
 static inline int __get_order_pgd(void)
 {
 	return get_order(PTRS_PER_PGD * sizeof(pgd_t));
 }
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	int num, num2;
+	pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL);
 	pgd_t *ret = (pgd_t *) __get_free_pages(GFP_KERNEL, __get_order_pgd());
 	if (ret) {
 		int num, num2;
 		num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
 		memzero(ret, num * sizeof(pgd_t));
@ -68,64 +70,27 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
 {
-	free_pages((unsigned long)pgd, __get_order_pgd());
+	set_p4d(p4dp, __p4d((unsigned long)pudp));
 }
 #define __pud_free_tlb(tlb, pmd, addr)  pud_free((tlb)->mm, pmd)
-/*
+#endif
 * With software-only page-tables, addr-split for traversal is tweakable and
 * that directly governs how big tables would be at each level.
 * Further, the MMU page size is configurable.
 * Thus we need to programatically assert the size constraint
 * All of this is const math, allowing gcc to do constant folding/propagation.
 */
-static inline int __get_order_pte(void)
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
 {
-	return get_order(PTRS_PER_PTE * sizeof(pte_t));
+	set_pud(pudp, __pud((unsigned long)pmdp));
 }
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+#define __pmd_free_tlb(tlb, pmd, addr)  pmd_free((tlb)->mm, pmd)
 {
 	pte_t *pte;
-	pte = (pte_t *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+#endif
 					 __get_order_pte());
 	return pte;
 }
 static inline pgtable_t
 pte_alloc_one(struct mm_struct *mm)
 {
 	pgtable_t pte_pg;
 	struct page *page;
 	pte_pg = (pgtable_t)__get_free_pages(GFP_KERNEL, __get_order_pte());
 	if (!pte_pg)
 		return 0;
 	memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t));
 	page = virt_to_page(pte_pg);
 	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return 0;
 	}
 	return pte_pg;
 }
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_pages((unsigned long)pte, __get_order_pte()); /* takes phy addr */
 }
 static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
 {
 	pgtable_pte_page_dtor(virt_to_page(ptep));
 	free_pages((unsigned long)ptep, __get_order_pte());
 }
 #define __pte_free_tlb(tlb, pte, addr)  pte_free((tlb)->mm, pte)
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@ -0,0 +1,149 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 */
 /*
 * page table flags for software walked/managed MMUv3 (ARC700) and MMUv4 (HS)
 * There correspond to the corresponding bits in the TLB
 */
 #ifndef _ASM_ARC_PGTABLE_BITS_ARCV2_H
 #define _ASM_ARC_PGTABLE_BITS_ARCV2_H
 #ifdef CONFIG_ARC_CACHE_PAGES
 #define _PAGE_CACHEABLE		(1 << 0)  /* Cached (H) */
 #else
 #define _PAGE_CACHEABLE		0
 #endif
 #define _PAGE_EXECUTE		(1 << 1)  /* User Execute  (H) */
 #define _PAGE_WRITE		(1 << 2)  /* User Write    (H) */
 #define _PAGE_READ		(1 << 3)  /* User Read     (H) */
 #define _PAGE_ACCESSED		(1 << 4)  /* Accessed      (s) */
 #define _PAGE_DIRTY		(1 << 5)  /* Modified      (s) */
 #define _PAGE_SPECIAL		(1 << 6)
 #define _PAGE_GLOBAL		(1 << 8)  /* ASID agnostic (H) */
 #define _PAGE_PRESENT		(1 << 9)  /* PTE/TLB Valid (H) */
 #ifdef CONFIG_ARC_MMU_V4
 #define _PAGE_HW_SZ		(1 << 10)  /* Normal/super (H) */
 #else
 #define _PAGE_HW_SZ		0
 #endif
 /* Defaults for every user page */
 #define ___DEF		(_PAGE_PRESENT | _PAGE_CACHEABLE)
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \
 							   _PAGE_SPECIAL)
 /* More Abbrevaited helpers */
 #define PAGE_U_NONE     __pgprot(___DEF)
 #define PAGE_U_R        __pgprot(___DEF | _PAGE_READ)
 #define PAGE_U_W_R      __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE)
 #define PAGE_U_X_R      __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE)
 #define PAGE_U_X_W_R    __pgprot(___DEF \
 				| _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE)
 #define PAGE_KERNEL     __pgprot(___DEF | _PAGE_GLOBAL \
 				| _PAGE_READ | _PAGE_WRITE | _PAGE_EXECUTE)
 #define PAGE_SHARED	PAGE_U_W_R
 #define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE))
 /*
 * Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
 *
 * Certain cases have 1:1 mapping
 *  e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED
 *       which directly corresponds to  PAGE_U_X_R
 *
 * Other rules which cause the divergence from 1:1 mapping
 *
 *  1. Although ARC700 can do exclusive execute/write protection (meaning R
 *     can be tracked independet of X/W unlike some other CPUs), still to
 *     keep things consistent with other archs:
 *      -Write implies Read:   W => R
 *      -Execute implies Read: X => R
 *
 *  2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W
 *     This is to enable COW mechanism
 */
 	/* xwr */
 #define __P000  PAGE_U_NONE
 #define __P001  PAGE_U_R
 #define __P010  PAGE_U_R	/* Pvt-W => !W */
 #define __P011  PAGE_U_R	/* Pvt-W => !W */
 #define __P100  PAGE_U_X_R	/* X => R */
 #define __P101  PAGE_U_X_R
 #define __P110  PAGE_U_X_R	/* Pvt-W => !W and X => R */
 #define __P111  PAGE_U_X_R	/* Pvt-W => !W */
 #define __S000  PAGE_U_NONE
 #define __S001  PAGE_U_R
 #define __S010  PAGE_U_W_R	/* W => R */
 #define __S011  PAGE_U_W_R
 #define __S100  PAGE_U_X_R	/* X => R */
 #define __S101  PAGE_U_X_R
 #define __S110  PAGE_U_X_W_R	/* X => R */
 #define __S111  PAGE_U_X_W_R
 #ifndef __ASSEMBLY__
 #define pte_write(pte)		(pte_val(pte) & _PAGE_WRITE)
 #define pte_dirty(pte)		(pte_val(pte) & _PAGE_DIRTY)
 #define pte_young(pte)		(pte_val(pte) & _PAGE_ACCESSED)
 #define pte_special(pte)	(pte_val(pte) & _PAGE_SPECIAL)
 #define PTE_BIT_FUNC(fn, op) \
 	static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
 PTE_BIT_FUNC(mknotpresent,     &= ~(_PAGE_PRESENT));
 PTE_BIT_FUNC(wrprotect,	&= ~(_PAGE_WRITE));
 PTE_BIT_FUNC(mkwrite,	|= (_PAGE_WRITE));
 PTE_BIT_FUNC(mkclean,	&= ~(_PAGE_DIRTY));
 PTE_BIT_FUNC(mkdirty,	|= (_PAGE_DIRTY));
 PTE_BIT_FUNC(mkold,	&= ~(_PAGE_ACCESSED));
 PTE_BIT_FUNC(mkyoung,	|= (_PAGE_ACCESSED));
 PTE_BIT_FUNC(mkspecial,	|= (_PAGE_SPECIAL));
 PTE_BIT_FUNC(mkhuge,	|= (_PAGE_HW_SZ));
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
 	set_pte(ptep, pteval);
 }
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *ptep);
 /* Encode swap {type,off} tuple into PTE
 * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that
 * PAGE_PRESENT is zero in a PTE holding swap "identifier"
 */
 #define __swp_entry(type, off)		((swp_entry_t) \
 					{ ((type) & 0x1f) | ((off) << 13) })
 /* Decode a PTE containing swap "identifier "into constituents */
 #define __swp_type(pte_lookalike)	(((pte_lookalike).val) & 0x1f)
 #define __swp_offset(pte_lookalike)	((pte_lookalike).val >> 13)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 #define kern_addr_valid(addr)	(1)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #include <asm/hugepage.h>
 #endif
 #endif /* __ASSEMBLY__ */
 #endif
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@ -0,0 +1,189 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright (C) 2020 Synopsys, Inc. (www.synopsys.com)
 */
 /*
 * Helpers for implemenintg paging levels
 */
 #ifndef _ASM_ARC_PGTABLE_LEVELS_H
 #define _ASM_ARC_PGTABLE_LEVELS_H
 #if CONFIG_PGTABLE_LEVELS == 2
 /*
 * 2 level paging setup for software walked MMUv3 (ARC700) and MMUv4 (HS)
 *
 * [31]            32 bit virtual address              [0]
 * -------------------------------------------------------
 * |               | <---------- PGDIR_SHIFT ----------> |
 * |               |                | <-- PAGE_SHIFT --> |
 * -------------------------------------------------------
 *       |                  |                |
 *       |                  |                --> off in page frame
 *       |                  ---> index into Page Table
 *       ----> index into Page Directory
 *
 * Given software walk, the vaddr split is arbitrary set to 11:8:13
 * However enabling of super page in a 2 level regime pegs PGDIR_SHIFT to
 * super page size.
 */
 #if defined(CONFIG_ARC_HUGEPAGE_16M)
 #define PGDIR_SHIFT		24
 #elif defined(CONFIG_ARC_HUGEPAGE_2M)
 #define PGDIR_SHIFT		21
 #else
 /*
 * No Super page case
 * Default value provides 11:8:13 (8K), 10:10:12 (4K)
 * Limits imposed by pgtable_t only PAGE_SIZE long
 * (so 4K page can only have 1K entries: or 10 bits)
 */
 #ifdef CONFIG_ARC_PAGE_SIZE_4K
 #define PGDIR_SHIFT		22
 #else
 #define PGDIR_SHIFT		21
 #endif
 #endif
 #else /* CONFIG_PGTABLE_LEVELS != 2 */
 /*
 * A default 3 level paging testing setup in software walked MMU
 *   MMUv4 (8K page): <4> : <7> : <8> : <13>
 * A default 4 level paging testing setup in software walked MMU
 *   MMUv4 (8K page): <4> : <3> : <4> : <8> : <13>
 */
 #define PGDIR_SHIFT		28
 #if CONFIG_PGTABLE_LEVELS > 3
 #define PUD_SHIFT		25
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
 #define PMD_SHIFT		21
 #endif
 #endif /* CONFIG_PGTABLE_LEVELS */
 #define PGDIR_SIZE		BIT(PGDIR_SHIFT)
 #define PGDIR_MASK		(~(PGDIR_SIZE - 1))
 #define PTRS_PER_PGD		BIT(32 - PGDIR_SHIFT)
 #if CONFIG_PGTABLE_LEVELS > 3
 #define PUD_SIZE		BIT(PUD_SHIFT)
 #define PUD_MASK		(~(PUD_SIZE - 1))
 #define PTRS_PER_PUD		BIT(PGDIR_SHIFT - PUD_SHIFT)
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
 #define PMD_SIZE		BIT(PMD_SHIFT)
 #define PMD_MASK		(~(PMD_SIZE - 1))
 #define PTRS_PER_PMD		BIT(PUD_SHIFT - PMD_SHIFT)
 #endif
 #define PTRS_PER_PTE		BIT(PMD_SHIFT - PAGE_SHIFT)
 #ifndef __ASSEMBLY__
 #if CONFIG_PGTABLE_LEVELS > 3
 #include <asm-generic/pgtable-nop4d.h>
 #elif CONFIG_PGTABLE_LEVELS > 2
 #include <asm-generic/pgtable-nopud.h>
 #else
 #include <asm-generic/pgtable-nopmd.h>
 #endif
 /*
 * 1st level paging: pgd
 */
 #define pgd_index(addr)		((addr) >> PGDIR_SHIFT)
 #define pgd_offset(mm, addr)	(((mm)->pgd) + pgd_index(addr))
 #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
 #define pgd_ERROR(e) \
 	pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 #if CONFIG_PGTABLE_LEVELS > 3
 /* In 4 level paging, p4d_* macros work on pgd */
 #define p4d_none(x)		(!p4d_val(x))
 #define p4d_bad(x)		((p4d_val(x) & ~PAGE_MASK))
 #define p4d_present(x)		(p4d_val(x))
 #define p4d_clear(xp)		do { p4d_val(*(xp)) = 0; } while (0)
 #define p4d_pgtable(p4d)	((pud_t *)(p4d_val(p4d) & PAGE_MASK))
 #define p4d_page(p4d)		virt_to_page(p4d_pgtable(p4d))
 #define set_p4d(p4dp, p4d)	(*(p4dp) = p4d)
 /*
 * 2nd level paging: pud
 */
 #define pud_ERROR(e) \
 	pr_crit("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
 /*
 * In 3 level paging, pud_* macros work on pgd
 * In 4 level paging, pud_* macros work on pud
 */
 #define pud_none(x)		(!pud_val(x))
 #define pud_bad(x)		((pud_val(x) & ~PAGE_MASK))
 #define pud_present(x)		(pud_val(x))
 #define pud_clear(xp)		do { pud_val(*(xp)) = 0; } while (0)
 #define pud_pgtable(pud)	((pmd_t *)(pud_val(pud) & PAGE_MASK))
 #define pud_page(pud)		virt_to_page(pud_pgtable(pud))
 #define set_pud(pudp, pud)	(*(pudp) = pud)
 /*
 * 3rd level paging: pmd
 */
 #define pmd_ERROR(e) \
 	pr_crit("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pmd_pfn(pmd)		((pmd_val(pmd) & PMD_MASK) >> PAGE_SHIFT)
 #define pfn_pmd(pfn,prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
 #define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
 #endif
 /*
 * Due to the strange way generic pgtable level folding works, the pmd_* macros
 *  - are valid even for 2 levels (which supposedly only has pgd - pte)
 *  - behave differently for 2 vs. 3
 * In 2  level paging        (pgd -> pte), pmd_* macros work on pgd
 * In 3+ level paging (pgd -> pmd -> pte), pmd_* macros work on pmd
 */
 #define pmd_none(x)		(!pmd_val(x))
 #define pmd_bad(x)		((pmd_val(x) & ~PAGE_MASK))
 #define pmd_present(x)		(pmd_val(x))
 #define pmd_clear(xp)		do { pmd_val(*(xp)) = 0; } while (0)
 #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & PAGE_MASK)
 #define pmd_page(pmd)		virt_to_page(pmd_page_vaddr(pmd))
 #define set_pmd(pmdp, pmd)	(*(pmdp) = pmd)
 #define pmd_pgtable(pmd)	((pgtable_t) pmd_page_vaddr(pmd))
 /*
 * 4th level paging: pte
 */
 #define pte_ERROR(e) \
 	pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pte_none(x)		(!pte_val(x))
 #define pte_present(x)		(pte_val(x) & _PAGE_PRESENT)
 #define pte_clear(mm,addr,ptep)	set_pte_at(mm, addr, ptep, __pte(0))
 #define pte_page(pte)		pfn_to_page(pte_pfn(pte))
 #define set_pte(ptep, pte)	((*(ptep)) = (pte))
 #define pte_pfn(pte)		(pte_val(pte) >> PAGE_SHIFT)
 #define pfn_pte(pfn, prot)	__pte(__pfn_to_phys(pfn) | pgprot_val(prot))
 #define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
 #ifdef CONFIG_ISA_ARCV2
 #define pmd_leaf(x)		(pmd_val(x) & _PAGE_HW_SZ)
 #endif
 #endif	/* !__ASSEMBLY__ */
 #endif
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@ -1,220 +1,17 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 *
 * vineetg: May 2011
 *  -Folded PAGE_PRESENT (used by VM) and PAGE_VALID (used by MMU) into 1.
 *     They are semantically the same although in different contexts
 *     VALID marks a TLB entry exists and it will only happen if PRESENT
 *  - Utilise some unused free bits to confine PTE flags to 12 bits
 *     This is a must for 4k pg-sz
 *
 * vineetg: Mar 2011 - changes to accommodate MMU TLB Page Descriptor mods
 *  -TLB Locking never really existed, except for initial specs
 *  -SILENT_xxx not needed for our port
 *  -Per my request, MMU V3 changes the layout of some of the bits
 *     to avoid a few shifts in TLB Miss handlers.
 *
 * vineetg: April 2010
 *  -PGD entry no longer contains any flags. If empty it is 0, otherwise has
 *   Pg-Tbl ptr. Thus pmd_present(), pmd_valid(), pmd_set( ) become simpler
 *
 * vineetg: April 2010
 *  -Switched form 8:11:13 split for page table lookup to 11:8:13
 *  -this speeds up page table allocation itself as we now have to memset 1K
 *    instead of 8k per page table.
 * -TODO: Right now page table alloc is 8K and rest 7K is unused
 *    need to optimise it
 *
 * Amit Bhor, Sameer Dhavale: Codito Technologies 2004
 */
 #ifndef _ASM_ARC_PGTABLE_H
 #define _ASM_ARC_PGTABLE_H
 #include <linux/bits.h>
-#include <asm-generic/pgtable-nopmd.h>
+
 #include <asm/pgtable-levels.h>
 #include <asm/pgtable-bits-arcv2.h>
 #include <asm/page.h>
-#include <asm/mmu.h>	/* to propagate CONFIG_ARC_MMU_VER <n> */
+#include <asm/mmu.h>
 /**************************************************************************
 * Page Table Flags
 *
 * ARC700 MMU only deals with softare managed TLB entries.
 * Page Tables are purely for Linux VM's consumption and the bits below are
 * suited to that (uniqueness). Hence some are not implemented in the TLB and
 * some have different value in TLB.
 * e.g. MMU v2: K_READ bit is 8 and so is GLOBAL (possible because they live in
 *      seperate PD0 and PD1, which combined forms a translation entry)
 *      while for PTE perspective, they are 8 and 9 respectively
 * with MMU v3: Most bits (except SHARED) represent the exact hardware pos
 *      (saves some bit shift ops in TLB Miss hdlrs)
 */
 #if (CONFIG_ARC_MMU_VER <= 2)
 #define _PAGE_ACCESSED      (1<<1)	/* Page is accessed (S) */
 #define _PAGE_CACHEABLE     (1<<2)	/* Page is cached (H) */
 #define _PAGE_EXECUTE       (1<<3)	/* Page has user execute perm (H) */
 #define _PAGE_WRITE         (1<<4)	/* Page has user write perm (H) */
 #define _PAGE_READ          (1<<5)	/* Page has user read perm (H) */
 #define _PAGE_DIRTY         (1<<6)	/* Page modified (dirty) (S) */
 #define _PAGE_SPECIAL       (1<<7)
 #define _PAGE_GLOBAL        (1<<8)	/* Page is global (H) */
 #define _PAGE_PRESENT       (1<<10)	/* TLB entry is valid (H) */
 #else	/* MMU v3 onwards */
 #define _PAGE_CACHEABLE     (1<<0)	/* Page is cached (H) */
 #define _PAGE_EXECUTE       (1<<1)	/* Page has user execute perm (H) */
 #define _PAGE_WRITE         (1<<2)	/* Page has user write perm (H) */
 #define _PAGE_READ          (1<<3)	/* Page has user read perm (H) */
 #define _PAGE_ACCESSED      (1<<4)	/* Page is accessed (S) */
 #define _PAGE_DIRTY         (1<<5)	/* Page modified (dirty) (S) */
 #define _PAGE_SPECIAL       (1<<6)
 #if (CONFIG_ARC_MMU_VER >= 4)
 #define _PAGE_WTHRU         (1<<7)	/* Page cache mode write-thru (H) */
 #endif
 #define _PAGE_GLOBAL        (1<<8)	/* Page is global (H) */
 #define _PAGE_PRESENT       (1<<9)	/* TLB entry is valid (H) */
 #if (CONFIG_ARC_MMU_VER >= 4)
 #define _PAGE_HW_SZ         (1<<10)	/* Page Size indicator (H): 0 normal, 1 super */
 #endif
 #define _PAGE_SHARED_CODE   (1<<11)	/* Shared Code page with cmn vaddr
 					   usable for shared TLB entries (H) */
 #define _PAGE_UNUSED_BIT    (1<<12)
 #endif
 /* vmalloc permissions */
 #define _K_PAGE_PERMS  (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ | \
 			_PAGE_GLOBAL | _PAGE_PRESENT)
 #ifndef CONFIG_ARC_CACHE_PAGES
 #undef _PAGE_CACHEABLE
 #define _PAGE_CACHEABLE 0
 #endif
 #ifndef _PAGE_HW_SZ
 #define _PAGE_HW_SZ	0
 #endif
 /* Defaults for every user page */
 #define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE)
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \
 							   _PAGE_SPECIAL)
 /* More Abbrevaited helpers */
 #define PAGE_U_NONE     __pgprot(___DEF)
 #define PAGE_U_R        __pgprot(___DEF | _PAGE_READ)
 #define PAGE_U_W_R      __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE)
 #define PAGE_U_X_R      __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE)
 #define PAGE_U_X_W_R    __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE | \
 						       _PAGE_EXECUTE)
 #define PAGE_SHARED	PAGE_U_W_R
 /* While kernel runs out of unstranslated space, vmalloc/modules use a chunk of
 * user vaddr space - visible in all addr spaces, but kernel mode only
 * Thus Global, all-kernel-access, no-user-access, cached
 */
 #define PAGE_KERNEL          __pgprot(_K_PAGE_PERMS | _PAGE_CACHEABLE)
 /* ioremap */
 #define PAGE_KERNEL_NO_CACHE __pgprot(_K_PAGE_PERMS)
 /* Masks for actual TLB "PD"s */
 #define PTE_BITS_IN_PD0		(_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ)
 #define PTE_BITS_RWX		(_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
 #define PTE_BITS_NON_RWX_IN_PD1	(PAGE_MASK_PHYS | _PAGE_CACHEABLE)
 /**************************************************************************
 * Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
 *
 * Certain cases have 1:1 mapping
 *  e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED
 *       which directly corresponds to  PAGE_U_X_R
 *
 * Other rules which cause the divergence from 1:1 mapping
 *
 *  1. Although ARC700 can do exclusive execute/write protection (meaning R
 *     can be tracked independet of X/W unlike some other CPUs), still to
 *     keep things consistent with other archs:
 *      -Write implies Read:   W => R
 *      -Execute implies Read: X => R
 *
 *  2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W
 *     This is to enable COW mechanism
 */
 	/* xwr */
 #define __P000  PAGE_U_NONE
 #define __P001  PAGE_U_R
 #define __P010  PAGE_U_R	/* Pvt-W => !W */
 #define __P011  PAGE_U_R	/* Pvt-W => !W */
 #define __P100  PAGE_U_X_R	/* X => R */
 #define __P101  PAGE_U_X_R
 #define __P110  PAGE_U_X_R	/* Pvt-W => !W and X => R */
 #define __P111  PAGE_U_X_R	/* Pvt-W => !W */
 #define __S000  PAGE_U_NONE
 #define __S001  PAGE_U_R
 #define __S010  PAGE_U_W_R	/* W => R */
 #define __S011  PAGE_U_W_R
 #define __S100  PAGE_U_X_R	/* X => R */
 #define __S101  PAGE_U_X_R
 #define __S110  PAGE_U_X_W_R	/* X => R */
 #define __S111  PAGE_U_X_W_R
 /****************************************************************
 * 2 tier (PGD:PTE) software page walker
 *
 * [31]		    32 bit virtual address              [0]
 * -------------------------------------------------------
 * |               | <------------ PGDIR_SHIFT ----------> |
 * |		   |					 |
 * | BITS_FOR_PGD  |  BITS_FOR_PTE  | <-- PAGE_SHIFT --> |
 * -------------------------------------------------------
 *       |                  |                |
 *       |                  |                --> off in page frame
 *       |                  ---> index into Page Table
 *       ----> index into Page Directory
 *
 * In a single page size configuration, only PAGE_SHIFT is fixed
 * So both PGD and PTE sizing can be tweaked
 *  e.g. 8K page (PAGE_SHIFT 13) can have
 *  - PGDIR_SHIFT 21  -> 11:8:13 address split
 *  - PGDIR_SHIFT 24  -> 8:11:13 address split
 *
 * If Super Page is configured, PGDIR_SHIFT becomes fixed too,
 * so the sizing flexibility is gone.
 */
 #if defined(CONFIG_ARC_HUGEPAGE_16M)
 #define PGDIR_SHIFT	24
 #elif defined(CONFIG_ARC_HUGEPAGE_2M)
 #define PGDIR_SHIFT	21
 #else
 /*
 * Only Normal page support so "hackable" (see comment above)
 * Default value provides 11:8:13 (8K), 11:9:12 (4K)
 */
 #define PGDIR_SHIFT	21
 #endif
 #define BITS_FOR_PTE	(PGDIR_SHIFT - PAGE_SHIFT)
 #define BITS_FOR_PGD	(32 - PGDIR_SHIFT)
 #define PGDIR_SIZE	BIT(PGDIR_SHIFT)	/* vaddr span, not PDG sz */
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define	PTRS_PER_PTE	BIT(BITS_FOR_PTE)
 #define	PTRS_PER_PGD	BIT(BITS_FOR_PGD)
 /*
 * Number of entries a user land program use.
@ -222,143 +19,17 @@
 */
 #define	USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 /****************************************************************
 * Bucket load of VM Helpers
 */
 #ifndef __ASSEMBLY__
 #define pte_ERROR(e) \
 	pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pgd_ERROR(e) \
 	pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 /* the zero page used for uninitialized and anonymous pages */
 extern char empty_zero_page[PAGE_SIZE];
 #define ZERO_PAGE(vaddr)	(virt_to_page(empty_zero_page))
-#define set_pte(pteptr, pteval)	((*(pteptr)) = (pteval))
+extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
 #define set_pmd(pmdptr, pmdval)	(*(pmdptr) = pmdval)
 /* find the page descriptor of the Page Tbl ref by PMD entry */
 #define pmd_page(pmd)		virt_to_page(pmd_val(pmd) & PAGE_MASK)
 /* find the logical addr (phy for ARC) of the Page Tbl ref by PMD entry */
 #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & PAGE_MASK)
 /* In a 2 level sys, setup the PGD entry with PTE value */
 static inline void pmd_set(pmd_t *pmdp, pte_t *ptep)
 {
 	pmd_val(*pmdp) = (unsigned long)ptep;
 }
 #define pte_none(x)			(!pte_val(x))
 #define pte_present(x)			(pte_val(x) & _PAGE_PRESENT)
 #define pte_clear(mm, addr, ptep)	set_pte_at(mm, addr, ptep, __pte(0))
 #define pmd_none(x)			(!pmd_val(x))
 #define	pmd_bad(x)			((pmd_val(x) & ~PAGE_MASK))
 #define pmd_present(x)			(pmd_val(x))
 #define pmd_leaf(x)			(pmd_val(x) & _PAGE_HW_SZ)
 #define pmd_clear(xp)			do { pmd_val(*(xp)) = 0; } while (0)
 #define pte_page(pte)		pfn_to_page(pte_pfn(pte))
 #define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
 #define pfn_pte(pfn, prot)	__pte(__pfn_to_phys(pfn) | pgprot_val(prot))
 /* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/
 #define pte_pfn(pte)		(pte_val(pte) >> PAGE_SHIFT)
 /* Zoo of pte_xxx function */
 #define pte_read(pte)		(pte_val(pte) & _PAGE_READ)
 #define pte_write(pte)		(pte_val(pte) & _PAGE_WRITE)
 #define pte_dirty(pte)		(pte_val(pte) & _PAGE_DIRTY)
 #define pte_young(pte)		(pte_val(pte) & _PAGE_ACCESSED)
 #define pte_special(pte)	(pte_val(pte) & _PAGE_SPECIAL)
 #define PTE_BIT_FUNC(fn, op) \
 	static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
 PTE_BIT_FUNC(mknotpresent,	&= ~(_PAGE_PRESENT));
 PTE_BIT_FUNC(wrprotect,	&= ~(_PAGE_WRITE));
 PTE_BIT_FUNC(mkwrite,	|= (_PAGE_WRITE));
 PTE_BIT_FUNC(mkclean,	&= ~(_PAGE_DIRTY));
 PTE_BIT_FUNC(mkdirty,	|= (_PAGE_DIRTY));
 PTE_BIT_FUNC(mkold,	&= ~(_PAGE_ACCESSED));
 PTE_BIT_FUNC(mkyoung,	|= (_PAGE_ACCESSED));
 PTE_BIT_FUNC(exprotect,	&= ~(_PAGE_EXECUTE));
 PTE_BIT_FUNC(mkexec,	|= (_PAGE_EXECUTE));
 PTE_BIT_FUNC(mkspecial,	|= (_PAGE_SPECIAL));
 PTE_BIT_FUNC(mkhuge,	|= (_PAGE_HW_SZ));
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 /* Macro to mark a page protection as uncacheable */
 #define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE))
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
 	set_pte(ptep, pteval);
 }
 /*
 * Macro to quickly access the PGD entry, utlising the fact that some
 * arch may cache the pointer to Page Directory of "current" task
 * in a MMU register
 *
 * Thus task->mm->pgd (3 pointer dereferences, cache misses etc simply
 * becomes read a register
 *
 * ********CAUTION*******:
 * Kernel code might be dealing with some mm_struct of NON "current"
 * Thus use this macro only when you are certain that "current" is current
 * e.g. when dealing with signal frame setup code etc
 */
 #ifdef ARC_USE_SCRATCH_REG
 #define pgd_offset_fast(mm, addr)	\
 ({					\
 	pgd_t *pgd_base = (pgd_t *) read_aux_reg(ARC_REG_SCRATCH_DATA0);  \
 	pgd_base + pgd_index(addr);	\
 })
 #else
 #define pgd_offset_fast(mm, addr)	pgd_offset(mm, addr)
 #endif
 extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *ptep);
 /* Encode swap {type,off} tuple into PTE
 * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that
 * PAGE_PRESENT is zero in a PTE holding swap "identifier"
 */
 #define __swp_entry(type, off)	((swp_entry_t) { \
 					((type) & 0x1f) | ((off) << 13) })
 /* Decode a PTE containing swap "identifier "into constituents */
 #define __swp_type(pte_lookalike)	(((pte_lookalike).val) & 0x1f)
 #define __swp_offset(pte_lookalike)	((pte_lookalike).val >> 13)
 /* NOPs, to keep generic kernel happy */
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 #define kern_addr_valid(addr)	(1)
 #define pmd_pgtable(pmd)       ((pgtable_t) pmd_page_vaddr(pmd))
 /*
 * remap a physical page `pfn' of size `size' with page protection `prot'
 * into virtual address `from'
 */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #include <asm/hugepage.h>
 #endif
 /* to cope with aliasing VIPT cache */
 #define HAVE_ARCH_UNMAPPED_AREA
--- a/arch/arc/include/asm/processor.h
+++ b/arch/arc/include/asm/processor.h
@ -93,7 +93,7 @@ extern unsigned int get_wchan(struct task_struct *p);
 #define VMALLOC_START	(PAGE_OFFSET - (CONFIG_ARC_KVADDR_SIZE << 20))
 /* 1 PGDIR_SIZE each for fixmap/pkmap, 2 PGDIR_SIZE gutter (see asm/highmem.h) */
-#define VMALLOC_SIZE	((CONFIG_ARC_KVADDR_SIZE << 20) - PGDIR_SIZE * 4)
+#define VMALLOC_SIZE	((CONFIG_ARC_KVADDR_SIZE << 20) - PMD_SIZE * 4)
 #define VMALLOC_END	(VMALLOC_START + VMALLOC_SIZE)
--- a/arch/arc/include/asm/setup.h
+++ b/arch/arc/include/asm/setup.h
@ -2,8 +2,8 @@
 /*
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 */
-#ifndef __ASMARC_SETUP_H
+#ifndef __ASM_ARC_SETUP_H
-#define __ASMARC_SETUP_H
+#define __ASM_ARC_SETUP_H
 #include <linux/types.h>
@ -34,4 +34,12 @@ long __init arc_get_mem_sz(void);
 #define IS_AVAIL2(v, s, cfg)	IS_AVAIL1(v, s), IS_AVAIL1(v, IS_USED_CFG(cfg))
 #define IS_AVAIL3(v, v2, s)	IS_AVAIL1(v, s), IS_AVAIL1(v, IS_DISABLED_RUN(v2))
 extern void arc_mmu_init(void);
 extern char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_mmu_bcr(void);
 extern void arc_cache_init(void);
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
 #endif /* __ASMARC_SETUP_H */
--- a/arch/arc/include/asm/smp.h
+++ b/arch/arc/include/asm/smp.h
@ -105,7 +105,6 @@ static inline const char *arc_platform_smp_cpuinfo(void)
 #include <asm/spinlock.h>
 extern arch_spinlock_t smp_atomic_ops_lock;
 extern arch_spinlock_t smp_bitops_lock;
 #define atomic_ops_lock(flags)	do {		\
 	local_irq_save(flags);			\
@ -117,24 +116,11 @@ extern arch_spinlock_t smp_bitops_lock;
 	local_irq_restore(flags);		\
 } while (0)
 #define bitops_lock(flags)	do {		\
 	local_irq_save(flags);			\
 	arch_spin_lock(&smp_bitops_lock);	\
 } while (0)
 #define bitops_unlock(flags) do {		\
 	arch_spin_unlock(&smp_bitops_lock);	\
 	local_irq_restore(flags);		\
 } while (0)
 #else /* !CONFIG_SMP */
 #define atomic_ops_lock(flags)		local_irq_save(flags)
 #define atomic_ops_unlock(flags)	local_irq_restore(flags)
 #define bitops_lock(flags)		local_irq_save(flags)
 #define bitops_unlock(flags)		local_irq_restore(flags)
 #endif /* !CONFIG_SMP */
 #endif	/* !CONFIG_ARC_HAS_LLSC */
--- a/arch/arc/include/asm/tlb-mmu1.h
+++ b/arch/arc/include/asm/tlb-mmu1.h
@ -1,101 +0,0 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 */
 #ifndef __ASM_TLB_MMU_V1_H__
 #define __ASM_TLB_MMU_V1_H__
 #include <asm/mmu.h>
 #if defined(__ASSEMBLY__) && (CONFIG_ARC_MMU_VER == 1)
 .macro TLB_WRITE_HEURISTICS
 #define JH_HACK1
 #undef JH_HACK2
 #undef JH_HACK3
 #ifdef JH_HACK3
 ; Calculate set index for 2-way MMU
 ; -avoiding use of GetIndex from MMU
 ;   and its unpleasant LFSR pseudo-random sequence
 ;
 ; r1 = TLBPD0 from TLB_RELOAD above
 ;
 ; -- jh_ex_way_set not cleared on startup
 ;    didn't want to change setup.c
 ;    hence extra instruction to clean
 ;
 ; -- should be in cache since in same line
 ;    as r0/r1 saves above
 ;
 ld  r0,[jh_ex_way_sel]  ; victim pointer
 and r0,r0,1         ; clean
 xor.f   r0,r0,1         ; flip
 st  r0,[jh_ex_way_sel]  ; store back
 asr r0,r1,12        ; get set # <<1, note bit 12=R=0
 or.nz   r0,r0,1         ; set way bit
 and r0,r0,0xff      ; clean
 sr  r0,[ARC_REG_TLBINDEX]
 #endif
 #ifdef JH_HACK2
 ; JH hack #2
 ;  Faster than hack #1 in non-thrash case, but hard-coded for 2-way MMU
 ;  Slower in thrash case (where it matters) because more code is executed
 ;  Inefficient due to two-register paradigm of this miss handler
 ;
 /* r1 = data TLBPD0 at this point */
 lr      r0,[eret]               /* instruction address */
 xor     r0,r0,r1                /* compare set #       */
 and.f   r0,r0,0x000fe000        /* 2-way MMU mask      */
 bne     88f                     /* not in same set - no need to probe */
 lr      r0,[eret]               /* instruction address */
 and     r0,r0,PAGE_MASK         /* VPN of instruction address */
 ; lr  r1,[ARC_REG_TLBPD0]     /* Data VPN+ASID - already in r1 from TLB_RELOAD*/
 and     r1,r1,0xff              /* Data ASID */
 or      r0,r0,r1                /* Instruction address + Data ASID */
 lr      r1,[ARC_REG_TLBPD0]     /* save TLBPD0 containing data TLB*/
 sr      r0,[ARC_REG_TLBPD0]     /* write instruction address to TLBPD0 */
 sr      TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */
 lr      r0,[ARC_REG_TLBINDEX]   /* r0 = index where instruction is, if at all */
 sr      r1,[ARC_REG_TLBPD0]     /* restore TLBPD0 */
 xor     r0,r0,1                 /* flip bottom bit of data index */
 b.d     89f
 sr      r0,[ARC_REG_TLBINDEX]   /* and put it back */
 88:
 sr  TLBGetIndex, [ARC_REG_TLBCOMMAND]
 89:
 #endif
 #ifdef JH_HACK1
 ;
 ; Always checks whether instruction will be kicked out by dtlb miss
 ;
 mov_s   r3, r1                  ; save PD0 prepared by TLB_RELOAD in r3
 lr      r0,[eret]               /* instruction address */
 and     r0,r0,PAGE_MASK         /* VPN of instruction address */
 bmsk    r1,r3,7                 /* Data ASID, bits 7-0 */
 or_s    r0,r0,r1                /* Instruction address + Data ASID */
 sr      r0,[ARC_REG_TLBPD0]     /* write instruction address to TLBPD0 */
 sr      TLBProbe, [ARC_REG_TLBCOMMAND] /* Look for instruction */
 lr      r0,[ARC_REG_TLBINDEX]   /* r0 = index where instruction is, if at all */
 sr      r3,[ARC_REG_TLBPD0]     /* restore TLBPD0 */
 sr      TLBGetIndex, [ARC_REG_TLBCOMMAND]
 lr      r1,[ARC_REG_TLBINDEX]   /* r1 = index where MMU wants to put data */
 cmp     r0,r1                   /* if no match on indices, go around */
 xor.eq  r1,r1,1                 /* flip bottom bit of data index */
 sr      r1,[ARC_REG_TLBINDEX]   /* and put it back */
 #endif
 .endm
 #endif
 #endif
--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@ -10,6 +10,7 @@
 #include <asm/errno.h>
 #include <asm/arcregs.h>
 #include <asm/irqflags.h>
 #include <asm/mmu.h>
 ; A maximum number of supported interrupts in the core interrupt controller.
 ; This number is not equal to the maximum interrupt number (256) because
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@ -101,11 +101,8 @@ ENTRY(EV_MachineCheck)
 	lr  r0, [efa]
 	mov r1, sp
-	; hardware auto-disables MMU, re-enable it to allow kernel vaddr
+	; MC excpetions disable MMU
-	; access for say stack unwinding of modules for crash dumps
+	ARC_MMU_REENABLE r3
 	lr	r3, [ARC_REG_PID]
 	or	r3, r3, MMU_ENABLE
 	sr	r3, [ARC_REG_PID]
 	lsr  	r3, r2, 8
 	bmsk 	r3, r3, 7
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@ -142,7 +142,7 @@ IRQCHIP_DECLARE(arc_intc, "snps,arc700-intc", init_onchip_IRQ);
 *    Time hard-ISR, timer_interrupt( ) calls spin_unlock_irq several times.
 *    Here local_irq_enable( ) shd not re-enable lower priority interrupts
 * -If called from soft-ISR, it must re-enable all interrupts
- *    soft ISR are low prioity jobs which can be very slow, thus all IRQs
+ *    soft ISR are low priority jobs which can be very slow, thus all IRQs
 *    must be enabled while they run.
 *    Now hardware context wise we may still be in L2 ISR (not done rtie)
 *    still we must re-enable both L1 and L2 IRQs
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@ -29,10 +29,8 @@
 #ifndef CONFIG_ARC_HAS_LLSC
 arch_spinlock_t smp_atomic_ops_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 arch_spinlock_t smp_bitops_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL_GPL(smp_atomic_ops_lock);
 EXPORT_SYMBOL_GPL(smp_bitops_lock);
 #endif
 struct plat_smp_ops  __weak plat_smp_ops;
@ -283,7 +281,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg)
 	/*
 	 * Call the platform specific IPI kick function, but avoid if possible:
 	 * Only do so if there's no pending msg from other concurrent sender(s).
-	 * Otherwise, recevier will see this msg as well when it takes the
+	 * Otherwise, receiver will see this msg as well when it takes the
 	 * IPI corresponding to that msg. This is true, even if it is already in
 	 * IPI handler, because !@old means it has not yet dequeued the msg(s)
 	 * so @new msg can be a free-loader
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@ -149,7 +149,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
 #else
 	/* On ARC, only Dward based unwinder works. fp based backtracing is
 	 * not possible (-fno-omit-frame-pointer) because of the way function
-	 * prelogue is setup (callee regs saved and then fp set and not other
+	 * prologue is setup (callee regs saved and then fp set and not other
 	 * way around
 	 */
 	pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@ -205,93 +205,24 @@ slc_chk:
 #define OP_INV_IC	0x4
 /*
- *		I-Cache Aliasing in ARC700 VIPT caches (MMU v1-v3)
+ * Cache Flush programming model
 *
- * ARC VIPT I-cache uses vaddr to index into cache and paddr to match the tag.
+ * ARC700 MMUv3 I$ and D$ are both VIPT and can potentially alias.
- * The orig Cache Management Module "CDU" only required paddr to invalidate a
+ * Programming model requires both paddr and vaddr irrespecive of aliasing
- * certain line since it sufficed as index in Non-Aliasing VIPT cache-geometry.
+ * considerations:
- * Infact for distinct V1,V2,P: all of {V1-P},{V2-P},{P-P} would end up fetching
+ *  - vaddr in {I,D}C_IV?L
- * the exact same line.
+ *  - paddr in {I,D}C_PTAG
 *
- * However for larger Caches (way-size > page-size) - i.e. in Aliasing config,
+ * In HS38x (MMUv4), D$ is PIPT, I$ is VIPT and can still alias.
- * paddr alone could not be used to correctly index the cache.
+ * Programming model is different for aliasing vs. non-aliasing I$
 *  - D$ / Non-aliasing I$: only paddr in {I,D}C_IV?L
 *  - Aliasing I$: same as ARC700 above (so MMUv3 routine used for MMUv4 I$)
 *
- * ------------------
+ *  - If PAE40 is enabled, independent of aliasing considerations, the higher
- * MMU v1/v2 (Fixed Page Size 8k)
+ *    bits needs to be written into PTAG_HI
 * ------------------
 * The solution was to provide CDU with these additonal vaddr bits. These
 * would be bits [x:13], x would depend on cache-geometry, 13 comes from
 * standard page size of 8k.
 * H/w folks chose [17:13] to be a future safe range, and moreso these 5 bits
 * of vaddr could easily be "stuffed" in the paddr as bits [4:0] since the
 * orig 5 bits of paddr were anyways ignored by CDU line ops, as they
 * represent the offset within cache-line. The adv of using this "clumsy"
 * interface for additional info was no new reg was needed in CDU programming
 * model.
 *
 * 17:13 represented the max num of bits passable, actual bits needed were
 * fewer, based on the num-of-aliases possible.
 * -for 2 alias possibility, only bit 13 needed (32K cache)
 * -for 4 alias possibility, bits 14:13 needed (64K cache)
 *
 * ------------------
 * MMU v3
 * ------------------
 * This ver of MMU supports variable page sizes (1k-16k): although Linux will
 * only support 8k (default), 16k and 4k.
 * However from hardware perspective, smaller page sizes aggravate aliasing
 * meaning more vaddr bits needed to disambiguate the cache-line-op ;
 * the existing scheme of piggybacking won't work for certain configurations.
 * Two new registers IC_PTAG and DC_PTAG inttoduced.
 * "tag" bits are provided in PTAG, index bits in existing IVIL/IVDL/FLDL regs
 */
 static inline
 void __cache_line_loop_v2(phys_addr_t paddr, unsigned long vaddr,
 			  unsigned long sz, const int op, const int full_page)
 {
 	unsigned int aux_cmd;
 	int num_lines;
 	if (op == OP_INV_IC) {
 		aux_cmd = ARC_REG_IC_IVIL;
 	} else {
 		/* d$ cmd: INV (discard or wback-n-discard) OR FLUSH (wback) */
 		aux_cmd = op & OP_INV ? ARC_REG_DC_IVDL : ARC_REG_DC_FLDL;
 	}
 	/* Ensure we properly floor/ceil the non-line aligned/sized requests
 	 * and have @paddr - aligned to cache line and integral @num_lines.
 	 * This however can be avoided for page sized since:
 	 *  -@paddr will be cache-line aligned already (being page aligned)
 	 *  -@sz will be integral multiple of line size (being page sized).
 	 */
 	if (!full_page) {
 		sz += paddr & ~CACHE_LINE_MASK;
 		paddr &= CACHE_LINE_MASK;
 		vaddr &= CACHE_LINE_MASK;
 	}
 	num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
 	/* MMUv2 and before: paddr contains stuffed vaddrs bits */
 	paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
 	while (num_lines-- > 0) {
 		write_aux_reg(aux_cmd, paddr);
 		paddr += L1_CACHE_BYTES;
 	}
 }
 /*
 * For ARC700 MMUv3 I-cache and D-cache flushes
 *  - ARC700 programming model requires paddr and vaddr be passed in seperate
 *    AUX registers (*_IV*L and *_PTAG respectively) irrespective of whether the
 *    caches actually alias or not.
 * -  For HS38, only the aliasing I-cache configuration uses the PTAG reg
 *    (non aliasing I-cache version doesn't; while D-cache can't possibly alias)
 */
 static inline
 void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
 			  unsigned long sz, const int op, const int full_page)
 {
@ -350,17 +281,6 @@ void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
 #ifndef USE_RGN_FLSH
 /*
 * In HS38x (MMU v4), I-cache is VIPT (can alias), D-cache is PIPT
 * Here's how cache ops are implemented
 *
 *  - D-cache: only paddr needed (in DC_IVDL/DC_FLDL)
 *  - I-cache Non Aliasing: Despite VIPT, only paddr needed (in IC_IVIL)
 *  - I-cache Aliasing: Both vaddr and paddr needed (in IC_IVIL, IC_PTAG
 *    respectively, similar to MMU v3 programming model, hence
 *    __cache_line_loop_v3() is used)
 *
 * If PAE40 is enabled, independent of aliasing considerations, the higher bits
 * needs to be written into PTAG_HI
 */
 static inline
 void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
@ -460,11 +380,9 @@ void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
 #endif
-#if (CONFIG_ARC_MMU_VER < 3)
+#ifdef CONFIG_ARC_MMU_V3
 #define __cache_line_loop	__cache_line_loop_v2
 #elif (CONFIG_ARC_MMU_VER == 3)
 #define __cache_line_loop	__cache_line_loop_v3
-#elif (CONFIG_ARC_MMU_VER > 3)
+#else
 #define __cache_line_loop	__cache_line_loop_v4
 #endif
@ -1123,7 +1041,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
 	clear_page(to);
 	clear_bit(PG_dc_clean, &page->flags);
 }
-
+EXPORT_SYMBOL(clear_user_page);
 /**********************************************************************
 * Explicit Cache flush request from user space via syscall
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@ -33,28 +33,34 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
 	pud_t *pud, *pud_k;
 	pmd_t *pmd, *pmd_k;
-	pgd = pgd_offset_fast(current->active_mm, address);
+	pgd = pgd_offset(current->active_mm, address);
 	pgd_k = pgd_offset_k(address);
-	if (!pgd_present(*pgd_k))
+	if (pgd_none (*pgd_k))
 		goto bad_area;
 	if (!pgd_present(*pgd))
 		set_pgd(pgd, *pgd_k);
 	p4d = p4d_offset(pgd, address);
 	p4d_k = p4d_offset(pgd_k, address);
-	if (!p4d_present(*p4d_k))
+	if (p4d_none(*p4d_k))
 		goto bad_area;
 	if (!p4d_present(*p4d))
 		set_p4d(p4d, *p4d_k);
 	pud = pud_offset(p4d, address);
 	pud_k = pud_offset(p4d_k, address);
-	if (!pud_present(*pud_k))
+	if (pud_none(*pud_k))
 		goto bad_area;
 	if (!pud_present(*pud))
 		set_pud(pud, *pud_k);
 	pmd = pmd_offset(pud, address);
 	pmd_k = pmd_offset(pud_k, address);
-	if (!pmd_present(*pmd_k))
+	if (pmd_none(*pmd_k))
 		goto bad_area;
-
+	if (!pmd_present(*pmd))
-	set_pmd(pmd, *pmd_k);
+		set_pmd(pmd, *pmd_k);
 	/* XXX: create the TLB entry here */
 	return 0;
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@ -189,6 +189,11 @@ void __init mem_init(void)
 {
 	memblock_free_all();
 	highmem_init();
 	BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE);
 	BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE);
 	BUILD_BUG_ON((PTRS_PER_PMD * sizeof(pmd_t)) > PAGE_SIZE);
 	BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE);
 }
 #ifdef CONFIG_HIGHMEM
--- a/arch/arc/mm/ioremap.c
+++ b/arch/arc/mm/ioremap.c
@ -39,7 +39,8 @@ void __iomem *ioremap(phys_addr_t paddr, unsigned long size)
 	if (arc_uncached_addr_space(paddr))
 		return (void __iomem *)(u32)paddr;
-	return ioremap_prot(paddr, size, PAGE_KERNEL_NO_CACHE);
+	return ioremap_prot(paddr, size,
 			    pgprot_val(pgprot_noncached(PAGE_KERNEL)));
 }
 EXPORT_SYMBOL(ioremap);
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@ -1,51 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * TLB Management (flush/create/diagnostics) for ARC700
+ * TLB Management (flush/create/diagnostics) for MMUv3 and MMUv4
 *
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 *
 * vineetg: Aug 2011
 *  -Reintroduce duplicate PD fixup - some customer chips still have the issue
 *
 * vineetg: May 2011
 *  -No need to flush_cache_page( ) for each call to update_mmu_cache()
 *   some of the LMBench tests improved amazingly
 *      = page-fault thrice as fast (75 usec to 28 usec)
 *      = mmap twice as fast (9.6 msec to 4.6 msec),
 *      = fork (5.3 msec to 3.7 msec)
 *
 * vineetg: April 2011 :
 *  -MMU v3: PD{0,1} bits layout changed: They don't overlap anymore,
 *      helps avoid a shift when preparing PD0 from PTE
 *
 * vineetg: April 2011 : Preparing for MMU V3
 *  -MMU v2/v3 BCRs decoded differently
 *  -Remove TLB_SIZE hardcoding as it's variable now: 256 or 512
 *  -tlb_entry_erase( ) can be void
 *  -local_flush_tlb_range( ):
 *      = need not "ceil" @end
 *      = walks MMU only if range spans < 32 entries, as opposed to 256
 *
 * Vineetg: Sept 10th 2008
 *  -Changes related to MMU v2 (Rel 4.8)
 *
 * Vineetg: Aug 29th 2008
 *  -In TLB Flush operations (Metal Fix MMU) there is a explicit command to
 *    flush Micro-TLBS. If TLB Index Reg is invalid prior to TLBIVUTLB cmd,
 *    it fails. Thus need to load it with ANY valid value before invoking
 *    TLBIVUTLB cmd
 *
 * Vineetg: Aug 21th 2008:
 *  -Reduced the duration of IRQ lockouts in TLB Flush routines
 *  -Multiple copies of TLB erase code separated into a "single" function
 *  -In TLB Flush routines, interrupt disabling moved UP to retrieve ASID
 *       in interrupt-safe region.
 *
 * Vineetg: April 23rd Bug #93131
 *    Problem: tlb_flush_kernel_range() doesn't do anything if the range to
 *              flush is more than the size of TLB itself.
 *
 * Rahul Trivedi : Codito Technologies 2004
 */
 #include <linux/module.h>
@ -57,47 +15,6 @@
 #include <asm/mmu_context.h>
 #include <asm/mmu.h>
 /*			Need for ARC MMU v2
 *
 * ARC700 MMU-v1 had a Joint-TLB for Code and Data and is 2 way set-assoc.
 * For a memcpy operation with 3 players (src/dst/code) such that all 3 pages
 * map into same set, there would be contention for the 2 ways causing severe
 * Thrashing.
 *
 * Although J-TLB is 2 way set assoc, ARC700 caches J-TLB into uTLBS which has
 * much higher associativity. u-D-TLB is 8 ways, u-I-TLB is 4 ways.
 * Given this, the thrashing problem should never happen because once the 3
 * J-TLB entries are created (even though 3rd will knock out one of the prev
 * two), the u-D-TLB and u-I-TLB will have what is required to accomplish memcpy
 *
 * Yet we still see the Thrashing because a J-TLB Write cause flush of u-TLBs.
 * This is a simple design for keeping them in sync. So what do we do?
 * The solution which James came up was pretty neat. It utilised the assoc
 * of uTLBs by not invalidating always but only when absolutely necessary.
 *
 * - Existing TLB commands work as before
 * - New command (TLBWriteNI) for TLB write without clearing uTLBs
 * - New command (TLBIVUTLB) to invalidate uTLBs.
 *
 * The uTLBs need only be invalidated when pages are being removed from the
 * OS page table. If a 'victim' TLB entry is being overwritten in the main TLB
 * as a result of a miss, the removed entry is still allowed to exist in the
 * uTLBs as it is still valid and present in the OS page table. This allows the
 * full associativity of the uTLBs to hide the limited associativity of the main
 * TLB.
 *
 * During a miss handler, the new "TLBWriteNI" command is used to load
 * entries without clearing the uTLBs.
 *
 * When the OS page table is updated, TLB entries that may be associated with a
 * removed page are removed (flushed) from the TLB using TLBWrite. In this
 * circumstance, the uTLBs must also be cleared. This is done by using the
 * existing TLBWrite command. An explicit IVUTLB is also required for those
 * corner cases when TLBWrite was not executed at all because the corresp
 * J-TLB entry got evicted/replaced.
 */
 /* A copy of the ASID from the PID reg is kept in asid_cache */
 DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
@ -120,32 +37,10 @@ static inline void __tlb_entry_erase(void)
 static void utlb_invalidate(void)
 {
 #if (CONFIG_ARC_MMU_VER >= 2)
 #if (CONFIG_ARC_MMU_VER == 2)
 	/* MMU v2 introduced the uTLB Flush command.
 	 * There was however an obscure hardware bug, where uTLB flush would
 	 * fail when a prior probe for J-TLB (both totally unrelated) would
 	 * return lkup err - because the entry didn't exist in MMU.
 	 * The Workaround was to set Index reg with some valid value, prior to
 	 * flush. This was fixed in MMU v3
 	 */
 	unsigned int idx;
 	/* make sure INDEX Reg is valid */
 	idx = read_aux_reg(ARC_REG_TLBINDEX);
 	/* If not write some dummy val */
 	if (unlikely(idx & TLB_LKUP_ERR))
 		write_aux_reg(ARC_REG_TLBINDEX, 0xa);
 #endif
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBIVUTLB);
 #endif
 }
-#if (CONFIG_ARC_MMU_VER < 4)
+#ifdef CONFIG_ARC_MMU_V3
 static inline unsigned int tlb_entry_lkup(unsigned long vaddr_n_asid)
 {
@ -176,7 +71,7 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid)
 	}
 }
-static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
+static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1)
 {
 	unsigned int idx;
@ -206,7 +101,7 @@ static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
 }
-#else	/* CONFIG_ARC_MMU_VER >= 4) */
+#else	/* MMUv4 */
 static void tlb_entry_erase(unsigned int vaddr_n_asid)
 {
@ -214,13 +109,16 @@ static void tlb_entry_erase(unsigned int vaddr_n_asid)
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBDeleteEntry);
 }
-static void tlb_entry_insert(unsigned int pd0, pte_t pd1)
+static void tlb_entry_insert(unsigned int pd0, phys_addr_t pd1)
 {
 	write_aux_reg(ARC_REG_TLBPD0, pd0);
 	write_aux_reg(ARC_REG_TLBPD1, pd1);
-	if (is_pae40_enabled())
+	if (!is_pae40_enabled()) {
 		write_aux_reg(ARC_REG_TLBPD1, pd1);
 	} else {
 		write_aux_reg(ARC_REG_TLBPD1, pd1 & 0xFFFFFFFF);
 		write_aux_reg(ARC_REG_TLBPD1HI, (u64)pd1 >> 32);
 	}
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBInsertEntry);
 }
@ -496,7 +394,7 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
 	unsigned long flags;
 	unsigned int asid_or_sasid, rwx;
 	unsigned long pd0;
-	pte_t pd1;
+	phys_addr_t pd1;
 	/*
 	 * create_tlb() assumes that current->mm == vma->mm, since
@ -505,7 +403,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
 	 *
 	 * Removing the assumption involves
 	 * -Using vma->mm->context{ASID,SASID}, as opposed to MMU reg.
 	 * -Fix the TLB paranoid debug code to not trigger false negatives.
 	 * -More importantly it makes this handler inconsistent with fast-path
 	 *  TLB Refill handler which always deals with "current"
 	 *
@ -528,8 +425,6 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep)
 	local_irq_save(flags);
 	tlb_paranoid_check(asid_mm(vma->vm_mm, smp_processor_id()), vaddr);
 	vaddr &= PAGE_MASK;
 	/* update this PTE credentials */
@ -639,43 +534,6 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	update_mmu_cache(vma, addr, &pte);
 }
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 				pgtable_t pgtable)
 {
 	struct list_head *lh = (struct list_head *) pgtable;
 	assert_spin_locked(&mm->page_table_lock);
 	/* FIFO */
 	if (!pmd_huge_pte(mm, pmdp))
 		INIT_LIST_HEAD(lh);
 	else
 		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
 	pmd_huge_pte(mm, pmdp) = pgtable;
 }
 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
 	struct list_head *lh;
 	pgtable_t pgtable;
 	assert_spin_locked(&mm->page_table_lock);
 	pgtable = pmd_huge_pte(mm, pmdp);
 	lh = (struct list_head *) pgtable;
 	if (list_empty(lh))
 		pmd_huge_pte(mm, pmdp) = NULL;
 	else {
 		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
 		list_del(lh);
 	}
 	pte_val(pgtable[0]) = 0;
 	pte_val(pgtable[1]) = 0;
 	return pgtable;
 }
 void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			       unsigned long end)
 {
@ -706,14 +564,6 @@ void read_decode_mmu_bcr(void)
 {
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
 	unsigned int tmp;
 	struct bcr_mmu_1_2 {
 #ifdef CONFIG_CPU_BIG_ENDIAN
 		unsigned int ver:8, ways:4, sets:4, u_itlb:8, u_dtlb:8;
 #else
 		unsigned int u_dtlb:8, u_itlb:8, sets:4, ways:4, ver:8;
 #endif
 	} *mmu2;
 	struct bcr_mmu_3 {
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	unsigned int ver:8, ways:4, sets:4, res:3, sasid:1, pg_sz:4,
@ -738,23 +588,14 @@ void read_decode_mmu_bcr(void)
 	tmp = read_aux_reg(ARC_REG_MMU_BCR);
 	mmu->ver = (tmp >> 24);
-	if (is_isa_arcompact()) {
+	if (is_isa_arcompact() && mmu->ver == 3) {
-		if (mmu->ver <= 2) {
+		mmu3 = (struct bcr_mmu_3 *)&tmp;
-			mmu2 = (struct bcr_mmu_1_2 *)&tmp;
+		mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1);
-			mmu->pg_sz_k = TO_KB(0x2000);
+		mmu->sets = 1 << mmu3->sets;
-			mmu->sets = 1 << mmu2->sets;
+		mmu->ways = 1 << mmu3->ways;
-			mmu->ways = 1 << mmu2->ways;
+		mmu->u_dtlb = mmu3->u_dtlb;
-			mmu->u_dtlb = mmu2->u_dtlb;
+		mmu->u_itlb = mmu3->u_itlb;
-			mmu->u_itlb = mmu2->u_itlb;
+		mmu->sasid = mmu3->sasid;
 		} else {
 			mmu3 = (struct bcr_mmu_3 *)&tmp;
 			mmu->pg_sz_k = 1 << (mmu3->pg_sz - 1);
 			mmu->sets = 1 << mmu3->sets;
 			mmu->ways = 1 << mmu3->ways;
 			mmu->u_dtlb = mmu3->u_dtlb;
 			mmu->u_itlb = mmu3->u_itlb;
 			mmu->sasid = mmu3->sasid;
 		}
 	} else {
 		mmu4 = (struct bcr_mmu_4 *)&tmp;
 		mmu->pg_sz_k = 1 << (mmu4->sz0 - 1);
@ -780,8 +621,8 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 			  IS_USED_CFG(CONFIG_TRANSPARENT_HUGEPAGE));
 	n += scnprintf(buf + n, len - n,
-		      "MMU [v%x]\t: %dk PAGE, %sJTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n",
+		      "MMU [v%x]\t: %dk PAGE, %s, swalk %d lvl, JTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n",
-		       p_mmu->ver, p_mmu->pg_sz_k, super_pg,
+		       p_mmu->ver, p_mmu->pg_sz_k, super_pg,  CONFIG_PGTABLE_LEVELS,
 		       p_mmu->sets * p_mmu->ways, p_mmu->sets, p_mmu->ways,
 		       p_mmu->u_dtlb, p_mmu->u_itlb,
 		       IS_AVAIL2(p_mmu->pae, ", PAE40 ", CONFIG_ARC_HAS_PAE40));
@ -815,22 +656,17 @@ void arc_mmu_init(void)
 	/*
 	 * Ensure that MMU features assumed by kernel exist in hardware.
-	 * For older ARC700 cpus, it has to be exact match, since the MMU
+	 *  - For older ARC700 cpus, only v3 supported
-	 * revisions were not backwards compatible (MMUv3 TLB layout changed
+	 *  - For HS cpus, v4 was baseline and v5 is backwards compatible
-	 * so even if kernel for v2 didn't use any new cmds of v3, it would
+	 *    (will run older software).
 	 * still not work.
 	 * For HS cpus, MMUv4 was baseline and v5 is backwards compatible
 	 * (will run older software).
 	 */
-	if (is_isa_arcompact() && mmu->ver == CONFIG_ARC_MMU_VER)
+	if (is_isa_arcompact() && mmu->ver == 3)
 		compat = 1;
-	else if (is_isa_arcv2() && mmu->ver >= CONFIG_ARC_MMU_VER)
+	else if (is_isa_arcv2() && mmu->ver >= 4)
 		compat = 1;
-	if (!compat) {
+	if (!compat)
-		panic("MMU ver %d doesn't match kernel built for %d...\n",
+		panic("MMU ver %d doesn't match kernel built for\n", mmu->ver);
 		      mmu->ver, CONFIG_ARC_MMU_VER);
 	}
 	if (mmu->pg_sz_k != TO_KB(PAGE_SIZE))
 		panic("MMU pg size != PAGE_SIZE (%luk)\n", TO_KB(PAGE_SIZE));
@ -843,14 +679,11 @@ void arc_mmu_init(void)
 	if (IS_ENABLED(CONFIG_ARC_HAS_PAE40) && !mmu->pae)
 		panic("Hardware doesn't support PAE40\n");
-	/* Enable the MMU */
+	/* Enable the MMU with ASID 0 */
-	write_aux_reg(ARC_REG_PID, MMU_ENABLE);
+	mmu_setup_asid(NULL, 0);
-	/* In smp we use this reg for interrupt 1 scratch */
+	/* cache the pgd pointer in MMU SCRATCH reg (ARCv2 only) */
-#ifdef ARC_USE_SCRATCH_REG
+	mmu_setup_pgd(NULL, swapper_pg_dir);
 	/* swapper_pg_dir is the pgd for the kernel, used by vmalloc */
 	write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir);
 #endif
 	if (pae40_exist_but_not_enab())
 		write_aux_reg(ARC_REG_TLBPD1HI, 0);
@ -945,40 +778,3 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 	local_irq_restore(flags);
 }
 /***********************************************************************
 * Diagnostic Routines
 *  -Called from Low Level TLB Handlers if things don;t look good
 **********************************************************************/
 #ifdef CONFIG_ARC_DBG_TLB_PARANOIA
 /*
 * Low Level ASM TLB handler calls this if it finds that HW and SW ASIDS
 * don't match
 */
 void print_asid_mismatch(int mm_asid, int mmu_asid, int is_fast_path)
 {
 	pr_emerg("ASID Mismatch in %s Path Handler: sw-pid=0x%x hw-pid=0x%x\n",
 	       is_fast_path ? "Fast" : "Slow", mm_asid, mmu_asid);
 	__asm__ __volatile__("flag 1");
 }
 void tlb_paranoid_check(unsigned int mm_asid, unsigned long addr)
 {
 	unsigned int mmu_asid;
 	mmu_asid = read_aux_reg(ARC_REG_PID) & 0xff;
 	/*
 	 * At the time of a TLB miss/installation
 	 *   - HW version needs to match SW version
 	 *   - SW needs to have a valid ASID
 	 */
 	if (addr < 0x70000000 &&
 	    ((mm_asid == MM_CTXT_NO_ASID) ||
 	      (mmu_asid != (mm_asid & MM_CTXT_ASID_MASK))))
 		print_asid_mismatch(mm_asid, mmu_asid, 0);
 }
 #endif
--- a/arch/arc/mm/tlbex.S
+++ b/arch/arc/mm/tlbex.S
@ -39,7 +39,6 @@
 #include <asm/arcregs.h>
 #include <asm/cache.h>
 #include <asm/processor.h>
 #include <asm/tlb-mmu1.h>
 #ifdef CONFIG_ISA_ARCOMPACT
 ;-----------------------------------------------------------------
@ -94,11 +93,6 @@ ex_saved_reg1:
 	st_s  r1, [r0, 4]
 	st_s  r2, [r0, 8]
 	st_s  r3, [r0, 12]
 	; VERIFY if the ASID in MMU-PID Reg is same as
 	; one in Linux data structures
 	tlb_paranoid_check_asm
 .endm
 .macro TLBMISS_RESTORE_REGS
@ -147,55 +141,18 @@ ex_saved_reg1:
 #endif
 ;============================================================================
 ;  Troubleshooting Stuff
 ;============================================================================
 ; Linux keeps ASID (Address Space ID) in task->active_mm->context.asid
 ; When Creating TLB Entries, instead of doing 3 dependent loads from memory,
 ; we use the MMU PID Reg to get current ASID.
 ; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble.
 ; So we try to detect this in TLB Mis shandler
 .macro tlb_paranoid_check_asm
 #ifdef CONFIG_ARC_DBG_TLB_PARANOIA
 	GET_CURR_TASK_ON_CPU  r3
 	ld r0, [r3, TASK_ACT_MM]
 	ld r0, [r0, MM_CTXT+MM_CTXT_ASID]
 	breq r0, 0, 55f	; Error if no ASID allocated
 	lr r1, [ARC_REG_PID]
 	and r1, r1, 0xFF
 	and r2, r0, 0xFF	; MMU PID bits only for comparison
 	breq r1, r2, 5f
 55:
 	; Error if H/w and S/w ASID don't match, but NOT if in kernel mode
 	lr  r2, [erstatus]
 	bbit0 r2, STATUS_U_BIT, 5f
 	; We sure are in troubled waters, Flag the error, but to do so
 	; need to switch to kernel mode stack to call error routine
 	GET_TSK_STACK_BASE   r3, sp
 	; Call printk to shoutout aloud
 	mov r2, 1
 	j print_asid_mismatch
 5:	; ASIDs match so proceed normally
 	nop
 #endif
 .endm
 ;============================================================================
 ;TLB Miss handling Code
 ;============================================================================
 #ifndef PMD_SHIFT
 #define PMD_SHIFT PUD_SHIFT
 #endif
 #ifndef PUD_SHIFT
 #define PUD_SHIFT PGDIR_SHIFT
 #endif
 ;-----------------------------------------------------------------------------
 ; This macro does the page-table lookup for the faulting address.
 ; OUT: r0 = PTE faulted on, r1 = ptr to PTE, r2 = Faulting V-address
@ -203,7 +160,7 @@ ex_saved_reg1:
 	lr  r2, [efa]
-#ifdef ARC_USE_SCRATCH_REG
+#ifdef CONFIG_ISA_ARCV2
 	lr  r1, [ARC_REG_SCRATCH_DATA0] ; current pgd
 #else
 	GET_CURR_TASK_ON_CPU  r1
@ -216,6 +173,24 @@ ex_saved_reg1:
 	tst	r3, r3
 	bz	do_slow_path_pf         ; if no Page Table, do page fault
 #if CONFIG_PGTABLE_LEVELS > 3
 	lsr     r0, r2, PUD_SHIFT	; Bits for indexing into PUD
 	and	r0, r0, (PTRS_PER_PUD - 1)
 	ld.as	r1, [r3, r0]		; PMD entry
 	tst	r1, r1
 	bz	do_slow_path_pf
 	mov	r3, r1
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	lsr     r0, r2, PMD_SHIFT	; Bits for indexing into PMD
 	and	r0, r0, (PTRS_PER_PMD - 1)
 	ld.as	r1, [r3, r0]		; PMD entry
 	tst	r1, r1
 	bz	do_slow_path_pf
 	mov	r3, r1
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	and.f	0, r3, _PAGE_HW_SZ	; Is this Huge PMD (thp)
 	add2.nz	r1, r1, r0
@ -279,7 +254,7 @@ ex_saved_reg1:
 ; Commit the TLB entry into MMU
 .macro COMMIT_ENTRY_TO_MMU
-#if (CONFIG_ARC_MMU_VER < 4)
+#ifdef CONFIG_ARC_MMU_V3
 	/* Get free TLB slot: Set = computed from vaddr, way = random */
 	sr  TLBGetIndex, [ARC_REG_TLBCOMMAND]
@ -375,13 +350,6 @@ ENTRY(EV_TLBMissD)
 	CONV_PTE_TO_TLB
 #if (CONFIG_ARC_MMU_VER == 1)
 	; MMU with 2 way set assoc J-TLB, needs some help in pathetic case of
 	; memcpy where 3 parties contend for 2 ways, ensuing a livelock.
 	; But only for old MMU or one with Metal Fix
 	TLB_WRITE_HEURISTICS
 #endif
 	COMMIT_ENTRY_TO_MMU
 	TLBMISS_RESTORE_REGS
 EV_TLBMissD_fast_ret:	; additional label for VDK OS-kit instrumentation