From a003365cab64b0f7988ac3ccb1da895ce0bece5e Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@linux.ibm.com>
Date: Wed, 4 Sep 2019 00:55:29 -0400
Subject: [PATCH 01/16] powerpc/tm: Add tm-poison test

Add TM selftest to check if FP or VEC register values from one process
can leak into another process when both run on the same CPU.

Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190904045529.23002-3-gromero@linux.vnet.ibm.com
---
 tools/testing/selftests/powerpc/tm/.gitignore |   1 +
 tools/testing/selftests/powerpc/tm/Makefile   |   2 +-
 .../testing/selftests/powerpc/tm/tm-poison.c  | 179 ++++++++++++++++++
 3 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 951fe855f7cd..98f2708d86cc 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -17,3 +17,4 @@ tm-vmx-unavail
 tm-unavailable
 tm-trap
 tm-sigreturn
+tm-poison
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index c0734ed0ef56..b15a1a325bd0 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -5,7 +5,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu
 TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
 	tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
 	$(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
-	tm-signal-context-force-tm
+	tm-signal-context-force-tm tm-poison
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c
new file mode 100644
index 000000000000..977558497c16
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp.
+ *
+ * This test will spawn two processes. Both will be attached to the same
+ * CPU (CPU 0). The child will be in a loop writing to FP register f31 and
+ * VMX/VEC/Altivec register vr31 a known value, called poison, calling
+ * sched_yield syscall after to allow the parent to switch on the CPU.
+ * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and
+ * vr31 remain 1 as expected until a given timeout (2m). If the issue is
+ * present child's poison will leak into parent's f31 or vr31 registers,
+ * otherwise, poison will never leak into parent's f31 and vr31 registers.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#include "tm.h"
+
+int tm_poison_test(void)
+{
+	int pid;
+	cpu_set_t cpuset;
+	uint64_t poison = 0xdeadbeefc0dec0fe;
+	uint64_t unknown = 0;
+	bool fail_fp = false;
+	bool fail_vr = false;
+
+	SKIP_IF(!have_htm());
+
+	/* Attach both Child and Parent to CPU 0 */
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+	pid = fork();
+	if (!pid) {
+		/**
+		 * child
+		 */
+		while (1) {
+			sched_yield();
+			asm (
+				"mtvsrd 31, %[poison];" // f31 = poison
+				"mtvsrd 63, %[poison];" // vr31 = poison
+
+				: : [poison] "r" (poison) : );
+		}
+	}
+
+	/**
+	 * parent
+	 */
+	asm (
+		/*
+		 * Set r3, r4, and f31 to known value 1 before entering
+		 * in transaction. They won't be written after that.
+		 */
+		"       li      3, 0x1          ;"
+		"       li      4, 0x1          ;"
+		"       mtvsrd  31, 4           ;"
+
+		/*
+		 * The Time Base (TB) is a 64-bit counter register that is
+		 * independent of the CPU clock and which is incremented
+		 * at a frequency of 512000000 Hz, so every 1.953125ns.
+		 * So it's necessary 120s/0.000000001953125s = 61440000000
+		 * increments to get a 2 minutes timeout. Below we set that
+		 * value in r5 and then use r6 to track initial TB value,
+		 * updating TB values in r7 at every iteration and comparing it
+		 * to r6. When r7 (current) - r6 (initial) > 61440000000 we bail
+		 * out since for sure we spent already 2 minutes in the loop.
+		 * SPR 268 is the TB register.
+		 */
+		"       lis     5, 14           ;"
+		"       ori     5, 5, 19996     ;"
+		"       sldi    5, 5, 16        ;" // r5 = 61440000000
+
+		"       mfspr   6, 268          ;" // r6 (TB initial)
+		"1:     mfspr   7, 268          ;" // r7 (TB current)
+		"       subf    7, 6, 7         ;" // r7 - r6 > 61440000000 ?
+		"       cmpd    7, 5            ;"
+		"       bgt     3f              ;" // yes, exit
+
+		/*
+		 * Main loop to check f31
+		 */
+		"       tbegin.                 ;" // no, try again
+		"       beq     1b              ;" // restart if no timeout
+		"       mfvsrd  3, 31           ;" // read f31
+		"       cmpd    3, 4            ;" // f31 == 1 ?
+		"       bne     2f              ;" // broken :-(
+		"       tabort. 3               ;" // try another transaction
+		"2:     tend.                   ;" // commit transaction
+		"3:     mr    %[unknown], 3     ;" // record r3
+
+		: [unknown] "=r" (unknown)
+		:
+		: "cr0", "r3", "r4", "r5", "r6", "r7", "vs31"
+
+		);
+
+	/*
+	 * On leak 'unknown' will contain 'poison' value from child,
+	 * otherwise (no leak) 'unknown' will contain the same value
+	 * as r3 before entering in transactional mode, i.e. 0x1.
+	 */
+	fail_fp = unknown != 0x1;
+	if (fail_fp)
+		printf("Unknown value %#"PRIx64" leaked into f31!\n", unknown);
+	else
+		printf("Good, no poison or leaked value into FP registers\n");
+
+	asm (
+		/*
+		 * Set r3, r4, and vr31 to known value 1 before entering
+		 * in transaction. They won't be written after that.
+		 */
+		"       li      3, 0x1          ;"
+		"       li      4, 0x1          ;"
+		"       mtvsrd  63, 4           ;"
+
+		"       lis     5, 14           ;"
+		"       ori     5, 5, 19996     ;"
+		"       sldi    5, 5, 16        ;" // r5 = 61440000000
+
+		"       mfspr   6, 268          ;" // r6 (TB initial)
+		"1:     mfspr   7, 268          ;" // r7 (TB current)
+		"       subf    7, 6, 7         ;" // r7 - r6 > 61440000000 ?
+		"       cmpd    7, 5            ;"
+		"       bgt     3f              ;" // yes, exit
+
+		/*
+		 * Main loop to check vr31
+		 */
+		"       tbegin.                 ;" // no, try again
+		"       beq     1b              ;" // restart if no timeout
+		"       mfvsrd  3, 63           ;" // read vr31
+		"       cmpd    3, 4            ;" // vr31 == 1 ?
+		"       bne     2f              ;" // broken :-(
+		"       tabort. 3               ;" // try another transaction
+		"2:     tend.                   ;" // commit transaction
+		"3:     mr    %[unknown], 3     ;" // record r3
+
+		: [unknown] "=r" (unknown)
+		:
+		: "cr0", "r3", "r4", "r5", "r6", "r7", "vs63"
+
+		);
+
+	/*
+	 * On leak 'unknown' will contain 'poison' value from child,
+	 * otherwise (no leak) 'unknown' will contain the same value
+	 * as r3 before entering in transactional mode, i.e. 0x1.
+	 */
+	fail_vr = unknown != 0x1;
+	if (fail_vr)
+		printf("Unknown value %#"PRIx64" leaked into vr31!\n", unknown);
+	else
+		printf("Good, no poison or leaked value into VEC registers\n");
+
+	kill(pid, SIGKILL);
+
+	return (fail_fp | fail_vr);
+}
+
+int main(int argc, char *argv[])
+{
+	/* Test completes in about 4m */
+	test_harness_set_timeout(250);
+	return test_harness(tm_poison_test, "tm_poison_test");
+}

From 7aec584eaf1cc1a527dcbe7d80f2e44e3bfcfe1d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 18 Sep 2019 19:31:03 +0530
Subject: [PATCH 02/16] powerpc/book3s64/radix: Remove WARN_ON in
 destroy_context()

On failed task initialization due to memory allocation failures, we can
call into destroy_context() with process_tb entry already populated.
This patch forces the process_tb entry to zero in destroy_context().
With this patch, we lose the ability to track if we are destroying a
context without flushing the process table entry.

  WARNING: CPU: 4 PID: 6368 at arch/powerpc/mm/mmu_context_book3s64.c:246 destroy_context+0x58/0x340
  NIP [c0000000000875f8] destroy_context+0x58/0x340
  LR [c00000000013da18] __mmdrop+0x78/0x270
  Call Trace:
  [c000000f7db77c80] [c00000000013da18] __mmdrop+0x78/0x270
  [c000000f7db77cf0] [c0000000004d6a34] __do_execve_file.isra.13+0xbd4/0x1000
  [c000000f7db77e00] [c0000000004d7428] sys_execve+0x58/0x70
  [c000000f7db77e30] [c00000000000b388] system_call+0x5c/0x70

Reported-by: Priya M.A <priyama2@in.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
[mpe: Reformat/tweak comment wording]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190918140103.24395-1-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/mm/book3s64/mmu_context.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index 2d0cb5ba9a47..0ba30b8b935b 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -256,8 +256,21 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
+	/*
+	 * For tasks which were successfully initialized we end up calling
+	 * arch_exit_mmap() which clears the process table entry. And
+	 * arch_exit_mmap() is called before the required fullmm TLB flush
+	 * which does a RIC=2 flush. Hence for an initialized task, we do clear
+	 * any cached process table entries.
+	 *
+	 * The condition below handles the error case during task init. We have
+	 * set the process table entry early and if we fail a task
+	 * initialization, we need to ensure the process table entry is zeroed.
+	 * We need not worry about process table entry caches because the task
+	 * never ran with the PID value.
+	 */
 	if (radix_enabled())
-		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+		process_tb[mm->context.id].prtb0 = 0;
 	else
 		subpage_prot_free(mm);
 	destroy_contexts(&mm->context);

From c6fadabb2868f817299ddb338ac15885e25d12d2 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Tue, 17 Sep 2019 10:46:04 +1000
Subject: [PATCH 03/16] powerpc: Fix definition of PCR bits to work with old
 binutils

Commit 388cc6e133132 ("KVM: PPC: Book3S HV: Support POWER6
compatibility mode on POWER7") introduced new macros defining the PCR
bits. When used from assembly files these definitions lead to build
errors using older versions of binutils that don't support the 'ul'
suffix. This fixes the build errors by updating the definitions to use
the __MASK() macro which selects the appropriate suffix.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190917004605.22471-1-alistair@popple.id.au
---
 arch/powerpc/include/asm/reg.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ec3714cf0989..931939af95d1 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -475,9 +475,9 @@
 #define   HMER_DEBUG_TRIG	(1ul << (63 - 17)) /* Debug trigger */
 #define	SPRN_HMEER	0x151	/* Hyp maintenance exception enable reg */
 #define SPRN_PCR	0x152	/* Processor compatibility register */
-#define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
-#define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
-#define   PCR_TM_DIS	(1ul << (63-2))	/* Trans. memory disable (POWER8) */
+#define   PCR_VEC_DIS	(__MASK(63-0))	/* Vec. disable (bit NA since POWER8) */
+#define   PCR_VSX_DIS	(__MASK(63-1))	/* VSX disable (bit NA since POWER8) */
+#define   PCR_TM_DIS	(__MASK(63-2))	/* Trans. memory disable (POWER8) */
 /*
  * These bits are used in the function kvmppc_set_arch_compat() to specify and
  * determine both the compatibility level which we want to emulate and the

From 13c7bb3c57dcfe779ea5b4b083f6c47753cc5327 Mon Sep 17 00:00:00 2001
From: Jordan Niethe <jniethe5@gmail.com>
Date: Tue, 17 Sep 2019 10:46:05 +1000
Subject: [PATCH 04/16] powerpc/64s: Set reserved PCR bits

Currently the reserved bits of the Processor Compatibility
Register (PCR) are cleared as per the Programming Note in Section
1.3.3 of version 3.0B of the Power ISA. This causes all new
architecture features to be made available when running on newer
processors with new architecture features added to the PCR as bits
must be set to disable a given feature.

For example to disable new features added as part of Version 2.07 of
the ISA the corresponding bit in the PCR needs to be set.

As new processor features generally require explicit kernel support
they should be disabled until such support is implemented. Therefore
kernels should set all unknown/reserved bits in the PCR such that any
new architecture features which the kernel does not currently know
about get disabled.

An update is planned to the ISA to clarify that the PCR is an
exception to the Programming Note on reserved bits in Section 1.3.3.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Tested-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190917004605.22471-2-alistair@popple.id.au
---
 arch/powerpc/include/asm/reg.h          |  3 +++
 arch/powerpc/kernel/cpu_setup_power.S   |  6 ++++++
 arch/powerpc/kernel/dt_cpu_ftrs.c       |  3 ++-
 arch/powerpc/kvm/book3s_hv.c            | 11 +++++++----
 arch/powerpc/kvm/book3s_hv_nested.c     |  6 +++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++----
 6 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 931939af95d1..b3cbb1136bce 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -478,6 +478,7 @@
 #define   PCR_VEC_DIS	(__MASK(63-0))	/* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS	(__MASK(63-1))	/* VSX disable (bit NA since POWER8) */
 #define   PCR_TM_DIS	(__MASK(63-2))	/* Trans. memory disable (POWER8) */
+#define   PCR_HIGH_BITS	(PCR_VEC_DIS | PCR_VSX_DIS | PCR_TM_DIS)
 /*
  * These bits are used in the function kvmppc_set_arch_compat() to specify and
  * determine both the compatibility level which we want to emulate and the
@@ -486,6 +487,8 @@
 #define   PCR_ARCH_207	0x8		/* Architecture 2.07 */
 #define   PCR_ARCH_206	0x4		/* Architecture 2.06 */
 #define   PCR_ARCH_205	0x2		/* Architecture 2.05 */
+#define   PCR_LOW_BITS	(PCR_ARCH_207 | PCR_ARCH_206 | PCR_ARCH_205)
+#define   PCR_MASK	~(PCR_HIGH_BITS | PCR_LOW_BITS)	/* PCR Reserved Bits */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
 #define SPRN_TLBINDEXR	0x154	/* P7 TLB control register */
 #define SPRN_TLBVPNR	0x155	/* P7 TLB control register */
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 3239a9fe6c1c..a460298c7ddb 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -23,6 +23,7 @@ _GLOBAL(__setup_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
@@ -37,6 +38,7 @@ _GLOBAL(__restore_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
@@ -54,6 +56,7 @@ _GLOBAL(__setup_cpu_power8)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
@@ -76,6 +79,7 @@ _GLOBAL(__restore_cpu_power8)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr   r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
@@ -98,6 +102,7 @@ _GLOBAL(__setup_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mtspr	SPRN_PID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
@@ -123,6 +128,7 @@ _GLOBAL(__restore_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mtspr	SPRN_PID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr   r3,SPRN_LPCR
 	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index bd95318d2202..bceee2fde885 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -101,7 +101,7 @@ static void __restore_cpu_cpufeatures(void)
 	if (hv_mode) {
 		mtspr(SPRN_LPID, 0);
 		mtspr(SPRN_HFSCR, system_registers.hfscr);
-		mtspr(SPRN_PCR, 0);
+		mtspr(SPRN_PCR, PCR_MASK);
 	}
 	mtspr(SPRN_FSCR, system_registers.fscr);
 
@@ -144,6 +144,7 @@ static void __init cpufeatures_setup_cpu(void)
 		mtspr(SPRN_HFSCR, 0);
 	}
 	mtspr(SPRN_FSCR, 0);
+	mtspr(SPRN_PCR, PCR_MASK);
 
 	/*
 	 * LPCR does not get cleared, to match behaviour with secondaries
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index efd8f93bc9dc..709cf1fd4cf4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -401,8 +401,11 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 
 	spin_lock(&vc->lock);
 	vc->arch_compat = arch_compat;
-	/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
-	vc->pcr = host_pcr_bit - guest_pcr_bit;
+	/*
+	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
+	 * Also set all reserved PCR bits
+	 */
+	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
 	spin_unlock(&vc->lock);
 
 	return 0;
@@ -3410,7 +3413,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	}
 
 	if (vc->pcr)
-		mtspr(SPRN_PCR, vc->pcr);
+		mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
 	mtspr(SPRN_DPDES, vc->dpdes);
 	mtspr(SPRN_VTB, vc->vtb);
 
@@ -3490,7 +3493,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->vtb = mfspr(SPRN_VTB);
 	mtspr(SPRN_DPDES, 0);
 	if (vc->pcr)
-		mtspr(SPRN_PCR, 0);
+		mtspr(SPRN_PCR, PCR_MASK);
 
 	if (vc->tb_offset_applied) {
 		u64 new_tb = mftb() - vc->tb_offset_applied;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index fff90f2c3de2..cdf30c6eaf54 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -29,7 +29,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	hr->pcr = vc->pcr;
+	hr->pcr = vc->pcr | PCR_MASK;
 	hr->dpdes = vc->dpdes;
 	hr->hfscr = vcpu->arch.hfscr;
 	hr->tb_offset = vc->tb_offset;
@@ -65,7 +65,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
 	hr->lpid = swab32(hr->lpid);
 	hr->vcpu_token = swab32(hr->vcpu_token);
 	hr->lpcr = swab64(hr->lpcr);
-	hr->pcr = swab64(hr->pcr);
+	hr->pcr = swab64(hr->pcr) | PCR_MASK;
 	hr->amor = swab64(hr->amor);
 	hr->dpdes = swab64(hr->dpdes);
 	hr->hfscr = swab64(hr->hfscr);
@@ -148,7 +148,7 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	vc->pcr = hr->pcr;
+	vc->pcr = hr->pcr | PCR_MASK;
 	vc->dpdes = hr->dpdes;
 	vcpu->arch.hfscr = hr->hfscr;
 	vcpu->arch.dawr = hr->dawr0;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9a05b0d932ef..74a9cfe84aee 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -644,8 +644,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/* Load guest PCR value to select appropriate compat mode */
 37:	ld	r7, VCORE_PCR(r5)
-	cmpdi	r7, 0
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r7, r6
 	beq	38f
+	or	r7, r7, r6
 	mtspr	SPRN_PCR, r7
 38:
 
@@ -1913,10 +1915,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	/* Reset PCR */
 	ld	r0, VCORE_PCR(r5)
-	cmpdi	r0, 0
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r0, r6
 	beq	18f
-	li	r0, 0
-	mtspr	SPRN_PCR, r0
+	mtspr	SPRN_PCR, r6
 18:
 	/* Signal secondary CPUs to continue */
 	stb	r0,VCORE_IN_GUEST(r5)

From 4c0f5d1eb4072871c34530358df45f05ab80edd6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 21 Aug 2019 10:20:00 +0000
Subject: [PATCH 05/16] powerpc/mm: Add a helper to select PAGE_KERNEL_RO or
 PAGE_READONLY

In a couple of places there is a need to select whether read-only
protection of shadow pages is performed with PAGE_KERNEL_RO or with
PAGE_READONLY.

Add a helper to avoid duplicating the choice.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: stable@vger.kernel.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/9f33f44b9cd741c4a02b3dce7b8ef9438fe2cd2a.1566382750.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 802387b231ad..e8ab3cc5f6e4 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -12,6 +12,14 @@
 #include <asm/code-patching.h>
 #include <mm/mmu_decl.h>
 
+static pgprot_t kasan_prot_ro(void)
+{
+	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		return PAGE_READONLY;
+
+	return PAGE_KERNEL_RO;
+}
+
 static void kasan_populate_pte(pte_t *ptep, pgprot_t prot)
 {
 	unsigned long va = (unsigned long)kasan_early_shadow_page;
@@ -26,6 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l
 {
 	pmd_t *pmd;
 	unsigned long k_cur, k_next;
+	pgprot_t prot = kasan_prot_ro();
 
 	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
 
@@ -43,10 +52,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l
 
 		if (!new)
 			return -ENOMEM;
-		if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
-			kasan_populate_pte(new, PAGE_READONLY);
-		else
-			kasan_populate_pte(new, PAGE_KERNEL_RO);
+		kasan_populate_pte(new, prot);
 
 		smp_wmb(); /* See comment in __pte_alloc */
 
@@ -103,10 +109,9 @@ static int __ref kasan_init_region(void *start, size_t size)
 
 static void __init kasan_remap_early_shadow_ro(void)
 {
-	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
-		kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY);
-	else
-		kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO);
+	pgprot_t prot = kasan_prot_ro();
+
+	kasan_populate_pte(kasan_early_shadow_pte, prot);
 
 	flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
 }

From cbd18991e24fea2c31da3bb117c83e4a3538cd11 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 21 Aug 2019 10:20:11 +0000
Subject: [PATCH 06/16] powerpc/mm: Fix an Oops in kasan_mmu_init()

Uncompressing Kernel Image ... OK
   Loading Device Tree to 01ff7000, end 01fff74f ... OK
[    0.000000] printk: bootconsole [udbg0] enabled
[    0.000000] BUG: Unable to handle kernel data access at 0xf818c000
[    0.000000] Faulting instruction address: 0xc0013c7c
[    0.000000] Thread overran stack, or stack corrupted
[    0.000000] Oops: Kernel access of bad area, sig: 11 [#1]
[    0.000000] BE PAGE_SIZE=16K PREEMPT
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty #2080
[    0.000000] NIP:  c0013c7c LR: c0013310 CTR: 00000000
[    0.000000] REGS: c0c5ff38 TRAP: 0300   Not tainted  (5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty)
[    0.000000] MSR:  00001032 <ME,IR,DR,RI>  CR: 99033955  XER: 80002100
[    0.000000] DAR: f818c000 DSISR: 82000000
[    0.000000] GPR00: c0013310 c0c5fff0 c0ad6ac0 c0c600c0 f818c031 82000000 00000000 ffffffff
[    0.000000] GPR08: 00000000 f1f1f1f1 c0013c2c c0013304 99033955 00400008 00000000 07ff9598
[    0.000000] GPR16: 00000000 07ffb94c 00000000 00000000 00000000 00000000 00000000 f818cfb2
[    0.000000] GPR24: 00000000 00000000 00001000 ffffffff 00000000 c07dbf80 00000000 f818c000
[    0.000000] NIP [c0013c7c] do_page_fault+0x50/0x904
[    0.000000] LR [c0013310] handle_page_fault+0xc/0x38
[    0.000000] Call Trace:
[    0.000000] Instruction dump:
[    0.000000] be010080 91410014 553fe8fe 3d40c001 3d20f1f1 7d800026 394a3c2c 3fffe000
[    0.000000] 6129f1f1 900100c4 9181007c 91410018 <913f0000> 3d2001f4 6129f4f4 913f0004

Don't map the early shadow page read-only yet when creating the new
page tables for the real shadow memory, otherwise the memblock
allocations that immediately follows to create the real shadow pages
that are about to replace the early shadow page trigger a page fault
if they fall into the region being worked on at the moment.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Cc: stable@vger.kernel.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/fe86886fb8db44360417cee0dc515ad47ca6ef72.1566382750.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index e8ab3cc5f6e4..0e6ed4413eea 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -34,7 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l
 {
 	pmd_t *pmd;
 	unsigned long k_cur, k_next;
-	pgprot_t prot = kasan_prot_ro();
+	pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL;
 
 	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
 
@@ -110,9 +110,22 @@ static int __ref kasan_init_region(void *start, size_t size)
 static void __init kasan_remap_early_shadow_ro(void)
 {
 	pgprot_t prot = kasan_prot_ro();
+	unsigned long k_start = KASAN_SHADOW_START;
+	unsigned long k_end = KASAN_SHADOW_END;
+	unsigned long k_cur;
+	phys_addr_t pa = __pa(kasan_early_shadow_page);
 
 	kasan_populate_pte(kasan_early_shadow_pte, prot);
 
+	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+
+		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
+			continue;
+
+		__set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+	}
 	flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
 }
 

From 3a83f677a6eeff65751b29e3648d7c69c3be83f3 Mon Sep 17 00:00:00 2001
From: Michael Roth <mdroth@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2019 17:31:55 -0500
Subject: [PATCH 07/16] KVM: PPC: Book3S HV: use smp_mb() when setting/clearing
 host_ipi flag

On a 2-socket Power9 system with 32 cores/128 threads (SMT4) and 1TB
of memory running the following guest configs:

  guest A:
    - 224GB of memory
    - 56 VCPUs (sockets=1,cores=28,threads=2), where:
      VCPUs 0-1 are pinned to CPUs 0-3,
      VCPUs 2-3 are pinned to CPUs 4-7,
      ...
      VCPUs 54-55 are pinned to CPUs 108-111

  guest B:
    - 4GB of memory
    - 4 VCPUs (sockets=1,cores=4,threads=1)

with the following workloads (with KSM and THP enabled in all):

  guest A:
    stress --cpu 40 --io 20 --vm 20 --vm-bytes 512M

  guest B:
    stress --cpu 4 --io 4 --vm 4 --vm-bytes 512M

  host:
    stress --cpu 4 --io 4 --vm 2 --vm-bytes 256M

the below soft-lockup traces were observed after an hour or so and
persisted until the host was reset (this was found to be reliably
reproducible for this configuration, for kernels 4.15, 4.18, 5.0,
and 5.3-rc5):

  [ 1253.183290] rcu: INFO: rcu_sched self-detected stall on CPU
  [ 1253.183319] rcu:     124-....: (5250 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=1941
  [ 1256.287426] watchdog: BUG: soft lockup - CPU#105 stuck for 23s! [CPU 52/KVM:19709]
  [ 1264.075773] watchdog: BUG: soft lockup - CPU#24 stuck for 23s! [worker:19913]
  [ 1264.079769] watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [worker:20331]
  [ 1264.095770] watchdog: BUG: soft lockup - CPU#45 stuck for 23s! [worker:20338]
  [ 1264.131773] watchdog: BUG: soft lockup - CPU#64 stuck for 23s! [avocado:19525]
  [ 1280.408480] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791]
  [ 1316.198012] rcu: INFO: rcu_sched self-detected stall on CPU
  [ 1316.198032] rcu:     124-....: (21003 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=8243
  [ 1340.411024] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791]
  [ 1379.212609] rcu: INFO: rcu_sched self-detected stall on CPU
  [ 1379.212629] rcu:     124-....: (36756 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=14714
  [ 1404.413615] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791]
  [ 1442.227095] rcu: INFO: rcu_sched self-detected stall on CPU
  [ 1442.227115] rcu:     124-....: (52509 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=21403
  [ 1455.111787] INFO: task worker:19907 blocked for more than 120 seconds.
  [ 1455.111822]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.111833] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.111884] INFO: task worker:19908 blocked for more than 120 seconds.
  [ 1455.111905]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.111925] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.111966] INFO: task worker:20328 blocked for more than 120 seconds.
  [ 1455.111986]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.111998] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.112048] INFO: task worker:20330 blocked for more than 120 seconds.
  [ 1455.112068]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.112097] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.112138] INFO: task worker:20332 blocked for more than 120 seconds.
  [ 1455.112159]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.112179] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.112210] INFO: task worker:20333 blocked for more than 120 seconds.
  [ 1455.112231]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.112242] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.112282] INFO: task worker:20335 blocked for more than 120 seconds.
  [ 1455.112303]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1
  [ 1455.112332] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 1455.112372] INFO: task worker:20336 blocked for more than 120 seconds.
  [ 1455.112392]       Tainted: G             L    5.3.0-rc5-mdr-vanilla+ #1

CPUs 45, 24, and 124 are stuck on spin locks, likely held by
CPUs 105 and 31.

CPUs 105 and 31 are stuck in smp_call_function_many(), waiting on
target CPU 42. For instance:

  # CPU 105 registers (via xmon)
  R00 = c00000000020b20c   R16 = 00007d1bcd800000
  R01 = c00000363eaa7970   R17 = 0000000000000001
  R02 = c0000000019b3a00   R18 = 000000000000006b
  R03 = 000000000000002a   R19 = 00007d537d7aecf0
  R04 = 000000000000002a   R20 = 60000000000000e0
  R05 = 000000000000002a   R21 = 0801000000000080
  R06 = c0002073fb0caa08   R22 = 0000000000000d60
  R07 = c0000000019ddd78   R23 = 0000000000000001
  R08 = 000000000000002a   R24 = c00000000147a700
  R09 = 0000000000000001   R25 = c0002073fb0ca908
  R10 = c000008ffeb4e660   R26 = 0000000000000000
  R11 = c0002073fb0ca900   R27 = c0000000019e2464
  R12 = c000000000050790   R28 = c0000000000812b0
  R13 = c000207fff623e00   R29 = c0002073fb0ca808
  R14 = 00007d1bbee00000   R30 = c0002073fb0ca800
  R15 = 00007d1bcd600000   R31 = 0000000000000800
  pc  = c00000000020b260 smp_call_function_many+0x3d0/0x460
  cfar= c00000000020b270 smp_call_function_many+0x3e0/0x460
  lr  = c00000000020b20c smp_call_function_many+0x37c/0x460
  msr = 900000010288b033   cr  = 44024824
  ctr = c000000000050790   xer = 0000000000000000   trap =  100

CPU 42 is running normally, doing VCPU work:

  # CPU 42 stack trace (via xmon)
  [link register   ] c00800001be17188 kvmppc_book3s_radix_page_fault+0x90/0x2b0 [kvm_hv]
  [c000008ed3343820] c000008ed3343850 (unreliable)
  [c000008ed33438d0] c00800001be11b6c kvmppc_book3s_hv_page_fault+0x264/0xe30 [kvm_hv]
  [c000008ed33439d0] c00800001be0d7b4 kvmppc_vcpu_run_hv+0x8dc/0xb50 [kvm_hv]
  [c000008ed3343ae0] c00800001c10891c kvmppc_vcpu_run+0x34/0x48 [kvm]
  [c000008ed3343b00] c00800001c10475c kvm_arch_vcpu_ioctl_run+0x244/0x420 [kvm]
  [c000008ed3343b90] c00800001c0f5a78 kvm_vcpu_ioctl+0x470/0x7c8 [kvm]
  [c000008ed3343d00] c000000000475450 do_vfs_ioctl+0xe0/0xc70
  [c000008ed3343db0] c0000000004760e4 ksys_ioctl+0x104/0x120
  [c000008ed3343e00] c000000000476128 sys_ioctl+0x28/0x80
  [c000008ed3343e20] c00000000000b388 system_call+0x5c/0x70
  --- Exception: c00 (System Call) at 00007d545cfd7694
  SP (7d53ff7edf50) is in userspace

It was subsequently found that ipi_message[PPC_MSG_CALL_FUNCTION]
was set for CPU 42 by at least 1 of the CPUs waiting in
smp_call_function_many(), but somehow the corresponding
call_single_queue entries were never processed by CPU 42, causing the
callers to spin in csd_lock_wait() indefinitely.

Nick Piggin suggested something similar to the following sequence as
a possible explanation (interleaving of CALL_FUNCTION/RESCHEDULE
IPI messages seems to be most common, but any mix of CALL_FUNCTION and
!CALL_FUNCTION messages could trigger it):

    CPU
      X: smp_muxed_ipi_set_message():
      X:   smp_mb()
      X:   message[RESCHEDULE] = 1
      X: doorbell_global_ipi(42):
      X:   kvmppc_set_host_ipi(42, 1)
      X:   ppc_msgsnd_sync()/smp_mb()
      X:   ppc_msgsnd() -> 42
     42: doorbell_exception(): // from CPU X
     42:   ppc_msgsync()
    105: smp_muxed_ipi_set_message():
    105:   smb_mb()
         // STORE DEFERRED DUE TO RE-ORDERING
  --105:   message[CALL_FUNCTION] = 1
  | 105: doorbell_global_ipi(42):
  | 105:   kvmppc_set_host_ipi(42, 1)
  |  42:   kvmppc_set_host_ipi(42, 0)
  |  42: smp_ipi_demux_relaxed()
  |  42: // returns to executing guest
  |      // RE-ORDERED STORE COMPLETES
  ->105:   message[CALL_FUNCTION] = 1
    105:   ppc_msgsnd_sync()/smp_mb()
    105:   ppc_msgsnd() -> 42
     42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
    105: // hangs waiting on 42 to process messages/call_single_queue

This can be prevented with an smp_mb() at the beginning of
kvmppc_set_host_ipi(), such that stores to message[<type>] (or other
state indicated by the host_ipi flag) are ordered vs. the store to
to host_ipi.

However, doing so might still allow for the following scenario (not
yet observed):

    CPU
      X: smp_muxed_ipi_set_message():
      X:   smp_mb()
      X:   message[RESCHEDULE] = 1
      X: doorbell_global_ipi(42):
      X:   kvmppc_set_host_ipi(42, 1)
      X:   ppc_msgsnd_sync()/smp_mb()
      X:   ppc_msgsnd() -> 42
     42: doorbell_exception(): // from CPU X
     42:   ppc_msgsync()
         // STORE DEFERRED DUE TO RE-ORDERING
  -- 42:   kvmppc_set_host_ipi(42, 0)
  |  42: smp_ipi_demux_relaxed()
  | 105: smp_muxed_ipi_set_message():
  | 105:   smb_mb()
  | 105:   message[CALL_FUNCTION] = 1
  | 105: doorbell_global_ipi(42):
  | 105:   kvmppc_set_host_ipi(42, 1)
  |      // RE-ORDERED STORE COMPLETES
  -> 42:   kvmppc_set_host_ipi(42, 0)
     42: // returns to executing guest
    105:   ppc_msgsnd_sync()/smp_mb()
    105:   ppc_msgsnd() -> 42
     42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
    105: // hangs waiting on 42 to process messages/call_single_queue

Fixing this scenario would require an smp_mb() *after* clearing
host_ipi flag in kvmppc_set_host_ipi() to order the store vs.
subsequent processing of IPI messages.

To handle both cases, this patch splits kvmppc_set_host_ipi() into
separate set/clear functions, where we execute smp_mb() prior to
setting host_ipi flag, and after clearing host_ipi flag. These
functions pair with each other to synchronize the sender and receiver
sides.

With that change in place the above workload ran for 20 hours without
triggering any lock-ups.

Fixes: 755563bc79c7 ("powerpc/powernv: Fixes for hypervisor doorbell handling") # v4.0
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190911223155.16045-1-mdroth@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_ppc.h    | 100 +++++++++++++++++++++++++-
 arch/powerpc/kernel/dbell.c           |   6 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c  |   2 +-
 arch/powerpc/platforms/powernv/smp.c  |   2 +-
 arch/powerpc/sysdev/xics/icp-native.c |   6 +-
 arch/powerpc/sysdev/xics/icp-opal.c   |   6 +-
 6 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 8e8514efb124..ee62776e5433 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -452,9 +452,100 @@ static inline u32 kvmppc_get_xics_latch(void)
 	return xirr;
 }
 
-static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+/*
+ * To avoid the need to unnecessarily exit fully to the host kernel, an IPI to
+ * a CPU thread that's running/napping inside of a guest is by default regarded
+ * as a request to wake the CPU (if needed) and continue execution within the
+ * guest, potentially to process new state like externally-generated
+ * interrupts or IPIs sent from within the guest itself (e.g. H_PROD/H_IPI).
+ *
+ * To force an exit to the host kernel, kvmppc_set_host_ipi() must be called
+ * prior to issuing the IPI to set the corresponding 'host_ipi' flag in the
+ * target CPU's PACA. To avoid unnecessary exits to the host, this flag should
+ * be immediately cleared via kvmppc_clear_host_ipi() by the IPI handler on
+ * the receiving side prior to processing the IPI work.
+ *
+ * NOTE:
+ *
+ * We currently issue an smp_mb() at the beginning of kvmppc_set_host_ipi().
+ * This is to guard against sequences such as the following:
+ *
+ *      CPU
+ *        X: smp_muxed_ipi_set_message():
+ *        X:   smp_mb()
+ *        X:   message[RESCHEDULE] = 1
+ *        X: doorbell_global_ipi(42):
+ *        X:   kvmppc_set_host_ipi(42)
+ *        X:   ppc_msgsnd_sync()/smp_mb()
+ *        X:   ppc_msgsnd() -> 42
+ *       42: doorbell_exception(): // from CPU X
+ *       42:   ppc_msgsync()
+ *      105: smp_muxed_ipi_set_message():
+ *      105:   smb_mb()
+ *           // STORE DEFERRED DUE TO RE-ORDERING
+ *    --105:   message[CALL_FUNCTION] = 1
+ *    | 105: doorbell_global_ipi(42):
+ *    | 105:   kvmppc_set_host_ipi(42)
+ *    |  42:   kvmppc_clear_host_ipi(42)
+ *    |  42: smp_ipi_demux_relaxed()
+ *    |  42: // returns to executing guest
+ *    |      // RE-ORDERED STORE COMPLETES
+ *    ->105:   message[CALL_FUNCTION] = 1
+ *      105:   ppc_msgsnd_sync()/smp_mb()
+ *      105:   ppc_msgsnd() -> 42
+ *       42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
+ *      105: // hangs waiting on 42 to process messages/call_single_queue
+ *
+ * We also issue an smp_mb() at the end of kvmppc_clear_host_ipi(). This is
+ * to guard against sequences such as the following (as well as to create
+ * a read-side pairing with the barrier in kvmppc_set_host_ipi()):
+ *
+ *      CPU
+ *        X: smp_muxed_ipi_set_message():
+ *        X:   smp_mb()
+ *        X:   message[RESCHEDULE] = 1
+ *        X: doorbell_global_ipi(42):
+ *        X:   kvmppc_set_host_ipi(42)
+ *        X:   ppc_msgsnd_sync()/smp_mb()
+ *        X:   ppc_msgsnd() -> 42
+ *       42: doorbell_exception(): // from CPU X
+ *       42:   ppc_msgsync()
+ *           // STORE DEFERRED DUE TO RE-ORDERING
+ *    -- 42:   kvmppc_clear_host_ipi(42)
+ *    |  42: smp_ipi_demux_relaxed()
+ *    | 105: smp_muxed_ipi_set_message():
+ *    | 105:   smb_mb()
+ *    | 105:   message[CALL_FUNCTION] = 1
+ *    | 105: doorbell_global_ipi(42):
+ *    | 105:   kvmppc_set_host_ipi(42)
+ *    |      // RE-ORDERED STORE COMPLETES
+ *    -> 42:   kvmppc_clear_host_ipi(42)
+ *       42: // returns to executing guest
+ *      105:   ppc_msgsnd_sync()/smp_mb()
+ *      105:   ppc_msgsnd() -> 42
+ *       42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
+ *      105: // hangs waiting on 42 to process messages/call_single_queue
+ */
+static inline void kvmppc_set_host_ipi(int cpu)
 {
-	paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi;
+	/*
+	 * order stores of IPI messages vs. setting of host_ipi flag
+	 *
+	 * pairs with the barrier in kvmppc_clear_host_ipi()
+	 */
+	smp_mb();
+	paca_ptrs[cpu]->kvm_hstate.host_ipi = 1;
+}
+
+static inline void kvmppc_clear_host_ipi(int cpu)
+{
+	paca_ptrs[cpu]->kvm_hstate.host_ipi = 0;
+	/*
+	 * order clearing of host_ipi flag vs. processing of IPI messages
+	 *
+	 * pairs with the barrier in kvmppc_set_host_ipi()
+	 */
+	smp_mb();
 }
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -486,7 +577,10 @@ static inline u32 kvmppc_get_xics_latch(void)
 	return 0;
 }
 
-static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+static inline void kvmppc_set_host_ipi(int cpu)
+{}
+
+static inline void kvmppc_clear_host_ipi(int cpu)
 {}
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 804b1a6196fa..f17ff1200eaa 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -33,7 +33,7 @@ void doorbell_global_ipi(int cpu)
 {
 	u32 tag = get_hard_smp_processor_id(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
 	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -48,7 +48,7 @@ void doorbell_core_ipi(int cpu)
 {
 	u32 tag = cpu_thread_in_core(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
 	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -84,7 +84,7 @@ void doorbell_exception(struct pt_regs *regs)
 
 	may_hard_irq_enable();
 
-	kvmppc_set_host_ipi(smp_processor_id(), 0);
+	kvmppc_clear_host_ipi(smp_processor_id());
 	__this_cpu_inc(irq_stat.doorbell_irqs);
 
 	smp_ipi_demux_relaxed(); /* already performed the barrier */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 4d2ec77d806c..287d5911df0f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -58,7 +58,7 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	kvmppc_set_host_ipi(hcpu, 1);
+	kvmppc_set_host_ipi(hcpu);
 	smp_mb();
 	kvmhv_rm_send_ipi(hcpu);
 }
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 94cd96b9b7bb..fbd6e6b7bbf2 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -193,7 +193,7 @@ static void pnv_smp_cpu_kill_self(void)
 		 * for coming online, which are handled via
 		 * generic_check_cpu_restart() calls.
 		 */
-		kvmppc_set_host_ipi(cpu, 0);
+		kvmppc_clear_host_ipi(cpu);
 
 		srr1 = pnv_cpu_offline(cpu);
 
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 485569ff7ef1..7d13d2ef5a90 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -140,7 +140,7 @@ static unsigned int icp_native_get_irq(void)
 
 static void icp_native_cause_ipi(int cpu)
 {
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
@@ -179,7 +179,7 @@ void icp_native_flush_interrupt(void)
 	if (vec == XICS_IPI) {
 		/* Clear pending IPI */
 		int cpu = smp_processor_id();
-		kvmppc_set_host_ipi(cpu, 0);
+		kvmppc_clear_host_ipi(cpu);
 		icp_native_set_qirr(cpu, 0xff);
 	} else {
 		pr_err("XICS: hw interrupt 0x%x to offline cpu, disabling\n",
@@ -200,7 +200,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
 
-	kvmppc_set_host_ipi(cpu, 0);
+	kvmppc_clear_host_ipi(cpu);
 	icp_native_set_qirr(cpu, 0xff);
 
 	return smp_ipi_demux();
diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index 8bb8dd7dd6ad..68fd2540b093 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c
@@ -126,7 +126,7 @@ static void icp_opal_cause_ipi(int cpu)
 {
 	int hw_cpu = get_hard_smp_processor_id(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	opal_int_set_mfrr(hw_cpu, IPI_PRIORITY);
 }
 
@@ -134,7 +134,7 @@ static irqreturn_t icp_opal_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
 
-	kvmppc_set_host_ipi(cpu, 0);
+	kvmppc_clear_host_ipi(cpu);
 	opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
 
 	return smp_ipi_demux();
@@ -157,7 +157,7 @@ void icp_opal_flush_interrupt(void)
 		if (vec == XICS_IPI) {
 			/* Clear pending IPI */
 			int cpu = smp_processor_id();
-			kvmppc_set_host_ipi(cpu, 0);
+			kvmppc_clear_host_ipi(cpu);
 			opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
 		} else {
 			pr_err("XICS: hw interrupt 0x%x to offline cpu, "

From 1211ee61b4a8e60d6dc77211cdcf01906915bfba Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Fri, 20 Sep 2019 15:05:22 +0200
Subject: [PATCH 08/16] powerpc/pseries: Read TLB Block Invalidate
 Characteristics

The PAPR document specifies the TLB Block Invalidate Characteristics
which tells for each pair of segment base page size, actual page size,
the size of the block the hcall H_BLOCK_REMOVE supports.

These characteristics are loaded at boot time in a new table
hblkr_size. The table is separate from the mmu_psize_def because this
is specific to the pseries platform.

A new init function, pseries_lpar_read_hblkrm_characteristics() is
added to read the characteristics. It is called from
pSeries_setup_arch().

Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE")
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190920130523.20441-2-ldufour@linux.ibm.com
---
 arch/powerpc/platforms/pseries/lpar.c    | 140 +++++++++++++++++++++++
 arch/powerpc/platforms/pseries/pseries.h |   1 +
 arch/powerpc/platforms/pseries/setup.c   |   1 +
 3 files changed, 142 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 36b846f6e74e..8e3ac7ae4e11 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -56,6 +56,15 @@ EXPORT_SYMBOL(plpar_hcall);
 EXPORT_SYMBOL(plpar_hcall9);
 EXPORT_SYMBOL(plpar_hcall_norets);
 
+/*
+ * H_BLOCK_REMOVE supported block size for this page size in segment who's base
+ * page size is that page size.
+ *
+ * The first index is the segment base page size, the second one is the actual
+ * page size.
+ */
+static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static u8 dtl_mask = DTL_LOG_PREEMPT;
 #else
@@ -1311,6 +1320,137 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
 		(void)call_block_remove(pix, param, true);
 }
 
+/*
+ * TLB Block Invalidate Characteristics
+ *
+ * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
+ * is able to process for each couple segment base page size, actual page size.
+ *
+ * The ibm,get-system-parameter properties is returning a buffer with the
+ * following layout:
+ *
+ * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
+ * -----------------
+ * TLB Block Invalidate Specifiers:
+ * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
+ * [ 1 byte Number of page sizes (N) that are supported for the specified
+ *          TLB invalidate block size ]
+ * [ 1 byte Encoded segment base page size and actual page size
+ *          MSB=0 means 4k segment base page size and actual page size
+ *          MSB=1 the penc value in mmu_psize_def ]
+ * ...
+ * -----------------
+ * Next TLB Block Invalidate Specifiers...
+ * -----------------
+ * [ 0 ]
+ */
+static inline void set_hblkrm_bloc_size(int bpsize, int psize,
+					unsigned int block_size)
+{
+	if (block_size > hblkrm_size[bpsize][psize])
+		hblkrm_size[bpsize][psize] = block_size;
+}
+
+/*
+ * Decode the Encoded segment base page size and actual page size.
+ * PAPR specifies:
+ *   - bit 7 is the L bit
+ *   - bits 0-5 are the penc value
+ * If the L bit is 0, this means 4K segment base page size and actual page size
+ * otherwise the penc value should be read.
+ */
+#define HBLKRM_L_MASK		0x80
+#define HBLKRM_PENC_MASK	0x3f
+static inline void __init check_lp_set_hblkrm(unsigned int lp,
+					      unsigned int block_size)
+{
+	unsigned int bpsize, psize;
+
+	/* First, check the L bit, if not set, this means 4K */
+	if ((lp & HBLKRM_L_MASK) == 0) {
+		set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
+		return;
+	}
+
+	lp &= HBLKRM_PENC_MASK;
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
+		struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			if (def->penc[psize] == lp) {
+				set_hblkrm_bloc_size(bpsize, psize, block_size);
+				return;
+			}
+		}
+	}
+}
+
+#define SPLPAR_TLB_BIC_TOKEN		50
+
+/*
+ * The size of the TLB Block Invalidate Characteristics is variable. But at the
+ * maximum it will be the number of possible page sizes *2 + 10 bytes.
+ * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
+ * (128 bytes) for the buffer to get plenty of space.
+ */
+#define SPLPAR_TLB_BIC_MAXLENGTH	128
+
+void __init pseries_lpar_read_hblkrm_characteristics(void)
+{
+	unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
+	int call_status, len, idx, bpsize;
+
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				SPLPAR_TLB_BIC_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+	memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
+	local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (call_status != 0) {
+		pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
+			__FILE__, __func__, call_status);
+		return;
+	}
+
+	/*
+	 * The first two (2) bytes of the data in the buffer are the length of
+	 * the returned data, not counting these first two (2) bytes.
+	 */
+	len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
+	if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
+		pr_warn("%s too large returned buffer %d", __func__, len);
+		return;
+	}
+
+	idx = 2;
+	while (idx < len) {
+		u8 block_shift = local_buffer[idx++];
+		u32 block_size;
+		unsigned int npsize;
+
+		if (!block_shift)
+			break;
+
+		block_size = 1 << block_shift;
+
+		for (npsize = local_buffer[idx++];
+		     npsize > 0 && idx < len; npsize--)
+			check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
+					    block_size);
+	}
+
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+			if (hblkrm_size[bpsize][idx])
+				pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
+					bpsize, idx, hblkrm_size[bpsize][idx]);
+}
+
 /*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index a6624d4bd9d0..13fa370a87e4 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -112,5 +112,6 @@ static inline unsigned long cmo_get_page_size(void)
 int dlpar_workqueue_init(void);
 
 void pseries_setup_rfi_flush(void);
+void pseries_lpar_read_hblkrm_characteristics(void);
 
 #endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f8adcd0e4589..0a40201f315f 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -744,6 +744,7 @@ static void __init pSeries_setup_arch(void)
 
 	pseries_setup_rfi_flush();
 	setup_stf_barrier();
+	pseries_lpar_read_hblkrm_characteristics();
 
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);

From 59545ebe331917afcb1ebbd6f3e5dc3ed51beb05 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Fri, 20 Sep 2019 15:05:23 +0200
Subject: [PATCH 09/16] powerpc/pseries: Call H_BLOCK_REMOVE when supported

Depending on the hardware and the hypervisor, the hcall H_BLOCK_REMOVE
may not be able to process all the page sizes for a segment base page
size, as reported by the TLB Invalidate Characteristics.

For each pair of base segment page size and actual page size, this
characteristic tells us the size of the block the hcall supports.

In the case, the hcall is not supporting a pair of base segment page
size, actual page size, it is returning H_PARAM which leads to a panic
like this:

  kernel BUG at /home/srikar/work/linux.git/arch/powerpc/platforms/pseries/lpar.c:466!
  Oops: Exception in kernel mode, sig: 5 [#1]
  BE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 28 PID: 583 Comm: modprobe Not tainted 5.2.0-master #5
  NIP: c0000000000be8dc LR: c0000000000be880 CTR: 0000000000000000
  REGS: c0000007e77fb130 TRAP: 0700  Not tainted (5.2.0-master)
  MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI> CR: 42224824 XER: 20000000
  CFAR: c0000000000be8fc IRQMASK: 0
  GPR00: 0000000022224828 c0000007e77fb3c0 c000000001434d00 0000000000000005
  GPR04: 9000000004fa8c00 0000000000000000 0000000000000003 0000000000000001
  GPR08: c0000007e77fb450 0000000000000000 0000000000000001 ffffffffffffffff
  GPR12: c0000007e77fb450 c00000000edfcb80 0000cd7d3ea30000 c0000000016022b0
  GPR16: 00000000000000b0 0000cd7d3ea30000 0000000000000001 c080001f04f00105
  GPR20: 0000000000000003 0000000000000004 c000000fbeb05f58 c000000001602200
  GPR24: 0000000000000000 0000000000000004 8800000000000000 c000000000c5d148
  GPR28: c000000000000000 8000000000000000 a000000000000000 c0000007e77fb580
  NIP [c0000000000be8dc] .call_block_remove+0x12c/0x220
  LR [c0000000000be880] .call_block_remove+0xd0/0x220
  Call Trace:
    0xc000000fb8c00240 (unreliable)
    .pSeries_lpar_flush_hash_range+0x578/0x670
    .flush_hash_range+0x44/0x100
    .__flush_tlb_pending+0x3c/0xc0
    .zap_pte_range+0x7ec/0x830
    .unmap_page_range+0x3f4/0x540
    .unmap_vmas+0x94/0x120
    .exit_mmap+0xac/0x1f0
    .mmput+0x9c/0x1f0
    .do_exit+0x388/0xd60
    .do_group_exit+0x54/0x100
    .__se_sys_exit_group+0x14/0x20
    system_call+0x5c/0x70
  Instruction dump:
  39400001 38a00000 4800003c 60000000 60420000 7fa9e800 38e00000 419e0014
  7d29d278 7d290074 7929d182 69270001 <0b070000> 7d495378 394a0001 7fa93040

The call to H_BLOCK_REMOVE should only be made for the supported pair
of base segment page size, actual page size and using the correct
maximum block size.

Due to the required complexity in do_block_remove() and
call_block_remove(), and the fact that currently a block size of 8 is
returned by the hypervisor, we are only supporting 8 size block to the
H_BLOCK_REMOVE hcall.

In order to identify this limitation easily in the code, a local
define HBLKR_SUPPORTED_SIZE defining the currently supported block
size, and a dedicated checking helper is_supported_hlbkr() are
introduced.

For regular pages and hugetlb, the assumption is made that the page
size is equal to the base page size. For THP the page size is assumed
to be 16M.

Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE")
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190920130523.20441-3-ldufour@linux.ibm.com
---
 arch/powerpc/platforms/pseries/lpar.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 8e3ac7ae4e11..b53359258d99 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -65,6 +65,13 @@ EXPORT_SYMBOL(plpar_hcall_norets);
  */
 static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
 
+/*
+ * Due to the involved complexity, and that the current hypervisor is only
+ * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
+ * buffer size to 8 size block.
+ */
+#define HBLKRM_SUPPORTED_BLOCK_SIZE 8
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static u8 dtl_mask = DTL_LOG_PREEMPT;
 #else
@@ -993,6 +1000,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 #define HBLKR_CTRL_ERRNOTFOUND	0x8800000000000000UL
 #define HBLKR_CTRL_ERRBUSY	0xa000000000000000UL
 
+/*
+ * Returned true if we are supporting this block size for the specified segment
+ * base page size and actual page size.
+ *
+ * Currently, we only support 8 size block.
+ */
+static inline bool is_supported_hlbkrm(int bpsize, int psize)
+{
+	return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
+}
+
 /**
  * H_BLOCK_REMOVE caller.
  * @idx should point to the latest @param entry set with a PTEX.
@@ -1152,7 +1170,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
 	if (lock_tlbie)
 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
-	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
+	/* Assuming THP size is 16M */
+	if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
 		hugepage_block_invalidate(slot, vpn, count, psize, ssize);
 	else
 		hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
@@ -1470,7 +1489,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 	if (lock_tlbie)
 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
-	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) {
+	if (is_supported_hlbkrm(batch->psize, batch->psize)) {
 		do_block_remove(number, batch, param);
 		goto out;
 	}

From 677733e296b5c7a37c47da391fc70a43dc40bd67 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 24 Sep 2019 09:22:51 +0530
Subject: [PATCH 10/16] powerpc/book3s64/mm: Don't do tlbie fixup for some
 hardware revisions

The store ordering vs tlbie issue mentioned in commit
a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on
POWER9") is fixed for Nimbus 2.3 and Cumulus 1.3 revisions. We don't
need to apply the fixup if we are running on them

We can only do this on PowerNV. On pseries guest with KVM we still
don't support redoing the feature fixup after migration. So we should
be enabling all the workarounds needed, because whe can possibly
migrate between DD 2.3 and DD 2.2

Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9")
Cc: stable@vger.kernel.org # v4.16+
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190924035254.24612-1-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/kernel/dt_cpu_ftrs.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index bceee2fde885..5fc1b527de46 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -692,9 +692,35 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
 	return true;
 }
 
+/*
+ * Handle POWER9 broadcast tlbie invalidation issue using
+ * cpu feature flag.
+ */
+static __init void update_tlbie_feature_flag(unsigned long pvr)
+{
+	if (PVR_VER(pvr) == PVR_POWER9) {
+		/*
+		 * Set the tlbie feature flag for anything below
+		 * Nimbus DD 2.3 and Cumulus DD 1.3
+		 */
+		if ((pvr & 0xe000) == 0) {
+			/* Nimbus */
+			if ((pvr & 0xfff) < 0x203)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+		} else if ((pvr & 0xc000) == 0) {
+			/* Cumulus */
+			if ((pvr & 0xfff) < 0x103)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+		} else {
+			WARN_ONCE(1, "Unknown PVR");
+			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+		}
+	}
+}
+
 static __init void cpufeatures_cpu_quirks(void)
 {
-	int version = mfspr(SPRN_PVR);
+	unsigned long version = mfspr(SPRN_PVR);
 
 	/*
 	 * Not all quirks can be derived from the cpufeatures device tree.
@@ -713,10 +739,10 @@ static __init void cpufeatures_cpu_quirks(void)
 
 	if ((version & 0xffff0000) == 0x004e0000) {
 		cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
-		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
 		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR;
 	}
 
+	update_tlbie_feature_flag(version);
 	/*
 	 * PKEY was not in the initial base or feature node
 	 * specification, but it should become optional in the next

From 09ce98cacd51fcd0fa0af2f79d1e1d3192f4cbb0 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 24 Sep 2019 09:22:52 +0530
Subject: [PATCH 11/16] powerpc/book3s64/radix: Rename CPU_FTR_P9_TLBIE_BUG
 feature flag

Rename the #define to indicate this is related to store vs tlbie
ordering issue. In the next patch, we will be adding another feature
flag that is used to handles ERAT flush vs tlbie ordering issue.

Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9")
Cc: stable@vger.kernel.org # v4.16+
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190924035254.24612-2-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/include/asm/cputable.h    | 4 ++--
 arch/powerpc/kernel/dt_cpu_ftrs.c      | 6 +++---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c    | 2 +-
 arch/powerpc/mm/book3s64/hash_native.c | 2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c   | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index a1ebcbc3931f..f080fba48619 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -209,7 +209,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_POWER9_DD2_1		LONG_ASM_CONST(0x0000080000000000)
 #define CPU_FTR_P9_TM_HV_ASSIST		LONG_ASM_CONST(0x0000100000000000)
 #define CPU_FTR_P9_TM_XER_SO_BUG	LONG_ASM_CONST(0x0000200000000000)
-#define CPU_FTR_P9_TLBIE_BUG		LONG_ASM_CONST(0x0000400000000000)
+#define CPU_FTR_P9_TLBIE_STQ_BUG	LONG_ASM_CONST(0x0000400000000000)
 #define CPU_FTR_P9_TIDR			LONG_ASM_CONST(0x0000800000000000)
 
 #ifndef __ASSEMBLY__
@@ -457,7 +457,7 @@ static inline void cpu_feature_keys_init(void) { }
 	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
 	    CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-	    CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR)
+	    CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR)
 #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
 #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 5fc1b527de46..a86486390c70 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -706,14 +706,14 @@ static __init void update_tlbie_feature_flag(unsigned long pvr)
 		if ((pvr & 0xe000) == 0) {
 			/* Nimbus */
 			if ((pvr & 0xfff) < 0x203)
-				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
 		} else if ((pvr & 0xc000) == 0) {
 			/* Cumulus */
 			if ((pvr & 0xfff) < 0x103)
-				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
 		} else {
 			WARN_ONCE(1, "Unknown PVR");
-			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
 		}
 	}
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7186c65c61c9..cfdf232aac34 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -451,7 +451,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 		}
 
-		if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+		if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 			/*
 			 * Need the extra ptesync to make sure we don't
 			 * re-order the tlbie
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 90ab4f31e2b3..02568dae4695 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -199,7 +199,7 @@ static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
 
 static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 {
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		/* Need the extra ptesync to ensure we don't reorder tlbie*/
 		asm volatile("ptesync": : :"memory");
 		___tlbie(vpn, psize, apsize, ssize);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 631be42abd33..69fdc004d83f 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -201,7 +201,7 @@ static inline void fixup_tlbie(void)
 	unsigned long pid = 0;
 	unsigned long va = ((1UL << 52) - 1);
 
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
 	}
@@ -211,7 +211,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
 {
 	unsigned long va = ((1UL << 52) - 1);
 
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
 	}

From 047e6575aec71d75b765c22111820c4776cd1c43 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 24 Sep 2019 09:22:53 +0530
Subject: [PATCH 12/16] powerpc/mm: Fixup tlbie vs mtpidr/mtlpidr ordering
 issue on POWER9

On POWER9, under some circumstances, a broadcast TLB invalidation will
fail to invalidate the ERAT cache on some threads when there are
parallel mtpidr/mtlpidr happening on other threads of the same core.
This can cause stores to continue to go to a page after it's unmapped.

The workaround is to force an ERAT flush using PID=0 or LPID=0 tlbie
flush. This additional TLB flush will cause the ERAT cache
invalidation. Since we are using PID=0 or LPID=0, we don't get
filtered out by the TLB snoop filtering logic.

We need to still follow this up with another tlbie to take care of
store vs tlbie ordering issue explained in commit:
a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on
POWER9"). The presence of ERAT cache implies we can still get new
stores and they may miss store queue marking flush.

Cc: stable@vger.kernel.org
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190924035254.24612-3-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/include/asm/cputable.h    |  3 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c      |  2 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c    | 42 ++++++++++----
 arch/powerpc/mm/book3s64/hash_native.c | 29 +++++++++-
 arch/powerpc/mm/book3s64/radix_tlb.c   | 80 +++++++++++++++++++++++---
 5 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index f080fba48619..cf00ff0d121d 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -211,6 +211,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_P9_TM_XER_SO_BUG	LONG_ASM_CONST(0x0000200000000000)
 #define CPU_FTR_P9_TLBIE_STQ_BUG	LONG_ASM_CONST(0x0000400000000000)
 #define CPU_FTR_P9_TIDR			LONG_ASM_CONST(0x0000800000000000)
+#define CPU_FTR_P9_TLBIE_ERAT_BUG	LONG_ASM_CONST(0x0001000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -457,7 +458,7 @@ static inline void cpu_feature_keys_init(void) { }
 	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
 	    CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-	    CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR)
+	    CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG | CPU_FTR_P9_TIDR)
 #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
 #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index a86486390c70..180b3a5d1001 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -715,6 +715,8 @@ static __init void update_tlbie_feature_flag(unsigned long pvr)
 			WARN_ONCE(1, "Unknown PVR");
 			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
 		}
+
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG;
 	}
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index cfdf232aac34..220305454c23 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -433,6 +433,37 @@ static inline int is_mmio_hpte(unsigned long v, unsigned long r)
 		(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
 }
 
+static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid)
+{
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		/* Radix flush for a hash guest */
+
+		unsigned long rb,rs,prs,r,ric;
+
+		rb = PPC_BIT(52); /* IS = 2 */
+		rs = 0;  /* lpid = 0 */
+		prs = 0; /* partition scoped */
+		r = 1;   /* radix format */
+		ric = 0; /* RIC_FLSUH_TLB */
+
+		/*
+		 * Need the extra ptesync to make sure we don't
+		 * re-order the tlbie
+		 */
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+			     : : "r"(rb), "i"(r), "i"(prs),
+			       "i"(ric), "r"(rs) : "memory");
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+			     "r" (rb_value), "r" (lpid));
+	}
+}
+
 static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 		      long npages, int global, bool need_sync)
 {
@@ -451,16 +482,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 		}
 
-		if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
-			/*
-			 * Need the extra ptesync to make sure we don't
-			 * re-order the tlbie
-			 */
-			asm volatile("ptesync": : :"memory");
-			asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
-				     "r" (rbvalues[0]), "r" (kvm->arch.lpid));
-		}
-
+		fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid);
 		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 	} else {
 		if (need_sync)
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 02568dae4695..523e42eb11da 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -197,8 +197,31 @@ static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
 	return va;
 }
 
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void fixup_tlbie_vpn(unsigned long vpn, int psize,
+				   int apsize, int ssize)
 {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		/* Radix flush for a hash guest */
+
+		unsigned long rb,rs,prs,r,ric;
+
+		rb = PPC_BIT(52); /* IS = 2 */
+		rs = 0;  /* lpid = 0 */
+		prs = 0; /* partition scoped */
+		r = 1;   /* radix format */
+		ric = 0; /* RIC_FLSUH_TLB */
+
+		/*
+		 * Need the extra ptesync to make sure we don't
+		 * re-order the tlbie
+		 */
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+			     : : "r"(rb), "i"(r), "i"(prs),
+			       "i"(ric), "r"(rs) : "memory");
+	}
+
+
 	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		/* Need the extra ptesync to ensure we don't reorder tlbie*/
 		asm volatile("ptesync": : :"memory");
@@ -283,7 +306,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
 		asm volatile("ptesync": : :"memory");
 	} else {
 		__tlbie(vpn, psize, apsize, ssize);
-		fixup_tlbie(vpn, psize, apsize, ssize);
+		fixup_tlbie_vpn(vpn, psize, apsize, ssize);
 		asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 	if (lock_tlbie && !use_local)
@@ -856,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 		/*
 		 * Just do one more with the last used values.
 		 */
-		fixup_tlbie(vpn, psize, psize, ssize);
+		fixup_tlbie_vpn(vpn, psize, psize, ssize);
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
 		if (lock_tlbie)
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 69fdc004d83f..67af871190c6 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -196,21 +196,82 @@ static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid
 	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
 }
 
-static inline void fixup_tlbie(void)
+
+static inline void fixup_tlbie_va(unsigned long va, unsigned long pid,
+				  unsigned long ap)
 {
-	unsigned long pid = 0;
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, 0, ap, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+	}
+}
+
+static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
+					unsigned long ap)
+{
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_pid(0, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+	}
+}
+
+static inline void fixup_tlbie_pid(unsigned long pid)
+{
+	/*
+	 * We can use any address for the invalidation, pick one which is
+	 * probably unused as an optimisation.
+	 */
 	unsigned long va = ((1UL << 52) - 1);
 
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_pid(0, RIC_FLUSH_TLB);
+	}
+
 	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
 	}
 }
 
+
+static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
+				       unsigned long ap)
+{
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB);
+	}
+}
+
 static inline void fixup_tlbie_lpid(unsigned long lpid)
 {
+	/*
+	 * We can use any address for the invalidation, pick one which is
+	 * probably unused as an optimisation.
+	 */
 	unsigned long va = ((1UL << 52) - 1);
 
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid(0, RIC_FLUSH_TLB);
+	}
+
 	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
@@ -258,6 +319,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	switch (ric) {
 	case RIC_FLUSH_TLB:
 		__tlbie_pid(pid, RIC_FLUSH_TLB);
+		fixup_tlbie_pid(pid);
 		break;
 	case RIC_FLUSH_PWC:
 		__tlbie_pid(pid, RIC_FLUSH_PWC);
@@ -265,8 +327,8 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	case RIC_FLUSH_ALL:
 	default:
 		__tlbie_pid(pid, RIC_FLUSH_ALL);
+		fixup_tlbie_pid(pid);
 	}
-	fixup_tlbie();
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -315,6 +377,7 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 	switch (ric) {
 	case RIC_FLUSH_TLB:
 		__tlbie_lpid(lpid, RIC_FLUSH_TLB);
+		fixup_tlbie_lpid(lpid);
 		break;
 	case RIC_FLUSH_PWC:
 		__tlbie_lpid(lpid, RIC_FLUSH_PWC);
@@ -322,8 +385,8 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 	case RIC_FLUSH_ALL:
 	default:
 		__tlbie_lpid(lpid, RIC_FLUSH_ALL);
+		fixup_tlbie_lpid(lpid);
 	}
-	fixup_tlbie_lpid(lpid);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -390,6 +453,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
 
 	for (addr = start; addr < end; addr += page_size)
 		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+
+	fixup_tlbie_va_range(addr - page_size, pid, ap);
 }
 
 static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
@@ -399,7 +464,7 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
 
 	asm volatile("ptesync": : :"memory");
 	__tlbie_va(va, pid, ap, ric);
-	fixup_tlbie();
+	fixup_tlbie_va(va, pid, ap);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -457,7 +522,7 @@ static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
 
 	asm volatile("ptesync": : :"memory");
 	__tlbie_lpid_va(va, lpid, ap, ric);
-	fixup_tlbie_lpid(lpid);
+	fixup_tlbie_lpid_va(va, lpid, ap);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -469,7 +534,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 	if (also_pwc)
 		__tlbie_pid(pid, RIC_FLUSH_PWC);
 	__tlbie_va_range(start, end, pid, page_size, psize);
-	fixup_tlbie();
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -856,7 +920,7 @@ is_local:
 			if (gflush)
 				__tlbie_va_range(gstart, gend, pid,
 						PUD_SIZE, MMU_PAGE_1G);
-			fixup_tlbie();
+
 			asm volatile("eieio; tlbsync; ptesync": : :"memory");
 		} else {
 			_tlbiel_va_range_multicast(mm,

From 93cad5f789951eaa27c3392b15294b4e51253944 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 24 Sep 2019 09:22:54 +0530
Subject: [PATCH 13/16] selftests/powerpc: Add test case for tlbie vs mtpidr
 ordering issue

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
[mpe: Some minor fixes to make it build]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190924035254.24612-4-aneesh.kumar@linux.ibm.com
---
 tools/testing/selftests/powerpc/mm/Makefile   |   2 +
 .../testing/selftests/powerpc/mm/tlbie_test.c | 734 ++++++++++++++++++
 2 files changed, 736 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/mm/tlbie_test.c

diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index f1fbc15800c4..ed1565809d2b 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -4,6 +4,7 @@ noarg:
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
 		  large_vm_fork_separation
+TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
 top_srcdir = ../../../../..
@@ -19,3 +20,4 @@ $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/tempfile:
 	dd if=/dev/zero of=$@ bs=64k count=1
 
+$(OUTPUT)/tlbie_test: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c
new file mode 100644
index 000000000000..9868a5ddd847
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, Nick Piggin, Gautham R. Shenoy, Aneesh Kumar K.V, IBM Corp.
+ */
+
+/*
+ *
+ * Test tlbie/mtpidr race. We have 4 threads doing flush/load/compare/store
+ * sequence in a loop. The same threads also rung a context switch task
+ * that does sched_yield() in loop.
+ *
+ * The snapshot thread mark the mmap area PROT_READ in between, make a copy
+ * and copy it back to the original area. This helps us to detect if any
+ * store continued to happen after we marked the memory PROT_READ.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <linux/futex.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <time.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+static inline void dcbf(volatile unsigned int *addr)
+{
+	__asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory");
+}
+
+static void err_msg(char *msg)
+{
+
+	time_t now;
+	time(&now);
+	printf("=================================\n");
+	printf("    Error: %s\n", msg);
+	printf("    %s", ctime(&now));
+	printf("=================================\n");
+	exit(1);
+}
+
+static char *map1;
+static char *map2;
+static pid_t rim_process_pid;
+
+/*
+ * A "rim-sequence" is defined to be the sequence of the following
+ * operations performed on a memory word:
+ *	1) FLUSH the contents of that word.
+ *	2) LOAD the contents of that word.
+ *	3) COMPARE the contents of that word with the content that was
+ *	           previously stored at that word
+ *	4) STORE new content into that word.
+ *
+ * The threads in this test that perform the rim-sequence are termed
+ * as rim_threads.
+ */
+
+/*
+ * A "corruption" is defined to be the failed COMPARE operation in a
+ * rim-sequence.
+ *
+ * A rim_thread that detects a corruption informs about it to all the
+ * other rim_threads, and the mem_snapshot thread.
+ */
+static volatile unsigned int corruption_found;
+
+/*
+ * This defines the maximum number of rim_threads in this test.
+ *
+ * The THREAD_ID_BITS denote the number of bits required
+ * to represent the thread_ids [0..MAX_THREADS - 1].
+ * We are being a bit paranoid here and set it to 8 bits,
+ * though 6 bits suffice.
+ *
+ */
+#define MAX_THREADS 		64
+#define THREAD_ID_BITS		8
+#define THREAD_ID_MASK		((1 << THREAD_ID_BITS) - 1)
+static unsigned int rim_thread_ids[MAX_THREADS];
+static pthread_t rim_threads[MAX_THREADS];
+
+
+/*
+ * Each rim_thread works on an exclusive "chunk" of size
+ * RIM_CHUNK_SIZE.
+ *
+ * The ith rim_thread works on the ith chunk.
+ *
+ * The ith chunk begins at
+ * map1 + (i * RIM_CHUNK_SIZE)
+ */
+#define RIM_CHUNK_SIZE  	1024
+#define BITS_PER_BYTE 		8
+#define WORD_SIZE     		(sizeof(unsigned int))
+#define WORD_BITS		(WORD_SIZE * BITS_PER_BYTE)
+#define WORDS_PER_CHUNK		(RIM_CHUNK_SIZE/WORD_SIZE)
+
+static inline char *compute_chunk_start_addr(unsigned int thread_id)
+{
+	char *chunk_start;
+
+	chunk_start = (char *)((unsigned long)map1 +
+			       (thread_id * RIM_CHUNK_SIZE));
+
+	return chunk_start;
+}
+
+/*
+ * The "word-offset" of a word-aligned address inside a chunk, is
+ * defined to be the number of words that precede the address in that
+ * chunk.
+ *
+ * WORD_OFFSET_BITS denote the number of bits required to represent
+ * the word-offsets of all the word-aligned addresses of a chunk.
+ */
+#define WORD_OFFSET_BITS	(__builtin_ctz(WORDS_PER_CHUNK))
+#define WORD_OFFSET_MASK	((1 << WORD_OFFSET_BITS) - 1)
+
+static inline unsigned int compute_word_offset(char *start, unsigned int *addr)
+{
+	unsigned int delta_bytes, ret;
+	delta_bytes = (unsigned long)addr - (unsigned long)start;
+
+	ret = delta_bytes/WORD_SIZE;
+
+	return ret;
+}
+
+/*
+ * A "sweep" is defined to be the sequential execution of the
+ * rim-sequence by a rim_thread on its chunk one word at a time,
+ * starting from the first word of its chunk and ending with the last
+ * word of its chunk.
+ *
+ * Each sweep of a rim_thread is uniquely identified by a sweep_id.
+ * SWEEP_ID_BITS denote the number of bits required to represent
+ * the sweep_ids of rim_threads.
+ *
+ * As to why SWEEP_ID_BITS are computed as a function of THREAD_ID_BITS,
+ * WORD_OFFSET_BITS, and WORD_BITS, see the "store-pattern" below.
+ */
+#define SWEEP_ID_BITS		(WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS))
+#define SWEEP_ID_MASK		((1 << SWEEP_ID_BITS) - 1)
+
+/*
+ * A "store-pattern" is the word-pattern that is stored into a word
+ * location in the 4)STORE step of the rim-sequence.
+ *
+ * In the store-pattern, we shall encode:
+ *
+ *      - The thread-id of the rim_thread performing the store
+ *        (The most significant THREAD_ID_BITS)
+ *
+ *      - The word-offset of the address into which the store is being
+ *        performed (The next WORD_OFFSET_BITS)
+ *
+ *      - The sweep_id of the current sweep in which the store is
+ *        being performed. (The lower SWEEP_ID_BITS)
+ *
+ * Store Pattern: 32 bits
+ * |------------------|--------------------|---------------------------------|
+ * |    Thread id     |  Word offset       |         sweep_id                |
+ * |------------------|--------------------|---------------------------------|
+ *    THREAD_ID_BITS     WORD_OFFSET_BITS          SWEEP_ID_BITS
+ *
+ * In the store pattern, the (Thread-id + Word-offset) uniquely identify the
+ * address to which the store is being performed i.e,
+ *    address == map1 +
+ *              (Thread-id * RIM_CHUNK_SIZE) + (Word-offset * WORD_SIZE)
+ *
+ * And the sweep_id in the store pattern identifies the time when the
+ * store was performed by the rim_thread.
+ *
+ * We shall use this property in the 3)COMPARE step of the
+ * rim-sequence.
+ */
+#define SWEEP_ID_SHIFT	0
+#define WORD_OFFSET_SHIFT	(SWEEP_ID_BITS)
+#define THREAD_ID_SHIFT		(WORD_OFFSET_BITS + SWEEP_ID_BITS)
+
+/*
+ * Compute the store pattern for a given thread with id @tid, at
+ * location @addr in the sweep identified by @sweep_id
+ */
+static inline unsigned int compute_store_pattern(unsigned int tid,
+						 unsigned int *addr,
+						 unsigned int sweep_id)
+{
+	unsigned int ret = 0;
+	char *start = compute_chunk_start_addr(tid);
+	unsigned int word_offset = compute_word_offset(start, addr);
+
+	ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT;
+	ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT;
+	ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT;
+	return ret;
+}
+
+/* Extract the thread-id from the given store-pattern */
+static inline unsigned int extract_tid(unsigned int pattern)
+{
+	unsigned int ret;
+
+	ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK;
+	return ret;
+}
+
+/* Extract the word-offset from the given store-pattern */
+static inline unsigned int extract_word_offset(unsigned int pattern)
+{
+	unsigned int ret;
+
+	ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK;
+
+	return ret;
+}
+
+/* Extract the sweep-id from the given store-pattern */
+static inline unsigned int extract_sweep_id(unsigned int pattern)
+
+{
+	unsigned int ret;
+
+	ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK;
+
+	return ret;
+}
+
+/************************************************************
+ *                                                          *
+ *          Logging the output of the verification          *
+ *                                                          *
+ ************************************************************/
+#define LOGDIR_NAME_SIZE 100
+static char logdir[LOGDIR_NAME_SIZE];
+
+static FILE *fp[MAX_THREADS];
+static const char logfilename[] ="Thread-%02d-Chunk";
+
+static inline void start_verification_log(unsigned int tid,
+					  unsigned int *addr,
+					  unsigned int cur_sweep_id,
+					  unsigned int prev_sweep_id)
+{
+	FILE *f;
+	char logfile[30];
+	char path[LOGDIR_NAME_SIZE + 30];
+	char separator[2] = "/";
+	char *chunk_start = compute_chunk_start_addr(tid);
+	unsigned int size = RIM_CHUNK_SIZE;
+
+	sprintf(logfile, logfilename, tid);
+	strcpy(path, logdir);
+	strcat(path, separator);
+	strcat(path, logfile);
+	f = fopen(path, "w");
+
+	if (!f) {
+		err_msg("Unable to create logfile\n");
+	}
+
+	fp[tid] = f;
+
+	fprintf(f, "----------------------------------------------------------\n");
+	fprintf(f, "PID                = %d\n", rim_process_pid);
+	fprintf(f, "Thread id          = %02d\n", tid);
+	fprintf(f, "Chunk Start Addr   = 0x%016lx\n", (unsigned long)chunk_start);
+	fprintf(f, "Chunk Size         = %d\n", size);
+	fprintf(f, "Next Store Addr    = 0x%016lx\n", (unsigned long)addr);
+	fprintf(f, "Current sweep-id   = 0x%08x\n", cur_sweep_id);
+	fprintf(f, "Previous sweep-id  = 0x%08x\n", prev_sweep_id);
+	fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void log_anamoly(unsigned int tid, unsigned int *addr,
+			       unsigned int expected, unsigned int observed)
+{
+	FILE *f = fp[tid];
+
+	fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n",
+	        tid, (unsigned long)addr, expected, observed);
+	fprintf(f, "Thread %02d: Expected Thread id   = %02d\n", tid, extract_tid(expected));
+	fprintf(f, "Thread %02d: Observed Thread id   = %02d\n", tid, extract_tid(observed));
+	fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected));
+	fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed));
+	fprintf(f, "Thread %02d: Expected sweep-id    = 0x%x\n", tid, extract_sweep_id(expected));
+	fprintf(f, "Thread %02d: Observed sweep-id    = 0x%x\n", tid, extract_sweep_id(observed));
+	fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
+{
+	FILE *f = fp[tid];
+	char logfile[30];
+	char path[LOGDIR_NAME_SIZE + 30];
+	char separator[] = "/";
+
+	fclose(f);
+
+	if (nr_anamolies == 0) {
+		remove(path);
+		return;
+	}
+
+	sprintf(logfile, logfilename, tid);
+	strcpy(path, logdir);
+	strcat(path, separator);
+	strcat(path, logfile);
+
+	printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
+		tid, nr_anamolies, path);
+}
+
+/*
+ * When a COMPARE step of a rim-sequence fails, the rim_thread informs
+ * everyone else via the shared_memory pointed to by
+ * corruption_found variable. On seeing this, every thread verifies the
+ * content of its chunk as follows.
+ *
+ * Suppose a thread identified with @tid was about to store (but not
+ * yet stored) to @next_store_addr in its current sweep identified
+ * @cur_sweep_id. Let @prev_sweep_id indicate the previous sweep_id.
+ *
+ * This implies that for all the addresses @addr < @next_store_addr,
+ * Thread @tid has already performed a store as part of its current
+ * sweep. Hence we expect the content of such @addr to be:
+ *    |-------------------------------------------------|
+ *    | tid   | word_offset(addr) |    cur_sweep_id     |
+ *    |-------------------------------------------------|
+ *
+ * Since Thread @tid is yet to perform stores on address
+ * @next_store_addr and above, we expect the content of such an
+ * address @addr to be:
+ *    |-------------------------------------------------|
+ *    | tid   | word_offset(addr) |    prev_sweep_id    |
+ *    |-------------------------------------------------|
+ *
+ * The verifier function @verify_chunk does this verification and logs
+ * any anamolies that it finds.
+ */
+static void verify_chunk(unsigned int tid, unsigned int *next_store_addr,
+		  unsigned int cur_sweep_id,
+		  unsigned int prev_sweep_id)
+{
+	unsigned int *iter_ptr;
+	unsigned int size = RIM_CHUNK_SIZE;
+	unsigned int expected;
+	unsigned int observed;
+	char *chunk_start = compute_chunk_start_addr(tid);
+
+	int nr_anamolies = 0;
+
+	start_verification_log(tid, next_store_addr,
+			       cur_sweep_id, prev_sweep_id);
+
+	for (iter_ptr = (unsigned int *)chunk_start;
+	     (unsigned long)iter_ptr < (unsigned long)chunk_start + size;
+	     iter_ptr++) {
+		unsigned int expected_sweep_id;
+
+		if (iter_ptr < next_store_addr) {
+			expected_sweep_id = cur_sweep_id;
+		} else {
+			expected_sweep_id = prev_sweep_id;
+		}
+
+		expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id);
+
+		dcbf((volatile unsigned int*)iter_ptr); //Flush before reading
+		observed = *iter_ptr;
+
+	        if (observed != expected) {
+			nr_anamolies++;
+			log_anamoly(tid, iter_ptr, expected, observed);
+		}
+	}
+
+	end_verification_log(tid, nr_anamolies);
+}
+
+static void set_pthread_cpu(pthread_t th, int cpu)
+{
+	cpu_set_t run_cpu_mask;
+	struct sched_param param;
+
+	CPU_ZERO(&run_cpu_mask);
+	CPU_SET(cpu, &run_cpu_mask);
+	pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask);
+
+	param.sched_priority = 1;
+	if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+		/* haven't reproduced with this setting, it kills random preemption which may be a factor */
+		fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+	}
+}
+
+static void set_mycpu(int cpu)
+{
+	cpu_set_t run_cpu_mask;
+	struct sched_param param;
+
+	CPU_ZERO(&run_cpu_mask);
+	CPU_SET(cpu, &run_cpu_mask);
+	sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask);
+
+	param.sched_priority = 1;
+	if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+		fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+	}
+}
+
+static volatile int segv_wait;
+
+static void segv_handler(int signo, siginfo_t *info, void *extra)
+{
+	while (segv_wait) {
+		sched_yield();
+	}
+
+}
+
+static void set_segv_handler(void)
+{
+	struct sigaction sa;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = segv_handler;
+
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+}
+
+int timeout = 0;
+/*
+ * This function is executed by every rim_thread.
+ *
+ * This function performs sweeps over the exclusive chunks of the
+ * rim_threads executing the rim-sequence one word at a time.
+ */
+static void *rim_fn(void *arg)
+{
+	unsigned int tid = *((unsigned int *)arg);
+
+	int size = RIM_CHUNK_SIZE;
+	char *chunk_start = compute_chunk_start_addr(tid);
+
+	unsigned int prev_sweep_id;
+	unsigned int cur_sweep_id = 0;
+
+	/* word access */
+	unsigned int pattern = cur_sweep_id;
+	unsigned int *pattern_ptr = &pattern;
+	unsigned int *w_ptr, read_data;
+
+	set_segv_handler();
+
+	/*
+	 * Let us initialize the chunk:
+	 *
+	 * Each word-aligned address addr in the chunk,
+	 * is initialized to :
+	 *    |-------------------------------------------------|
+	 *    | tid   | word_offset(addr) |         0           |
+	 *    |-------------------------------------------------|
+	 */
+	for (w_ptr = (unsigned int *)chunk_start;
+	     (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+	     w_ptr++) {
+
+		*pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+		*w_ptr = *pattern_ptr;
+	}
+
+	while (!corruption_found && !timeout) {
+		prev_sweep_id = cur_sweep_id;
+		cur_sweep_id = cur_sweep_id + 1;
+
+		for (w_ptr = (unsigned int *)chunk_start;
+		     (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+		     w_ptr++)  {
+			unsigned int old_pattern;
+
+			/*
+			 * Compute the pattern that we would have
+			 * stored at this location in the previous
+			 * sweep.
+			 */
+			old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id);
+
+			/*
+			 * FLUSH:Ensure that we flush the contents of
+			 *       the cache before loading
+			 */
+			dcbf((volatile unsigned int*)w_ptr); //Flush
+
+			/* LOAD: Read the value */
+			read_data = *w_ptr; //Load
+
+			/*
+			 * COMPARE: Is it the same as what we had stored
+			 *          in the previous sweep ? It better be!
+			 */
+			if (read_data != old_pattern) {
+				/* No it isn't! Tell everyone */
+				corruption_found = 1;
+			}
+
+			/*
+			 * Before performing a store, let us check if
+			 * any rim_thread has found a corruption.
+			 */
+			if (corruption_found || timeout) {
+				/*
+				 * Yes. Someone (including us!) has found
+				 * a corruption :(
+				 *
+				 * Let us verify that our chunk is
+				 * correct.
+				 */
+				/* But first, let us allow the dust to settle down! */
+				verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id);
+
+				return 0;
+			}
+
+			/*
+			 * Compute the new pattern that we are going
+			 * to write to this location
+			 */
+			*pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+
+			/*
+			 * STORE: Now let us write this pattern into
+			 *        the location
+			 */
+			*w_ptr = *pattern_ptr;
+		}
+	}
+
+	return NULL;
+}
+
+
+static unsigned long start_cpu = 0;
+static unsigned long nrthreads = 4;
+
+static pthread_t mem_snapshot_thread;
+
+static void *mem_snapshot_fn(void *arg)
+{
+	int page_size = getpagesize();
+	size_t size = page_size;
+	void *tmp = malloc(size);
+
+	while (!corruption_found && !timeout) {
+		/* Stop memory migration once corruption is found */
+		segv_wait = 1;
+
+		mprotect(map1, size, PROT_READ);
+
+		/*
+		 * Load from the working alias (map1). Loading from map2
+		 * also fails.
+		 */
+		memcpy(tmp, map1, size);
+
+		/*
+		 * Stores must go via map2 which has write permissions, but
+		 * the corrupted data tends to be seen in the snapshot buffer,
+		 * so corruption does not appear to be introduced at the
+		 * copy-back via map2 alias here.
+		 */
+		memcpy(map2, tmp, size);
+		/*
+		 * Before releasing other threads, must ensure the copy
+		 * back to
+		 */
+		asm volatile("sync" ::: "memory");
+		mprotect(map1, size, PROT_READ|PROT_WRITE);
+		asm volatile("sync" ::: "memory");
+		segv_wait = 0;
+
+		usleep(1); /* This value makes a big difference */
+	}
+
+	return 0;
+}
+
+void alrm_sighandler(int sig)
+{
+	timeout = 1;
+}
+
+int main(int argc, char *argv[])
+{
+	int c;
+	int page_size = getpagesize();
+	time_t now;
+	int i, dir_error;
+	pthread_attr_t attr;
+	key_t shm_key = (key_t) getpid();
+	int shmid, run_time = 20 * 60;
+	struct sigaction sa_alrm;
+
+	snprintf(logdir, LOGDIR_NAME_SIZE,
+		 "/tmp/logdir-%u", (unsigned int)getpid());
+	while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) {
+		switch(c) {
+		case 'r':
+			start_cpu = strtoul(optarg, NULL, 10);
+			break;
+		case 'h':
+			printf("%s [-r <start_cpu>] [-n <nrthreads>] [-l <logdir>] [-t <timeout>]\n", argv[0]);
+			exit(0);
+			break;
+		case 'n':
+			nrthreads = strtoul(optarg, NULL, 10);
+			break;
+		case 'l':
+			strncpy(logdir, optarg, LOGDIR_NAME_SIZE);
+			break;
+		case 't':
+			run_time = strtoul(optarg, NULL, 10);
+			break;
+		default:
+			printf("invalid option\n");
+			exit(0);
+			break;
+		}
+	}
+
+	if (nrthreads > MAX_THREADS)
+		nrthreads = MAX_THREADS;
+
+	shmid = shmget(shm_key, page_size, IPC_CREAT|0666);
+	if (shmid < 0) {
+		err_msg("Failed shmget\n");
+	}
+
+	map1 = shmat(shmid, NULL, 0);
+	if (map1 == (void *) -1) {
+		err_msg("Failed shmat");
+	}
+
+	map2 = shmat(shmid, NULL, 0);
+	if (map2 == (void *) -1) {
+		err_msg("Failed shmat");
+	}
+
+	dir_error = mkdir(logdir, 0755);
+
+	if (dir_error) {
+		err_msg("Failed mkdir");
+	}
+
+	printf("start_cpu list:%lu\n", start_cpu);
+	printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads);
+	printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2);
+	printf("logdir at : %s\n", logdir);
+	printf("Timeout: %d seconds\n", run_time);
+
+	time(&now);
+	printf("=================================\n");
+	printf("     Starting Test\n");
+	printf("     %s", ctime(&now));
+	printf("=================================\n");
+
+	for (i = 0; i < nrthreads; i++) {
+		if (1 && !fork()) {
+			prctl(PR_SET_PDEATHSIG, SIGKILL);
+			set_mycpu(start_cpu + i);
+			for (;;)
+				sched_yield();
+			exit(0);
+		}
+	}
+
+
+	sa_alrm.sa_handler = &alrm_sighandler;
+	sigemptyset(&sa_alrm.sa_mask);
+	sa_alrm.sa_flags = 0;
+
+	if (sigaction(SIGALRM, &sa_alrm, 0) == -1) {
+		err_msg("Failed signal handler registration\n");
+	}
+
+	alarm(run_time);
+
+	pthread_attr_init(&attr);
+	for (i = 0; i < nrthreads; i++) {
+		rim_thread_ids[i] = i;
+		pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]);
+		set_pthread_cpu(rim_threads[i], start_cpu + i);
+	}
+
+	pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1);
+	set_pthread_cpu(mem_snapshot_thread, start_cpu + i);
+
+
+	pthread_join(mem_snapshot_thread, NULL);
+	for (i = 0; i < nrthreads; i++) {
+		pthread_join(rim_threads[i], NULL);
+	}
+
+	if (!timeout) {
+		time(&now);
+		printf("=================================\n");
+		printf("      Data Corruption Detected\n");
+		printf("      %s", ctime(&now));
+		printf("      See logfiles in %s\n", logdir);
+		printf("=================================\n");
+		return 1;
+	}
+	return 0;
+}

From 4111cdef0e871c0f7c8874b19ee68a3671f7d63e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 3 Sep 2019 18:04:51 +0530
Subject: [PATCH 14/16] powerpc/nvdimm: Use HCALL error as the return value

This simplifies the error handling and also enable us to switch to
H_SCM_QUERY hcall in a later patch on H_OVERLAP error.

We also do some kernel print formatting fixup in this patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903123452.28620-1-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/platforms/pseries/papr_scm.c | 26 ++++++++++-------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index a5ac371a3f06..3bef4d298ac6 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -66,28 +66,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
 	} while (rc == H_BUSY);
 
 	if (rc) {
-		/* H_OVERLAP needs a separate error path */
-		if (rc == H_OVERLAP)
-			return -EBUSY;
-
 		dev_err(&p->pdev->dev, "bind err: %lld\n", rc);
-		return -ENXIO;
+		return rc;
 	}
 
 	p->bound_addr = saved;
-
-	dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res);
-
-	return 0;
+	dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+	return rc;
 }
 
-static int drc_pmem_unbind(struct papr_scm_priv *p)
+static void drc_pmem_unbind(struct papr_scm_priv *p)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
 	uint64_t token = 0;
 	int64_t rc;
 
-	dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index);
+	dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index);
 
 	/* NB: unbind has the same retry requirements as drc_pmem_bind() */
 	do {
@@ -110,10 +104,10 @@ static int drc_pmem_unbind(struct papr_scm_priv *p)
 	if (rc)
 		dev_err(&p->pdev->dev, "unbind error: %lld\n", rc);
 	else
-		dev_dbg(&p->pdev->dev, "unbind drc %x complete\n",
+		dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n",
 			p->drc_index);
 
-	return rc == H_SUCCESS ? 0 : -ENXIO;
+	return;
 }
 
 static int papr_scm_meta_get(struct papr_scm_priv *p,
@@ -436,14 +430,16 @@ static int papr_scm_probe(struct platform_device *pdev)
 	rc = drc_pmem_bind(p);
 
 	/* If phyp says drc memory still bound then force unbound and retry */
-	if (rc == -EBUSY) {
+	if (rc == H_OVERLAP) {
 		dev_warn(&pdev->dev, "Retrying bind after unbinding\n");
 		drc_pmem_unbind(p);
 		rc = drc_pmem_bind(p);
 	}
 
-	if (rc)
+	if (rc != H_SUCCESS) {
+		rc = -ENXIO;
 		goto err;
+	}
 
 	/* setup the resource for the newly bound range */
 	p->res.start = p->bound_addr;

From faa6d21153fd11e139dd880044521389b34a24f2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 3 Sep 2019 18:04:52 +0530
Subject: [PATCH 15/16] powerpc/nvdimm: use H_SCM_QUERY hcall on H_OVERLAP
 error

Right now we force an unbind of SCM memory at drcindex on H_OVERLAP error.
This really slows down operations like kexec where we get the H_OVERLAP
error because we don't go through a full hypervisor re init.

H_OVERLAP error for a H_SCM_BIND_MEM hcall indicates that SCM memory at
drc index is already bound. Since we don't specify a logical memory
address for bind hcall, we can use the H_SCM_QUERY hcall to query
the already bound logical address.

Boot time difference with and without patch is:

[    5.583617] IOMMU table initialized, virtual merging enabled
[    5.603041] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Retrying bind after unbinding
[  301.514221] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Retrying bind after unbinding
[  340.057238] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs

after fix

[    5.101572] IOMMU table initialized, virtual merging enabled
[    5.116984] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Querying SCM details
[    5.117223] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Querying SCM details
[    5.120530] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903123452.28620-2-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/platforms/pseries/papr_scm.c | 48 +++++++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 3bef4d298ac6..61883291defc 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -65,10 +65,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
 		cond_resched();
 	} while (rc == H_BUSY);
 
-	if (rc) {
-		dev_err(&p->pdev->dev, "bind err: %lld\n", rc);
+	if (rc)
 		return rc;
-	}
 
 	p->bound_addr = saved;
 	dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
@@ -110,6 +108,42 @@ static void drc_pmem_unbind(struct papr_scm_priv *p)
 	return;
 }
 
+static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
+{
+	unsigned long start_addr;
+	unsigned long end_addr;
+	unsigned long ret[PLPAR_HCALL_BUFSIZE];
+	int64_t rc;
+
+
+	rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+			 p->drc_index, 0);
+	if (rc)
+		goto err_out;
+	start_addr = ret[0];
+
+	/* Make sure the full region is bound. */
+	rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+			 p->drc_index, p->blocks - 1);
+	if (rc)
+		goto err_out;
+	end_addr = ret[0];
+
+	if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size))
+		goto err_out;
+
+	p->bound_addr = start_addr;
+	dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+	return rc;
+
+err_out:
+	dev_info(&p->pdev->dev,
+		 "Failed to query, trying an unbind followed by bind");
+	drc_pmem_unbind(p);
+	return drc_pmem_bind(p);
+}
+
+
 static int papr_scm_meta_get(struct papr_scm_priv *p,
 			     struct nd_cmd_get_config_data_hdr *hdr)
 {
@@ -430,13 +464,11 @@ static int papr_scm_probe(struct platform_device *pdev)
 	rc = drc_pmem_bind(p);
 
 	/* If phyp says drc memory still bound then force unbound and retry */
-	if (rc == H_OVERLAP) {
-		dev_warn(&pdev->dev, "Retrying bind after unbinding\n");
-		drc_pmem_unbind(p);
-		rc = drc_pmem_bind(p);
-	}
+	if (rc == H_OVERLAP)
+		rc = drc_pmem_query_n_bind(p);
 
 	if (rc != H_SUCCESS) {
+		dev_err(&p->pdev->dev, "bind err: %d\n", rc);
 		rc = -ENXIO;
 		goto err;
 	}

From 253c892193ab58da6b1d94371285971b22c63260 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Thu, 26 Sep 2019 22:25:02 +1000
Subject: [PATCH 16/16] powerpc/eeh: Fix eeh eeh_debugfs_break_device() with
 SRIOV devices

s/CONFIG_IOV/CONFIG_PCI_IOV/

Whoops.

Fixes: bd6461cc7b3c ("powerpc/eeh: Add a eeh_dev_break debugfs interface")
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
[mpe: Fixup the #endif comment as well]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190926122502.14826-1-oohall@gmail.com
---
 arch/powerpc/kernel/eeh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 0a91dee51245..bc8a551013be 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1960,7 +1960,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev)
 	pci_err(pdev, "Going to break: %pR\n", bar);
 
 	if (pdev->is_virtfn) {
-#ifndef CONFIG_IOV
+#ifndef CONFIG_PCI_IOV
 		return -ENXIO;
 #else
 		/*
@@ -1980,7 +1980,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev)
 		pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
 		pos += PCI_SRIOV_CTRL;
 		bit  = PCI_SRIOV_CTRL_MSE;
-#endif /* !CONFIG_IOV */
+#endif /* !CONFIG_PCI_IOV */
 	} else {
 		bit = PCI_COMMAND_MEMORY;
 		pos = PCI_COMMAND;