From ae2e953fdca791270e80c08d6a830d9aa472a111 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Thu, 18 Jul 2019 11:22:14 -0500
Subject: [PATCH 001/264] powerpc/rtas: Unexport rtas_online_cpus_mask,
 rtas_offline_cpus_mask

These aren't used by modular code, nor should they be.

Fixes: 120496ac2d2d ("powerpc: Bring all threads online prior to migration/hibernation")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718162214.5694-1-nathanl@linux.ibm.com
---
 arch/powerpc/kernel/rtas.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 5faf0a64c92b..49159bb38949 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -922,13 +922,11 @@ int rtas_online_cpus_mask(cpumask_var_t cpus)
 
 	return ret;
 }
-EXPORT_SYMBOL(rtas_online_cpus_mask);
 
 int rtas_offline_cpus_mask(cpumask_var_t cpus)
 {
 	return rtas_cpu_state_change_mask(DOWN, cpus);
 }
-EXPORT_SYMBOL(rtas_offline_cpus_mask);
 
 int rtas_ibm_suspend_me(u64 handle)
 {

From 0df3e42167caaf9f8c7b64de3da40a459979afe8 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 3 Jun 2019 15:11:58 -0700
Subject: [PATCH 002/264] PCI: rpaphp: Avoid a sometimes-uninitialized warning

When building with -Wsometimes-uninitialized, clang warns:

drivers/pci/hotplug/rpaphp_core.c:243:14: warning: variable 'fndit' is
used uninitialized whenever 'for' loop exits because its condition is
false [-Wsometimes-uninitialized]
        for (j = 0; j < entries; j++) {
                    ^~~~~~~~~~~
drivers/pci/hotplug/rpaphp_core.c:256:6: note: uninitialized use occurs
here
        if (fndit)
            ^~~~~
drivers/pci/hotplug/rpaphp_core.c:243:14: note: remove the condition if
it is always true
        for (j = 0; j < entries; j++) {
                    ^~~~~~~~~~~
drivers/pci/hotplug/rpaphp_core.c:233:14: note: initialize the variable
'fndit' to silence this warning
        int j, fndit;
                    ^
                     = 0

fndit is only used to gate a sprintf call, which can be moved into the
loop to simplify the code and eliminate the local variable, which will
fix this warning.

Fixes: 2fcf3ae508c2 ("hotplug/drc-info: Add code to search ibm,drc-info property")
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Acked-by: Joel Savitz <jsavitz@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://github.com/ClangBuiltLinux/linux/issues/504
Link: https://lore.kernel.org/r/20190603221157.58502-1-natechancellor@gmail.com
---
 drivers/pci/hotplug/rpaphp_core.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index bcd5d357ca23..c3899ee1db99 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -230,7 +230,7 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
 	struct of_drc_info drc;
 	const __be32 *value;
 	char cell_drc_name[MAX_DRC_NAME_LEN];
-	int j, fndit;
+	int j;
 
 	info = of_find_property(dn->parent, "ibm,drc-info", NULL);
 	if (info == NULL)
@@ -245,17 +245,13 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
 
 		/* Should now know end of current entry */
 
-		if (my_index > drc.last_drc_index)
-			continue;
-
-		fndit = 1;
-		break;
+		/* Found it */
+		if (my_index <= drc.last_drc_index) {
+			sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix,
+				my_index);
+			break;
+		}
 	}
-	/* Found it */
-
-	if (fndit)
-		sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix, 
-			my_index);
 
 	if (((drc_name == NULL) ||
 	     (drc_name && !strcmp(drc_name, cell_drc_name))) &&

From 08a456aa643776757e07adfdebe7f7681117d144 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Thu, 9 May 2019 15:11:15 +1000
Subject: [PATCH 003/264] powerpc/powernv: Move SCOM access code into powernv
 platform

The powernv platform is the only one that directly accesses SCOMs.
Move the support code to platforms/powernv, and get rid of the
PPC_SCOM Kconfig option, as SCOM support is always selected when
compiling for powernv.

This also means that the Kconfig item for CONFIG_SCOM_DEBUGFS will
show up in menuconfig in the platform menu, rather than at the root,
which is a much better location.

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190509051119.7694-1-ajd@linux.ibm.com
---
 arch/powerpc/platforms/powernv/Kconfig              |  5 ++++-
 arch/powerpc/platforms/powernv/Makefile             |  2 +-
 arch/powerpc/platforms/powernv/opal-xscom.c         |  3 ++-
 arch/powerpc/{sysdev => platforms/powernv}/scom.c   |  3 ++-
 .../{include/asm => platforms/powernv}/scom.h       | 13 +++----------
 arch/powerpc/sysdev/Kconfig                         |  7 -------
 arch/powerpc/sysdev/Makefile                        |  2 --
 7 files changed, 12 insertions(+), 23 deletions(-)
 rename arch/powerpc/{sysdev => platforms/powernv}/scom.c (99%)
 rename arch/powerpc/{include/asm => platforms/powernv}/scom.h (94%)

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..938803eab0ad 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -12,7 +12,6 @@ config PPC_POWERNV
 	select EPAPR_BOOT
 	select PPC_INDIRECT_PIO
 	select PPC_UDBG_16550
-	select PPC_SCOM
 	select ARCH_RANDOM
 	select CPU_FREQ
 	select PPC_DOORBELL
@@ -47,3 +46,7 @@ config PPC_VAS
 	  VAS adapters are found in POWER9 based systems.
 
 	  If unsure, say N.
+
+config SCOM_DEBUGFS
+	bool "Expose SCOM controllers via debugfs"
+	depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..4b1644150135 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,12 +4,12 @@ obj-y			+= idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y			+= opal-xscom.o scom.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
-obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 66430eebe869..3f48ee69928c 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -14,7 +14,8 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/opal.h>
-#include <asm/scom.h>
+
+#include "scom.h"
 
 /*
  * We could probably fit that inside the scom_map_t
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/platforms/powernv/scom.c
similarity index 99%
rename from arch/powerpc/sysdev/scom.c
rename to arch/powerpc/platforms/powernv/scom.c
index 94e885bf3aee..74664e9496b8 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/platforms/powernv/scom.c
@@ -10,9 +10,10 @@
 #include <linux/export.h>
 #include <asm/debugfs.h>
 #include <asm/prom.h>
-#include <asm/scom.h>
 #include <linux/uaccess.h>
 
+#include "scom.h"
+
 const struct scom_controller *scom_controller;
 EXPORT_SYMBOL_GPL(scom_controller);
 
diff --git a/arch/powerpc/include/asm/scom.h b/arch/powerpc/platforms/powernv/scom.h
similarity index 94%
rename from arch/powerpc/include/asm/scom.h
rename to arch/powerpc/platforms/powernv/scom.h
index 08c44396e54a..b5c33e30d39c 100644
--- a/arch/powerpc/include/asm/scom.h
+++ b/arch/powerpc/platforms/powernv/scom.h
@@ -5,12 +5,8 @@
  *     and        David Gibson, IBM Corporation.
  */
 
-#ifndef _ASM_POWERPC_SCOM_H
-#define _ASM_POWERPC_SCOM_H
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-#ifdef CONFIG_PPC_SCOM
+#ifndef _SCOM_H
+#define _SCOM_H
 
 /*
  * The SCOM bus is a sideband bus used for accessing various internal
@@ -148,7 +144,4 @@ static inline int scom_write(scom_map_t map, u64 reg, u64 value)
 }
 
 
-#endif /* CONFIG_PPC_SCOM */
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-#endif /* _ASM_POWERPC_SCOM_H */
+#endif /* _SCOM_H */
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index d23288c4abf6..9ebcc1337560 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,13 +28,6 @@ config PPC_MSI_BITMAP
 source "arch/powerpc/sysdev/xics/Kconfig"
 source "arch/powerpc/sysdev/xive/Kconfig"
 
-config PPC_SCOM
-	bool
-
-config SCOM_DEBUGFS
-	bool "Expose SCOM controllers via debugfs"
-	depends on PPC_SCOM && DEBUG_FS
-
 config GE_FPGA
 	bool
 
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 9d73dfddf060..603b3c656d19 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -49,8 +49,6 @@ ifdef CONFIG_SUSPEND
 obj-$(CONFIG_PPC_BOOK3S_32)	+= 6xx-suspend.o
 endif
 
-obj-$(CONFIG_PPC_SCOM)		+= scom.o
-
 obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)	+= udbg_memcons.o
 
 obj-$(CONFIG_PPC_XICS)		+= xics/

From 9edc6b919d9fbec694dc37ea14f9587be1b354ec Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Thu, 9 May 2019 15:11:16 +1000
Subject: [PATCH 004/264] powerpc/powernv: Remove dead SCOM access code

Nothing is using scom_map_device() or scom_find_parent(). Remove them.

Also don't export scom_controller, there are no other users of it.

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190509051119.7694-2-ajd@linux.ibm.com
---
 arch/powerpc/platforms/powernv/scom.c | 67 ---------------------------
 arch/powerpc/platforms/powernv/scom.h | 27 -----------
 2 files changed, 94 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/scom.c b/arch/powerpc/platforms/powernv/scom.c
index 74664e9496b8..acd6968d9277 100644
--- a/arch/powerpc/platforms/powernv/scom.c
+++ b/arch/powerpc/platforms/powernv/scom.c
@@ -15,73 +15,6 @@
 #include "scom.h"
 
 const struct scom_controller *scom_controller;
-EXPORT_SYMBOL_GPL(scom_controller);
-
-struct device_node *scom_find_parent(struct device_node *node)
-{
-	struct device_node *par, *tmp;
-	const u32 *p;
-
-	for (par = of_node_get(node); par;) {
-		if (of_get_property(par, "scom-controller", NULL))
-			break;
-		p = of_get_property(par, "scom-parent", NULL);
-		tmp = par;
-		if (p == NULL)
-			par = of_get_parent(par);
-		else
-			par = of_find_node_by_phandle(*p);
-		of_node_put(tmp);
-	}
-	return par;
-}
-EXPORT_SYMBOL_GPL(scom_find_parent);
-
-scom_map_t scom_map_device(struct device_node *dev, int index)
-{
-	struct device_node *parent;
-	unsigned int cells, size;
-	const __be32 *prop, *sprop;
-	u64 reg, cnt;
-	scom_map_t ret;
-
-	parent = scom_find_parent(dev);
-
-	if (parent == NULL)
-		return NULL;
-
-	/*
-	 * We support "scom-reg" properties for adding scom registers
-	 * to a random device-tree node with an explicit scom-parent
-	 *
-	 * We also support the simple "reg" property if the device is
-	 * a direct child of a scom controller.
-	 *
-	 * In case both exist, "scom-reg" takes precedence.
-	 */
-	prop = of_get_property(dev, "scom-reg", &size);
-	sprop = of_get_property(parent, "#scom-cells", NULL);
-	if (!prop && parent == dev->parent) {
-		prop = of_get_property(dev, "reg", &size);
-		sprop = of_get_property(parent, "#address-cells", NULL);
-	}
-	if (!prop)
-		return NULL;
-	cells = sprop ? be32_to_cpup(sprop) : 1;
-	size >>= 2;
-
-	if (index >= (size / (2*cells)))
-		return NULL;
-
-	reg = of_read_number(&prop[index * cells * 2], cells);
-	cnt = of_read_number(&prop[index * cells * 2 + cells], cells);
-
-	ret = scom_map(parent, reg, cnt);
-	of_node_put(parent);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(scom_map_device);
 
 #ifdef CONFIG_SCOM_DEBUGFS
 struct scom_debug_entry {
diff --git a/arch/powerpc/platforms/powernv/scom.h b/arch/powerpc/platforms/powernv/scom.h
index b5c33e30d39c..9827a66ef147 100644
--- a/arch/powerpc/platforms/powernv/scom.h
+++ b/arch/powerpc/platforms/powernv/scom.h
@@ -75,33 +75,6 @@ static inline scom_map_t scom_map(struct device_node *ctrl_dev,
 	return scom_controller->map(ctrl_dev, reg, count);
 }
 
-/**
- * scom_find_parent - Find the SCOM controller for a device
- * @dev: OF node of the device
- *
- * This is not meant for general usage, but in combination with
- * scom_map() allows to map registers not represented by the
- * device own scom-reg property. Useful for applying HW workarounds
- * on things not properly represented in the device-tree for example.
- */
-struct device_node *scom_find_parent(struct device_node *dev);
-
-
-/**
- * scom_map_device - Map a device's block of SCOM registers
- * @dev: OF node of the device
- * @index: Register bank index (index in "scom-reg" property)
- *
- * This function will use the device-tree binding for SCOM which
- * is to follow "scom-parent" properties until it finds a node with
- * a "scom-controller" property to find the controller. It will then
- * use the "scom-reg" property which is made of reg/count pairs,
- * each of them having a size defined by the controller's #scom-cells
- * property
- */
-extern scom_map_t scom_map_device(struct device_node *dev, int index);
-
-
 /**
  * scom_unmap - Unmap a block of SCOM registers
  * @map: Result of scom_map is to be unmapped

From bfd2f0d49aef8abfe6bf58f12719f39912993cc6 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Thu, 9 May 2019 15:11:17 +1000
Subject: [PATCH 005/264] powerpc/powernv: Get rid of old scom_controller
 abstraction

Once upon a time, the SCOM access code was used by the WSP platform as
well as powernv. Thus it made sense to have a generic SCOM access
interface to abstract between different platforms.

Now that it's just powernv, with no other platforms currently on the
horizon, let's rip out scom_controller and make everything much
simpler and more direct.

While we're here, fix up the comment block at the top.

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190509051119.7694-3-ajd@linux.ibm.com
---
 arch/powerpc/platforms/powernv/Makefile     |   2 +-
 arch/powerpc/platforms/powernv/opal-xscom.c | 234 +++++++++++++-------
 arch/powerpc/platforms/powernv/scom.c       | 157 -------------
 arch/powerpc/platforms/powernv/scom.h       | 120 ----------
 4 files changed, 152 insertions(+), 361 deletions(-)
 delete mode 100644 arch/powerpc/platforms/powernv/scom.c
 delete mode 100644 arch/powerpc/platforms/powernv/scom.h

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 4b1644150135..69a3aefa905b 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,7 +4,6 @@ obj-y			+= idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
-obj-y			+= opal-xscom.o scom.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
@@ -16,3 +15,4 @@ obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
 obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o
 obj-$(CONFIG_OCXL_BASE)	+= ocxl.o
+obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 3f48ee69928c..cb172af2f53a 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * PowerNV LPC bus handling.
+ * PowerNV SCOM bus debugfs interface
  *
+ * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
+ *                <benh@kernel.crashing.org>
+ *     and        David Gibson, IBM Corporation.
  * Copyright 2013 IBM Corp.
  */
 
@@ -10,63 +13,13 @@
 #include <linux/bug.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/opal.h>
-
-#include "scom.h"
-
-/*
- * We could probably fit that inside the scom_map_t
- * which is a void* after all but it's really too ugly
- * so let's kmalloc it for now
- */
-struct opal_scom_map {
-	uint32_t chip;
-	uint64_t addr;
-};
-
-static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
-{
-	struct opal_scom_map *m;
-	const __be32 *gcid;
-
-	if (!of_get_property(dev, "scom-controller", NULL)) {
-		pr_err("%s: device %pOF is not a SCOM controller\n",
-			__func__, dev);
-		return SCOM_MAP_INVALID;
-	}
-	gcid = of_get_property(dev, "ibm,chip-id", NULL);
-	if (!gcid) {
-		pr_err("%s: device %pOF has no ibm,chip-id\n",
-			__func__, dev);
-		return SCOM_MAP_INVALID;
-	}
-	m = kmalloc(sizeof(*m), GFP_KERNEL);
-	if (!m)
-		return NULL;
-	m->chip = be32_to_cpup(gcid);
-	m->addr = reg;
-
-	return (scom_map_t)m;
-}
-
-static void opal_scom_unmap(scom_map_t map)
-{
-	kfree(map);
-}
-
-static int opal_xscom_err_xlate(int64_t rc)
-{
-	switch(rc) {
-	case 0:
-		return 0;
-	/* Add more translations if necessary */
-	default:
-		return -EIO;
-	}
-}
+#include <asm/debugfs.h>
+#include <asm/prom.h>
 
 static u64 opal_scom_unmangle(u64 addr)
 {
@@ -99,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr)
 	return addr;
 }
 
-static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
 {
-	struct opal_scom_map *m = map;
 	int64_t rc;
 	__be64 v;
 
-	reg = opal_scom_unmangle(m->addr + reg);
-	rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
+	if (rc) {
+		*value = 0xfffffffffffffffful;
+		return -EIO;
+	}
 	*value = be64_to_cpu(v);
-	return opal_xscom_err_xlate(rc);
-}
-
-static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
-{
-	struct opal_scom_map *m = map;
-	int64_t rc;
-
-	reg = opal_scom_unmangle(m->addr + reg);
-	rc = opal_xscom_write(m->chip, reg, value);
-	return opal_xscom_err_xlate(rc);
-}
-
-static const struct scom_controller opal_scom_controller = {
-	.map	= opal_scom_map,
-	.unmap	= opal_scom_unmap,
-	.read	= opal_scom_read,
-	.write	= opal_scom_write
-};
-
-static int opal_xscom_init(void)
-{
-	if (firmware_has_feature(FW_FEATURE_OPAL))
-		scom_init(&opal_scom_controller);
 	return 0;
 }
-machine_arch_initcall(powernv, opal_xscom_init);
+
+static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
+{
+	int64_t rc;
+
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_write(chip, reg, value);
+	if (rc)
+		return -EIO;
+	return 0;
+}
+
+struct scom_debug_entry {
+	u32 chip;
+	struct debugfs_blob_wrapper path;
+	char name[16];
+};
+
+static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0; 
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = opal_scom_read(ent->chip, reg_base, reg, &val);
+		if (!rc)
+			rc = put_user(val, ubuf64);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		*ppos += 8;
+		done += 8;
+	}
+	return done;
+}
+
+static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf,
+				size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0; 
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = get_user(val, ubuf64);
+		if (!rc)
+			rc = opal_scom_write(ent->chip, reg_base, reg,  val);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		done += 8;
+	}
+	return done;
+}
+
+static const struct file_operations scom_debug_fops = {
+	.read =		scom_debug_read,
+	.write =	scom_debug_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
+			       int chip)
+{
+	struct scom_debug_entry *ent;
+	struct dentry *dir;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+
+	ent->chip = chip;
+	snprintf(ent->name, 16, "%08x", chip);
+	ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn);
+	ent->path.size = strlen((char *)ent->path.data);
+
+	dir = debugfs_create_dir(ent->name, root);
+	if (!dir) {
+		kfree(ent->path.data);
+		kfree(ent);
+		return -1;
+	}
+
+	debugfs_create_blob("devspec", 0400, dir, &ent->path);
+	debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
+
+	return 0;
+}
+
+static int scom_debug_init(void)
+{
+	struct device_node *dn;
+	struct dentry *root;
+	int chip, rc;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return 0;
+
+	root = debugfs_create_dir("scom", powerpc_debugfs_root);
+	if (!root)
+		return -1;
+
+	rc = 0;
+	for_each_node_with_property(dn, "scom-controller") {
+		chip = of_get_ibm_chip_id(dn);
+		WARN_ON(chip == -1);
+		rc |= scom_debug_init_one(root, dn, chip);
+	}
+
+	return rc;
+}
+device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/scom.c b/arch/powerpc/platforms/powernv/scom.c
deleted file mode 100644
index acd6968d9277..000000000000
--- a/arch/powerpc/platforms/powernv/scom.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
- *                <benh@kernel.crashing.org>
- *     and        David Gibson, IBM Corporation.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <asm/debugfs.h>
-#include <asm/prom.h>
-#include <linux/uaccess.h>
-
-#include "scom.h"
-
-const struct scom_controller *scom_controller;
-
-#ifdef CONFIG_SCOM_DEBUGFS
-struct scom_debug_entry {
-	struct device_node *dn;
-	struct debugfs_blob_wrapper path;
-	char name[16];
-};
-
-static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
-			       size_t count, loff_t *ppos)
-{
-	struct scom_debug_entry *ent = filp->private_data;
-	u64 __user *ubuf64 = (u64 __user *)ubuf;
-	loff_t off = *ppos;
-	ssize_t done = 0; 
-	u64 reg, reg_cnt, val;
-	scom_map_t map;
-	int rc;
-
-	if (off < 0 || (off & 7) || (count & 7))
-		return -EINVAL;
-	reg = off >> 3;
-	reg_cnt = count >> 3;
-
-	map = scom_map(ent->dn, reg, reg_cnt);
-	if (!scom_map_ok(map))
-		return -ENXIO;
-
-	for (reg = 0; reg < reg_cnt; reg++) {
-		rc = scom_read(map, reg, &val);
-		if (!rc)
-			rc = put_user(val, ubuf64);
-		if (rc) {
-			if (!done)
-				done = rc;
-			break;
-		}
-		ubuf64++;
-		*ppos += 8;
-		done += 8;
-	}
-	scom_unmap(map);
-	return done;
-}
-
-static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf,
-				size_t count, loff_t *ppos)
-{
-	struct scom_debug_entry *ent = filp->private_data;
-	u64 __user *ubuf64 = (u64 __user *)ubuf;
-	loff_t off = *ppos;
-	ssize_t done = 0; 
-	u64 reg, reg_cnt, val;
-	scom_map_t map;
-	int rc;
-
-	if (off < 0 || (off & 7) || (count & 7))
-		return -EINVAL;
-	reg = off >> 3;
-	reg_cnt = count >> 3;
-
-	map = scom_map(ent->dn, reg, reg_cnt);
-	if (!scom_map_ok(map))
-		return -ENXIO;
-
-	for (reg = 0; reg < reg_cnt; reg++) {
-		rc = get_user(val, ubuf64);
-		if (!rc)
-			rc = scom_write(map, reg,  val);
-		if (rc) {
-			if (!done)
-				done = rc;
-			break;
-		}
-		ubuf64++;
-		done += 8;
-	}
-	scom_unmap(map);
-	return done;
-}
-
-static const struct file_operations scom_debug_fops = {
-	.read =		scom_debug_read,
-	.write =	scom_debug_write,
-	.open =		simple_open,
-	.llseek =	default_llseek,
-};
-
-static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
-			       int i)
-{
-	struct scom_debug_entry *ent;
-	struct dentry *dir;
-
-	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
-	if (!ent)
-		return -ENOMEM;
-
-	ent->dn = of_node_get(dn);
-	snprintf(ent->name, 16, "%08x", i);
-	ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn);
-	ent->path.size = strlen((char *)ent->path.data);
-
-	dir = debugfs_create_dir(ent->name, root);
-	if (!dir) {
-		of_node_put(dn);
-		kfree(ent->path.data);
-		kfree(ent);
-		return -1;
-	}
-
-	debugfs_create_blob("devspec", 0400, dir, &ent->path);
-	debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
-
-	return 0;
-}
-
-static int scom_debug_init(void)
-{
-	struct device_node *dn;
-	struct dentry *root;
-	int i, rc;
-
-	root = debugfs_create_dir("scom", powerpc_debugfs_root);
-	if (!root)
-		return -1;
-
-	i = rc = 0;
-	for_each_node_with_property(dn, "scom-controller") {
-		int id = of_get_ibm_chip_id(dn);
-		if (id == -1)
-			id = i;
-		rc |= scom_debug_init_one(root, dn, id);
-		i++;
-	}
-
-	return rc;
-}
-device_initcall(scom_debug_init);
-#endif /* CONFIG_SCOM_DEBUGFS */
diff --git a/arch/powerpc/platforms/powernv/scom.h b/arch/powerpc/platforms/powernv/scom.h
deleted file mode 100644
index 9827a66ef147..000000000000
--- a/arch/powerpc/platforms/powernv/scom.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
- *                <benh@kernel.crashing.org>
- *     and        David Gibson, IBM Corporation.
- */
-
-#ifndef _SCOM_H
-#define _SCOM_H
-
-/*
- * The SCOM bus is a sideband bus used for accessing various internal
- * registers of the processor or the chipset. The implementation details
- * differ between processors and platforms, and the access method as
- * well.
- *
- * This API allows to "map" ranges of SCOM register numbers associated
- * with a given SCOM controller. The later must be represented by a
- * device node, though some implementations might support NULL if there
- * is no possible ambiguity
- *
- * Then, scom_read/scom_write can be used to accesses registers inside
- * that range. The argument passed is a register number relative to
- * the beginning of the range mapped.
- */
-
-typedef void *scom_map_t;
-
-/* Value for an invalid SCOM map */
-#define SCOM_MAP_INVALID	(NULL)
-
-/* The scom_controller data structure is what the platform passes
- * to the core code in scom_init, it provides the actual implementation
- * of all the SCOM functions
- */
-struct scom_controller {
-	scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count);
-	void (*unmap)(scom_map_t map);
-
-	int (*read)(scom_map_t map, u64 reg, u64 *value);
-	int (*write)(scom_map_t map, u64 reg, u64 value);
-};
-
-extern const struct scom_controller *scom_controller;
-
-/**
- * scom_init - Initialize the SCOM backend, called by the platform
- * @controller: The platform SCOM controller
- */
-static inline void scom_init(const struct scom_controller *controller)
-{
-	scom_controller = controller;
-}
-
-/**
- * scom_map_ok - Test is a SCOM mapping is successful
- * @map: The result of scom_map to test
- */
-static inline int scom_map_ok(scom_map_t map)
-{
-	return map != SCOM_MAP_INVALID;
-}
-
-/**
- * scom_map - Map a block of SCOM registers
- * @ctrl_dev: Device node of the SCOM controller
- *            some implementations allow NULL here
- * @reg: first SCOM register to map
- * @count: Number of SCOM registers to map
- */
-
-static inline scom_map_t scom_map(struct device_node *ctrl_dev,
-				  u64 reg, u64 count)
-{
-	return scom_controller->map(ctrl_dev, reg, count);
-}
-
-/**
- * scom_unmap - Unmap a block of SCOM registers
- * @map: Result of scom_map is to be unmapped
- */
-static inline void scom_unmap(scom_map_t map)
-{
-	if (scom_map_ok(map))
-		scom_controller->unmap(map);
-}
-
-/**
- * scom_read - Read a SCOM register
- * @map: Result of scom_map
- * @reg: Register index within that map
- * @value: Updated with the value read
- *
- * Returns 0 (success) or a negative error code
- */
-static inline int scom_read(scom_map_t map, u64 reg, u64 *value)
-{
-	int rc;
-
-	rc = scom_controller->read(map, reg, value);
-	if (rc)
-		*value = 0xfffffffffffffffful;
-	return rc;
-}
-
-/**
- * scom_write - Write to a SCOM register
- * @map: Result of scom_map
- * @reg: Register index within that map
- * @value: Value to write
- *
- * Returns 0 (success) or a negative error code
- */
-static inline int scom_write(scom_map_t map, u64 reg, u64 value)
-{
-	return scom_controller->write(map, reg, value);
-}
-
-
-#endif /* _SCOM_H */

From 8c98db8d5851db45c50024a29c3dd1779dc4da0b Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Thu, 9 May 2019 15:11:18 +1000
Subject: [PATCH 006/264] powerpc/powernv: Fix checkpatch warnings in
 opal-xscom.c

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190509051119.7694-4-ajd@linux.ibm.com
---
 arch/powerpc/platforms/powernv/opal-xscom.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index cb172af2f53a..fd510d961b8c 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -90,7 +90,7 @@ static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
 	struct scom_debug_entry *ent = filp->private_data;
 	u64 __user *ubuf64 = (u64 __user *)ubuf;
 	loff_t off = *ppos;
-	ssize_t done = 0; 
+	ssize_t done = 0;
 	u64 reg, reg_base, reg_cnt, val;
 	int rc;
 
@@ -115,13 +115,13 @@ static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
 	return done;
 }
 
-static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf,
+static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
 				size_t count, loff_t *ppos)
 {
 	struct scom_debug_entry *ent = filp->private_data;
 	u64 __user *ubuf64 = (u64 __user *)ubuf;
 	loff_t off = *ppos;
-	ssize_t done = 0; 
+	ssize_t done = 0;
 	u64 reg, reg_base, reg_cnt, val;
 	int rc;
 
@@ -164,7 +164,7 @@ static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
 
 	ent->chip = chip;
 	snprintf(ent->name, 16, "%08x", chip);
-	ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn);
+	ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
 	ent->path.size = strlen((char *)ent->path.data);
 
 	dir = debugfs_create_dir(ent->name, root);

From 8b856a0942a1b4d832966985fcdf1a455eb6ab8c Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Thu, 9 May 2019 15:11:19 +1000
Subject: [PATCH 007/264] powerpc/configs: Disable SCOM_DEBUGFS in
 powernv_defconfig

SCOM_DEBUGFS is really not needed for anything other than low-level
hardware debugging.

mpe: It also introduces a large and poorly documented/understood
attack surface. Although the interface is only available to root, the
kernel still aspires to restrict root to accessing hardware through
well defined interfaces, which this is not.

opal-prd uses its own interface (/dev/prd) for SCOM access, so it
doesn't need SCOM_DEBUGFS.

At some point in the future we'll introduce a debug config fragment
where this can go instead.

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190509051119.7694-5-ajd@linux.ibm.com
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 34219d555e8a..6658cceb928c 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -38,7 +38,7 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_PARTITION_ADVANCED=y
-CONFIG_SCOM_DEBUGFS=y
+# CONFIG_SCOM_DEBUGFS is not set
 CONFIG_OPAL_PRD=y
 CONFIG_PPC_MEMTRACE=y
 # CONFIG_PPC_PSERIES is not set

From e7de4f7b64c23e503a8c42af98d56f2a7462bd6d Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Fri, 3 May 2019 17:52:53 +1000
Subject: [PATCH 008/264] powerpc/powernv: Restrict OPAL symbol map to only be
 readable by root

Currently the OPAL symbol map is globally readable, which seems bad as
it contains physical addresses.

Restrict it to root.

Fixes: c8742f85125d ("powerpc/powernv: Expose OPAL firmware symbol map")
Cc: stable@vger.kernel.org # v3.19+
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190503075253.22798-1-ajd@linux.ibm.com
---
 arch/powerpc/platforms/powernv/opal.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index aba443be7daa..d271accf224b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -705,7 +705,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
 				       bin_attr->size);
 }
 
-static BIN_ATTR_RO(symbol_map, 0);
+static struct bin_attribute symbol_map_attr = {
+	.attr = {.name = "symbol_map", .mode = 0400},
+	.read = symbol_map_read
+};
 
 static void opal_export_symmap(void)
 {
@@ -722,10 +725,10 @@ static void opal_export_symmap(void)
 		return;
 
 	/* Setup attributes */
-	bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0]));
-	bin_attr_symbol_map.size = be64_to_cpu(syms[1]);
+	symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
+	symbol_map_attr.size = be64_to_cpu(syms[1]);
 
-	rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map);
+	rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
 	if (rc)
 		pr_warn("Error %d creating OPAL symbols file\n", rc);
 }

From 461cef2a676e7c578d0ef62969dbcb8237c8631d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 31 Jul 2019 06:31:41 +0000
Subject: [PATCH 009/264] powerpc/32: activate ARCH_HAS_PMEM_API and
 ARCH_HAS_UACCESS_FLUSHCACHE

PPC32 also have flush_dcache_range() so it can also support
ARCH_HAS_PMEM_API and ARCH_HAS_UACCESS_FLUSHCACHE without changes.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/a682a2f9db308c5cfe77e45aa3352e41bc9f4e33.1564554634.git.christophe.leroy@c-s.fr
---
 arch/powerpc/Kconfig      | 4 ++--
 arch/powerpc/lib/Makefile | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 77f6ebf97113..9bc8ac5b8c96 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,14 +129,14 @@ config PPC
 	select ARCH_HAS_HUGEPD			if HUGETLB_PAGE
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
-	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PMEM_API
 	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
 	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
+	select ARCH_HAS_UACCESS_FLUSHCACHE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index eebc782d89a5..f030eea2171d 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
 CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
 endif
 
-obj-y += alloc.o code-patching.o feature-fixups.o
+obj-y += alloc.o code-patching.o feature-fixups.o pmem.o
 
 ifndef CONFIG_KASAN
 obj-y	+=	string.o memcmp_$(BITS).o
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 			       memcpy_power7.o
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-	   memcpy_64.o pmem.o
+	   memcpy_64.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o

From 1ebe0dcce1750109181d666394b7dfd9af9ff645 Mon Sep 17 00:00:00 2001
From: Jordan Niethe <jniethe5@gmail.com>
Date: Fri, 2 Aug 2019 10:08:35 +1000
Subject: [PATCH 010/264] powerpc/xive: Update comment referencing magic loads
 from an ESB

The comment above xive_esb_read() references magic loads from an ESB as
described xive.h. This has been inaccurate since commit 12c1f339cd49
("powerpc/xive: Move definition of ESB bits") which moved the
description. Update the comment to reference the new location of the
description in xive-regs.h

Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Acked-by: Stewart Smith <stewart@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802000835.26191-1-jniethe5@gmail.com
---
 arch/powerpc/sysdev/xive/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 1cdb39575eae..083f657091d7 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -185,7 +185,7 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
 
 /*
  * This is used to perform the magic loads from an ESB
- * described in xive.h
+ * described in xive-regs.h
  */
 static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset)
 {

From 9616dda1aaddb2080122aaeab96ad7fc064e36b4 Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leonardo@linux.ibm.com>
Date: Thu, 1 Aug 2019 19:52:51 -0300
Subject: [PATCH 011/264] powerpc/pseries/hotplug-memory.c: Replace nested ifs
 by switch-case

I noticed these nested ifs can be easily replaced by switch-cases,
which can improve readability.

Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190801225251.17864-1-leonardo@linux.ibm.com
---
 .../platforms/pseries/hotplug-memory.c        | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 46d0d35b9ca4..8e700390f3d6 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
 
 	switch (hp_elog->action) {
 	case PSERIES_HP_ELOG_ACTION_ADD:
-		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+		switch (hp_elog->id_type) {
+		case PSERIES_HP_ELOG_ID_DRC_COUNT:
 			count = hp_elog->_drc_u.drc_count;
 			rc = dlpar_memory_add_by_count(count);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_INDEX:
 			drc_index = hp_elog->_drc_u.drc_index;
 			rc = dlpar_memory_add_by_index(drc_index);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_IC:
 			count = hp_elog->_drc_u.ic.count;
 			drc_index = hp_elog->_drc_u.ic.index;
 			rc = dlpar_memory_add_by_ic(count, drc_index);
-		} else {
+			break;
+		default:
 			rc = -EINVAL;
+			break;
 		}
 
 		break;
 	case PSERIES_HP_ELOG_ACTION_REMOVE:
-		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+		switch (hp_elog->id_type) {
+		case PSERIES_HP_ELOG_ID_DRC_COUNT:
 			count = hp_elog->_drc_u.drc_count;
 			rc = dlpar_memory_remove_by_count(count);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_INDEX:
 			drc_index = hp_elog->_drc_u.drc_index;
 			rc = dlpar_memory_remove_by_index(drc_index);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_IC:
 			count = hp_elog->_drc_u.ic.count;
 			drc_index = hp_elog->_drc_u.ic.index;
 			rc = dlpar_memory_remove_by_ic(count, drc_index);
-		} else {
+			break;
+		default:
 			rc = -EINVAL;
+			break;
 		}
 
 		break;

From 0c9c1d56397518eb823d458b00b06bcccd956794 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:14 -0300
Subject: [PATCH 012/264] x86, s390: Move ARCH_HAS_MEM_ENCRYPT definition to
 arch/Kconfig

powerpc is also going to use this feature, so put it in a generic location.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-2-bauerman@linux.ibm.com
---
 arch/Kconfig      | 3 +++
 arch/s390/Kconfig | 4 +---
 arch/x86/Kconfig  | 4 +---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a7b57dd42c26..89e2e3f64f79 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -925,6 +925,9 @@ config LOCK_EVENT_COUNTS
 	  the chance of application behavior change because of timing
 	  differences. The counts are reported via debugfs.
 
+config ARCH_HAS_MEM_ENCRYPT
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a4ad2733eedf..f43319c44454 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-config ARCH_HAS_MEM_ENCRYPT
-        def_bool y
-
 config MMU
 	def_bool y
 
@@ -68,6 +65,7 @@ config S390
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 222855cc0158..06027809c599 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -68,6 +68,7 @@ config X86
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
+	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
@@ -1518,9 +1519,6 @@ config X86_CPA_STATISTICS
 	  helps to determine the effectiveness of preserving large and huge
 	  page mappings when mapping protections are changed.
 
-config ARCH_HAS_MEM_ENCRYPT
-	def_bool y
-
 config AMD_MEM_ENCRYPT
 	bool "AMD Secure Memory Encryption (SME) support"
 	depends on X86_64 && CPU_SUP_AMD

From 47e5d8f9ed34a5310f45f48bbc15f1e61d83b6db Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:15 -0300
Subject: [PATCH 013/264] swiotlb: Remove call to sme_active()

sme_active() is an x86-specific function so it's better not to call it from
generic code.

There's no need to mention which memory encryption feature is active, so
just use a more generic message. Besides, other architectures will have
different names for similar technology.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-3-bauerman@linux.ibm.com
---
 kernel/dma/swiotlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9de232229063..f29caad71e13 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -461,8 +461,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
 	if (mem_encrypt_active())
-		pr_warn_once("%s is active and system is using DMA bounce buffers\n",
-			     sme_active() ? "SME" : "SEV");
+		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
 
 	mask = dma_get_seg_boundary(hwdev);
 

From e740815a97e2b6d6446792f4328378e66de166d1 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:16 -0300
Subject: [PATCH 014/264] dma-mapping: Remove dma_check_mask()

sme_active() is an x86-specific function so it's better not to call it from
generic code. Christoph Hellwig mentioned that "There is no reason why we
should have a special debug printk just for one specific reason why there
is a requirement for a large DMA mask.", so just remove dma_check_mask().

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-4-bauerman@linux.ibm.com
---
 kernel/dma/mapping.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 1f628e7ac709..61eeefbfcb36 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -291,12 +291,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
-	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
-		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
-}
-
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -327,7 +321,6 @@ int dma_set_mask(struct device *dev, u64 mask)
 		return -EIO;
 
 	arch_dma_set_mask(dev, mask);
-	dma_check_mask(dev, mask);
 	*dev->dma_mask = mask;
 	return 0;
 }
@@ -345,7 +338,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 	if (!dma_supported(dev, mask))
 		return -EIO;
 
-	dma_check_mask(dev, mask);
 	dev->coherent_dma_mask = mask;
 	return 0;
 }

From 284e21fab2cfcf90dacce565e0b12f29e5df00c1 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:17 -0300
Subject: [PATCH 015/264] x86, s390/mm: Move sme_active() and sme_me_mask to
 x86-specific header

Now that generic code doesn't reference them, move sme_active() and
sme_me_mask to x86's <asm/mem_encrypt.h>.

Also remove the export for sme_active() since it's only used in files that
won't be built as modules. sme_me_mask on the other hand is used in
arch/x86/kvm/svm.c (via __sme_set() and __psp_pa()) which can be built as a
module so its export needs to stay.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-5-bauerman@linux.ibm.com
---
 arch/s390/include/asm/mem_encrypt.h |  4 +---
 arch/x86/include/asm/mem_encrypt.h  | 10 ++++++++++
 arch/x86/mm/mem_encrypt.c           |  1 -
 include/linux/mem_encrypt.h         | 14 +-------------
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 3eb018508190..ff813a56bc30 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,9 +4,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define sme_me_mask	0ULL
-
-static inline bool sme_active(void) { return false; }
+static inline bool mem_encrypt_active(void) { return false; }
 extern bool sev_active(void);
 
 int set_memory_encrypted(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 0c196c47d621..848ce43b9040 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -92,6 +92,16 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
 
 extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
 
+static inline bool mem_encrypt_active(void)
+{
+	return sme_me_mask;
+}
+
+static inline u64 sme_get_me_mask(void)
+{
+	return sme_me_mask;
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index fece30ca8b0c..94da5a88abe6 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -344,7 +344,6 @@ bool sme_active(void)
 {
 	return sme_me_mask && !sev_enabled;
 }
-EXPORT_SYMBOL(sme_active);
 
 bool sev_active(void)
 {
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 470bd53a89df..0c5b0ff9eb29 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -18,23 +18,11 @@
 
 #else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-#define sme_me_mask	0ULL
-
-static inline bool sme_active(void) { return false; }
+static inline bool mem_encrypt_active(void) { return false; }
 static inline bool sev_active(void) { return false; }
 
 #endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-static inline bool mem_encrypt_active(void)
-{
-	return sme_me_mask;
-}
-
-static inline u64 sme_get_me_mask(void)
-{
-	return sme_me_mask;
-}
-
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 /*
  * The __sme_set() and __sme_clr() macros are useful for adding or removing

From ae7eb82a92fae4d255a0caa9f7c0f99e3babfec1 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:18 -0300
Subject: [PATCH 016/264] fs/core/vmcore: Move sev_active() reference to x86
 arch code

Secure Encrypted Virtualization is an x86-specific feature, so it shouldn't
appear in generic kernel code because it forces non-x86 architectures to
define the sev_active() function, which doesn't make a lot of sense.

To solve this problem, add an x86 elfcorehdr_read() function to override
the generic weak implementation. To do that, it's necessary to make
read_from_oldmem() public so that it can be used outside of vmcore.c.

Also, remove the export for sev_active() since it's only used in files that
won't be built as modules.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lianbo Jiang <lijiang@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-6-bauerman@linux.ibm.com
---
 arch/x86/kernel/crash_dump_64.c |  5 +++++
 arch/x86/mm/mem_encrypt.c       |  1 -
 fs/proc/vmcore.c                |  8 ++++----
 include/linux/crash_dump.h      | 14 ++++++++++++++
 include/linux/mem_encrypt.h     |  1 -
 5 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 22369dd5de3b..045e82e8945b 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
 {
 	return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
 }
+
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+	return read_from_oldmem(buf, count, ppos, 0, sev_active());
+}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 94da5a88abe6..9268c12458c8 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -349,7 +349,6 @@ bool sev_active(void)
 {
 	return sme_me_mask && sev_enabled;
 }
-EXPORT_SYMBOL(sev_active);
 
 /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 bool force_dma_unencrypted(struct device *dev)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7bcc92add72c..7b13988796e1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -104,9 +104,9 @@ static int pfn_is_ram(unsigned long pfn)
 }
 
 /* Reads a page from the oldmem device from given offset. */
-static ssize_t read_from_oldmem(char *buf, size_t count,
-				u64 *ppos, int userbuf,
-				bool encrypted)
+ssize_t read_from_oldmem(char *buf, size_t count,
+			 u64 *ppos, int userbuf,
+			 bool encrypted)
 {
 	unsigned long pfn, offset;
 	size_t nr_bytes;
@@ -170,7 +170,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0, sev_active());
+	return read_from_oldmem(buf, count, ppos, 0, false);
 }
 
 /*
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f774c5eb9e3c..4664fc1871de 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -115,4 +115,18 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
 	return -EOPNOTSUPP;
 }
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+#ifdef CONFIG_PROC_VMCORE
+ssize_t read_from_oldmem(char *buf, size_t count,
+			 u64 *ppos, int userbuf,
+			 bool encrypted);
+#else
+static inline ssize_t read_from_oldmem(char *buf, size_t count,
+				       u64 *ppos, int userbuf,
+				       bool encrypted)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE */
+
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 0c5b0ff9eb29..5c4a18a91f89 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -19,7 +19,6 @@
 #else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
 static inline bool mem_encrypt_active(void) { return false; }
-static inline bool sev_active(void) { return false; }
 
 #endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 

From 5cbdaeefb655072d304744812708b3f3a31c6b51 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Tue, 6 Aug 2019 01:49:19 -0300
Subject: [PATCH 017/264] s390/mm: Remove sev_active() function

All references to sev_active() were moved to arch/x86 so we don't need to
define it for s390 anymore.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Halil Pasic <pasic@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190806044919.10622-7-bauerman@linux.ibm.com
---
 arch/s390/include/asm/mem_encrypt.h | 1 -
 arch/s390/mm/init.c                 | 7 +------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index ff813a56bc30..2542cbf7e2d1 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -5,7 +5,6 @@
 #ifndef __ASSEMBLY__
 
 static inline bool mem_encrypt_active(void) { return false; }
-extern bool sev_active(void);
 
 int set_memory_encrypted(unsigned long addr, int numpages);
 int set_memory_decrypted(unsigned long addr, int numpages);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 20340a03ad90..a124f19f7b3c 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -156,14 +156,9 @@ int set_memory_decrypted(unsigned long addr, int numpages)
 }
 
 /* are we a protected virtualization guest? */
-bool sev_active(void)
-{
-	return is_prot_virt_guest();
-}
-
 bool force_dma_unencrypted(struct device *dev)
 {
-	return sev_active();
+	return is_prot_virt_guest();
 }
 
 /* protected virtualization */

From b214a8f2ea964b5ff2cc4a1e8bd8c74f64741f20 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Aug 2019 10:32:31 +0200
Subject: [PATCH 018/264] powerpc/xive: Use GFP_KERNEL instead of GFP_ATOMIC in
 'xive_irq_bitmap_add()'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no need to use GFP_ATOMIC here. GFP_KERNEL should be enough.
GFP_KERNEL is also already used for another allocation just a few lines
below.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/85d5d247ce753befd6aa63c473f7823de6520ccd.1564647619.git.christophe.jaillet@wanadoo.fr
---
 arch/powerpc/sysdev/xive/spapr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index 8ef9cf4ebb1c..b4f5eb9e0f82 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -45,7 +45,7 @@ static int xive_irq_bitmap_add(int base, int count)
 {
 	struct xive_irq_bitmap *xibm;
 
-	xibm = kzalloc(sizeof(*xibm), GFP_ATOMIC);
+	xibm = kzalloc(sizeof(*xibm), GFP_KERNEL);
 	if (!xibm)
 		return -ENOMEM;
 

From fd3806562f450a6189c31e0d2cb9cd4b208dcf2d Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Aug 2019 10:32:42 +0200
Subject: [PATCH 019/264] powerpc/xive: Add a check for memory allocation
 failure

The result of this kzalloc is not checked. Add a check and corresponding
error handling code.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/cc53462734dfeaf15b6bad0e626b483de18656b4.1564647619.git.christophe.jaillet@wanadoo.fr
---
 arch/powerpc/sysdev/xive/spapr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index b4f5eb9e0f82..52198131c75e 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -53,6 +53,10 @@ static int xive_irq_bitmap_add(int base, int count)
 	xibm->base = base;
 	xibm->count = count;
 	xibm->bitmap = kzalloc(xibm->count, GFP_KERNEL);
+	if (!xibm->bitmap) {
+		kfree(xibm);
+		return -ENOMEM;
+	}
 	list_add(&xibm->list, &xive_irq_bitmaps);
 
 	pr_info("Using IRQ range [%x-%x]", xibm->base,

From 2b87a2553aa04105724d6c844e223415985246ed Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 16 May 2019 12:04:37 +1000
Subject: [PATCH 020/264] powerpc/64s: Make boot look nice(r)

Radix boot looks like this:

 -----------------------------------------------------
 phys_mem_size     = 0x200000000
 dcache_bsize      = 0x80
 icache_bsize      = 0x80
 cpu_features      = 0x0000c06f8f5fb1a7
   possible        = 0x0000fbffcf5fb1a7
   always          = 0x00000003800081a1
 cpu_user_features = 0xdc0065c2 0xaee00000
 mmu_features      = 0xbc006041
 firmware_features = 0x0000000010000000
 hash-mmu: ppc64_pft_size    = 0x0
 hash-mmu: kernel vmalloc start   = 0xc008000000000000
 hash-mmu: kernel IO start        = 0xc00a000000000000
 hash-mmu: kernel vmemmap start   = 0xc00c000000000000
 -----------------------------------------------------

Fix:

 -----------------------------------------------------
 phys_mem_size     = 0x200000000
 dcache_bsize      = 0x80
 icache_bsize      = 0x80
 cpu_features      = 0x0000c06f8f5fb1a7
   possible        = 0x0000fbffcf5fb1a7
   always          = 0x00000003800081a1
 cpu_user_features = 0xdc0065c2 0xaee00000
 mmu_features      = 0xbc006041
 firmware_features = 0x0000000010000000
 vmalloc start     = 0xc008000000000000
 IO start          = 0xc00a000000000000
 vmemmap start     = 0xc00c000000000000
 -----------------------------------------------------

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190516020437.11783-1-npiggin@gmail.com
---
 arch/powerpc/kernel/setup-common.c    | 8 +++++++-
 arch/powerpc/mm/book3s64/hash_utils.c | 3 ---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 1f8db666468d..7b4c921ec73f 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -806,9 +806,15 @@ static __init void print_system_info(void)
 	pr_info("mmu_features      = 0x%08x\n", cur_cpu_spec->mmu_features);
 #ifdef CONFIG_PPC64
 	pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
+#ifdef CONFIG_PPC_BOOK3S
+	pr_info("vmalloc start     = 0x%lx\n", KERN_VIRT_START);
+	pr_info("IO start          = 0x%lx\n", KERN_IO_START);
+	pr_info("vmemmap start     = 0x%lx\n", (unsigned long)vmemmap);
+#endif
 #endif
 
-	print_system_hash_info();
+	if (!early_radix_enabled())
+		print_system_hash_info();
 
 	if (PHYSICAL_START > 0)
 		pr_info("physical_start    = 0x%llx\n",
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b8ad14bb1170..e6d471058597 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1957,7 +1957,4 @@ void __init print_system_hash_info(void)
 
 	if (htab_hash_mask)
 		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
-	pr_info("kernel vmalloc start   = 0x%lx\n", KERN_VIRT_START);
-	pr_info("kernel IO start        = 0x%lx\n", KERN_IO_START);
-	pr_info("kernel vmemmap start   = 0x%lx\n", (unsigned long)vmemmap);
 }

From c784be435d5dae28d3b03db31753dd7a18733f0c Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 15 May 2019 13:15:52 +0530
Subject: [PATCH 021/264] powerpc/pseries: Fix cpu_hotplug_lock acquisition in
 resize_hpt()

The calls to arch_add_memory()/arch_remove_memory() are always made
with the read-side cpu_hotplug_lock acquired via memory_hotplug_begin().
On pSeries, arch_add_memory()/arch_remove_memory() eventually call
resize_hpt() which in turn calls stop_machine() which acquires the
read-side cpu_hotplug_lock again, thereby resulting in the recursive
acquisition of this lock.

In the absence of CONFIG_PROVE_LOCKING, we hadn't observed a system
lockup during a memory hotplug operation because cpus_read_lock() is a
per-cpu rwsem read, which, in the fast-path (in the absence of the
writer, which in our case is a CPU-hotplug operation) simply
increments the read_count on the semaphore. Thus a recursive read in
the fast-path doesn't cause any problems.

However, we can hit this problem in practice if there is a concurrent
CPU-Hotplug operation in progress which is waiting to acquire the
write-side of the lock. This will cause the second recursive read to
block until the writer finishes. While the writer is blocked since the
first read holds the lock. Thus both the reader as well as the writers
fail to make any progress thereby blocking both CPU-Hotplug as well as
Memory Hotplug operations.

Memory-Hotplug				CPU-Hotplug
CPU 0					CPU 1
------                                  ------

1. down_read(cpu_hotplug_lock.rw_sem)
   [memory_hotplug_begin]
					2. down_write(cpu_hotplug_lock.rw_sem)
					[cpu_up/cpu_down]
3. down_read(cpu_hotplug_lock.rw_sem)
   [stop_machine()]

Lockdep complains as follows in these code-paths.

 swapper/0/1 is trying to acquire lock:
 (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60

but task is already holding lock:
(____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(cpu_hotplug_lock.rw_sem);
   lock(cpu_hotplug_lock.rw_sem);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 3 locks held by swapper/0/1:
  #0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0
  #1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
  #2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0

stack backtrace:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166
 Call Trace:
   dump_stack+0xe8/0x164 (unreliable)
   __lock_acquire+0x1110/0x1c70
   lock_acquire+0x240/0x290
   cpus_read_lock+0x64/0xf0
   stop_machine+0x2c/0x60
   pseries_lpar_resize_hpt+0x19c/0x2c0
   resize_hpt_for_hotplug+0x70/0xd0
   arch_add_memory+0x58/0xfc
   devm_memremap_pages+0x5e8/0x8f0
   pmem_attach_disk+0x764/0x830
   nvdimm_bus_probe+0x118/0x240
   really_probe+0x230/0x4b0
   driver_probe_device+0x16c/0x1e0
   __driver_attach+0x148/0x1b0
   bus_for_each_dev+0x90/0x130
   driver_attach+0x34/0x50
   bus_add_driver+0x1a8/0x360
   driver_register+0x108/0x170
   __nd_driver_register+0xd0/0xf0
   nd_pmem_driver_init+0x34/0x48
   do_one_initcall+0x1e0/0x45c
   kernel_init_freeable+0x540/0x64c
   kernel_init+0x2c/0x160
   ret_from_kernel_thread+0x5c/0x68

Fix this issue by
  1) Requiring all the calls to pseries_lpar_resize_hpt() be made
     with cpu_hotplug_lock held.

  2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
     as a consequence of 1)

  3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
     with cpu_hotplug_lock held.

Fixes: dbcf929c0062 ("powerpc/pseries: Add support for hash table resizing")
Cc: stable@vger.kernel.org # v4.11+
Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1557906352-29048-1-git-send-email-ego@linux.vnet.ibm.com
---
 arch/powerpc/mm/book3s64/hash_utils.c | 9 ++++++++-
 arch/powerpc/platforms/pseries/lpar.c | 8 ++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e6d471058597..c363e850550e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -34,6 +34,7 @@
 #include <linux/libfdt.h>
 #include <linux/pkeys.h>
 #include <linux/hugetlb.h>
+#include <linux/cpu.h>
 
 #include <asm/debugfs.h>
 #include <asm/processor.h>
@@ -1931,10 +1932,16 @@ static int hpt_order_get(void *data, u64 *val)
 
 static int hpt_order_set(void *data, u64 val)
 {
+	int ret;
+
 	if (!mmu_hash_ops.resize_hpt)
 		return -ENODEV;
 
-	return mmu_hash_ops.resize_hpt(val);
+	cpus_read_lock();
+	ret = mmu_hash_ops.resize_hpt(val);
+	cpus_read_unlock();
+
+	return ret;
 }
 
 DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..4f76e5f30c97 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
 	return 0;
 }
 
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
 static int pseries_lpar_resize_hpt(unsigned long shift)
 {
 	struct hpt_resize_state state = {
@@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
 
 	t1 = ktime_get();
 
-	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+				     &state, NULL);
 
 	t2 = ktime_get();
 

From 56090a3902c80c296e822d11acdb6a101b322c52 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 18 Jul 2019 15:11:36 +1000
Subject: [PATCH 022/264] powerpc/powernv/ioda: Fix race in TCE level
 allocation

pnv_tce() returns a pointer to a TCE entry and originally a TCE table
would be pre-allocated. For the default case of 2GB window the table
needs only a single level and that is fine. However if more levels are
requested, it is possible to get a race when 2 threads want a pointer
to a TCE entry from the same page of TCEs.

This adds cmpxchg to handle the race. Note that once TCE is non-zero,
it cannot become zero again.

Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand")
CC: stable@vger.kernel.org # v4.19+
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718051139.74787-2-aik@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..8d6569590161 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -48,6 +48,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned int levels);
+
 static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 {
 	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +60,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 
 	while (level) {
 		int n = (idx & mask) >> (level * shift);
-		unsigned long tce;
+		unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
 
-		if (tmp[n] == 0) {
+		if (!tce) {
 			__be64 *tmp2;
 
 			if (!alloc)
@@ -70,10 +73,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 			if (!tmp2)
 				return NULL;
 
-			tmp[n] = cpu_to_be64(__pa(tmp2) |
-					TCE_PCI_READ | TCE_PCI_WRITE);
+			tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+			oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+					cpu_to_be64(tce)));
+			if (oldtce) {
+				pnv_pci_ioda2_table_do_free_pages(tmp2,
+					ilog2(tbl->it_level_size) + 3, 1);
+				tce = oldtce;
+			}
 		}
-		tce = be64_to_cpu(tmp[n]);
 
 		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
 		idx &= ~mask;

From 4f7e0babbc7c46fc0db4f5c14fe96bf6f4d69502 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 18 Jul 2019 15:11:37 +1000
Subject: [PATCH 023/264] powerpc/iommu: Allow bypass-only for DMA

POWER8 and newer support a bypass mode which maps all host memory to
PCI buses so an IOMMU table is not always required. However if we fail to
create such a table, the DMA setup fails and the kernel does not boot.

This skips the 32bit DMA setup check if the bypass is selected.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718051139.74787-3-aik@ozlabs.ru
---
 arch/powerpc/kernel/dma-iommu.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index a0879674a9c8..c963d704fa31 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 {
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
-	if (!tbl) {
-		dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
-			", table unavailable\n", mask);
-		return 0;
-	}
-
 	if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
 		dev->archdata.iommu_bypass = true;
 		dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
 		return 1;
 	}
 
+	if (!tbl) {
+		dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask);
+		return 0;
+	}
+
 	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
 		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
 		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",

From c37c792dec0929dbb6360a609fb00fa20bb16fc2 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 18 Jul 2019 15:11:38 +1000
Subject: [PATCH 024/264] powerpc/powernv/ioda2: Allocate TCE table levels on
 demand for default DMA window

We allocate only the first level of multilevel TCE tables for KVM
already (alloc_userspace_copy==true), and the rest is allocated on demand.
This is not enabled though for bare metal.

This removes the KVM limitation (implicit, via the alloc_userspace_copy
parameter) and always allocates just the first level. The on-demand
allocation of missing levels is already implemented.

As from now on DMA map might happen with disabled interrupts, this
allocates TCEs with GFP_ATOMIC; otherwise lockdep reports errors 1].
In practice just a single page is allocated there so chances for failure
are quite low.

To save time when creating a new clean table, this skips non-allocated
indirect TCE entries in pnv_tce_free just like we already do in
the VFIO IOMMU TCE driver.

This changes the default level number from 1 to 2 to reduce the amount
of memory required for the default 32bit DMA window at the boot time.
The default window size is up to 2GB which requires 4MB of TCEs which is
unlikely to be used entirely or at all as most devices these days are
64bit capable so by switching to 2 levels by default we save 4032KB of
RAM per a device.

While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace
can trigger this path via VFIO, see the failure and try creating a table
again with different parameters which might succeed.

[1]:
===
BUG: sleeping function called from invalid context at mm/page_alloc.c:4596
in_atomic(): 1, irqs_disabled(): 1, pid: 1038, name: scsi_eh_1
2 locks held by scsi_eh_1/1038:
 #0: 000000005efd659a (&host->eh_mutex){+.+.}, at: ata_eh_acquire+0x34/0x80
 #1: 0000000006cf56a6 (&(&host->lock)->rlock){....}, at: ata_exec_internal_sg+0xb0/0x5c0
irq event stamp: 500
hardirqs last  enabled at (499): [<c000000000cb8a74>] _raw_spin_unlock_irqrestore+0x94/0xd0
hardirqs last disabled at (500): [<c000000000cb85c4>] _raw_spin_lock_irqsave+0x44/0x120
softirqs last  enabled at (0): [<c000000000101120>] copy_process.isra.4.part.5+0x640/0x1a80
softirqs last disabled at (0): [<0000000000000000>] 0x0
CPU: 73 PID: 1038 Comm: scsi_eh_1 Not tainted 5.2.0-rc6-le_nv2_aikATfstn1-p1 #634
Call Trace:
[c000003d064cef50] [c000000000c8e6c4] dump_stack+0xe8/0x164 (unreliable)
[c000003d064cefa0] [c00000000014ed78] ___might_sleep+0x2f8/0x310
[c000003d064cf020] [c0000000003ca084] __alloc_pages_nodemask+0x2a4/0x1560
[c000003d064cf220] [c0000000000c2530] pnv_alloc_tce_level.isra.0+0x90/0x130
[c000003d064cf290] [c0000000000c2888] pnv_tce+0x128/0x3b0
[c000003d064cf360] [c0000000000c2c00] pnv_tce_build+0xb0/0xf0
[c000003d064cf3c0] [c0000000000bbd9c] pnv_ioda2_tce_build+0x3c/0xb0
[c000003d064cf400] [c00000000004cfe0] ppc_iommu_map_sg+0x210/0x550
[c000003d064cf510] [c00000000004b7a4] dma_iommu_map_sg+0x74/0xb0
[c000003d064cf530] [c000000000863944] ata_qc_issue+0x134/0x470
[c000003d064cf5b0] [c000000000863ec4] ata_exec_internal_sg+0x244/0x5c0
[c000003d064cf700] [c0000000008642d0] ata_exec_internal+0x90/0xe0
[c000003d064cf780] [c0000000008650ac] ata_dev_read_id+0x2ec/0x640
[c000003d064cf8d0] [c000000000878e28] ata_eh_recover+0x948/0x16d0
[c000003d064cfa10] [c00000000087d760] sata_pmp_error_handler+0x480/0xbf0
[c000003d064cfbc0] [c000000000884624] ahci_error_handler+0x74/0xe0
[c000003d064cfbf0] [c000000000879fa8] ata_scsi_port_error_handler+0x2d8/0x7c0
[c000003d064cfca0] [c00000000087a544] ata_scsi_error+0xb4/0x100
[c000003d064cfd00] [c000000000802450] scsi_error_handler+0x120/0x510
[c000003d064cfdb0] [c000000000140c48] kthread+0x1b8/0x1c0
[c000003d064cfe20] [c00000000000bd8c] ret_from_kernel_thread+0x5c/0x70
ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
irq event stamp: 2305

========================================================
hardirqs last  enabled at (2305): [<c00000000000e4c8>] fast_exc_return_irq+0x28/0x34
hardirqs last disabled at (2303): [<c000000000cb9fd0>] __do_softirq+0x4a0/0x654
WARNING: possible irq lock inversion dependency detected
5.2.0-rc6-le_nv2_aikATfstn1-p1 #634 Tainted: G        W
softirqs last  enabled at (2304): [<c000000000cba054>] __do_softirq+0x524/0x654
softirqs last disabled at (2297): [<c00000000010f278>] irq_exit+0x128/0x180
--------------------------------------------------------
swapper/0/0 just changed the state of lock:
0000000006cf56a6 (&(&host->lock)->rlock){-...}, at: ahci_single_level_irq_intr+0xac/0x120
but this lock took another, HARDIRQ-unsafe lock in the past:
 (fs_reclaim){+.+.}

and interrupts could create inverse lock ordering between them.

other info that might help us debug this:
 Possible interrupt unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               local_irq_disable();
                               lock(&(&host->lock)->rlock);
                               lock(fs_reclaim);
  <Interrupt>
    lock(&(&host->lock)->rlock);

 *** DEADLOCK ***

no locks held by swapper/0/0.

the shortest dependencies between 2nd lock and 1st lock:
 -> (fs_reclaim){+.+.} ops: 167579 {
    HARDIRQ-ON-W at:
                      lock_acquire+0xf8/0x2a0
                      fs_reclaim_acquire.part.23+0x44/0x60
                      kmem_cache_alloc_node_trace+0x80/0x590
                      alloc_desc+0x64/0x270
                      __irq_alloc_descs+0x2e4/0x3a0
                      irq_domain_alloc_descs+0xb0/0x150
                      irq_create_mapping+0x168/0x2c0
                      xics_smp_probe+0x2c/0x98
                      pnv_smp_probe+0x40/0x9c
                      smp_prepare_cpus+0x524/0x6c4
                      kernel_init_freeable+0x1b4/0x650
                      kernel_init+0x2c/0x148
                      ret_from_kernel_thread+0x5c/0x70
    SOFTIRQ-ON-W at:
                      lock_acquire+0xf8/0x2a0
                      fs_reclaim_acquire.part.23+0x44/0x60
                      kmem_cache_alloc_node_trace+0x80/0x590
                      alloc_desc+0x64/0x270
                      __irq_alloc_descs+0x2e4/0x3a0
                      irq_domain_alloc_descs+0xb0/0x150
                      irq_create_mapping+0x168/0x2c0
                      xics_smp_probe+0x2c/0x98
                      pnv_smp_probe+0x40/0x9c
                      smp_prepare_cpus+0x524/0x6c4
                      kernel_init_freeable+0x1b4/0x650
                      kernel_init+0x2c/0x148
                      ret_from_kernel_thread+0x5c/0x70
    INITIAL USE at:
                     lock_acquire+0xf8/0x2a0
                     fs_reclaim_acquire.part.23+0x44/0x60
                     kmem_cache_alloc_node_trace+0x80/0x590
                     alloc_desc+0x64/0x270
                     __irq_alloc_descs+0x2e4/0x3a0
                     irq_domain_alloc_descs+0xb0/0x150
                     irq_create_mapping+0x168/0x2c0
                     xics_smp_probe+0x2c/0x98
                     pnv_smp_probe+0x40/0x9c
                     smp_prepare_cpus+0x524/0x6c4
                     kernel_init_freeable+0x1b4/0x650
                     kernel_init+0x2c/0x148
                     ret_from_kernel_thread+0x5c/0x70
  }
===

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718051139.74787-4-aik@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++++++++----------
 arch/powerpc/platforms/powernv/pci.h          |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 8d6569590161..a0b9c0c23ed2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	struct page *tce_mem = NULL;
 	__be64 *addr;
 
-	tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+	tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+			shift - PAGE_SHIFT);
 	if (!tce_mem) {
 		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
 				shift);
@@ -169,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 		if (ptce)
 			*ptce = cpu_to_be64(0);
+		else
+			/* Skip the rest of the level */
+			i |= tbl->it_level_size - 1;
 	}
 }
 
@@ -268,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
 			PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
-	unsigned int tmplevels = levels;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
@@ -276,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
-	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
-		tmplevels = 1;
-
 	/* Adjust direct table size from window_size and levels */
 	entries_shift = (entries_shift + levels - 1) / levels;
 	level_shift = entries_shift + 3;
@@ -289,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	/* Allocate TCE table */
 	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-			tmplevels, tce_table_size, &offset, &total_allocated);
+			1, tce_table_size, &offset, &total_allocated);
 
 	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
@@ -300,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	 * we did not allocate as much as we wanted,
 	 * release partially allocated table.
 	 */
-	if (tmplevels == levels && offset < tce_table_size)
+	if (levels == 1 && offset < tce_table_size)
 		goto free_tces_exit;
 
 	/* Allocate userspace view of the TCE table */
 	if (alloc_userspace_copy) {
 		offset = 0;
 		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-				tmplevels, tce_table_size, &offset,
+				1, tce_table_size, &offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
-		if (tmplevels == levels && (offset < tce_table_size ||
+		if (levels == 1 && (offset < tce_table_size ||
 				total_allocated_uas != total_allocated))
 			goto free_uas_exit;
 	}
@@ -326,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
 			window_size, tce_table_size, bus_offset, tbl->it_base,
-			tbl->it_userspace, tmplevels, levels);
+			tbl->it_userspace, 1, levels);
 
 	return 0;
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 469c24463247..f914f0b14e4e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
 		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_DEFAULT_LEVELS	2
 #define POWERNV_IOMMU_MAX_LEVELS	5
 
 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,

From 201ed7f327a17577debec52c33786d4b3259d0dc Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 18 Jul 2019 15:11:39 +1000
Subject: [PATCH 025/264] powerpc/powernv/ioda2: Create bigger default window
 with 64k IOMMU pages

At the moment we create a small window only for 32bit devices, the window
maps 0..2GB of the PCI space only. For other devices we either use
a sketchy bypass or hardware bypass but the former can only work if
the amount of RAM is no bigger than the device's DMA mask and the latter
requires devices to support at least 59bit DMA.

This extends the default DMA window to the maximum size possible to allow
a wider DMA mask than just 32bit. The default window size is now limited
by the the iommu_table::it_map allocation bitmap which is a contiguous
array, 1 bit per an IOMMU page.

This increases the default IOMMU page size from hard coded 4K to
the system page size to allow wider DMA masks.

This increases the level number to not exceed the max order allocation
limit per TCE level. By the same time, this keeps minimal levels number
as 2 in order to save memory.

As the extended window now overlaps the 32bit MMIO region, this adds
an area reservation to iommu_init_table().

After this change the default window size is 0x80000000000==1<<43 so
devices limited to DMA mask smaller than the amount of system RAM can
still use more than just 2GB of memory for DMA.

This is an optimization and not a bug fix for DMA API usage.

With the on-demand allocation of indirect TCE table levels enabled and
2 levels, the first TCE level size is just
1<<ceil((log2(0x7ffffffffff+1)-16)/2)=16384 TCEs or 2 system pages.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718051139.74787-5-aik@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h          |  7 ++-
 arch/powerpc/kernel/iommu.c               | 74 ++++++++++++++++-------
 arch/powerpc/platforms/cell/iommu.c       |  2 +-
 arch/powerpc/platforms/pasemi/iommu.c     |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 42 ++++++++++---
 arch/powerpc/platforms/pseries/iommu.c    |  8 +--
 arch/powerpc/platforms/pseries/vio.c      |  2 +-
 arch/powerpc/sysdev/dart_iommu.c          |  2 +-
 8 files changed, 100 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 18d342b815e4..d7bf1f104c15 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -111,6 +111,8 @@ struct iommu_table {
 	struct iommu_table_ops *it_ops;
 	struct kref    it_kref;
 	int it_nid;
+	unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
+	unsigned long it_reserved_end;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -149,8 +151,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
  */
-extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
-					    int nid);
+extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
+		int nid, unsigned long res_start, unsigned long res_end);
+
 #define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
 struct iommu_table_group;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0a67ce9f827e..e7a2b160d4c6 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl)
 #endif
 }
 
+static void iommu_table_reserve_pages(struct iommu_table *tbl,
+		unsigned long res_start, unsigned long res_end)
+{
+	int i;
+
+	WARN_ON_ONCE(res_end < res_start);
+	/*
+	 * Reserve page 0 so it will not be used for any mappings.
+	 * This avoids buggy drivers that consider page 0 to be invalid
+	 * to crash the machine or even lose data.
+	 */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	tbl->it_reserved_start = res_start;
+	tbl->it_reserved_end = res_end;
+
+	/* Check if res_start..res_end isn't empty and overlaps the table */
+	if (res_start && res_end &&
+			(tbl->it_offset + tbl->it_size < res_start ||
+			 res_end < tbl->it_offset))
+		return;
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		set_bit(i - tbl->it_offset, tbl->it_map);
+}
+
+static void iommu_table_release_pages(struct iommu_table *tbl)
+{
+	int i;
+
+	/*
+	 * In case we have reserved the first bit, we should not emit
+	 * the warning below.
+	 */
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+		clear_bit(i - tbl->it_offset, tbl->it_map);
+}
+
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
  */
-struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
+struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
+		unsigned long res_start, unsigned long res_end)
 {
 	unsigned long sz;
 	static int welcomed = 0;
@@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	tbl->it_map = page_address(page);
 	memset(tbl->it_map, 0, sz);
 
-	/*
-	 * Reserve page 0 so it will not be used for any mappings.
-	 * This avoids buggy drivers that consider page 0 to be invalid
-	 * to crash the machine or even lose data.
-	 */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	iommu_table_reserve_pages(tbl, res_start, res_end);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
 	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
@@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref)
 		return;
 	}
 
-	/*
-	 * In case we have reserved the first bit, we should not emit
-	 * the warning below.
-	 */
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
@@ -1024,15 +1056,14 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
 
-	if (tbl->it_offset == 0)
-		clear_bit(0, tbl->it_map);
+	iommu_table_release_pages(tbl);
 
 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
 		ret = -EBUSY;
-		/* Restore bit#0 set by iommu_init_table() */
-		if (tbl->it_offset == 0)
-			set_bit(0, tbl->it_map);
+		/* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
+		iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
+				tbl->it_reserved_end);
 	} else {
 		memset(tbl->it_map, 0xff, sz);
 	}
@@ -1055,9 +1086,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
 
 	memset(tbl->it_map, 0, sz);
 
-	/* Restore bit#0 set by iommu_init_table() */
-	if (tbl->it_offset == 0)
-		set_bit(0, tbl->it_map);
+	iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
+			tbl->it_reserved_end);
 
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_unlock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 16dfee29aa41..ca9ffc1c8685 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_size = size >> window->table.it_page_shift;
 	window->table.it_ops = &cell_iommu_ops;
 
-	iommu_init_table(&window->table, iommu->nid);
+	iommu_init_table(&window->table, iommu->nid, 0, 0);
 
 	pr_debug("\tioid      %d\n", window->ioid);
 	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 77fee09104f8..b500a6e47e6b 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
 	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
-	iommu_init_table(&iommu_table_iobmap, 0);
+	iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
 	pr_debug(" <- %s\n", __func__);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8080558d020..ec48ea25a674 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2303,7 +2303,7 @@ found:
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
-	iommu_init_table(tbl, phb->hose->node);
+	iommu_init_table(tbl, phb->hose->node, 0, 0);
 
 	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2420,6 +2420,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = NULL;
 	long rc;
+	unsigned long res_start, res_end;
 
 	/*
 	 * crashkernel= specifies the kdump kernel's maximum memory at
@@ -2433,19 +2434,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
 
-	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
-			IOMMU_PAGE_SHIFT_4K,
-			window_size,
-			POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+	/*
+	 * We create the default window as big as we can. The constraint is
+	 * the max order of allocation possible. The TCE table is likely to
+	 * end up being multilevel and with on-demand allocation in place,
+	 * the initial use is not going to be huge as the default window aims
+	 * to support crippled devices (i.e. not fully 64bit DMAble) only.
+	 */
+	/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+	const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+	/* Each TCE level cannot exceed maxblock so go multilevel if needed */
+	unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
+	unsigned long tcelevel_order = ilog2(maxblock >> 3);
+	unsigned int levels = tces_order / tcelevel_order;
+
+	if (tces_order % tcelevel_order)
+		levels += 1;
+	/*
+	 * We try to stick to default levels (which is >1 at the moment) in
+	 * order to save memory by relying on on-demain TCE level allocation.
+	 */
+	levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+			window_size, levels, false, &tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
 				rc);
 		return rc;
 	}
 
-	iommu_init_table(tbl, pe->phb->hose->node);
+	/* We use top part of 32bit space for MMIO so exclude it from DMA */
+	res_start = 0;
+	res_end = 0;
+	if (window_size > pe->phb->ioda.m32_pci_base) {
+		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+	}
+	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
 
 	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
 	if (rc) {
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 889dc2e44b89..42fb03253334 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -609,7 +609,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
-	iommu_init_table(tbl, pci->phb->node);
+	iommu_init_table(tbl, pci->phb->node, 0, 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
 				ppci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		iommu_init_table(tbl, ppci->phb->node);
+		iommu_init_table(tbl, ppci->phb->node, 0, 0);
 		iommu_register_group(ppci->table_group,
 				pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->table_group);
@@ -719,7 +719,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
-		iommu_init_table(tbl, phb->node);
+		iommu_init_table(tbl, phb->node, 0, 0);
 		set_iommu_table_base(&dev->dev, tbl);
 		return;
 	}
@@ -1169,7 +1169,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
 				pci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		iommu_init_table(tbl, pci->phb->node);
+		iommu_init_table(tbl, pci->phb->node, 0, 0);
 		iommu_register_group(pci->table_group,
 				pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->table_group);
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 6601b9d404dc..115934f83935 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1191,7 +1191,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 	else
 		tbl->it_ops = &iommu_table_pseries_ops;
 
-	return iommu_init_table(tbl, -1);
+	return iommu_init_table(tbl, -1, 0, 0);
 }
 
 /**
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 21a1fae0714e..6b4a34b36d98 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -344,7 +344,7 @@ static void iommu_table_dart_setup(void)
 	iommu_table_dart.it_index = 0;
 	iommu_table_dart.it_blocksize = 1;
 	iommu_table_dart.it_ops = &iommu_dart_ops;
-	iommu_init_table(&iommu_table_dart, -1);
+	iommu_init_table(&iommu_table_dart, -1, 0, 0);
 
 	/* Reserve the last page of the DART to avoid possible prefetch
 	 * past the DART mapped area

From c3e0dbd7f780a58c4695f1cd8fc8afde80376737 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Wed, 14 Aug 2019 17:47:52 +0200
Subject: [PATCH 026/264] powerpc/xmon: Check for HV mode when dumping XIVE
 info from OPAL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the xmon 'dx' command calls OPAL to dump the XIVE state in
the OPAL logs and also outputs some of the fields of the internal XIVE
structures in Linux. The OPAL calls can only be done on baremetal
(PowerNV) and they crash a pseries machine. Fix by checking the
hypervisor feature of the CPU.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814154754.23682-2-clg@kaod.org
---
 arch/powerpc/xmon/xmon.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 14e56c25879f..25d4adccf750 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2534,13 +2534,16 @@ static void dump_pacas(void)
 static void dump_one_xive(int cpu)
 {
 	unsigned int hwid = get_hard_smp_processor_id(cpu);
+	bool hv = cpu_has_feature(CPU_FTR_HVMODE);
 
-	opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
-	opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
-	opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
-	opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
-	opal_xive_dump(XIVE_DUMP_VP, hwid);
-	opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
+	if (hv) {
+		opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
+		opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
+		opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
+		opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
+		opal_xive_dump(XIVE_DUMP_VP, hwid);
+		opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
+	}
 
 	if (setjmp(bus_error_jmp) != 0) {
 		catch_memory_errors = 0;

From b4868ff55d082bc66b0c287a41e4888f6d3e5f87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Wed, 14 Aug 2019 17:47:53 +0200
Subject: [PATCH 027/264] powerpc/xive: Fix dump of XIVE interrupt under
 pseries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xmon 'dxi' command calls OPAL to query the XIVE configuration of a
interrupt. This can only be done on baremetal (PowerNV) and it will
crash a pseries machine.

Introduce a new XIVE get_irq_config() operation which implements a
different query depending on the platform, PowerNV or pseries, and
modify xmon to use a top level wrapper.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814154754.23682-3-clg@kaod.org
---
 arch/powerpc/include/asm/xive.h          |  2 +
 arch/powerpc/sysdev/xive/common.c        |  7 ++++
 arch/powerpc/sysdev/xive/native.c        | 15 +++++++
 arch/powerpc/sysdev/xive/spapr.c         | 51 ++++++++++++++++++++++++
 arch/powerpc/sysdev/xive/xive-internal.h |  2 +
 arch/powerpc/xmon/xmon.c                 | 12 +++---
 6 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index efb0e597b272..967d6ab3c977 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,6 +99,8 @@ extern void xive_flush_interrupt(void);
 
 /* xmon hook */
 extern void xmon_xive_do_dump(int cpu);
+extern int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio,
+				    u32 *sw_irq);
 
 /* APIs used by KVM */
 extern u32 xive_native_default_eq_shift(void);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 6b973b7cdd8a..f75a660365e5 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -257,6 +257,13 @@ notrace void xmon_xive_do_dump(int cpu)
 	}
 #endif
 }
+
+int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio,
+			     u32 *sw_irq)
+{
+	return xive_ops->get_irq_config(irq, target, prio, sw_irq);
+}
+
 #endif /* CONFIG_XMON */
 
 static unsigned int xive_get_irq(void)
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 2f26b74f6cfa..4b61e44f0171 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -111,6 +111,20 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 }
 EXPORT_SYMBOL_GPL(xive_native_configure_irq);
 
+static int xive_native_get_irq_config(u32 hw_irq, u32 *target, u8 *prio,
+				      u32 *sw_irq)
+{
+	s64 rc;
+	__be64 vp;
+	__be32 lirq;
+
+	rc = opal_xive_get_irq_config(hw_irq, &vp, prio, &lirq);
+
+	*target = be64_to_cpu(vp);
+	*sw_irq = be32_to_cpu(lirq);
+
+	return rc == 0 ? 0 : -ENXIO;
+}
 
 /* This can be called multiple time to change a queue configuration */
 int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
@@ -442,6 +456,7 @@ EXPORT_SYMBOL_GPL(xive_native_sync_queue);
 static const struct xive_ops xive_native_ops = {
 	.populate_irq_data	= xive_native_populate_irq_data,
 	.configure_irq		= xive_native_configure_irq,
+	.get_irq_config		= xive_native_get_irq_config,
 	.setup_queue		= xive_native_setup_queue,
 	.cleanup_queue		= xive_native_cleanup_queue,
 	.match			= xive_native_match,
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index 52198131c75e..33c10749edec 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -215,6 +215,38 @@ static long plpar_int_set_source_config(unsigned long flags,
 	return 0;
 }
 
+static long plpar_int_get_source_config(unsigned long flags,
+					unsigned long lisn,
+					unsigned long *target,
+					unsigned long *prio,
+					unsigned long *sw_irq)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	pr_devel("H_INT_GET_SOURCE_CONFIG flags=%lx lisn=%lx\n", flags, lisn);
+
+	do {
+		rc = plpar_hcall(H_INT_GET_SOURCE_CONFIG, retbuf, flags, lisn,
+				 target, prio, sw_irq);
+	} while (plpar_busy_delay(rc));
+
+	if (rc) {
+		pr_err("H_INT_GET_SOURCE_CONFIG lisn=%ld failed %ld\n",
+		       lisn, rc);
+		return rc;
+	}
+
+	*target = retbuf[0];
+	*prio   = retbuf[1];
+	*sw_irq = retbuf[2];
+
+	pr_devel("H_INT_GET_SOURCE_CONFIG target=%lx prio=%lx sw_irq=%lx\n",
+		retbuf[0], retbuf[1], retbuf[2]);
+
+	return 0;
+}
+
 static long plpar_int_get_queue_info(unsigned long flags,
 				     unsigned long target,
 				     unsigned long priority,
@@ -398,6 +430,24 @@ static int xive_spapr_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 	return rc == 0 ? 0 : -ENXIO;
 }
 
+static int xive_spapr_get_irq_config(u32 hw_irq, u32 *target, u8 *prio,
+				     u32 *sw_irq)
+{
+	long rc;
+	unsigned long h_target;
+	unsigned long h_prio;
+	unsigned long h_sw_irq;
+
+	rc = plpar_int_get_source_config(0, hw_irq, &h_target, &h_prio,
+					 &h_sw_irq);
+
+	*target = h_target;
+	*prio = h_prio;
+	*sw_irq = h_sw_irq;
+
+	return rc == 0 ? 0 : -ENXIO;
+}
+
 /* This can be called multiple time to change a queue configuration */
 static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio,
 				   __be32 *qpage, u32 order)
@@ -590,6 +640,7 @@ static void xive_spapr_sync_source(u32 hw_irq)
 static const struct xive_ops xive_spapr_ops = {
 	.populate_irq_data	= xive_spapr_populate_irq_data,
 	.configure_irq		= xive_spapr_configure_irq,
+	.get_irq_config		= xive_spapr_get_irq_config,
 	.setup_queue		= xive_spapr_setup_queue,
 	.cleanup_queue		= xive_spapr_cleanup_queue,
 	.match			= xive_spapr_match,
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
index 211725dbf364..59cd366e7933 100644
--- a/arch/powerpc/sysdev/xive/xive-internal.h
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -33,6 +33,8 @@ struct xive_cpu {
 struct xive_ops {
 	int	(*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
 	int 	(*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+	int	(*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio,
+				  u32 *sw_irq);
 	int	(*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
 	void	(*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
 	void	(*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 25d4adccf750..4ea53e05053f 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2574,14 +2574,14 @@ static void dump_all_xives(void)
 
 static void dump_one_xive_irq(u32 num)
 {
-	s64 rc;
-	__be64 vp;
+	int rc;
+	u32 target;
 	u8 prio;
-	__be32 lirq;
+	u32 lirq;
 
-	rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
-	xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
-		    num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
+	rc = xmon_xive_get_irq_config(num, &target, &prio, &lirq);
+	xmon_printf("IRQ 0x%08x : target=0x%x prio=%d lirq=0x%x (rc=%d)\n",
+		    num, target, prio, lirq, rc);
 }
 
 static void dump_xives(void)

From 39f14e79b15a40709ef177bc4c07f193b6d3bce3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Wed, 14 Aug 2019 17:47:54 +0200
Subject: [PATCH 028/264] powerpc/xmon: Add a dump of all XIVE interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify the xmon 'dxi' command to query all interrupts if no IRQ number
is specified.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814154754.23682-4-clg@kaod.org
---
 arch/powerpc/xmon/xmon.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 4ea53e05053f..dc9832e06256 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2584,6 +2584,25 @@ static void dump_one_xive_irq(u32 num)
 		    num, target, prio, lirq, rc);
 }
 
+static void dump_all_xive_irq(void)
+{
+	unsigned int i;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(i, desc) {
+		struct irq_data *d = irq_desc_get_irq_data(desc);
+		unsigned int hwirq;
+
+		if (!d)
+			continue;
+
+		hwirq = (unsigned int)irqd_to_hwirq(d);
+		/* IPIs are special (HW number 0) */
+		if (hwirq)
+			dump_one_xive_irq(hwirq);
+	}
+}
+
 static void dump_xives(void)
 {
 	unsigned long num;
@@ -2601,6 +2620,8 @@ static void dump_xives(void)
 	} else if (c == 'i') {
 		if (scanhex(&num))
 			dump_one_xive_irq(num);
+		else
+			dump_all_xive_irq();
 		return;
 	}
 

From 7c7a532ba3fc51bf9527d191fb410786c1fdc73c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 12:36:09 +0000
Subject: [PATCH 029/264] powerpc/ptdump: Fix addresses display on PPC32

Commit 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot")
wrongly changed KERN_VIRT_START from 0 to PAGE_OFFSET, leading to a
shift in the displayed addresses.

Lets revert that change to resync walk_pagetables()'s addr val and
pgd_t pointer for PPC32.

Fixes: 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot")
Cc: stable@vger.kernel.org # v5.2+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/eb4d626514e22f85814830012642329018ef6af9.1565786091.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/ptdump/ptdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 6a88a9f585d4..3ad64fc11419 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -27,7 +27,7 @@
 #include "ptdump.h"
 
 #ifdef CONFIG_PPC32
-#define KERN_VIRT_START	PAGE_OFFSET
+#define KERN_VIRT_START	0
 #endif
 
 /*

From e033829d2aaad85cb7cf46986c3be0bcc72f791e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 12:36:10 +0000
Subject: [PATCH 030/264] powerpc/ptdump: fix walk_pagetables() address
 mismatch

walk_pagetables() always walk the entire pgdir from address 0
but considers PAGE_OFFSET or KERN_VIRT_START as the starting
address of the walk, resulting in a possible mismatch in the
displayed addresses.

Ex: on PPC32, when KERN_VIRT_START was locally defined as
PAGE_OFFSET, ptdump displayed 0x80000000
instead of 0xc0000000 for the first kernel page,
because 0xc0000000 + 0xc0000000 = 0x80000000

Start the walk at st->start_address instead of starting at 0.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/5aa2ac513295f594cce8ddb1c649f61947bd063d.1565786091.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/ptdump/ptdump.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 3ad64fc11419..74ff2bff4ea0 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -299,17 +299,15 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 
 static void walk_pagetables(struct pg_state *st)
 {
-	pgd_t *pgd = pgd_offset_k(0UL);
 	unsigned int i;
-	unsigned long addr;
-
-	addr = st->start_address;
+	unsigned long addr = st->start_address & PGDIR_MASK;
+	pgd_t *pgd = pgd_offset_k(addr);
 
 	/*
 	 * Traverse the linux pagetable structure and dump pages that are in
 	 * the hash pagetable.
 	 */
-	for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
+	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
 		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
 			/* pgd exists */
 			walk_pud(st, pgd, addr);

From 82242352728125ecd86cea4631b89a5e6be0d0fd Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 12:36:11 +0000
Subject: [PATCH 031/264] powerpc/ptdump: drop dummy KERN_VIRT_START on PPC32

PPC32 doesn't have KERN_VIRT_START. Make PAGE_OFFSET the
default starting address for the dump, and drop the dummy
definition of KERN_VIRT_START. Only use KERN_VIRT_START for
non radix PPC64.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/334632b1df4775b0ccf3bdc8d6b201d14e3daedd.1565786091.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/ptdump/ptdump.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 74ff2bff4ea0..9a2186c133e6 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -26,10 +26,6 @@
 
 #include "ptdump.h"
 
-#ifdef CONFIG_PPC32
-#define KERN_VIRT_START	0
-#endif
-
 /*
  * To visualise what is happening,
  *
@@ -362,12 +358,13 @@ static int ptdump_show(struct seq_file *m, void *v)
 	struct pg_state st = {
 		.seq = m,
 		.marker = address_markers,
+		.start_address = PAGE_OFFSET,
 	};
 
-	if (radix_enabled())
-		st.start_address = PAGE_OFFSET;
-	else
+#ifdef CONFIG_PPC64
+	if (!radix_enabled())
 		st.start_address = KERN_VIRT_START;
+#endif
 
 	/* Traverse kernel page tables */
 	walk_pagetables(&st);
@@ -405,12 +402,13 @@ void ptdump_check_wx(void)
 		.seq = NULL,
 		.marker = address_markers,
 		.check_wx = true,
+		.start_address = PAGE_OFFSET,
 	};
 
-	if (radix_enabled())
-		st.start_address = PAGE_OFFSET;
-	else
+#ifdef CONFIG_PPC64
+	if (!radix_enabled())
 		st.start_address = KERN_VIRT_START;
+#endif
 
 	walk_pagetables(&st);
 

From f3a2ac05894b57e78cd6cc0adba1de642034d940 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 12:36:12 +0000
Subject: [PATCH 032/264] powerpc/ptdump: get out of note_prot_wx() when
 CONFIG_PPC_DEBUG_WX is not selected.

When CONFIG_PPC_DEBUG_WX, note_prot_wx() is useless.

Get out of it early and inconditionnally in that case,
so that GCC can kick all the code out.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/ff6c8f631bd4ce3a10e0cc241eb569816187bc20.1565786091.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/ptdump/ptdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 9a2186c133e6..ab6a572202b4 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -177,7 +177,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 
 static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
-	if (!st->check_wx)
+	if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx)
 		return;
 
 	if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X)))

From 65e701b2d2a8593c44c8855aee2e087be7e11e75 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 12:36:13 +0000
Subject: [PATCH 033/264] powerpc/ptdump: drop non vital #ifdefs

hashpagetable.c is only compiled when CONFIG_PPC_BOOK3S_64 is
defined, so drop the test and its 'else' branch.

Use IS_ENABLED(CONFIG_PPC_PSERIES) instead of #ifdef, this allows the
code to be checked at any build. It is still optimised out by GCC.

Use IS_ENABLED(CONFIG_PPC_64K_PAGES) instead of #ifdef.

Use IS_ENABLED(CONFIG_SPARSEMEN_VMEMMAP) instead of #ifdef.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/c8998ed32e4e3954b56a8dacecfe43319a2a0483.1565786091.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/plpar_wrappers.h |  6 ++++++
 arch/powerpc/mm/ptdump/hashpagetable.c    | 24 +++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index cff5a411e595..4497c8afb573 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -340,6 +340,12 @@ static inline long plpar_set_ciabr(unsigned long ciabr)
 {
 	return 0;
 }
+
+static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
+				    unsigned long *ptes)
+{
+	return 0;
+}
 #endif /* CONFIG_PPC_PSERIES */
 
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index 72f0e4a3d839..a07278027c6f 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -237,7 +237,6 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
 	return -1;
 }
 
-#ifdef CONFIG_PPC_PSERIES
 static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r)
 {
 	struct hash_pte ptes[4];
@@ -274,7 +273,6 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *
 	}
 	return -1;
 }
-#endif
 
 static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
 		unsigned long *lp_bits)
@@ -316,10 +314,9 @@ static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
 static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v,
 			  u64 *r)
 {
-#ifdef CONFIG_PPC_PSERIES
-	if (firmware_has_feature(FW_FEATURE_LPAR))
+	if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR))
 		return pseries_find(ea, psize, primary, v, r);
-#endif
+
 	return native_find(ea, psize, primary, v, r);
 }
 
@@ -386,12 +383,13 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
 			psize = mmu_vmalloc_psize;
 		else
 			psize = mmu_io_psize;
-#ifdef CONFIG_PPC_64K_PAGES
+
 		/* check for secret 4K mappings */
-		if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) ||
-			((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
+		if (IS_ENABLED(CONFIG_PPC_64K_PAGES) &&
+		    ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO ||
+		     (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
 			psize = mmu_io_psize;
-#endif
+
 		/* check for hashpte */
 		status = hpte_find(st, addr, psize);
 
@@ -469,9 +467,10 @@ static void walk_linearmapping(struct pg_state *st)
 
 static void walk_vmemmap(struct pg_state *st)
 {
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 	struct vmemmap_backing *ptr = vmemmap_list;
 
+	if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+		return;
 	/*
 	 * Traverse the vmemmaped memory and dump pages that are in the hash
 	 * pagetable.
@@ -481,7 +480,6 @@ static void walk_vmemmap(struct pg_state *st)
 		ptr = ptr->list;
 	}
 	seq_puts(st->seq, "---[ vmemmap end ]---\n");
-#endif
 }
 
 static void populate_markers(void)
@@ -495,11 +493,7 @@ static void populate_markers(void)
 	address_markers[6].start_address = PHB_IO_END;
 	address_markers[7].start_address = IOREMAP_BASE;
 	address_markers[8].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_BOOK3S_64
 	address_markers[9].start_address =  H_VMEMMAP_START;
-#else
-	address_markers[9].start_address =  VMEMMAP_BASE;
-#endif
 }
 
 static int ptdump_show(struct seq_file *m, void *v)

From 658d029df0bce6472c94b724ca54d74bc6659c2e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 28 Jun 2019 15:55:52 +0000
Subject: [PATCH 034/264] powerpc/hw_breakpoint: move instruction stepping out
 of hw_breakpoint_handler()

On 8xx, breakpoints stop after executing the instruction, so
stepping/emulation is not needed. Move it into a sub-function and
remove the #ifdefs.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/f8cdc3f1c66ad3c43ebc568abcc6c39ed4676284.1561737231.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/hw_breakpoint.c | 60 ++++++++++++++++-------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index c8d1fa2e9d53..28ad3171bb82 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -198,15 +198,43 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 /*
  * Handle debug exception notifications.
  */
+static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
+			     unsigned long addr)
+{
+	int stepped;
+	unsigned int instr;
+
+	/* Do not emulate user-space instructions, instead single-step them */
+	if (user_mode(regs)) {
+		current->thread.last_hit_ubp = bp;
+		regs->msr |= MSR_SE;
+		return false;
+	}
+
+	stepped = 0;
+	instr = 0;
+	if (!__get_user_inatomic(instr, (unsigned int *)regs->nip))
+		stepped = emulate_step(regs, instr);
+
+	/*
+	 * emulate_step() could not execute it. We've failed in reliably
+	 * handling the hw-breakpoint. Unregister it and throw a warning
+	 * message to let the user know about it.
+	 */
+	if (!stepped) {
+		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
+			"0x%lx will be disabled.", addr);
+		perf_event_disable_inatomic(bp);
+		return false;
+	}
+	return true;
+}
+
 int hw_breakpoint_handler(struct die_args *args)
 {
 	int rc = NOTIFY_STOP;
 	struct perf_event *bp;
 	struct pt_regs *regs = args->regs;
-#ifndef CONFIG_PPC_8xx
-	int stepped = 1;
-	unsigned int instr;
-#endif
 	struct arch_hw_breakpoint *info;
 	unsigned long dar = regs->dar;
 
@@ -251,31 +279,9 @@ int hw_breakpoint_handler(struct die_args *args)
 	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
 		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
 
-#ifndef CONFIG_PPC_8xx
-	/* Do not emulate user-space instructions, instead single-step them */
-	if (user_mode(regs)) {
-		current->thread.last_hit_ubp = bp;
-		regs->msr |= MSR_SE;
+	if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info->address))
 		goto out;
-	}
 
-	stepped = 0;
-	instr = 0;
-	if (!__get_user_inatomic(instr, (unsigned int *) regs->nip))
-		stepped = emulate_step(regs, instr);
-
-	/*
-	 * emulate_step() could not execute it. We've failed in reliably
-	 * handling the hw-breakpoint. Unregister it and throw a warning
-	 * message to let the user know about it.
-	 */
-	if (!stepped) {
-		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
-			"0x%lx will be disabled.", info->address);
-		perf_event_disable_inatomic(bp);
-		goto out;
-	}
-#endif
 	/*
 	 * As a policy, the callback is invoked in a 'trigger-after-execute'
 	 * fashion

From 45ff3c55958542c3b76075d59741297b8cb31cbb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 9 Aug 2019 14:58:09 +0000
Subject: [PATCH 035/264] powerpc/kasan: Fix parallel loading of modules.

Parallel loading of modules may lead to bad setup of shadow page table
entries.

First, lets align modules so that two modules never share the same
shadow page.

Second, ensure that two modules cannot allocate two page tables for
the same PMD entry at the same time. This is done by using
init_mm.page_table_lock in the same way as __pte_alloc_kernel()

Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Cc: stable@vger.kernel.org # v5.2+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/c97284f912128cbc3f2fe09d68e90e65fb3e6026.1565361876.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 74f4555a62ba..3282f8a2aecc 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -5,6 +5,7 @@
 #include <linux/kasan.h>
 #include <linux/printk.h>
 #include <linux/memblock.h>
+#include <linux/moduleloader.h>
 #include <linux/sched/task.h>
 #include <linux/vmalloc.h>
 #include <asm/pgalloc.h>
@@ -46,7 +47,19 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l
 			kasan_populate_pte(new, PAGE_READONLY);
 		else
 			kasan_populate_pte(new, PAGE_KERNEL_RO);
-		pmd_populate_kernel(&init_mm, pmd, new);
+
+		smp_wmb(); /* See comment in __pte_alloc */
+
+		spin_lock(&init_mm.page_table_lock);
+			/* Has another populated it ? */
+		if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) {
+			pmd_populate_kernel(&init_mm, pmd, new);
+			new = NULL;
+		}
+		spin_unlock(&init_mm.page_table_lock);
+
+		if (new && slab_is_available())
+			pte_free_kernel(&init_mm, new);
 	}
 	return 0;
 }
@@ -137,7 +150,11 @@ void __init kasan_init(void)
 #ifdef CONFIG_MODULES
 void *module_alloc(unsigned long size)
 {
-	void *base = vmalloc_exec(size);
+	void *base;
+
+	base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 
 	if (!base)
 		return NULL;

From 663c0c9496a69f80011205ba3194049bcafd681d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 9 Aug 2019 14:58:10 +0000
Subject: [PATCH 036/264] powerpc/kasan: Fix shadow area set up for modules.

When loading modules, from time to time an Oops is encountered during
the init of shadow area for globals. This is due to the last page not
always being mapped depending on the exact distance between the start
and the end of the shadow area and the alignment with the page
addresses.

Fix this by aligning the starting address with the page address.

Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Cc: stable@vger.kernel.org # v5.2+
Reported-by: Erhard F. <erhard_f@mailbox.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/4f887e9b77d0d725cbb52035c7ece485c1c5fc14.1565361881.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 3282f8a2aecc..802387b231ad 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -87,7 +87,7 @@ static int __ref kasan_init_region(void *start, size_t size)
 	if (!slab_is_available())
 		block = memblock_alloc(k_end - k_start, PAGE_SIZE);
 
-	for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) {
+	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
 		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
 		void *va = block ? block + k_cur - k_start : kasan_get_one_page();
 		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);

From 38a0d0cdb46d3f91534e5b9839ec2d67be14c59d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 09:25:52 +0000
Subject: [PATCH 037/264] powerpc/futex: Fix warning: 'oldval' may be used
 uninitialized in this function

We see warnings such as:
  kernel/futex.c: In function 'do_futex':
  kernel/futex.c:1676:17: warning: 'oldval' may be used uninitialized in this function [-Wmaybe-uninitialized]
     return oldval == cmparg;
                   ^
  kernel/futex.c:1651:6: note: 'oldval' was declared here
    int oldval, ret;
        ^

This is because arch_futex_atomic_op_inuser() only sets *oval if ret
is 0 and GCC doesn't see that it will only use it when ret is 0.

Anyway, the non-zero ret path is an error path that won't suffer from
setting *oval, and as *oval is a local var in futex_atomic_op_inuser()
it will have no impact.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: reword change log slightly]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/86b72f0c134367b214910b27b9a6dd3321af93bb.1565774657.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/futex.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 3a6aa57b9d90..eea28ca679db 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -60,8 +60,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 
 	pagefault_enable();
 
-	if (!ret)
-		*oval = oldval;
+	*oval = oldval;
 
 	prevent_write_to_user(uaddr, sizeof(*uaddr));
 	return ret;

From 9d6d712fbf7766f21c838940eebcd7b4d476c5e6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 10:02:20 +0000
Subject: [PATCH 038/264] powerpc/32s: Fix boot failure with DEBUG_PAGEALLOC
 without KASAN.

When KASAN is selected, the definitive hash table has to be
set up later, but there is already an early temporary one.

When KASAN is not selected, there is no early hash table,
so the setup of the definitive hash table cannot be delayed.

Fixes: 72f208c6a8f7 ("powerpc/32s: move hash code patching out of MMU_init_hw()")
Cc: stable@vger.kernel.org # v5.2+
Reported-by: Jonathan Neuschafer <j.neuschaefer@gmx.net>
Tested-by: Jonathan Neuschafer <j.neuschaefer@gmx.net>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b7860c5e1e784d6b96ba67edf47dd6cbc2e78ab6.1565776892.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_32.S  | 2 ++
 arch/powerpc/mm/book3s32/mmu.c | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index f255e22184b4..c8b4f7ed318c 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -897,9 +897,11 @@ start_here:
 	bl	machine_init
 	bl	__save_cpu_setup
 	bl	MMU_init
+#ifdef CONFIG_KASAN
 BEGIN_MMU_FTR_SECTION
 	bl	MMU_init_hw_patch
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 
 /*
  * Go back to running unmapped so we can load up new values
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index e249fbf6b9c3..8d68f03bf5a4 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -358,6 +358,15 @@ void __init MMU_init_hw(void)
 	hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
 	if (lg_n_hpteg > 16)
 		hash_mb2 = 16 - LG_HPTEG_SIZE;
+
+	/*
+	 * When KASAN is selected, there is already an early temporary hash
+	 * table and the switch to the final hash table is done later.
+	 */
+	if (IS_ENABLED(CONFIG_KASAN))
+		return;
+
+	MMU_init_hw_patch();
 }
 
 void __init MMU_init_hw_patch(void)

From ad628a34ec4e3558bf838195f60bbaa4c2b68f2a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 14 Aug 2019 14:36:10 +0000
Subject: [PATCH 039/264] powerpc/mm: don't display empty early ioremap area

On the 8xx, the layout displayed at boot is:

[    0.000000] Memory: 121856K/131072K available (5728K kernel code, 592K rwdata, 1248K rodata, 560K init, 448K bss, 9216K reserved, 0K cma-reserved)
[    0.000000] Kernel virtual memory layout:
[    0.000000]   * 0xffefc000..0xffffc000  : fixmap
[    0.000000]   * 0xffefc000..0xffefc000  : early ioremap
[    0.000000]   * 0xc9000000..0xffefc000  : vmalloc & ioremap
[    0.000000] SLUB: HWalign=16, Order=0-3, MinObjects=0, CPUs=1, Nodes=1

Remove display of an empty early ioremap.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/f6267226038cb25a839b567319e240576e3f8565.1565793287.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/mem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9191a66b3bc5..828c535e8fd2 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -306,8 +306,9 @@ void __init mem_init(void)
 	pr_info("  * 0x%08lx..0x%08lx  : consistent mem\n",
 		IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
 #endif /* CONFIG_NOT_COHERENT_CACHE */
-	pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
-		ioremap_bot, IOREMAP_TOP);
+	if (ioremap_bot != IOREMAP_TOP)
+		pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
+			ioremap_bot, IOREMAP_TOP);
 	pr_info("  * 0x%08lx..0x%08lx  : vmalloc & ioremap\n",
 		VMALLOC_START, VMALLOC_END);
 #endif /* CONFIG_PPC32 */

From d9642117914c9d3f800b3bacc19d7e388b04edb4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 05:41:40 +0000
Subject: [PATCH 040/264] powerpc/mm: define empty update_mmu_cache() as static
 inline

Only BOOK3S and FSL_BOOK3E have a usefull update_mmu_cache().

For the others, just define it static inline.

In the meantime, simplify the FSL_BOOK3E related ifdef as
book3e_hugetlb_preload() only exists when CONFIG_PPC_FSL_BOOK3E
is selected.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/668aba4db6b9af6d8a151174e11a4289f1a6bbcd.1565933217.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/book3s/pgtable.h | 11 +++++++++++
 arch/powerpc/include/asm/nohash/pgtable.h | 13 +++++++++++++
 arch/powerpc/include/asm/pgtable.h        | 12 ------------
 arch/powerpc/mm/mem.c                     | 11 +++++++----
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 6436b65ac7bc..0e1263455d73 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -26,5 +26,16 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot);
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 
+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ * On machines which use an MMU hash table, we use this to put a
+ * corresponding HPTE into the hash table ahead of time, instead of
+ * waiting for the inevitable extra hash-table miss exception.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
+
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 1ca1c1864b32..7fed9dc0f147 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -293,5 +293,18 @@ static inline int pgd_huge(pgd_t pgd)
 #define is_hugepd(hpd)		(hugepd_ok(hpd))
 #endif
 
+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ */
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
+#else
+static inline
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {}
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c58ba7963688..c70916a7865a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -77,18 +77,6 @@ extern void paging_init(void);
 
 #include <asm-generic/pgtable.h>
 
-
-/*
- * This gets called at the end of handling a page fault, when
- * the kernel has put a new PTE into the page table for the process.
- * We use it to ensure coherency between the i-cache and d-cache
- * for the page which has just been mapped in.
- * On machines which use an MMU hash table, we use this to put a
- * corresponding HPTE into the hash table ahead of time, instead of
- * waiting for the inevitable extra hash-table miss exception.
- */
-extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
-
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd)		0
 #endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 828c535e8fd2..ee8225cd2cd2 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -415,10 +415,10 @@ EXPORT_SYMBOL(flush_icache_user_range);
  * 
  * This must always be called with the pte lock held.
  */
+#ifdef CONFIG_PPC_BOOK3S
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *ptep)
 {
-#ifdef CONFIG_PPC_BOOK3S
 	/*
 	 * We don't need to worry about _PAGE_PRESENT here because we are
 	 * called with either mm->page_table_lock held or ptl lock held
@@ -456,13 +456,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	hash_preload(vma->vm_mm, address, is_exec, trap);
+}
 #endif /* CONFIG_PPC_BOOK3S */
-#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
-	&& defined(CONFIG_HUGETLB_PAGE)
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t *ptep)
+{
 	if (is_vm_hugetlb_page(vma))
 		book3e_hugetlb_preload(vma, address, *ptep);
-#endif
 }
+#endif
 
 /*
  * System memory should not be in /proc/iomem but various tools expect it

From 4c1616ef036ffaaea95a29d7b6abf9d3e8eb9d92 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 05:41:41 +0000
Subject: [PATCH 041/264] powerpc/mm: move FSL_BOOK3 version of
 update_mmu_cache()

Move FSL_BOOK3E version of update_mmu_cache() at the same
place as book3e_hugetlb_preload() as update_mmu_cache() is
the only user of book3e_hugetlb_preload().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/4d69fdc86df9c74adc71a60331a86f6afb8b5e9e.1565933217.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/hugetlb.h          |  3 ---
 arch/powerpc/mm/mem.c                       |  8 --------
 arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 16 ++++++++++++++--
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 20a101046cff..bd6504c28c2f 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -31,9 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 	return 0;
 }
 
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-			    pte_t pte);
-
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ee8225cd2cd2..58d11362eee4 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -458,14 +458,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	hash_preload(vma->vm_mm, address, is_exec, trap);
 }
 #endif /* CONFIG_PPC_BOOK3S */
-#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
-		      pte_t *ptep)
-{
-	if (is_vm_hugetlb_page(vma))
-		book3e_hugetlb_preload(vma, address, *ptep);
-}
-#endif
 
 /*
  * System memory should not be in /proc/iomem but various tools expect it
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index 61915f4d3c7f..8b88be91b622 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -122,8 +122,8 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
 	return found;
 }
 
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-			    pte_t pte)
+static void
+book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
 {
 	unsigned long mas1, mas2;
 	u64 mas7_3;
@@ -183,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	local_irq_restore(flags);
 }
 
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+{
+	if (is_vm_hugetlb_page(vma))
+		book3e_hugetlb_preload(vma, address, *ptep);
+}
+
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
 	struct hstate *hstate = hstate_file(vma->vm_file);

From e5a1edb9fe4cfa07e37a59475f8f7d0a8939c73e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 05:41:42 +0000
Subject: [PATCH 042/264] powerpc/mm: move update_mmu_cache() into book3s hash
 utils.

update_mmu_cache() is only for BOOK3S, and can be simplified for
BOOK3S32.

Move it out of mem.c into respective BOOK3S32 and BOOK3S64 files
containing hash utils.

BOOK3S64 version of hash_preload() is only used locally, declare it
static.

Remove the radix_enabled() stuff in BOOK3S32 version.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/107aaf43583a5f5d09e0d4e84c4c4390ecfcd512.1565933217.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/book3s32/mmu.c        | 46 ++++++++++++++++++++++
 arch/powerpc/mm/book3s64/hash_utils.c | 55 ++++++++++++++++++++++++++-
 arch/powerpc/mm/mem.c                 | 52 -------------------------
 arch/powerpc/mm/mmu_decl.h            |  7 ++--
 4 files changed, 102 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 8d68f03bf5a4..1e77dca8b497 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -309,6 +309,52 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
 }
 
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t *ptep)
+{
+	/*
+	 * We don't need to worry about _PAGE_PRESENT here because we are
+	 * called with either mm->page_table_lock held or ptl lock held
+	 */
+	unsigned long trap;
+	bool is_exec;
+
+	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+	if (!pte_young(*ptep) || address >= TASK_SIZE)
+		return;
+
+	/*
+	 * We try to figure out if we are coming from an instruction
+	 * access fault and pass that down to __hash_page so we avoid
+	 * double-faulting on execution of fresh text. We have to test
+	 * for regs NULL since init will get here first thing at boot.
+	 *
+	 * We also avoid filling the hash if not coming from a fault.
+	 */
+
+	trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+	switch (trap) {
+	case 0x300:
+		is_exec = false;
+		break;
+	case 0x400:
+		is_exec = true;
+		break;
+	default:
+		return;
+	}
+
+	hash_preload(vma->vm_mm, address, is_exec, trap);
+}
+
 /*
  * Initialize the hash table and patch the instructions in hashtable.S.
  */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index c363e850550e..c3bfef08dcf8 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1519,8 +1519,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
 }
 #endif
 
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-		  bool is_exec, unsigned long trap)
+static void hash_preload(struct mm_struct *mm, unsigned long ea,
+			 bool is_exec, unsigned long trap)
 {
 	int hugepage_shift;
 	unsigned long vsid;
@@ -1600,6 +1600,57 @@ out_exit:
 	local_irq_restore(flags);
 }
 
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t *ptep)
+{
+	/*
+	 * We don't need to worry about _PAGE_PRESENT here because we are
+	 * called with either mm->page_table_lock held or ptl lock held
+	 */
+	unsigned long trap;
+	bool is_exec;
+
+	if (radix_enabled()) {
+		prefetch((void *)address);
+		return;
+	}
+
+	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+	if (!pte_young(*ptep) || address >= TASK_SIZE)
+		return;
+
+	/*
+	 * We try to figure out if we are coming from an instruction
+	 * access fault and pass that down to __hash_page so we avoid
+	 * double-faulting on execution of fresh text. We have to test
+	 * for regs NULL since init will get here first thing at boot.
+	 *
+	 * We also avoid filling the hash if not coming from a fault.
+	 */
+
+	trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+	switch (trap) {
+	case 0x300:
+		is_exec = false;
+		break;
+	case 0x400:
+		is_exec = true;
+		break;
+	default:
+		return;
+	}
+
+	hash_preload(vma->vm_mm, address, is_exec, trap);
+}
+
 #ifdef CONFIG_PPC_MEM_KEYS
 /*
  * Return the protection key associated with the given address and the
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 58d11362eee4..69f99128a8d6 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -407,58 +407,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 }
 EXPORT_SYMBOL(flush_icache_user_range);
 
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a PTE in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux PTE.
- * 
- * This must always be called with the pte lock held.
- */
-#ifdef CONFIG_PPC_BOOK3S
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
-		      pte_t *ptep)
-{
-	/*
-	 * We don't need to worry about _PAGE_PRESENT here because we are
-	 * called with either mm->page_table_lock held or ptl lock held
-	 */
-	unsigned long trap;
-	bool is_exec;
-
-	if (radix_enabled()) {
-		prefetch((void *)address);
-		return;
-	}
-
-	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
-	if (!pte_young(*ptep) || address >= TASK_SIZE)
-		return;
-
-	/* We try to figure out if we are coming from an instruction
-	 * access fault and pass that down to __hash_page so we avoid
-	 * double-faulting on execution of fresh text. We have to test
-	 * for regs NULL since init will get here first thing at boot
-	 *
-	 * We also avoid filling the hash if not coming from a fault
-	 */
-
-	trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
-	switch (trap) {
-	case 0x300:
-		is_exec = false;
-		break;
-	case 0x400:
-		is_exec = true;
-		break;
-	default:
-		return;
-	}
-
-	hash_preload(vma->vm_mm, address, is_exec, trap);
-}
-#endif /* CONFIG_PPC_BOOK3S */
-
 /*
  * System memory should not be in /proc/iomem but various tools expect it
  * (eg kdump).
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 32c1a191c28a..9f325a7a09cb 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -82,10 +82,6 @@ static inline void print_system_hash_info(void) {}
 
 #else /* CONFIG_PPC_MMU_NOHASH */
 
-extern void hash_preload(struct mm_struct *mm, unsigned long ea,
-			 bool is_exec, unsigned long trap);
-
-
 extern void _tlbie(unsigned long address);
 extern void _tlbia(void);
 
@@ -95,6 +91,9 @@ void print_system_hash_info(void);
 
 #ifdef CONFIG_PPC32
 
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  bool is_exec, unsigned long trap);
+
 extern void mapin_ram(void);
 extern void setbat(int index, unsigned long virt, phys_addr_t phys,
 		   unsigned int size, pgprot_t prot);

From f49f4e2b68b683491263e92c229ff344d44759a7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 05:41:43 +0000
Subject: [PATCH 043/264] powerpc/mm: Simplify update_mmu_cache() on BOOK3S32

On BOOK3S32, hash_preload() neither use is_exec nor trap,
so drop those parameters and simplify update_mmu_cached().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/35f143c6fe29f9fd25c7f3cd4448ae401029ce3c.1565933217.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/book3s32/mmu.c | 32 ++++++++------------------------
 arch/powerpc/mm/mmu_decl.h     |  3 +--
 arch/powerpc/mm/pgtable_32.c   |  2 +-
 3 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 1e77dca8b497..5d7d35eb96fb 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -297,8 +297,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
 /*
  * Preload a translation in the hash table
  */
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-		  bool is_exec, unsigned long trap)
+void hash_preload(struct mm_struct *mm, unsigned long ea)
 {
 	pmd_t *pmd;
 
@@ -324,35 +323,20 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	 * We don't need to worry about _PAGE_PRESENT here because we are
 	 * called with either mm->page_table_lock held or ptl lock held
 	 */
-	unsigned long trap;
-	bool is_exec;
 
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(*ptep) || address >= TASK_SIZE)
 		return;
 
-	/*
-	 * We try to figure out if we are coming from an instruction
-	 * access fault and pass that down to __hash_page so we avoid
-	 * double-faulting on execution of fresh text. We have to test
-	 * for regs NULL since init will get here first thing at boot.
-	 *
-	 * We also avoid filling the hash if not coming from a fault.
-	 */
-
-	trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
-	switch (trap) {
-	case 0x300:
-		is_exec = false;
-		break;
-	case 0x400:
-		is_exec = true;
-		break;
-	default:
+	/* We have to test for regs NULL since init will get here first thing at boot */
+	if (!current->thread.regs)
 		return;
-	}
 
-	hash_preload(vma->vm_mm, address, is_exec, trap);
+	/* We also avoid filling the hash if not coming from a fault */
+	if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400)
+		return;
+
+	hash_preload(vma->vm_mm, address);
 }
 
 /*
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 9f325a7a09cb..adbaf2167214 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -91,8 +91,7 @@ void print_system_hash_info(void);
 
 #ifdef CONFIG_PPC32
 
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-		  bool is_exec, unsigned long trap);
+void hash_preload(struct mm_struct *mm, unsigned long ea);
 
 extern void mapin_ram(void);
 extern void setbat(int index, unsigned long virt, phys_addr_t phys,
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 35cb96cfc258..97f401a06fcc 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -252,7 +252,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 		map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
 #ifdef CONFIG_PPC_BOOK3S_32
 		if (ktext)
-			hash_preload(&init_mm, v, false, 0x300);
+			hash_preload(&init_mm, v);
 #endif
 		v += PAGE_SIZE;
 		p += PAGE_SIZE;

From f204338f8e29777302042cba578b0da44f69695f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 05:41:44 +0000
Subject: [PATCH 044/264] powerpc/mm: ppc 603 doesn't need update_mmu_cache()

On powerpc 603, there is no hash table so get out of
update_mmu_cache() early.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/6133e0f115d955fac4061536dab0fa7480a1c433.1565933217.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/book3s32/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 5d7d35eb96fb..50a1991d257f 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -319,6 +319,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *ptep)
 {
+	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		return;
 	/*
 	 * We don't need to worry about _PAGE_PRESENT here because we are
 	 * called with either mm->page_table_lock held or ptl lock held

From 7ab0b7cb8951d4095d73e203759b74d41916e455 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 16 Aug 2019 07:52:20 +0000
Subject: [PATCH 045/264] powerpc/32: Add warning on misaligned copy_page() or
 clear_page()

copy_page() and clear_page() expect page aligned destination, and
use dcbz instruction to clear entire cache lines based on the
assumption that the destination is cache aligned.

As shown during analysis of a bug in BTRFS filesystem, a misaligned
copy_page() can create bugs that are difficult to locate (see Link).

Add an explicit WARNING when copy_page() or clear_page() are called
with misaligned destination.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=204371
Link: https://lore.kernel.org/r/c6cea38f90480268d439ca44a645647e260fff09.1565941808.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/page_32.h | 4 ++++
 arch/powerpc/kernel/misc_32.S      | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index 683dfbc67ca8..d64dfe3ac712 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -40,6 +40,8 @@ typedef unsigned long long pte_basic_t;
 typedef unsigned long pte_basic_t;
 #endif
 
+#include <asm/bug.h>
+
 /*
  * Clear page using the dcbz instruction, which doesn't cause any
  * memory traffic (except to write out any cache lines which get
@@ -49,6 +51,8 @@ static inline void clear_page(void *addr)
 {
 	unsigned int i;
 
+	WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
+
 	for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES)
 		dcbz(addr);
 }
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index fe4bd321730e..02d90e1ebf65 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -452,7 +452,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	stwu	r9,16(r3)
 
 _GLOBAL(copy_page)
+	rlwinm	r5, r3, 0, L1_CACHE_BYTES - 1
 	addi	r3,r3,-4
+
+0:	twnei	r5, 0	/* WARN if r3 is not cache aligned */
+	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+
 	addi	r4,r4,-4
 
 	li	r5,4

From e354d7dc81d0e81bea33165f381aff1eda45f5d9 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Jun 2019 00:03:17 +1000
Subject: [PATCH 046/264] powerpc/64: allow compiler to cache 'current'

current may be cached by the compiler, so remove the volatile asm
restriction. This results in better generated code, as well as being
smaller and fewer dependent loads, it can avoid store-hit-load flushes
like this one that shows up in irq_exit():

    preempt_count_sub(HARDIRQ_OFFSET);
    if (!in_interrupt() && ...)

Which ends up as:

    ((struct thread_info *)current)->preempt_count -= HARDIRQ_OFFSET;
    if (((struct thread_info *)current)->preempt_count ...

Evaluating current twice presently means it has to be loaded twice, and
here gcc happens to pick a different register each time, then
preempt_count is accessed via that base register:

    1058:       ld      r10,2392(r13)     <-- current
    105c:       lwz     r9,0(r10)         <-- preempt_count
    1060:       addis   r9,r9,-1
    1064:       stw     r9,0(r10)         <-- preempt_count
    1068:       ld      r9,2392(r13)      <-- current
    106c:       lwz     r9,0(r9)          <-- preempt_count
    1070:       rlwinm. r9,r9,0,11,23
    1074:       bne     1090 <irq_exit+0x60>

This can frustrate store-hit-load detection heuristics and cause
flushes. Allowing the compiler to cache current in a reigster with this
patch results in the same base register being used for all accesses,
which is more likely to be detected as an alias:

    1058:       ld      r31,2392(r13)
    ...
    1070:       lwz     r9,0(r31)
    1074:       addis   r9,r9,-1
    1078:       stw     r9,0(r31)
    107c:       lwz     r9,0(r31)
    1080:       rlwinm. r9,r9,0,11,23
    1084:       bne     10a0 <irq_exit+0x60>

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190612140317.24490-1-npiggin@gmail.com
---
 arch/powerpc/include/asm/current.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/current.h b/arch/powerpc/include/asm/current.h
index 297827b76169..bbfb94800415 100644
--- a/arch/powerpc/include/asm/current.h
+++ b/arch/powerpc/include/asm/current.h
@@ -16,7 +16,8 @@ static inline struct task_struct *get_current(void)
 {
 	struct task_struct *task;
 
-	__asm__ __volatile__("ld %0,%1(13)"
+	/* get_current can be cached by the compiler, so no volatile */
+	asm ("ld %0,%1(13)"
 	: "=r" (task)
 	: "i" (offsetof(struct paca_struct, __current)));
 

From 8f51e3929470942e6a8744061254fdeef646cd36 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 24 Jul 2019 18:46:34 +1000
Subject: [PATCH 047/264] powerpc/64s/radix: Fix memory hotplug section page
 table creation

create_physical_mapping expects physical addresses, but creating and
splitting these mappings after boot is supplying virtual (effective)
addresses. This can be irritated by booting with mem= to limit memory
then probing an unused physical memory range:

  echo <addr> > /sys/devices/system/memory/probe

This mostly works by accident, firstly because __va(__va(x)) == __va(x)
so the virtual address does not get corrupted. Secondly because pfn_pte
masks out the upper bits of the pfn beyond the physical address limit,
so a pfn constructed with a 0xc000000000000000 virtual linear address
will be masked back to the correct physical address in the pte.

Fixes: 6cc27341b21a8 ("powerpc/mm: add radix__create_section_mapping()")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190724084638.24982-1-npiggin@gmail.com
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b4ca9e95e678..c5cc16ab1954 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -902,7 +902,7 @@ int __meminit radix__create_section_mapping(unsigned long start, unsigned long e
 		return -1;
 	}
 
-	return create_physical_mapping(start, end, nid);
+	return create_physical_mapping(__pa(start), __pa(end), nid);
 }
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)

From 31f210cf42d4b308eacef89b6cb0b1459338b8de Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 24 Jul 2019 18:46:35 +1000
Subject: [PATCH 048/264] powerpc/64s/radix: Fix memory hot-unplug page table
 split

create_physical_mapping expects physical addresses, but splitting
these mapping on hot unplug is supplying virtual (effective)
addresses.

Fixes: 4dd5f8a99e791 ("powerpc/mm/radix: Split linear mapping on hot-unplug")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190724084638.24982-2-npiggin@gmail.com
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index c5cc16ab1954..2204d8eeb784 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -737,8 +737,8 @@ static int __meminit stop_machine_change_mapping(void *data)
 
 	spin_unlock(&init_mm.page_table_lock);
 	pte_clear(&init_mm, params->aligned_start, params->pte);
-	create_physical_mapping(params->aligned_start, params->start, -1);
-	create_physical_mapping(params->end, params->aligned_end, -1);
+	create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
+	create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
 	spin_lock(&init_mm.page_table_lock);
 	return 0;
 }

From 10c4bd7cd28e77aeb8cfa65b23cb3c632ede2a49 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 24 Jul 2019 18:46:36 +1000
Subject: [PATCH 049/264] powerpc/perf: fix imc allocation failure handling

The alloc_pages_node return value should be tested for failure
before being passed to page_address.

Tested-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190724084638.24982-3-npiggin@gmail.com
---
 arch/powerpc/perf/imc-pmu.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index dea243185ea4..cb50a9e1fd2d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -577,6 +577,7 @@ static int core_imc_mem_init(int cpu, int size)
 {
 	int nid, rc = 0, core_id = (cpu / threads_per_core);
 	struct imc_mem_info *mem_info;
+	struct page *page;
 
 	/*
 	 * alloc_pages_node() will allocate memory for core in the
@@ -587,11 +588,12 @@ static int core_imc_mem_init(int cpu, int size)
 	mem_info->id = core_id;
 
 	/* We need only vbase for core counters */
-	mem_info->vbase = page_address(alloc_pages_node(nid,
-					  GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
-					  __GFP_NOWARN, get_order(size)));
-	if (!mem_info->vbase)
+	page = alloc_pages_node(nid,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
+				__GFP_NOWARN, get_order(size));
+	if (!page)
 		return -ENOMEM;
+	mem_info->vbase = page_address(page);
 
 	/* Init the mutex */
 	core_imc_refc[core_id].id = core_id;
@@ -849,15 +851,17 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
 	int nid = cpu_to_node(cpu_id);
 
 	if (!local_mem) {
+		struct page *page;
 		/*
 		 * This case could happen only once at start, since we dont
 		 * free the memory in cpu offline path.
 		 */
-		local_mem = page_address(alloc_pages_node(nid,
+		page = alloc_pages_node(nid,
 				  GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
-				  __GFP_NOWARN, get_order(size)));
-		if (!local_mem)
+				  __GFP_NOWARN, get_order(size));
+		if (!page)
 			return -ENOMEM;
+		local_mem = page_address(page);
 
 		per_cpu(thread_imc_mem, cpu_id) = local_mem;
 	}
@@ -1095,11 +1099,14 @@ static int trace_imc_mem_alloc(int cpu_id, int size)
 	int core_id = (cpu_id / threads_per_core);
 
 	if (!local_mem) {
-		local_mem = page_address(alloc_pages_node(phys_id,
-					GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
-					__GFP_NOWARN, get_order(size)));
-		if (!local_mem)
+		struct page *page;
+
+		page = alloc_pages_node(phys_id,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
+				__GFP_NOWARN, get_order(size));
+		if (!page)
 			return -ENOMEM;
+		local_mem = page_address(page);
 		per_cpu(trace_imc_mem, cpu_id) = local_mem;
 
 		/* Initialise the counters for trace mode */

From 4dd7554a6456d124c85e0a4ad156625b71390b5c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 24 Jul 2019 18:46:37 +1000
Subject: [PATCH 050/264] powerpc/64: Add VIRTUAL_BUG_ON checks for __va and
 __pa addresses

Ensure __va is given a physical address below PAGE_OFFSET, and __pa is
given a virtual address above PAGE_OFFSET.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190724084638.24982-4-npiggin@gmail.com
---
 arch/powerpc/include/asm/page.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 0d52f57fca04..c8bb14ff4713 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -215,9 +215,19 @@ static inline bool pfn_valid(unsigned long pfn)
 /*
  * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET
  * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit.
+ * This also results in better code generation.
  */
-#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET))
-#define __pa(x) ((unsigned long)(x) & 0x0fffffffffffffffUL)
+#define __va(x)								\
+({									\
+	VIRTUAL_BUG_ON((unsigned long)(x) >= PAGE_OFFSET);		\
+	(void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET);	\
+})
+
+#define __pa(x)								\
+({									\
+	VIRTUAL_BUG_ON((unsigned long)(x) < PAGE_OFFSET);		\
+	(unsigned long)(x) & 0x0fffffffffffffffUL;			\
+})
 
 #else /* 32-bit, non book E */
 #define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START))

From 6bb25170d7a44ef0ed9677814600f0785e7421d1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 24 Jul 2019 18:46:38 +1000
Subject: [PATCH 051/264] powerpc/64s/radix: Remove redundant pfn_pte bitop,
 add VM_BUG_ON

pfn_pte is never given a pte above the addressable physical memory
limit, so the masking is redundant. In case of a software bug, it
is not obviously better to silently truncate the pfn than to corrupt
the pte (either one will result in memory corruption or crashes),
so there is no reason to add this to the fast path.

Add VM_BUG_ON to catch cases where the pfn is invalid. These would
catch the create_section_mapping bug fixed by a previous commit.

  [16885.256466] ------------[ cut here ]------------
  [16885.256492] kernel BUG at arch/powerpc/include/asm/book3s/64/pgtable.h:612!
  cpu 0x0: Vector: 700 (Program Check) at [c0000000ee0a36d0]
      pc: c000000000080738: __map_kernel_page+0x248/0x6f0
      lr: c000000000080ac0: __map_kernel_page+0x5d0/0x6f0
      sp: c0000000ee0a3960
     msr: 9000000000029033
    current = 0xc0000000ec63b400
    paca    = 0xc0000000017f0000   irqmask: 0x03   irq_happened: 0x01
      pid   = 85, comm = sh
  kernel BUG at arch/powerpc/include/asm/book3s/64/pgtable.h:612!
  Linux version 5.3.0-rc1-00001-g0fe93e5f3394
  enter ? for help
  [c0000000ee0a3a00] c000000000d37378 create_physical_mapping+0x260/0x360
  [c0000000ee0a3b10] c000000000d370bc create_section_mapping+0x1c/0x3c
  [c0000000ee0a3b30] c000000000071f54 arch_add_memory+0x74/0x130

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190724084638.24982-5-npiggin@gmail.com
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8308f32e9782..8e47fb85dfa6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -608,8 +608,10 @@ static inline bool pte_access_permitted(pte_t pte, bool write)
  */
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 {
-	return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) |
-		     pgprot_val(pgprot));
+	VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
+	VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
+
+	return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
 }
 
 static inline unsigned long pte_pfn(pte_t pte)

From 415480dce2ef03bb8335deebd2f402f475443ce0 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 19 Aug 2019 06:40:25 +0000
Subject: [PATCH 052/264] powerpc/603: Fix handling of the DIRTY flag

If a page is already mapped RW without the DIRTY flag, the DIRTY
flag is never set and a TLB store miss exception is taken forever.

This is easily reproduced with the following app:

void main(void)
{
	volatile char *ptr = mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);

	*ptr = *ptr;
}

When DIRTY flag is not set, bail out of TLB miss handler and take
a minor page fault which will set the DIRTY flag.

Fixes: f8b58c64eaef ("powerpc/603: let's handle PAGE_DIRTY directly")
Cc: stable@vger.kernel.org # v5.1+
Reported-by: Doug Crawford <doug.crawford@intelight-its.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/80432f71194d7ee75b2f5043ecf1501cf1cca1f3.1566196646.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_32.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index c8b4f7ed318c..9e6f01abb31e 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -557,9 +557,9 @@ DataStoreTLBMiss:
 	cmplw	0,r1,r3
 	mfspr	r2, SPRN_SPRG_PGDIR
 #ifdef CONFIG_SWAP
-	li	r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED
+	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED
 #else
-	li	r1, _PAGE_RW | _PAGE_PRESENT
+	li	r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT
 #endif
 	bge-	112f
 	lis	r2, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */

From a6717c01ddc259f6f73364779df058e2c67309f8 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Fri, 2 Aug 2019 14:29:24 -0500
Subject: [PATCH 053/264] powerpc/rtas: use device model APIs and serialization
 during LPM

The LPAR migration implementation and userspace-initiated cpu hotplug
can interleave their executions like so:

1. Set cpu 7 offline via sysfs.

2. Begin a partition migration, whose implementation requires the OS
   to ensure all present cpus are online; cpu 7 is onlined:

     rtas_ibm_suspend_me -> rtas_online_cpus_mask -> cpu_up

   This sets cpu 7 online in all respects except for the cpu's
   corresponding struct device; dev->offline remains true.

3. Set cpu 7 online via sysfs. _cpu_up() determines that cpu 7 is
   already online and returns success. The driver core (device_online)
   sets dev->offline = false.

4. The migration completes and restores cpu 7 to offline state:

     rtas_ibm_suspend_me -> rtas_offline_cpus_mask -> cpu_down

This leaves cpu7 in a state where the driver core considers the cpu
device online, but in all other respects it is offline and
unused. Attempts to online the cpu via sysfs appear to succeed but the
driver core actually does not pass the request to the lower-level
cpuhp support code. This makes the cpu unusable until the cpu device
is manually set offline and then online again via sysfs.

Instead of directly calling cpu_up/cpu_down, the migration code should
use the higher-level device core APIs to maintain consistent state and
serialize operations.

Fixes: 120496ac2d2d ("powerpc: Bring all threads online prior to migration/hibernation")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802192926.19277-2-nathanl@linux.ibm.com
---
 arch/powerpc/kernel/rtas.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 49159bb38949..ef290d4036ba 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -871,15 +871,17 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
 		return 0;
 
 	for_each_cpu(cpu, cpus) {
+		struct device *dev = get_cpu_device(cpu);
+
 		switch (state) {
 		case DOWN:
-			cpuret = cpu_down(cpu);
+			cpuret = device_offline(dev);
 			break;
 		case UP:
-			cpuret = cpu_up(cpu);
+			cpuret = device_online(dev);
 			break;
 		}
-		if (cpuret) {
+		if (cpuret < 0) {
 			pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
 					__func__,
 					((state == UP) ? "up" : "down"),
@@ -966,6 +968,8 @@ int rtas_ibm_suspend_me(u64 handle)
 	data.token = rtas_token("ibm,suspend-me");
 	data.complete = &done;
 
+	lock_device_hotplug();
+
 	/* All present CPUs must be online */
 	cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
 	cpuret = rtas_online_cpus_mask(offline_mask);
@@ -1004,6 +1008,7 @@ out_hotplug_enable:
 				__func__);
 
 out:
+	unlock_device_hotplug();
 	free_cpumask_var(offline_mask);
 	return atomic_read(&data.error);
 }

From 10e4850d7c7f2af2e5c40520b8caf73bf9d7e2d1 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Fri, 2 Aug 2019 14:29:25 -0500
Subject: [PATCH 054/264] powerpc/rtas: allow rescheduling while changing cpu
 states

rtas_cpu_state_change_mask() potentially operates on scores of cpus,
so explicitly allow rescheduling in the loop body.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802192926.19277-3-nathanl@linux.ibm.com
---
 arch/powerpc/kernel/rtas.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ef290d4036ba..c5fa251b8950 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -16,6 +16,7 @@
 #include <linux/capability.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
@@ -898,6 +899,7 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
 				cpumask_clear_cpu(cpu, cpus);
 			}
 		}
+		cond_resched();
 	}
 
 	return ret;

From ccfb5bd71d3d1228090a8633800ae7cdf42a94ac Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Fri, 2 Aug 2019 14:29:26 -0500
Subject: [PATCH 055/264] powerpc/pseries/mobility: use cond_resched when
 updating device tree

After a partition migration, pseries_devicetree_update() processes
changes to the device tree communicated from the platform to
Linux. This is a relatively heavyweight operation, with multiple
device tree searches, memory allocations, and conversations with
partition firmware.

There's a few levels of nested loops which are bounded only by
decisions made by the platform, outside of Linux's control, and indeed
we have seen RCU stalls on large systems while executing this call
graph. Use cond_resched() in these loops so that the cpu is yielded
when needed.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802192926.19277-4-nathanl@linux.ibm.com
---
 arch/powerpc/platforms/pseries/mobility.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index fe812bebdf5e..b571285f6c14 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -9,6 +9,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
+#include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/stat.h>
 #include <linux/completion.h>
@@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope)
 
 				prop_data += vd;
 			}
+
+			cond_resched();
 		}
+
+		cond_resched();
 	} while (rtas_rc == 1);
 
 	of_node_put(dn);
@@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope)
 					add_dt_node(phandle, drc_index);
 					break;
 				}
+
+				cond_resched();
 			}
 		}
+
+		cond_resched();
 	} while (rc == 1);
 
 	kfree(rtas_buf);

From b5bda6263cad9a927e1a4edb7493d542da0c1410 Mon Sep 17 00:00:00 2001
From: Santosh Sivaraj <santosh@fossix.org>
Date: Tue, 20 Aug 2019 13:43:46 +0530
Subject: [PATCH 056/264] powerpc/mce: Schedule work from irq_work

schedule_work() cannot be called from MCE exception context as MCE can
interrupt even in interrupt disabled context.

Fixes: 733e4a4c4467 ("powerpc/mce: hookup memory_failure for UE errors")
Cc: stable@vger.kernel.org # v4.15+
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-2-santosh@fossix.org
---
 arch/powerpc/kernel/mce.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..cff31d4a501f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,6 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
 					mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
+static void machine_check_ue_irq_work(struct irq_work *work);
 void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
@@ -40,6 +41,10 @@ static struct irq_work mce_event_process_work = {
         .func = machine_check_process_queued_event,
 };
 
+static struct irq_work mce_ue_event_irq_work = {
+	.func = machine_check_ue_irq_work,
+};
+
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static void mce_set_error_info(struct machine_check_event *mce,
@@ -199,6 +204,10 @@ void release_mce_event(void)
 	get_mce_event(NULL, true);
 }
 
+static void machine_check_ue_irq_work(struct irq_work *work)
+{
+	schedule_work(&mce_ue_event_work);
+}
 
 /*
  * Queue up the MCE event which then can be handled later.
@@ -216,7 +225,7 @@ void machine_check_ue_event(struct machine_check_event *evt)
 	memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
 
 	/* Queue work to process this event later. */
-	schedule_work(&mce_ue_event_work);
+	irq_work_queue(&mce_ue_event_irq_work);
 }
 
 /*

From 99ead78afd1128bfcebe7f88f3b102fb2da09aee Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Tue, 20 Aug 2019 13:43:47 +0530
Subject: [PATCH 057/264] powerpc/mce: Fix MCE handling for huge pages

The current code would fail on huge pages addresses, since the shift would
be incorrect. Use the correct page shift value returned by
__find_linux_pte() to get the correct physical address. The code is more
generic and can handle both regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[arbab@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
Tested-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Cc: stable@vger.kernel.org # v4.15+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-3-santosh@fossix.org
---
 arch/powerpc/kernel/mce_power.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..714a98e0927f 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -26,6 +26,7 @@
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 {
 	pte_t *ptep;
+	unsigned int shift;
 	unsigned long flags;
 	struct mm_struct *mm;
 
@@ -35,13 +36,18 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 		mm = &init_mm;
 
 	local_irq_save(flags);
-	if (mm == current->mm)
-		ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
-	else
-		ptep = find_init_mm_pte(addr, NULL);
+	ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
 	local_irq_restore(flags);
+
 	if (!ptep || pte_special(*ptep))
 		return ULONG_MAX;
+
+	if (shift > PAGE_SHIFT) {
+		unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+
+		return pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+	}
+
 	return pte_pfn(*ptep);
 }
 
@@ -344,7 +350,7 @@ static const struct mce_derror_table mce_p9_derror_table[] = {
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
-static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
+static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
 					uint64_t *phys_addr)
 {
 	/*
@@ -541,7 +547,8 @@ static int mce_handle_derror(struct pt_regs *regs,
 			 * kernel/exception-64s.h
 			 */
 			if (get_paca()->in_mce < MAX_MCE_DEPTH)
-				mce_find_instr_ea_and_pfn(regs, addr, phys_addr);
+				mce_find_instr_ea_and_phys(regs, addr,
+							   phys_addr);
 		}
 		found = 1;
 	}

From 1a1715f516fd7fcfedffee5978ec4c07c5164470 Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.ibm.com>
Date: Tue, 20 Aug 2019 13:43:48 +0530
Subject: [PATCH 058/264] powerpc/mce: Make machine_check_ue_event() static

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-4-santosh@fossix.org
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index cff31d4a501f..a3b122a685a5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -34,7 +34,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -212,7 +212,7 @@ static void machine_check_ue_irq_work(struct irq_work *work)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
 	int index;
 

From 49ec9177b8ec9c081850744468811e708a7c9841 Mon Sep 17 00:00:00 2001
From: Santosh Sivaraj <santosh@fossix.org>
Date: Tue, 20 Aug 2019 13:43:49 +0530
Subject: [PATCH 059/264] extable: Add function to search only kernel exception
 table

Certain architecture specific operating modes (e.g., in powerpc machine
check handler that is unable to access vmalloc memory), the
search_exception_tables cannot be called because it also searches the
module exception tables if entry is not found in the kernel exception
table.

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-5-santosh@fossix.org
---
 include/linux/extable.h |  2 ++
 kernel/extable.c        | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *
+search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
 	}
 }
 
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+	return search_extable(__start___ex_table,
+			      __stop___ex_table - __start___ex_table, addr);
+}
+
 /* Given an address, look for it in the exception tables. */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
 	const struct exception_table_entry *e;
 
-	e = search_extable(__start___ex_table,
-			   __stop___ex_table - __start___ex_table, addr);
+	e = search_kernel_exception_table(addr);
 	if (!e)
 		e = search_module_extables(addr);
 	return e;

From 895e3dceeb97855dc9990136cbb80a842fe581aa Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Tue, 20 Aug 2019 13:43:50 +0530
Subject: [PATCH 060/264] powerpc/mce: Handle UE event for memcpy_mcsafe

If we take a UE on one of the instructions with a fixup entry, set nip
to continue execution at the fixup entry. Stop processing the event
further or print it.

Co-developed-by: Reza Arbab <arbab@linux.ibm.com>
Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-6-santosh@fossix.org
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c       | 16 ++++++++++++++++
 arch/powerpc/kernel/mce_power.c | 15 +++++++++++++--
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..19a33707d5ef 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
 			enum MCE_UeErrorType ue_error_type:8;
 			u8		effective_address_provided;
 			u8		physical_address_provided;
-			u8		reserved_1[5];
+			u8		ignore_event;
+			u8		reserved_1[4];
 			u64		effective_address;
 			u64		physical_address;
 			u8		reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
 	enum MCE_Initiator	initiator:8;
 	enum MCE_ErrorClass	error_class:8;
 	bool			sync_error;
+	bool			ignore_event;
 };
 
 #define MAX_MC_EVT	100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a3b122a685a5..ec4b3e1087be 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -149,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 		if (phys_addr != ULONG_MAX) {
 			mce->u.ue_error.physical_address_provided = true;
 			mce->u.ue_error.physical_address = phys_addr;
+			mce->u.ue_error.ignore_event = mce_err->ignore_event;
 			machine_check_ue_event(mce);
 		}
 	}
@@ -266,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work)
 		/*
 		 * This should probably queued elsewhere, but
 		 * oh! well
+		 *
+		 * Don't report this machine check because the caller has a
+		 * asked us to ignore the event, it has a fixup handler which
+		 * will do the appropriate error handling and reporting.
 		 */
 		if (evt->error_type == MCE_ERROR_TYPE_UE) {
+			if (evt->u.ue_error.ignore_event) {
+				__this_cpu_dec(mce_ue_count);
+				continue;
+			}
+
 			if (evt->u.ue_error.physical_address_provided) {
 				unsigned long pfn;
 
@@ -301,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	while (__this_cpu_read(mce_queue_count) > 0) {
 		index = __this_cpu_read(mce_queue_count) - 1;
 		evt = this_cpu_ptr(&mce_event_queue[index]);
+
+		if (evt->error_type == MCE_ERROR_TYPE_UE &&
+		    evt->u.ue_error.ignore_event) {
+			__this_cpu_dec(mce_queue_count);
+			continue;
+		}
 		machine_check_print_event_info(evt, false, false);
 		__this_cpu_dec(mce_queue_count);
 	}
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 714a98e0927f..b6cbe3449358 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/extable.h>
 #include <asm/mmu.h>
 #include <asm/mce.h>
 #include <asm/machdep.h>
@@ -18,6 +19,7 @@
 #include <asm/pte-walk.h>
 #include <asm/sstep.h>
 #include <asm/exception-64s.h>
+#include <asm/extable.h>
 
 /*
  * Convert an address related to an mm to a PFN. NOTE: we are in real
@@ -565,9 +567,18 @@ static int mce_handle_derror(struct pt_regs *regs,
 	return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+				struct mce_error_info *mce_err)
 {
 	long handled = 0;
+	const struct exception_table_entry *entry;
+
+	entry = search_kernel_exception_table(regs->nip);
+	if (entry) {
+		mce_err->ignore_event = true;
+		regs->nip = extable_fixup(entry);
+		return 1;
+	}
 
 	/*
 	 * On specific SCOM read via MMIO we may get a machine check
@@ -600,7 +611,7 @@ static long mce_handle_error(struct pt_regs *regs,
 				&phys_addr);
 
 	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-		handled = mce_handle_ue_error(regs);
+		handled = mce_handle_ue_error(regs, &mce_err);
 
 	save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 

From 4d4a273854c9eac1a1b163830658dc825e183ad5 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Tue, 20 Aug 2019 13:43:51 +0530
Subject: [PATCH 061/264] powerpc/memcpy: Add memcpy_mcsafe for pmem

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[arbab@linux.ibm.com: Added symbol export]
Co-developed-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-7-santosh@fossix.org
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile           |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 242 ++++++++++++++++++++++++++++
 3 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index f030eea2171d..b8de3be10eb4 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 			       memcpy_power7.o
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-	   memcpy_64.o
+	   memcpy_64.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index 000000000000..949976dc115d
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard <anton@au.ibm.com>
+ * Author - Balbir Singh <bsingharora@gmail.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/errno.h>
+#include <asm/export.h>
+
+	.macro err1
+100:
+	EX_TABLE(100b,.Ldo_err1)
+	.endm
+
+	.macro err2
+200:
+	EX_TABLE(200b,.Ldo_err2)
+	.endm
+
+	.macro err3
+300:	EX_TABLE(300b,.Ldone)
+	.endm
+
+.Ldo_err2:
+	ld	r22,STK_REG(R22)(r1)
+	ld	r21,STK_REG(R21)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r14,STK_REG(R14)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+.Ldo_err1:
+	/* Do a byte by byte copy to get the exact remaining size */
+	mtctr	r7
+46:
+err3;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+	bdnz	46b
+	li	r3,0
+	blr
+
+.Ldone:
+	mfctr	r3
+	blr
+
+
+_GLOBAL(memcpy_mcsafe)
+	mr	r7,r5
+	cmpldi	r5,16
+	blt	.Lshort_copy
+
+.Lcopy:
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+err1;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err1;	stb	r0,0(r3)
+	addi	r3,r3,1
+	subi	r7,r7,1
+
+1:	bf	cr7*4+2,2f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+	subi	r7,r7,2
+
+2:	bf	cr7*4+1,3f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+	subi	r7,r7,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	mflr	r0
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+	std	r17,STK_REG(R17)(r1)
+	std	r18,STK_REG(R18)(r1)
+	std	r19,STK_REG(R19)(r1)
+	std	r20,STK_REG(R20)(r1)
+	std	r21,STK_REG(R21)(r1)
+	std	r22,STK_REG(R22)(r1)
+	std	r0,STACKFRAMESIZE+16(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:
+err2;	ld	r0,0(r4)
+err2;	ld	r6,8(r4)
+err2;	ld	r8,16(r4)
+err2;	ld	r9,24(r4)
+err2;	ld	r10,32(r4)
+err2;	ld	r11,40(r4)
+err2;	ld	r12,48(r4)
+err2;	ld	r14,56(r4)
+err2;	ld	r15,64(r4)
+err2;	ld	r16,72(r4)
+err2;	ld	r17,80(r4)
+err2;	ld	r18,88(r4)
+err2;	ld	r19,96(r4)
+err2;	ld	r20,104(r4)
+err2;	ld	r21,112(r4)
+err2;	ld	r22,120(r4)
+	addi	r4,r4,128
+err2;	std	r0,0(r3)
+err2;	std	r6,8(r3)
+err2;	std	r8,16(r3)
+err2;	std	r9,24(r3)
+err2;	std	r10,32(r3)
+err2;	std	r11,40(r3)
+err2;	std	r12,48(r3)
+err2;	std	r14,56(r3)
+err2;	std	r15,64(r3)
+err2;	std	r16,72(r3)
+err2;	std	r17,80(r3)
+err2;	std	r18,88(r3)
+err2;	std	r19,96(r3)
+err2;	std	r20,104(r3)
+err2;	std	r21,112(r3)
+err2;	std	r22,120(r3)
+	addi	r3,r3,128
+	subi	r7,r7,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+err2;	ld	r0,0(r4)
+err2;	ld	r6,8(r4)
+err2;	ld	r8,16(r4)
+err2;	ld	r9,24(r4)
+err2;	ld	r10,32(r4)
+err2;	ld	r11,40(r4)
+err2;	ld	r12,48(r4)
+err2;	ld	r14,56(r4)
+	addi	r4,r4,64
+err2;	std	r0,0(r3)
+err2;	std	r6,8(r3)
+err2;	std	r8,16(r3)
+err2;	std	r9,24(r3)
+err2;	std	r10,32(r3)
+err2;	std	r11,40(r3)
+err2;	std	r12,48(r3)
+err2;	std	r14,56(r3)
+	addi	r3,r3,64
+	subi	r7,r7,64
+
+7:	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	ld	r21,STK_REG(R21)(r1)
+	ld	r22,STK_REG(R22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 63B to go */
+	bf	cr7*4+2,8f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+err1;	ld	r8,16(r4)
+err1;	ld	r9,24(r4)
+	addi	r4,r4,32
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+err1;	std	r8,16(r3)
+err1;	std	r9,24(r3)
+	addi	r3,r3,32
+	subi	r7,r7,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+	addi	r4,r4,16
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+	addi	r3,r3,16
+	subi	r7,r7,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err1;	lwz	r6,4(r4)
+	addi	r4,r4,8
+err1;	stw	r0,0(r3)
+err1;	stw	r6,4(r3)
+	addi	r3,r3,8
+	subi	r7,r7,8
+
+12:	bf	cr7*4+1,13f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+	subi	r7,r7,4
+
+13:	bf	cr7*4+2,14f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+	subi	r7,r7,2
+
+14:	bf	cr7*4+3,15f
+err1;	lbz	r0,0(r4)
+err1;	stb	r0,0(r3)
+
+15:	li	r3,0
+	blr
+
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);

From 42ac26d253ebe7a5f409443f2fbd239d66b65f57 Mon Sep 17 00:00:00 2001
From: Santosh Sivaraj <santosh@fossix.org>
Date: Tue, 20 Aug 2019 13:43:52 +0530
Subject: [PATCH 062/264] powerpc: add machine check safe copy_to_user

Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820081352.8641-8-santosh@fossix.org
---
 arch/powerpc/Kconfig               |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9bc8ac5b8c96..78f52030e76a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
 	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
+	select ARCH_HAS_UACCESS_MCSAFE		if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 8b03eb44e876..15002b51ff18 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user *to,
 	return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+	if (likely(check_copy_size(from, n, true))) {
+		if (access_ok(to, n)) {
+			allow_write_to_user(to, n);
+			n = memcpy_mcsafe((void *)to, from, n);
+			prevent_write_to_user(to, n);
+		}
+	}
+
+	return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)

From 56347074c5307d7bca5db38eb2c764c64ae57514 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sat, 13 Jul 2019 12:21:06 +0900
Subject: [PATCH 063/264] powerpc: remove meaningless KBUILD_ARFLAGS addition

The KBUILD_ARFLAGS addition in arch/powerpc/Makefile has never worked
in a useful way because it is always overridden by the following code
in the top Makefile:

  # use the deterministic mode of AR if available
  KBUILD_ARFLAGS := $(call ar-option,D)

The code in the top Makefile was added in 2011, by commit 40df759e2b9e
("kbuild: Fix build with binutils <= 2.19").

The KBUILD_ARFLAGS addition for ppc has always been dead code from the
beginning.

Nobody has reported a problem since 43c9127d94d6 ("powerpc: Add option
to use thin archives"), so this code was unneeded.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190713032106.8509-1-yamada.masahiro@socionext.com
---
 arch/powerpc/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index c345b79414a9..46ed198a3aa3 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -112,7 +112,6 @@ ifeq ($(HAS_BIARCH),y)
 KBUILD_CFLAGS	+= -m$(BITS)
 KBUILD_AFLAGS	+= -m$(BITS) -Wl,-a$(BITS)
 KBUILD_LDFLAGS	+= -m elf$(BITS)$(LDEMULATION)
-KBUILD_ARFLAGS	+= --target=elf$(BITS)-$(GNUTARGET)
 endif
 
 cflags-$(CONFIG_STACKPROTECTOR)	+= -mstack-protector-guard=tls

From 3f068aae7a958555533847af88705b5629f31600 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:05 +1000
Subject: [PATCH 064/264] powerpc/64: Adjust order in pcibios_init()

The pcibios_init() function for PowerPC 64 currently calls
pci_bus_add_devices() before pcibios_resource_survey(). This means
that at boot time, when the pcibios_bus_add_device() hooks are called
by pci_bus_add_devices(), device resources have not been allocated and
they are unable to perform EEH setup, so a separate pass is needed.

This patch adjusts that order so that it will become possible to
consolidate the EEH setup work into a single location.

The only functional change is to execute pcibios_resource_survey()
(excepting ppc_md.pcibios_fixup(), see below) before
pci_bus_add_devices() instead of after it.

Because pcibios_scan_phb() and pci_bus_add_devices() are called
together in a loop, this must be broken into one loop for each call.
Then the call to pcibios_resource_survey() is moved up in between
them. This changes the ordering but because pcibios_resource_survey()
also calls ppc_md.pcibios_fixup(), that call is extracted out into
pcibios_init() to where pcibios_resource_survey() was, so that it is
not moved.

The only other caller of pcibios_resource_survey() is the PowerPC 32
version of pcibios_init(), and therefore, that is modified to call
ppc_md.pcibios_fixup() right after pcibios_resource_survey() so that
there is no functional change there at all.

The re-arrangement will cause very few side-effects because at this
stage in the boot, pci_bus_add_devices() does very little:
- pci_create_sysfs_dev_files() does nothing (no sysfs yet)
- pci_proc_attach_device() does nothing (no proc yet)
- device_attach() does nothing (no drivers yet)
This leaves only the pci_final_fixup calls, D3 support, and marking
the device as added. Of those, only the pci_final_fixup calls have the
potential to be affected by resource allocation.

The only pci_final_fixup handlers that touch resources seem to be one
for x86 (pci_amd_enable_64bit_bar()), and a PowerPC 32 platform driver
(quirk_final_uli1575()), neither of which use this pcibios_init()
function. Even if they did, it would almost certainly be a bug, under
the current ordering, to rely on or make changes to resources before
they were allocated.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/4506b0489eabd0921a3587d90bd44c7683f3472d.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/pci-common.c |  4 ----
 arch/powerpc/kernel/pci_32.c     |  4 ++++
 arch/powerpc/kernel/pci_64.c     | 12 +++++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index f627e15bb43c..1c448cf25506 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1379,10 +1379,6 @@ void __init pcibios_resource_survey(void)
 		pr_debug("PCI: Assigning unassigned resources...\n");
 		pci_assign_unassigned_resources();
 	}
-
-	/* Call machine dependent fixup */
-	if (ppc_md.pcibios_fixup)
-		ppc_md.pcibios_fixup();
 }
 
 /* This is used by the PCI hotplug driver to allocate resource
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 50942a1d1a5f..b49e1060a3bf 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -263,6 +263,10 @@ static int __init pcibios_init(void)
 	/* Call common code to handle resource allocation */
 	pcibios_resource_survey();
 
+	/* Call machine dependent fixup */
+	if (ppc_md.pcibios_fixup)
+		ppc_md.pcibios_fixup();
+
 	/* Call machine dependent post-init code */
 	if (ppc_md.pcibios_after_init)
 		ppc_md.pcibios_after_init();
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index b7030b1189d0..f83d1f69b1dd 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -54,14 +54,20 @@ static int __init pcibios_init(void)
 	pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
 
 	/* Scan all of the recorded PCI controllers.  */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
 		pcibios_scan_phb(hose);
-		pci_bus_add_devices(hose->bus);
-	}
 
 	/* Call common code to handle resource allocation */
 	pcibios_resource_survey();
 
+	/* Add devices. */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		pci_bus_add_devices(hose->bus);
+
+	/* Call machine dependent fixup */
+	if (ppc_md.pcibios_fixup)
+		ppc_md.pcibios_fixup();
+
 	printk(KERN_DEBUG "PCI: Probing PCI hardware done\n");
 
 	return 0;

From aa06e3d60e245284d1e55497eb3108828092818d Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:06 +1000
Subject: [PATCH 065/264] powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag

The EEH_DEV_NO_HANDLER flag is used by the EEH system to prevent the
use of driver callbacks in drivers that have been bound part way
through the recovery process. This is necessary to prevent later stage
handlers from being called when the earlier stage handlers haven't,
which can be confusing for drivers.

However, the flag is set for all devices that are added after boot
time and only cleared at the end of the EEH recovery process. This
results in hot plugged devices erroneously having the flag set during
the first recovery after they are added (causing their driver's
handlers to be incorrectly ignored).

To remedy this, clear the flag at the beginning of recovery
processing. The flag is still cleared at the end of recovery
processing, although it is no longer really necessary.

Also clear the flag during eeh_handle_special_event(), for the same
reasons.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b8ca5629d27de74c957d4f4b250177d1b6fc4bbd.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh_driver.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 89623962c727..1fbe541856f5 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -793,6 +793,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		result = PCI_ERS_RESULT_DISCONNECT;
 	}
 
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			edev->mode &= ~EEH_DEV_NO_HANDLER;
+
 	/* Walk the various device drivers attached to this slot through
 	 * a reset sequence, giving each an opportunity to do what it needs
 	 * to accomplish the reset.  Each child gets a report of the
@@ -981,7 +985,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
  */
 void eeh_handle_special_event(void)
 {
-	struct eeh_pe *pe, *phb_pe;
+	struct eeh_pe *pe, *phb_pe, *tmp_pe;
+	struct eeh_dev *edev, *tmp_edev;
 	struct pci_bus *bus;
 	struct pci_controller *hose;
 	unsigned long flags;
@@ -1050,6 +1055,10 @@ void eeh_handle_special_event(void)
 				    (phb_pe->state & EEH_PE_RECOVERING))
 					continue;
 
+				eeh_for_each_pe(pe, tmp_pe)
+					eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
+						edev->mode &= ~EEH_DEV_NO_HANDLER;
+
 				/* Notify all devices to be down */
 				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
 				eeh_set_channel_state(pe, pci_channel_io_perm_failure);

From 617082a4817a4354fa3de05c80b5f6088e2083b7 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:07 +1000
Subject: [PATCH 066/264] powerpc/eeh: Improve debug messages around device
 addition

Also remove useless comment.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/59db84f4bf94718a12f206bc923ac797d47e4cc1.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh.c                    |  2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 14 ++++++++----
 arch/powerpc/platforms/pseries/eeh_pseries.c | 23 +++++++++++++++-----
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c0e4b73191f3..d187d2b290a8 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1198,7 +1198,7 @@ void eeh_add_device_late(struct pci_dev *dev)
 	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
 	edev = pdn_to_eeh_dev(pdn);
 	if (edev->pdev == dev) {
-		pr_debug("EEH: Already referenced !\n");
+		pr_debug("EEH: Device %s already referenced!\n", pci_name(dev));
 		return;
 	}
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 620a986209f5..97461d49254e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -44,10 +44,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 	if (!pdev->is_virtfn)
 		return;
 
-	/*
-	 * The following operations will fail if VF's sysfs files
-	 * aren't created or its resources aren't finalized.
-	 */
+	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
 	eeh_add_device_early(pdn);
 	eeh_add_device_late(pdev);
 	eeh_sysfs_add_device(pdev);
@@ -364,6 +361,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	int ret;
 	int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
+	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
+		__func__, hose->global_number, pdn->busno,
+		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+
 	/*
 	 * When probing the root bridge, which doesn't have any
 	 * subordinate PCI devices. We don't have OF node for
@@ -458,6 +459,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	/* Save memory bars */
 	eeh_save_bars(edev);
 
+	pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
+		__func__, pdn->busno, PCI_SLOT(pdn->devfn),
+		PCI_FUNC(pdn->devfn), edev->pe->phb->global_number,
+		edev->pe->addr);
+
 	return NULL;
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9edae1863e2f..83ee69d22022 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -52,6 +52,8 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 	if (!pdev->is_virtfn)
 		return;
 
+	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
+
 	pdn->device_id  =  pdev->device;
 	pdn->vendor_id  =  pdev->vendor;
 	pdn->class_code =  pdev->class;
@@ -238,6 +240,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 	int enable = 0;
 	int ret;
 
+	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
+		__func__, pdn->phb->global_number, pdn->busno,
+		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+
 	/* Retrieve OF node and eeh device */
 	edev = pdn_to_eeh_dev(pdn);
 	if (!edev || edev->pe)
@@ -281,7 +287,12 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 
 	/* Enable EEH on the device */
 	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
-	if (!ret) {
+	if (ret) {
+		pr_debug("%s: EEH failed to enable on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
+			__func__, pdn->busno, PCI_SLOT(pdn->devfn),
+			PCI_FUNC(pdn->devfn), pe.phb->global_number,
+			pe.addr, ret);
+	} else {
 		/* Retrieve PE address */
 		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
 		pe.addr = edev->pe_config_addr;
@@ -297,11 +308,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 		if (enable) {
 			eeh_add_flag(EEH_ENABLED);
 			eeh_add_to_parent_pe(edev);
-
-			pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
-				__func__, pdn->busno, PCI_SLOT(pdn->devfn),
-				PCI_FUNC(pdn->devfn), pe.phb->global_number,
-				pe.addr);
 		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
 			   (pdn_to_eeh_dev(pdn->parent))->pe) {
 			/* This device doesn't support EEH, but it may have an
@@ -310,6 +316,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
 			eeh_add_to_parent_pe(edev);
 		}
+		pr_debug("%s: EEH %s on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
+			__func__, (enable ? "enabled" : "unsupported"),
+			pdn->busno, PCI_SLOT(pdn->devfn),
+			PCI_FUNC(pdn->devfn), pe.phb->global_number,
+			pe.addr, ret);
 	}
 
 	/* Save memory bars */

From 685a0bc00abcf1d40d160eaafab9989f565ab2b5 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:08 +1000
Subject: [PATCH 067/264] powerpc/eeh: Initialize EEH address cache earlier

The EEH address cache is currently initialized and populated by a
single function: eeh_addr_cache_build().  While the initial population
of the cache can only be done once resources are allocated,
initialization (just setting up a spinlock) could be done much
earlier.

So move the initialization step into a separate function and call it
from a core_initcall (rather than a subsys initcall).

This will allow future work to make use of the cache during boot time
PCI scanning.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/0557206741bffee76cdfff042f65321f6f7a5b41.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/eeh.h  |  3 +++
 arch/powerpc/kernel/eeh.c       |  2 ++
 arch/powerpc/kernel/eeh_cache.c | 13 +++++++++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8aa7c76c2130..33e797eb0e6a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -274,6 +274,7 @@ int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
+void eeh_addr_cache_init(void);
 void eeh_addr_cache_build(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
@@ -332,6 +333,8 @@ static inline int eeh_check_failure(const volatile void __iomem *token)
 
 #define eeh_dev_check_failure(x) (0)
 
+static inline void eeh_addr_cache_init(void) { }
+
 static inline void eeh_addr_cache_build(void) { }
 
 static inline void eeh_add_device_early(struct pci_dn *pdn) { }
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index d187d2b290a8..22f646176abb 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1120,6 +1120,8 @@ static int eeh_init(void)
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
 		eeh_dev_phb_init_dynamic(hose);
 
+	eeh_addr_cache_init();
+
 	/* Initialize EEH event */
 	return eeh_event_init();
 }
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 05ffd32b3416..75f0d66dee4b 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -257,6 +257,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
 }
 
+/**
+ * eeh_addr_cache_init - Initialize a cache of I/O addresses
+ *
+ * Initialize a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ */
+void eeh_addr_cache_init(void)
+{
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+}
+
 /**
  * eeh_addr_cache_build - Build a cache of I/O addresses
  *
@@ -272,8 +283,6 @@ void eeh_addr_cache_build(void)
 	struct eeh_dev *edev;
 	struct pci_dev *dev = NULL;
 
-	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
-
 	for_each_pci_dev(dev) {
 		pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
 		if (!pdn)

From b905f8cdca7725e750a84f7188ea6821750124c3 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:09 +1000
Subject: [PATCH 068/264] powerpc/eeh: EEH for pSeries hot plug

On PowerNV and pSeries, devices currently acquire EEH support from
several different places: Boot-time devices from eeh_probe_devices()
and eeh_addr_cache_build(), Virtual Function devices from the pcibios
bus add device hooks and hot plugged devices from pci_hp_add_devices()
(with other platforms using other methods as well).  Unfortunately,
pSeries machines currently discover hot plugged devices using
pci_rescan_bus(), not pci_hp_add_devices(), and so those devices do
not receive EEH support.

Rather than adding another case for pci_rescan_bus(), this change
widens the scope of the pcibios bus add device hooks so that they can
handle all devices. As a side effect this also supports devices
discovered after manually rescanning via /sys/bus/pci/rescan.

Note that on PowerNV, this change allows the EEH subsystem to become
enabled after boot as long as it has not been forced off, which was
not previously possible (it was already possible on pSeries).

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/72ae8ae9c54097158894a52de23690448de38ea9.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh.c                    |  4 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 39 +++++++++-----
 arch/powerpc/platforms/pseries/eeh_pseries.c | 54 ++++++++++----------
 3 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 22f646176abb..abf4c1bb1fab 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1192,7 +1192,7 @@ void eeh_add_device_late(struct pci_dev *dev)
 	struct pci_dn *pdn;
 	struct eeh_dev *edev;
 
-	if (!dev || !eeh_enabled())
+	if (!dev)
 		return;
 
 	pr_debug("EEH: Adding device %s\n", pci_name(dev));
@@ -1248,6 +1248,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
+		return;
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		eeh_add_device_late(dev);
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 97461d49254e..00c0b81fd61d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -41,7 +41,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 
-	if (!pdev->is_virtfn)
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
 	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
@@ -196,6 +196,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
 
 #endif /* CONFIG_DEBUG_FS */
 
+void pnv_eeh_enable_phbs(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		/*
+		 * If EEH is enabled, we're going to rely on that.
+		 * Otherwise, we restore to conventional mechanism
+		 * to clear frozen PE during PCI config access.
+		 */
+		if (eeh_enabled())
+			phb->flags |= PNV_PHB_FLAG_EEH;
+		else
+			phb->flags &= ~PNV_PHB_FLAG_EEH;
+	}
+}
+
 /**
  * pnv_eeh_post_init - EEH platform dependent post initialization
  *
@@ -234,19 +253,11 @@ int pnv_eeh_post_init(void)
 	if (!eeh_enabled())
 		disable_irq(eeh_event_irq);
 
+	pnv_eeh_enable_phbs();
+
 	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 
-		/*
-		 * If EEH is enabled, we're going to rely on that.
-		 * Otherwise, we restore to conventional mechanism
-		 * to clear frozen PE during PCI config access.
-		 */
-		if (eeh_enabled())
-			phb->flags |= PNV_PHB_FLAG_EEH;
-		else
-			phb->flags &= ~PNV_PHB_FLAG_EEH;
-
 		/* Create debugfs entries */
 #ifdef CONFIG_DEBUG_FS
 		if (phb->has_dbgfs || !phb->dbgfs)
@@ -454,7 +465,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	 * Enable EEH explicitly so that we will do EEH check
 	 * while accessing I/O stuff
 	 */
-	eeh_add_flag(EEH_ENABLED);
+	if (!eeh_has_flag(EEH_ENABLED)) {
+		enable_irq(eeh_event_irq);
+		pnv_eeh_enable_phbs();
+		eeh_add_flag(EEH_ENABLED);
+	}
 
 	/* Save memory bars */
 	eeh_save_bars(edev);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 83ee69d22022..02454c737d11 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -42,44 +42,44 @@ static int ibm_get_config_addr_info;
 static int ibm_get_config_addr_info2;
 static int ibm_configure_pe;
 
-#ifdef CONFIG_PCI_IOV
 void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 {
 	struct pci_dn *pdn = pci_get_pdn(pdev);
-	struct pci_dn *physfn_pdn;
-	struct eeh_dev *edev;
 
-	if (!pdev->is_virtfn)
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
 	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
+#ifdef CONFIG_PCI_IOV
+	if (pdev->is_virtfn) {
+		struct pci_dn *physfn_pdn;
 
-	pdn->device_id  =  pdev->device;
-	pdn->vendor_id  =  pdev->vendor;
-	pdn->class_code =  pdev->class;
-	/*
-	 * Last allow unfreeze return code used for retrieval
-	 * by user space in eeh-sysfs to show the last command
-	 * completion from platform.
-	 */
-	pdn->last_allow_rc =  0;
-	physfn_pdn      =  pci_get_pdn(pdev->physfn);
-	pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
-	edev = pdn_to_eeh_dev(pdn);
-
-	/*
-	 * The following operations will fail if VF's sysfs files
-	 * aren't created or its resources aren't finalized.
-	 */
+		pdn->device_id  =  pdev->device;
+		pdn->vendor_id  =  pdev->vendor;
+		pdn->class_code =  pdev->class;
+		/*
+		 * Last allow unfreeze return code used for retrieval
+		 * by user space in eeh-sysfs to show the last command
+		 * completion from platform.
+		 */
+		pdn->last_allow_rc =  0;
+		physfn_pdn      =  pci_get_pdn(pdev->physfn);
+		pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
+	}
+#endif
 	eeh_add_device_early(pdn);
 	eeh_add_device_late(pdev);
-	edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
-	eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
-	eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
-	eeh_sysfs_add_device(pdev);
+#ifdef CONFIG_PCI_IOV
+	if (pdev->is_virtfn) {
+		struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
-}
+		edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
+		eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+		eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
+	}
 #endif
+	eeh_sysfs_add_device(pdev);
+}
 
 /*
  * Buffer for reporting slot-error-detail rtas calls. Its here
@@ -146,10 +146,8 @@ static int pseries_eeh_init(void)
 	/* Set EEH probe mode */
 	eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
 
-#ifdef CONFIG_PCI_IOV
 	/* Set EEH machine dependent code */
 	ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
-#endif
 
 	return 0;
 }

From c44e4ccadaca5884ac82b6dfffbd693bec3b583e Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:10 +1000
Subject: [PATCH 069/264] powerpc/eeh: Refactor around eeh_probe_devices()

Now that EEH support for all devices (on PowerNV and pSeries) is
provided by the pcibios bus add device hooks, eeh_probe_devices() and
eeh_addr_cache_build() are redundant and can be removed.

Move the EEH enabled message into it's own function so that it can be
called from multiple places.

Note that previously on pSeries, useless EEH sysfs files were created
for some devices that did not have EEH support and this change
prevents them from being created.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/33b0a6339d5ac88693de092d6fba984f2a5add66.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/eeh.h               |  7 ++---
 arch/powerpc/kernel/eeh.c                    | 27 ++++++-----------
 arch/powerpc/kernel/eeh_cache.c              | 32 --------------------
 arch/powerpc/platforms/powernv/eeh-powernv.c |  4 +--
 arch/powerpc/platforms/pseries/pci.c         |  3 +-
 5 files changed, 14 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 33e797eb0e6a..1fc2b5e40822 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -269,13 +269,12 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-void eeh_probe_devices(void);
+void eeh_show_enabled(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
 void eeh_addr_cache_init(void);
-void eeh_addr_cache_build(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
@@ -317,7 +316,7 @@ static inline bool eeh_enabled(void)
         return false;
 }
 
-static inline void eeh_probe_devices(void) { }
+static inline void eeh_show_enabled(void) { }
 
 static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
 {
@@ -335,8 +334,6 @@ static inline int eeh_check_failure(const volatile void __iomem *token)
 
 static inline void eeh_addr_cache_init(void) { }
 
-static inline void eeh_addr_cache_build(void) { }
-
 static inline void eeh_add_device_early(struct pci_dn *pdn) { }
 
 static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { }
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index abf4c1bb1fab..fc975342e242 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -150,6 +150,16 @@ static int __init eeh_setup(char *str)
 }
 __setup("eeh=", eeh_setup);
 
+void eeh_show_enabled(void)
+{
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
+		pr_info("EEH: Recovery disabled by kernel parameter.\n");
+	else if (eeh_has_flag(EEH_ENABLED))
+		pr_info("EEH: Capable adapter found: recovery enabled.\n");
+	else
+		pr_info("EEH: No capable adapters found: recovery disabled.\n");
+}
+
 /*
  * This routine captures assorted PCI configuration space data
  * for the indicated PCI device, and puts them into a buffer
@@ -1063,23 +1073,6 @@ static struct notifier_block eeh_reboot_nb = {
 	.notifier_call = eeh_reboot_notifier,
 };
 
-void eeh_probe_devices(void)
-{
-	struct pci_controller *hose, *tmp;
-	struct pci_dn *pdn;
-
-	/* Enable EEH for all adapters */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-		pdn = hose->pci_data;
-		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
-	}
-	if (eeh_enabled())
-		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
-	else
-		pr_info("EEH: No capable adapters found\n");
-
-}
-
 /**
  * eeh_init - EEH initialization
  *
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 75f0d66dee4b..1e47cd04a1bd 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -268,38 +268,6 @@ void eeh_addr_cache_init(void)
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
 }
 
-/**
- * eeh_addr_cache_build - Build a cache of I/O addresses
- *
- * Build a cache of pci i/o addresses.  This cache will be used to
- * find the pci device that corresponds to a given address.
- * This routine scans all pci busses to build the cache.
- * Must be run late in boot process, after the pci controllers
- * have been scanned for devices (after all device resources are known).
- */
-void eeh_addr_cache_build(void)
-{
-	struct pci_dn *pdn;
-	struct eeh_dev *edev;
-	struct pci_dev *dev = NULL;
-
-	for_each_pci_dev(dev) {
-		pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
-		if (!pdn)
-			continue;
-
-		edev = pdn_to_eeh_dev(pdn);
-		if (!edev)
-			continue;
-
-		dev->dev.archdata.edev = edev;
-		edev->pdev = dev;
-
-		eeh_addr_cache_insert_dev(dev);
-		eeh_sysfs_add_device(dev);
-	}
-}
-
 static int eeh_addr_cache_show(struct seq_file *s, void *v)
 {
 	struct pci_io_addr_range *piar;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 00c0b81fd61d..6ed9b6ccafcc 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -229,9 +229,7 @@ int pnv_eeh_post_init(void)
 	struct pnv_phb *phb;
 	int ret = 0;
 
-	/* Probe devices & build address cache */
-	eeh_probe_devices();
-	eeh_addr_cache_build();
+	eeh_show_enabled();
 
 	/* Register OPAL event notifier */
 	eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 1eae1d09980c..722830978639 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void)
 
 	pSeries_request_regions();
 
-	eeh_probe_devices();
-	eeh_addr_cache_build();
+	eeh_show_enabled();
 
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;

From 7c33a994d32d89937b23673e7507e8ec1daad893 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Fri, 16 Aug 2019 14:48:11 +1000
Subject: [PATCH 070/264] powerpc/eeh: Add bdfn field to eeh_dev

Preparation for removing pci_dn from the powernv EEH code. The only
thing we really use pci_dn for is to get the bdfn of the device for
config space accesses, so adding that information to eeh_dev reduces
the need to carry around the pci_dn.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
[SB: Re-wrapped commit message, fixed whitespace damage.]
Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/e458eb69a1f591d8a120782f23a8506b15d3c654.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/eeh.h     | 2 ++
 arch/powerpc/include/asm/ppc-pci.h | 2 ++
 arch/powerpc/kernel/eeh_dev.c      | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 1fc2b5e40822..294efeaf939e 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -121,6 +121,8 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
 	int class_code;			/* Class code of the device	*/
+	int bdfn;			/* bdfn of device (for cfg ops) */
+	struct pci_controller *controller;
 	int pe_config_addr;		/* PE config address		*/
 	u32 config_space[16];		/* Saved PCI config space	*/
 	int pcix_cap;			/* Saved PCIx capability	*/
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index cec2d6409515..72860de205a0 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -74,6 +74,8 @@ static inline const char *eeh_driver_name(struct pci_dev *pdev)
 
 #endif /* CONFIG_EEH */
 
+#define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
+
 #else /* CONFIG_PCI */
 static inline void init_pci_config_tokens(void) { }
 #endif /* !CONFIG_PCI */
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index c4317c452d98..7370185c7a05 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
 	/* Associate EEH device with OF node */
 	pdn->edev = edev;
 	edev->pdn = pdn;
+	edev->bdfn = (pdn->busno << 8) | pdn->devfn;
+	edev->controller = pdn->phb;
 
 	return edev;
 }

From b093f2cbedfbaba6bf1c520fbfcb46403f3c7802 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:12 +1000
Subject: [PATCH 071/264] powerpc/eeh: Introduce EEH edev logging macros

Now that struct eeh_dev includes the BDFN of it's PCI device, make use
of it to replace eeh_edev_info() with a set of dev_dbg()-style macros
that only need a struct edev.

With the BDFN available without the struct pci_dev, eeh_pci_name() is
now unnecessary, so remove it.

While only the "info" level function is used here, the others will be
used in followup work.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/f90ae9a53d762be7b0ccbad79e62b5a1b4f4996e.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/eeh.h   | 11 +++++++++++
 arch/powerpc/kernel/eeh_driver.c | 17 -----------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 294efeaf939e..56c1b4b0987a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -138,6 +138,17 @@ struct eeh_dev {
 	struct pci_dev *physfn;		/* Associated SRIOV PF		*/
 };
 
+/* "fmt" must be a simple literal string */
+#define EEH_EDEV_PRINT(level, edev, fmt, ...) \
+	pr_##level("PCI %04x:%02x:%02x.%x#%04x: EEH: " fmt, \
+	(edev)->controller->global_number, PCI_BUSNO((edev)->bdfn), \
+	PCI_SLOT((edev)->bdfn), PCI_FUNC((edev)->bdfn), \
+	((edev)->pe ? (edev)->pe_config_addr : 0xffff), ##__VA_ARGS__)
+#define eeh_edev_dbg(edev, fmt, ...) EEH_EDEV_PRINT(debug, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_info(edev, fmt, ...) EEH_EDEV_PRINT(info, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_warn(edev, fmt, ...) EEH_EDEV_PRINT(warn, (edev), fmt, ##__VA_ARGS__)
+#define eeh_edev_err(edev, fmt, ...) EEH_EDEV_PRINT(err, (edev), fmt, ##__VA_ARGS__)
+
 static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev)
 {
 	return edev ? edev->pdn : NULL;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 1fbe541856f5..ab576bcbe4dd 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -81,23 +81,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result)
 	}
 };
 
-static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
-					 const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
-	       edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
-
-	va_end(args);
-}
-
 static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
 						enum pci_ers_result new)
 {

From 1ff8f36fc770dd2b3eb294312f270db8cf94cc13 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:13 +1000
Subject: [PATCH 072/264] powerpc/eeh: Convert log messages to eeh_edev_*
 macros

Convert existing messages, where appropriate, to use the eeh_edev_*
logging macros.

The only effect should be minor adjustments to the log messages, apart
from:

- A new message in pseries_eeh_probe() "Probing device" to match the
powernv case.
- The "Probing device" message in pnv_eeh_probe() is now generated
slightly later, which will mean that it is no longer emitted for
devices that aren't probed due to the initial checks.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/ce505a0a7a4a5b0367f0f40f8b26e7c0a9cf4cb7.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/ppc-pci.h           |  5 --
 arch/powerpc/kernel/eeh.c                    | 19 +++-----
 arch/powerpc/kernel/eeh_cache.c              |  8 +--
 arch/powerpc/kernel/eeh_driver.c             |  7 +--
 arch/powerpc/kernel/eeh_pe.c                 | 51 +++++---------------
 arch/powerpc/platforms/powernv/eeh-powernv.c | 17 ++-----
 arch/powerpc/platforms/pseries/eeh_pseries.c | 21 +++-----
 7 files changed, 39 insertions(+), 89 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 72860de205a0..7f4be5a05eb3 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -62,11 +62,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
 
-static inline const char *eeh_pci_name(struct pci_dev *pdev) 
-{ 
-	return pdev ? pci_name(pdev) : "<null>";
-} 
-
 static inline const char *eeh_driver_name(struct pci_dev *pdev)
 {
 	return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index fc975342e242..958e03ca1db6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -470,8 +470,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	/* Access to IO BARs might get this far and still not want checking. */
 	if (!pe) {
 		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check for %s\n",
-			eeh_pci_name(dev));
+		eeh_edev_dbg(edev, "Ignored check\n");
 		return 0;
 	}
 
@@ -511,12 +510,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 			if (dn)
 				location = of_get_property(dn, "ibm,loc-code",
 						NULL);
-			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
-				"location=%s driver=%s pci addr=%s\n",
+			eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n",
 				pe->check_count,
 				location ? location : "unknown",
-				eeh_driver_name(dev), eeh_pci_name(dev));
-			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+				eeh_driver_name(dev));
+			eeh_edev_err(edev, "Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
 			dump_stack();
 		}
@@ -1188,12 +1186,11 @@ void eeh_add_device_late(struct pci_dev *dev)
 	if (!dev)
 		return;
 
-	pr_debug("EEH: Adding device %s\n", pci_name(dev));
-
 	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
 	edev = pdn_to_eeh_dev(pdn);
+	eeh_edev_dbg(edev, "Adding device\n");
 	if (edev->pdev == dev) {
-		pr_debug("EEH: Device %s already referenced!\n", pci_name(dev));
+		eeh_edev_dbg(edev, "Device already referenced!\n");
 		return;
 	}
 
@@ -1296,10 +1293,10 @@ void eeh_remove_device(struct pci_dev *dev)
 	edev = pci_dev_to_eeh_dev(dev);
 
 	/* Unregister the device with the EEH/PCI address search system */
-	pr_debug("EEH: Removing device %s\n", pci_name(dev));
+	dev_dbg(&dev->dev, "EEH: Removing device\n");
 
 	if (!edev || !edev->pdev || !edev->pe) {
-		pr_debug("EEH: Not referenced !\n");
+		dev_dbg(&dev->dev, "EEH: Device not referenced!\n");
 		return;
 	}
 
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 1e47cd04a1bd..cf11277ebd02 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
 	piar->pcidev = dev;
 	piar->flags = flags;
 
-	pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
-		 &alo, &ahi, pci_name(dev));
+	eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n",
+		 &alo, &ahi);
 
 	rb_link_node(&piar->rb_node, parent, p);
 	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -229,8 +229,8 @@ restart:
 		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
 
 		if (piar->pcidev == dev) {
-			pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n",
-				 &piar->addr_lo, &piar->addr_hi, pci_name(dev));
+			eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n",
+				 &piar->addr_lo, &piar->addr_hi);
 			rb_erase(n, &pci_io_addr_cache_root.rb_root);
 			kfree(piar);
 			goto restart;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index ab576bcbe4dd..274075a814b6 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -456,12 +456,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
 {
 	struct pci_driver *driver;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 
 	if (!(edev->physfn)) {
-		pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
-			__func__, pdn->phb->global_number, pdn->busno,
-			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+		eeh_edev_warn(edev, "Not for VF\n");
 		return NULL;
 	}
 
@@ -475,7 +472,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
 	}
 
 #ifdef CONFIG_PCI_IOV
-	pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
+	pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
 #endif
 	return NULL;
 }
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 854cef7b18f4..317a31624526 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -379,8 +379,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
 	/* Check if the PE number is valid */
 	if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
-		pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
-		       __func__, config_addr, pdn->phb->global_number);
+		eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n");
 		return -EINVAL;
 	}
 
@@ -398,12 +397,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
 		/* Put the edev to PE */
 		list_add_tail(&edev->entry, &pe->edevs);
-		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n",
-			 pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn),
-			 pe->addr);
+		eeh_edev_dbg(edev, "Added to bus PE\n");
 		return 0;
 	} else if (pe && (pe->type & EEH_PE_INVALID)) {
 		list_add_tail(&edev->entry, &pe->edevs);
@@ -420,13 +414,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 			parent = parent->parent;
 		}
 
-		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device "
-			 "PE#%x, Parent PE#%x\n",
-			 pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn),
-			 pe->addr, pe->parent->addr);
+		eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
+			     pe->parent->addr);
 		return 0;
 	}
 
@@ -468,13 +457,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	list_add_tail(&pe->child, &parent->child_list);
 	list_add_tail(&edev->entry, &pe->edevs);
 	edev->pe = pe;
-	pr_debug("EEH: Add %04x:%02x:%02x.%01x to "
-		 "Device PE#%x, Parent PE#%x\n",
-		 pdn->phb->global_number,
-		 pdn->busno,
-		 PCI_SLOT(pdn->devfn),
-		 PCI_FUNC(pdn->devfn),
-		 pe->addr, pe->parent->addr);
+	eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
+		     pe->parent->addr);
 
 	return 0;
 }
@@ -492,15 +476,10 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 {
 	struct eeh_pe *pe, *parent, *child;
 	int cnt;
-	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 
 	pe = eeh_dev_to_pe(edev);
 	if (!pe) {
-		pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n",
-			 __func__,  pdn->phb->global_number,
-			 pdn->busno,
-			 PCI_SLOT(pdn->devfn),
-			 PCI_FUNC(pdn->devfn));
+		eeh_edev_dbg(edev, "No PE found for device.\n");
 		return -EEXIST;
 	}
 
@@ -717,17 +696,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
 		return;
 
-	pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
-		 __func__, pdn->phb->global_number,
-		 pdn->busno,
-		 PCI_SLOT(pdn->devfn),
-		 PCI_FUNC(pdn->devfn));
+	eeh_edev_dbg(edev, "Checking PCIe link...\n");
 
 	/* Check slot status */
 	cap = edev->pcie_cap;
 	eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val);
 	if (!(val & PCI_EXP_SLTSTA_PDS)) {
-		pr_debug("  No card in the slot (0x%04x) !\n", val);
+		eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val);
 		return;
 	}
 
@@ -736,7 +711,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	if (val & PCI_EXP_SLTCAP_PCP) {
 		eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val);
 		if (val & PCI_EXP_SLTCTL_PCC) {
-			pr_debug("  In power-off state, power it on ...\n");
+			eeh_edev_dbg(edev, "In power-off state, power it on ...\n");
 			val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
 			val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
 			eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val);
@@ -752,7 +727,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	/* Check link */
 	eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val);
 	if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
-		pr_debug("  No link reporting capability (0x%08x) \n", val);
+		eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val);
 		msleep(1000);
 		return;
 	}
@@ -769,10 +744,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
 	}
 
 	if (val & PCI_EXP_LNKSTA_DLLLA)
-		pr_debug("  Link up (%s)\n",
+		eeh_edev_dbg(edev, "Link up (%s)\n",
 			 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
 	else
-		pr_debug("  Link not ready (0x%04x)\n", val);
+		eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val);
 }
 
 #define BYTE_SWAP(OFF)	(8*((OFF)/4)+3-(OFF))
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6ed9b6ccafcc..e7b867912f24 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -44,7 +44,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
-	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
+	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
 	eeh_add_device_early(pdn);
 	eeh_add_device_late(pdev);
 	eeh_sysfs_add_device(pdev);
@@ -370,10 +370,6 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	int ret;
 	int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
-	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
-		__func__, hose->global_number, pdn->busno,
-		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
-
 	/*
 	 * When probing the root bridge, which doesn't have any
 	 * subordinate PCI devices. We don't have OF node for
@@ -387,6 +383,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	eeh_edev_dbg(edev, "Probing device\n");
+
 	/* Initialize eeh device */
 	edev->class_code = pdn->class_code;
 	edev->mode	&= 0xFFFFFF00;
@@ -412,9 +410,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	/* Create PE */
 	ret = eeh_add_to_parent_pe(edev);
 	if (ret) {
-		pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
-			__func__, hose->global_number, pdn->busno,
-			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+		eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
 		return NULL;
 	}
 
@@ -472,10 +468,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	/* Save memory bars */
 	eeh_save_bars(edev);
 
-	pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
-		__func__, pdn->busno, PCI_SLOT(pdn->devfn),
-		PCI_FUNC(pdn->devfn), edev->pe->phb->global_number,
-		edev->pe->addr);
+	eeh_edev_dbg(edev, "EEH enabled on device\n");
 
 	return NULL;
 }
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 02454c737d11..893ba3f562c4 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -49,7 +49,7 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
-	pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
+	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
 #ifdef CONFIG_PCI_IOV
 	if (pdev->is_virtfn) {
 		struct pci_dn *physfn_pdn;
@@ -238,10 +238,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 	int enable = 0;
 	int ret;
 
-	pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
-		__func__, pdn->phb->global_number, pdn->busno,
-		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
-
 	/* Retrieve OF node and eeh device */
 	edev = pdn_to_eeh_dev(pdn);
 	if (!edev || edev->pe)
@@ -255,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
         if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	eeh_edev_dbg(edev, "Probing device\n");
+
 	/*
 	 * Update class code and mode of eeh device. We need
 	 * correctly reflects that current device is root port
@@ -284,12 +282,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 	pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
 
 	/* Enable EEH on the device */
+	eeh_edev_dbg(edev, "Enabling EEH on device\n");
 	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
 	if (ret) {
-		pr_debug("%s: EEH failed to enable on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
-			__func__, pdn->busno, PCI_SLOT(pdn->devfn),
-			PCI_FUNC(pdn->devfn), pe.phb->global_number,
-			pe.addr, ret);
+		eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
 	} else {
 		/* Retrieve PE address */
 		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
@@ -314,11 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
 			eeh_add_to_parent_pe(edev);
 		}
-		pr_debug("%s: EEH %s on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n",
-			__func__, (enable ? "enabled" : "unsupported"),
-			pdn->busno, PCI_SLOT(pdn->devfn),
-			PCI_FUNC(pdn->devfn), pe.phb->global_number,
-			pe.addr, ret);
+		eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
+			     (enable ? "enabled" : "unsupported"), ret);
 	}
 
 	/* Save memory bars */

From 2e25505147b8acf6510b9d5d951fd4c75f2e9bf2 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:14 +1000
Subject: [PATCH 073/264] powerpc/eeh: Fix crash when edev->pdev changes

If a PCI device is removed during eeh_pe_report_edev(), between the
calls to device_lock() and device_unlock(), edev->pdev will change and
cause a crash as the wrong mutex is released.

To correct this, hold the PCI rescan/remove lock while taking a copy
of edev->pdev and performing a get_device() on it.  Use this value to
release the mutex, but also pass it through to the device driver's EEH
handlers so that they always see the same device.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/3c590579a0faa24d20c826dcd26c739eb4d454e6.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh_driver.c | 44 +++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 274075a814b6..e817d78fe52d 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -257,20 +257,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
+					     struct pci_dev *,
 					     struct pci_driver *);
 static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 			       enum pci_ers_result *result)
 {
+	struct pci_dev *pdev;
 	struct pci_driver *driver;
 	enum pci_ers_result new_result;
 
-	if (!edev->pdev) {
+	pci_lock_rescan_remove();
+	pdev = edev->pdev;
+	if (pdev)
+		get_device(&pdev->dev);
+	pci_unlock_rescan_remove();
+	if (!pdev) {
 		eeh_edev_info(edev, "no device");
 		return;
 	}
-	device_lock(&edev->pdev->dev);
+	device_lock(&pdev->dev);
 	if (eeh_edev_actionable(edev)) {
-		driver = eeh_pcid_get(edev->pdev);
+		driver = eeh_pcid_get(pdev);
 
 		if (!driver)
 			eeh_edev_info(edev, "no driver");
@@ -279,7 +286,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 		else if (edev->mode & EEH_DEV_NO_HANDLER)
 			eeh_edev_info(edev, "driver bound too late");
 		else {
-			new_result = fn(edev, driver);
+			new_result = fn(edev, pdev, driver);
 			eeh_edev_info(edev, "%s driver reports: '%s'",
 				      driver->name,
 				      pci_ers_result_name(new_result));
@@ -288,12 +295,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
 							       new_result);
 		}
 		if (driver)
-			eeh_pcid_put(edev->pdev);
+			eeh_pcid_put(pdev);
 	} else {
-		eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
+		eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
 			      !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
 	}
-	device_unlock(&edev->pdev->dev);
+	device_unlock(&pdev->dev);
+	if (edev->pdev != pdev)
+		eeh_edev_warn(edev, "Device changed during processing!\n");
+	put_device(&pdev->dev);
 }
 
 static void eeh_pe_report(const char *name, struct eeh_pe *root,
@@ -320,20 +330,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
  * Report an EEH error to each device driver.
  */
 static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
+					    struct pci_dev *pdev,
 					    struct pci_driver *driver)
 {
 	enum pci_ers_result rc;
-	struct pci_dev *dev = edev->pdev;
 
 	if (!driver->err_handler->error_detected)
 		return PCI_ERS_RESULT_NONE;
 
 	eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
 		      driver->name);
-	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+	rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);
 
 	edev->in_error = true;
-	pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+	pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
 	return rc;
 }
 
@@ -346,12 +356,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
  * are now enabled.
  */
 static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
+						   struct pci_dev *pdev,
 						   struct pci_driver *driver)
 {
 	if (!driver->err_handler->mmio_enabled)
 		return PCI_ERS_RESULT_NONE;
 	eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
-	return driver->err_handler->mmio_enabled(edev->pdev);
+	return driver->err_handler->mmio_enabled(pdev);
 }
 
 /**
@@ -365,12 +376,13 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
  * driver can work again while the device is recovered.
  */
 static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
+					    struct pci_dev *pdev,
 					    struct pci_driver *driver)
 {
 	if (!driver->err_handler->slot_reset || !edev->in_error)
 		return PCI_ERS_RESULT_NONE;
 	eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
-	return driver->err_handler->slot_reset(edev->pdev);
+	return driver->err_handler->slot_reset(pdev);
 }
 
 static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
@@ -411,13 +423,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
  * to make the recovered device work again.
  */
 static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
+					     struct pci_dev *pdev,
 					     struct pci_driver *driver)
 {
 	if (!driver->err_handler->resume || !edev->in_error)
 		return PCI_ERS_RESULT_NONE;
 
 	eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
-	driver->err_handler->resume(edev->pdev);
+	driver->err_handler->resume(pdev);
 
 	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
 #ifdef CONFIG_PCI_IOV
@@ -436,6 +449,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
  * dead, and that no further recovery attempts will be made on it.
  */
 static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
+					      struct pci_dev *pdev,
 					      struct pci_driver *driver)
 {
 	enum pci_ers_result rc;
@@ -445,10 +459,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
 
 	eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
 		      driver->name);
-	rc = driver->err_handler->error_detected(edev->pdev,
+	rc = driver->err_handler->error_detected(pdev,
 						 pci_channel_io_perm_failure);
 
-	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT);
+	pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT);
 	return rc;
 }
 

From cef50c67c1d511bbbc974cead2bebeb6f83730ce Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:15 +1000
Subject: [PATCH 074/264] powerpc/eeh: Remove unused return path from
 eeh_pe_dev_traverse()

There are no users of the early-out return value from
eeh_pe_dev_traverse(), so remove it.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/c648070f5b28fe8ca1880b48e64b267959ffd369.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |  6 +++---
 arch/powerpc/kernel/eeh.c        | 16 +++++-----------
 arch/powerpc/kernel/eeh_driver.c | 26 +++++++++++---------------
 arch/powerpc/kernel/eeh_pe.c     | 25 +++++++------------------
 4 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 56c1b4b0987a..c13119a5e69b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -260,7 +260,7 @@ static inline bool eeh_state_active(int state)
 	== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
-typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
+typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
@@ -274,8 +274,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
 		      eeh_pe_traverse_func fn, void *flag);
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
-			  eeh_edev_traverse_func fn, void *flag);
+void eeh_pe_dev_traverse(struct eeh_pe *root,
+			 eeh_edev_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
 const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 958e03ca1db6..7b2755f5c6fd 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -705,7 +705,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
 	return rc;
 }
 
-static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
+static void eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 					    void *userdata)
 {
 	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
@@ -716,7 +716,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 	 * state for the specified device
 	 */
 	if (!pdev || pdev == dev)
-		return NULL;
+		return;
 
 	/* Ensure we have D0 power state */
 	pci_set_power_state(pdev, PCI_D0);
@@ -729,18 +729,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
 	 * interrupt from the device
 	 */
 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
-	return NULL;
 }
 
-static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
+static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
 	struct pci_dev *dev = userdata;
 
 	if (!pdev)
-		return NULL;
+		return;
 
 	/* Apply customization from firmware */
 	if (pdn && eeh_ops->restore_config)
@@ -749,8 +747,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 	/* The caller should restore state for the specified device */
 	if (pdev != dev)
 		pci_restore_state(pdev);
-
-	return NULL;
 }
 
 int eeh_restore_vf_config(struct pci_dn *pdn)
@@ -876,7 +872,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
  * the indicated device and its children so that the bunch of the
  * devices could be reset properly.
  */
-static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
+static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
 {
 	struct pci_dev *dev;
 	unsigned int *freset = (unsigned int *)flag;
@@ -884,8 +880,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
 	dev = eeh_dev_to_pci_dev(edev);
 	if (dev)
 		*freset |= dev->needs_freset;
-
-	return NULL;
 }
 
 static void eeh_pe_refreeze_passed(struct eeh_pe *root)
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index e817d78fe52d..a31cd32c4ce9 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -197,12 +197,12 @@ static void eeh_enable_irq(struct eeh_dev *edev)
 	}
 }
 
-static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
+static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dev *pdev;
 
 	if (!edev)
-		return NULL;
+		return;
 
 	/*
 	 * We cannot access the config space on some adapters.
@@ -212,14 +212,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
 	 * device is created.
 	 */
 	if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
-		return NULL;
+		return;
 
 	pdev = eeh_dev_to_pci_dev(edev);
 	if (!pdev)
-		return NULL;
+		return;
 
 	pci_save_state(pdev);
-	return NULL;
 }
 
 static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
@@ -385,12 +384,12 @@ static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
 	return driver->err_handler->slot_reset(pdev);
 }
 
-static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
+static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_dev *pdev;
 
 	if (!edev)
-		return NULL;
+		return;
 
 	/*
 	 * The content in the config space isn't saved because
@@ -402,15 +401,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
 		if (list_is_last(&edev->entry, &edev->pe->edevs))
 			eeh_pe_restore_bars(edev->pe);
 
-		return NULL;
+		return;
 	}
 
 	pdev = eeh_dev_to_pci_dev(edev);
 	if (!pdev)
-		return NULL;
+		return;
 
 	pci_restore_state(pdev);
-	return NULL;
 }
 
 /**
@@ -491,7 +489,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
 	return NULL;
 }
 
-static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
+static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 {
 	struct pci_driver *driver;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
@@ -506,7 +504,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 	 */
 	if (!eeh_edev_actionable(edev) ||
 	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
-		return NULL;
+		return;
 
 	if (rmv_data) {
 		driver = eeh_pcid_get(dev);
@@ -515,7 +513,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 			    driver->err_handler->error_detected &&
 			    driver->err_handler->slot_reset) {
 				eeh_pcid_put(dev);
-				return NULL;
+				return;
 			}
 			eeh_pcid_put(dev);
 		}
@@ -548,8 +546,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 		pci_stop_and_remove_bus_device(dev);
 		pci_unlock_rescan_remove();
 	}
-
-	return NULL;
 }
 
 static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 317a31624526..9c38fa7c33aa 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root,
  * The function is used to traverse the devices of the specified
  * PE and its child PEs.
  */
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
+void eeh_pe_dev_traverse(struct eeh_pe *root,
 			  eeh_edev_traverse_func fn, void *flag)
 {
 	struct eeh_pe *pe;
 	struct eeh_dev *edev, *tmp;
-	void *ret;
 
 	if (!root) {
 		pr_warn("%s: Invalid PE %p\n",
 			__func__, root);
-		return NULL;
+		return;
 	}
 
 	/* Traverse root PE */
-	eeh_for_each_pe(root, pe) {
-		eeh_pe_for_each_dev(pe, edev, tmp) {
-			ret = fn(edev, flag);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return NULL;
+	eeh_for_each_pe(root, pe)
+		eeh_pe_for_each_dev(pe, edev, tmp)
+			fn(edev, flag);
 }
 
 /**
@@ -602,13 +595,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root)
 }
 EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated);
 
-static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
+static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
 {
 	int mode = *((int *)flag);
 
 	edev->mode |= mode;
-
-	return NULL;
 }
 
 /**
@@ -827,7 +818,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev)
  * the expansion ROM base address, the latency timer, and etc.
  * from the saved values in the device node.
  */
-static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
+static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
 {
 	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 
@@ -839,8 +830,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
 
 	if (eeh_ops->restore_config && pdn)
 		eeh_ops->restore_config(pdn);
-
-	return NULL;
 }
 
 /**

From 27d4396ed5a1c70f894557fd5732725eb1d94542 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 16 Aug 2019 14:48:16 +1000
Subject: [PATCH 075/264] powerpc/eeh: Slightly simplify eeh_add_to_parent_pe()

Simplify some needlessly complicated boolean logic in
eeh_add_to_parent_pe().

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/09259a50308f10aa764695912bc87dc1d1cf654c.1565930772.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh_pe.c | 50 +++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 9c38fa7c33aa..1a6254bcf056 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -383,32 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	 * components.
 	 */
 	pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
-	if (pe && !(pe->type & EEH_PE_INVALID)) {
-		/* Mark the PE as type of PCI bus */
-		pe->type = EEH_PE_BUS;
-		edev->pe = pe;
+	if (pe) {
+		if (pe->type & EEH_PE_INVALID) {
+			list_add_tail(&edev->entry, &pe->edevs);
+			edev->pe = pe;
+			/*
+			 * We're running to here because of PCI hotplug caused by
+			 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+			 */
+			parent = pe;
+			while (parent) {
+				if (!(parent->type & EEH_PE_INVALID))
+					break;
+				parent->type &= ~EEH_PE_INVALID;
+				parent = parent->parent;
+			}
 
-		/* Put the edev to PE */
-		list_add_tail(&edev->entry, &pe->edevs);
-		eeh_edev_dbg(edev, "Added to bus PE\n");
-		return 0;
-	} else if (pe && (pe->type & EEH_PE_INVALID)) {
-		list_add_tail(&edev->entry, &pe->edevs);
-		edev->pe = pe;
-		/*
-		 * We're running to here because of PCI hotplug caused by
-		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
-		 */
-		parent = pe;
-		while (parent) {
-			if (!(parent->type & EEH_PE_INVALID))
-				break;
-			parent->type &= ~EEH_PE_INVALID;
-			parent = parent->parent;
+			eeh_edev_dbg(edev,
+				     "Added to device PE (parent: PE#%x)\n",
+				     pe->parent->addr);
+		} else {
+			/* Mark the PE as type of PCI bus */
+			pe->type = EEH_PE_BUS;
+			edev->pe = pe;
+
+			/* Put the edev to PE */
+			list_add_tail(&edev->entry, &pe->edevs);
+			eeh_edev_dbg(edev, "Added to bus PE\n");
 		}
-
-		eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
-			     pe->parent->addr);
 		return 0;
 	}
 

From 6278f55ba5ed20f486ea2049da0cf9a9c1d9a5d5 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@linux.ibm.com>
Date: Wed, 14 Aug 2019 15:56:37 -0500
Subject: [PATCH 076/264] powerpc: Document xmon options

Document all options currently supported by xmon debugger.

Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814205638.25322-1-gromero@linux.ibm.com
---
 .../admin-guide/kernel-parameters.txt         | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7ccd158b3894..6d495aab4d0b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5305,3 +5305,22 @@
 			A hex value specifying bitmask with supplemental xhci
 			host controller quirks. Meaning of each bit can be
 			consulted in header drivers/usb/host/xhci.h.
+
+	xmon		[PPC]
+			Format: { early | on | rw | ro | off }
+			Controls if xmon debugger is enabled. Default is off.
+			Passing only "xmon" is equivalent to "xmon=early".
+			early	Call xmon as early as possible on boot; xmon
+				debugger is called from setup_arch().
+			on	xmon debugger hooks will be installed so xmon
+				is only called on a kernel crash. Default mode,
+				i.e. either "ro" or "rw" mode, is controlled
+				with CONFIG_XMON_DEFAULT_RO_MODE.
+			rw	xmon debugger hooks will be installed so xmon
+				is called only on a kernel crash, mode is write,
+				meaning SPR registers, memory and, other data
+				can be written using xmon commands.
+			ro 	same as "rw" option above but SPR registers,
+				memory, and other data can't be written using
+				xmon commands.
+			off	xmon is disabled.

From b8baa05a0e505f8311fb03f5c72e75fa0e13fd95 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@linux.ibm.com>
Date: Wed, 14 Aug 2019 15:56:38 -0500
Subject: [PATCH 077/264] selftests/powerpc: Ignore generated files

Currently some binary files which are generated when tests are compiled
are not ignored by git, so 'git status' catch them.

For copyloops test, fix wrong binary names already in .gitignore. For
ptrace, security, and stringloops tests add missing binary names to the
.gitignore file.

Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814205638.25322-2-gromero@linux.ibm.com
---
 tools/testing/selftests/powerpc/copyloops/.gitignore   | 8 ++++----
 tools/testing/selftests/powerpc/ptrace/.gitignore      | 3 +++
 tools/testing/selftests/powerpc/security/.gitignore    | 1 +
 tools/testing/selftests/powerpc/stringloops/.gitignore | 5 ++++-
 4 files changed, 12 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/security/.gitignore

diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore
index ce12cd0e2967..de158104912a 100644
--- a/tools/testing/selftests/powerpc/copyloops/.gitignore
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -1,13 +1,13 @@
 copyuser_64_t0
 copyuser_64_t1
 copyuser_64_t2
-copyuser_power7_t0
-copyuser_power7_t1
+copyuser_p7_t0
+copyuser_p7_t1
 memcpy_64_t0
 memcpy_64_t1
 memcpy_64_t2
-memcpy_power7_t0
-memcpy_power7_t1
+memcpy_p7_t0
+memcpy_p7_t1
 copyuser_64_exc_t0
 copyuser_64_exc_t1
 copyuser_64_exc_t2
diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 07ec449a2767..dce19f221c46 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -10,3 +10,6 @@ ptrace-tm-spd-vsx
 ptrace-tm-spr
 ptrace-hwbreak
 perf-hwbreak
+core-pkey
+ptrace-pkey
+ptrace-syscall
diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore
new file mode 100644
index 000000000000..0b969fba3beb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/.gitignore
@@ -0,0 +1 @@
+rfi_flush
diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore
index 0b43da74ee46..31a17e0ba884 100644
--- a/tools/testing/selftests/powerpc/stringloops/.gitignore
+++ b/tools/testing/selftests/powerpc/stringloops/.gitignore
@@ -1 +1,4 @@
-memcmp
+memcmp_64
+memcmp_32
+strlen
+strlen_32

From 7e04a46d84f73ef0f21d2257d6ba2a194c0f1511 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Thu, 27 Jun 2019 15:30:08 +1000
Subject: [PATCH 078/264] powerpc/configs: Disable /dev/port in skiroot
 defconfig

While reviewing lockdown patches, I discovered that we still enable
/dev/port (CONFIG_DEVPORT) in skiroot.

/dev/port is used for old x86 style IO accesses. It's set up in
drivers/char/mem.c, and is only created if arch_has_dev_port() returns
true. Per arch/powerpc/include/asm/io.h, on PPC64 with PCI, this is
only true if there's a legacy ISA bridge.

Even if a system has a legacy ISA bridge installed, we have no
business accessing it in skiroot.

Deselect CONFIG_DEVPORT for skiroot.

Signed-off-by: Daniel Axtens <dja@axtens.net>
[mpe: Incorporate emailed comments into the change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190627053008.29315-1-dja@axtens.net
---
 arch/powerpc/configs/skiroot_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 557b530b2f70..1253482a67c0 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -213,6 +213,7 @@ CONFIG_IPMI_WATCHDOG=y
 CONFIG_HW_RANDOM=y
 CONFIG_TCG_TPM=y
 CONFIG_TCG_TIS_I2C_NUVOTON=y
+# CONFIG_DEVPORT is not set
 CONFIG_I2C=y
 # CONFIG_I2C_COMPAT is not set
 CONFIG_I2C_CHARDEV=y

From 9d535e200f09ce347afc38c81ec7f2901187e5f0 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@linux.ibm.com>
Date: Wed, 14 Aug 2019 15:52:11 -0500
Subject: [PATCH 079/264] selftests/powerpc: Fix and enhance TM signal context
 tests

Currently TM signal context tests for GPR, FPR, VMX, and VSX registers
print wrong register numbers (wrongly starting from register 0 instead
of the first register in the non-volatile subset). Besides it the
output when a mismatch happens is poor giving not much information
about which context and which register mismatches, because it prints
both contexts at the same time and not a comparison between the value
that mismatches and the value expected and, moreover, it stops
printing on the first mismatch, but it's important to know if there
are other mismatches happening beyond the first one.

For instance, this is the current output when a mismatch happens:

  test: tm_signal_context_chk_gpr
  tags: git_version:v5.2-8249-g02e970fae465-dirty
  Failed on 0 GPR 1 or 18446744073709551615
  failure: tm_signal_context_chk_gpr

  test: tm_signal_context_chk_fpu
  tags: git_version:v5.2-8248-g09c289e3ef80
  Failed on 0 FP -1 or -1
  failure: tm_signal_context_chk_fpu

  test: tm_signal_context_chk_vmx
  tags: git_version:v5.2-8248-g09c289e3ef80
  Failed on 0 vmx 0xfffffffffffffffefffffffdfffffffc vs 0xfffffffffffffffefffffffdfffffffc
  failure: tm_signal_context_chk_vmx

  test: tm_signal_context_chk_vsx
  tags: git_version:v5.2-8248-g09c289e3ef80
  Failed on 0 vsx 0xfffffffffefffffffdfffffffcffffff vs 0xfffffffffefffffffdfffffffcffffff
  failure: tm_signal_context_chk_vsx

This commit fixes the register numbers printed and enhances the error
output by providing a full list of mismatching registers separated by
the context (non-speculative or speculative context), for example:

  test: tm_signal_context_chk_gpr
  tags: git_version:v5.2-8249-g02e970fae465-dirty
  GPR14 (1st context) == 1 instead of -1 (expected)
  GPR15 (1st context) == 2 instead of -2 (expected)
  GPR14 (2nd context) == 0 instead of 18446744073709551615 (expected)
  GPR15 (2nd context) == 0 instead of 18446744073709551614 (expected)
  failure: tm_signal_context_chk_gpr

  test: tm_signal_context_chk_fpu
  tags: git_version:v5.2-8249-g02e970fae465-dirty
  FPR14 (1st context) == -1 instead of 1 (expected)
  FPR15 (1st context) == -2 instead of 2 (expected)
  failure: tm_signal_context_chk_fpu

  test: tm_signal_context_chk_vmx
  tags: git_version:v5.2-8249-g02e970fae465-dirty
  VMX20 (1st context) == 0xfffffffffffffffefffffffdfffffffc instead of 0x00000001000000020000000300000004 (expected)
  VMX21 (1st context) == 0xfffffffbfffffffafffffff9fffffff8 instead of 0x00000005000000060000000700000008 (expected)
  failure: tm_signal_context_chk_vmx

  test: tm_signal_context_chk_vsx
  tags: git_version:v5.2-8249-g02e970fae465-dirty
  VSX20 (1st context) == 0xfffffffffefffffffdfffffffcffffff instead of 0x00000001000000020000000300000004 (expected)
  VSX21 (1st context) == 0xfbfffffffafffffff9fffffff8ffffff instead of 0x00000005000000060000000700000008 (expected)
  failure: tm_signal_context_chk_vsx

Finally, this commit adds comments to the tests in the hope that it
will help people not so familiar with TM understand the tests.

Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814205211.24840-1-gromero@linux.ibm.com
---
 .../powerpc/tm/tm-signal-context-chk-fpu.c    |  49 +++++--
 .../powerpc/tm/tm-signal-context-chk-gpr.c    |  59 +++++---
 .../powerpc/tm/tm-signal-context-chk-vmx.c    |  76 ++++++----
 .../powerpc/tm/tm-signal-context-chk-vsx.c    | 130 +++++++++++++-----
 4 files changed, 229 insertions(+), 85 deletions(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
index d57c2d2ab6ec..254f912ad611 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
@@ -5,10 +5,11 @@
  * Test the kernel's signal frame code.
  *
  * The kernel sets up two sets of ucontexts if the signal was to be
- * delivered while the thread was in a transaction.
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
  * Expected behaviour is that the checkpointed state is in the user
- * context passed to the signal handler. The speculated state can be
- * accessed with the uc_link pointer.
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
  *
  * The rationale for this is that if TM unaware code (which linked
  * against TM libs) installs a signal handler it will not know of the
@@ -28,17 +29,20 @@
 
 #define MAX_ATTEMPT 500000
 
-#define NV_FPU_REGS 18
+#define NV_FPU_REGS 18 /* Number of non-volatile FP registers */
+#define FPR14 14 /* First non-volatile FP register to check in f14-31 subset */
 
 long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
 
-/* Be sure there are 2x as many as there are NV FPU regs (2x18) */
+/* Test only non-volatile registers, i.e. 18 fpr registers from f14 to f31 */
 static double fps[] = {
+	/* First context will be set with these values, i.e. non-speculative */
 	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+	/* Second context will be set with these values, i.e. speculative */
 	-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
 };
 
-static sig_atomic_t fail;
+static sig_atomic_t fail, broken;
 
 static void signal_usr1(int signum, siginfo_t *info, void *uc)
 {
@@ -46,11 +50,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc)
 	ucontext_t *ucp = uc;
 	ucontext_t *tm_ucp = ucp->uc_link;
 
-	for (i = 0; i < NV_FPU_REGS && !fail; i++) {
-		fail = (ucp->uc_mcontext.fp_regs[i + 14] != fps[i]);
-		fail |= (tm_ucp->uc_mcontext.fp_regs[i + 14] != fps[i + NV_FPU_REGS]);
-		if (fail)
-			printf("Failed on %d FP %g or %g\n", i, ucp->uc_mcontext.fp_regs[i + 14], tm_ucp->uc_mcontext.fp_regs[i + 14]);
+	for (i = 0; i < NV_FPU_REGS; i++) {
+		/* Check first context. Print all mismatches. */
+		fail = (ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[i]);
+		if (fail) {
+			broken = 1;
+			printf("FPR%d (1st context) == %g instead of %g (expected)\n",
+				FPR14 + i, ucp->uc_mcontext.fp_regs[FPR14 + i], fps[i]);
+		}
+	}
+
+	for (i = 0; i < NV_FPU_REGS; i++) {
+		/* Check second context. Print all mismatches. */
+		fail = (tm_ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[NV_FPU_REGS + i]);
+		if (fail) {
+			broken = 1;
+			printf("FPR%d (2nd context) == %g instead of %g (expected)\n",
+				FPR14 + i, tm_ucp->uc_mcontext.fp_regs[FPR14 + i], fps[NV_FPU_REGS + i]);
+		}
 	}
 }
 
@@ -72,13 +89,19 @@ static int tm_signal_context_chk_fpu()
 	}
 
 	i = 0;
-	while (i < MAX_ATTEMPT && !fail) {
+	while (i < MAX_ATTEMPT && !broken) {
+		/*
+		 * tm_signal_self_context_load will set both first and second
+		 * contexts accordingly to the values passed through non-NULL
+		 * array pointers to it, in that case 'fps', and invoke the
+		 * signal handler installed for SIGUSR1.
+		 */
 		rc = tm_signal_self_context_load(pid, NULL, fps, NULL, NULL);
 		FAIL_IF(rc != pid);
 		i++;
 	}
 
-	return fail;
+	return (broken);
 }
 
 int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
index 4d05f8b0254c..0cc680f61828 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
@@ -5,10 +5,11 @@
  * Test the kernel's signal frame code.
  *
  * The kernel sets up two sets of ucontexts if the signal was to be
- * delivered while the thread was in a transaction.
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
  * Expected behaviour is that the checkpointed state is in the user
- * context passed to the signal handler. The speculated state can be
- * accessed with the uc_link pointer.
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
  *
  * The rationale for this is that if TM unaware code (which linked
  * against TM libs) installs a signal handler it will not know of the
@@ -28,14 +29,22 @@
 
 #define MAX_ATTEMPT 500000
 
-#define NV_GPR_REGS 18
+#define NV_GPR_REGS 18 /* Number of non-volatile GPR registers */
+#define R14 14 /* First non-volatile register to check in r14-r31 subset */
 
 long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
 
-static sig_atomic_t fail;
+static sig_atomic_t fail, broken;
 
-static long gps[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
-					 -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18};
+/* Test only non-volatile general purpose registers, i.e. r14-r31 */
+static long gprs[] = {
+	/* First context will be set with these values, i.e. non-speculative */
+	/* R14, R15, ... */
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+	/* Second context will be set with these values, i.e. speculative */
+	/* R14, R15, ... */
+	-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
+};
 
 static void signal_usr1(int signum, siginfo_t *info, void *uc)
 {
@@ -43,12 +52,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc)
 	ucontext_t *ucp = uc;
 	ucontext_t *tm_ucp = ucp->uc_link;
 
-	for (i = 0; i < NV_GPR_REGS && !fail; i++) {
-		fail = (ucp->uc_mcontext.gp_regs[i + 14] != gps[i]);
-		fail |= (tm_ucp->uc_mcontext.gp_regs[i + 14] != gps[i + NV_GPR_REGS]);
-		if (fail)
-			printf("Failed on %d GPR %lu or %lu\n", i,
-					ucp->uc_mcontext.gp_regs[i + 14], tm_ucp->uc_mcontext.gp_regs[i + 14]);
+	/* Check first context. Print all mismatches. */
+	for (i = 0; i < NV_GPR_REGS; i++) {
+		fail = (ucp->uc_mcontext.gp_regs[R14 + i] != gprs[i]);
+		if (fail) {
+			broken = 1;
+			printf("GPR%d (1st context) == %lu instead of %lu (expected)\n",
+				R14 + i, ucp->uc_mcontext.gp_regs[R14 + i], gprs[i]);
+		}
+	}
+
+	/* Check second context. Print all mismatches. */
+	for (i = 0; i < NV_GPR_REGS; i++) {
+		fail = (tm_ucp->uc_mcontext.gp_regs[R14 + i] != gprs[NV_GPR_REGS + i]);
+		if (fail) {
+			broken = 1;
+			printf("GPR%d (2nd context) == %lu instead of %lu (expected)\n",
+				R14 + i, tm_ucp->uc_mcontext.gp_regs[R14 + i], gprs[NV_GPR_REGS + i]);
+		}
 	}
 }
 
@@ -70,13 +91,19 @@ static int tm_signal_context_chk_gpr()
 	}
 
 	i = 0;
-	while (i < MAX_ATTEMPT && !fail) {
-		rc = tm_signal_self_context_load(pid, gps, NULL, NULL, NULL);
+	while (i < MAX_ATTEMPT && !broken) {
+                /*
+                 * tm_signal_self_context_load will set both first and second
+                 * contexts accordingly to the values passed through non-NULL
+                 * array pointers to it, in that case 'gprs', and invoke the
+                 * signal handler installed for SIGUSR1.
+                 */
+		rc = tm_signal_self_context_load(pid, gprs, NULL, NULL, NULL);
 		FAIL_IF(rc != pid);
 		i++;
 	}
 
-	return fail;
+	return broken;
 }
 
 int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
index 48ad01499b1a..b6d52730a0d8 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
@@ -5,10 +5,11 @@
  * Test the kernel's signal frame code.
  *
  * The kernel sets up two sets of ucontexts if the signal was to be
- * delivered while the thread was in a transaction.
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
  * Expected behaviour is that the checkpointed state is in the user
- * context passed to the signal handler. The speculated state can be
- * accessed with the uc_link pointer.
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
  *
  * The rationale for this is that if TM unaware code (which linked
  * against TM libs) installs a signal handler it will not know of the
@@ -29,18 +30,24 @@
 
 #define MAX_ATTEMPT 500000
 
-#define NV_VMX_REGS 12
+#define NV_VMX_REGS 12 /* Number of non-volatile VMX registers */
+#define VMX20 20 /* First non-volatile register to check in vr20-31 subset */
 
 long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
 
-static sig_atomic_t fail;
+static sig_atomic_t fail, broken;
 
+/* Test only non-volatile registers, i.e. 12 vmx registers from vr20 to vr31 */
 vector int vms[] = {
-	{1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12},
+	/* First context will be set with these values, i.e. non-speculative */
+	/* VMX20     ,  VMX21      , ... */
+	{ 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
 	{13,14,15,16},{17,18,19,20},{21,22,23,24},
 	{25,26,27,28},{29,30,31,32},{33,34,35,36},
 	{37,38,39,40},{41,42,43,44},{45,46,47,48},
-	{-1, -2, -3, -4}, {-5, -6, -7, -8}, {-9, -10,-11,-12},
+	/* Second context will be set with these values, i.e. speculative */
+	/* VMX20        , VMX21            , ... */
+	{ -1, -2, -3, -4},{ -5, -6, -7, -8},{ -9,-10,-11,-12},
 	{-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
 	{-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
 	{-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48}
@@ -48,26 +55,43 @@ vector int vms[] = {
 
 static void signal_usr1(int signum, siginfo_t *info, void *uc)
 {
-	int i;
+	int i, j;
 	ucontext_t *ucp = uc;
 	ucontext_t *tm_ucp = ucp->uc_link;
 
-	for (i = 0; i < NV_VMX_REGS && !fail; i++) {
-		fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[i + 20],
+	for (i = 0; i < NV_VMX_REGS; i++) {
+		/* Check first context. Print all mismatches. */
+		fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
 				&vms[i], sizeof(vector int));
-		fail |= memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[i + 20],
-				&vms[i + NV_VMX_REGS], sizeof (vector int));
-
 		if (fail) {
-			int j;
-
-			fprintf(stderr, "Failed on %d vmx 0x", i);
+			broken = 1;
+			printf("VMX%d (1st context) == 0x", VMX20 + i);
+			/* Print actual value in first context. */
 			for (j = 0; j < 4; j++)
-				fprintf(stderr, "%04x", ucp->uc_mcontext.v_regs->vrregs[i + 20][j]);
-			fprintf(stderr, " vs 0x");
-			for (j = 0 ; j < 4; j++)
-				fprintf(stderr, "%04x", tm_ucp->uc_mcontext.v_regs->vrregs[i + 20][j]);
-			fprintf(stderr, "\n");
+				printf("%08x", ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
+			printf(" instead of 0x");
+			/* Print expected value. */
+			for (j = 0; j < 4; j++)
+				printf("%08x", vms[i][j]);
+			printf(" (expected)\n");
+		}
+	}
+
+	for (i = 0; i < NV_VMX_REGS; i++)  {
+		/* Check second context. Print all mismatches. */
+		fail = memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
+				&vms[NV_VMX_REGS + i], sizeof (vector int));
+		if (fail) {
+			broken = 1;
+			printf("VMX%d (2nd context) == 0x", NV_VMX_REGS + i);
+			/* Print actual value in second context. */
+			for (j = 0; j < 4; j++)
+				printf("%08x", tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
+			printf(" instead of 0x");
+			/* Print expected value. */
+			for (j = 0; j < 4; j++)
+				printf("%08x", vms[NV_VMX_REGS + i][j]);
+			printf(" (expected)\n");
 		}
 	}
 }
@@ -90,13 +114,19 @@ static int tm_signal_context_chk()
 	}
 
 	i = 0;
-	while (i < MAX_ATTEMPT && !fail) {
+	while (i < MAX_ATTEMPT && !broken) {
+		/*
+		 * tm_signal_self_context_load will set both first and second
+		 * contexts accordingly to the values passed through non-NULL
+		 * array pointers to it, in that case 'vms', and invoke the
+		 * signal handler installed for SIGUSR1.
+		 */
 		rc = tm_signal_self_context_load(pid, NULL, NULL, vms, NULL);
 		FAIL_IF(rc != pid);
 		i++;
 	}
 
-	return fail;
+	return (broken);
 }
 
 int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
index 8c8677a408bb..8e25e2072ecd 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
@@ -5,10 +5,11 @@
  * Test the kernel's signal frame code.
  *
  * The kernel sets up two sets of ucontexts if the signal was to be
- * delivered while the thread was in a transaction.
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
  * Expected behaviour is that the checkpointed state is in the user
- * context passed to the signal handler. The speculated state can be
- * accessed with the uc_link pointer.
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
  *
  * The rationale for this is that if TM unaware code (which linked
  * against TM libs) installs a signal handler it will not know of the
@@ -29,17 +30,24 @@
 
 #define MAX_ATTEMPT 500000
 
-#define NV_VSX_REGS 12
+#define NV_VSX_REGS 12 /* Number of VSX registers to check. */
+#define VSX20 20 /* First VSX register to check in vsr20-vsr31 subset */
+#define FPR20 20 /* FPR20 overlaps VSX20 most significant doubleword */
 
 long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
 
-static sig_atomic_t fail;
+static sig_atomic_t fail, broken;
 
-vector int vss[] = {
-	{1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12},
+/* Test only 12 vsx registers from vsr20 to vsr31 */
+vector int vsxs[] = {
+	/* First context will be set with these values, i.e. non-speculative */
+	/* VSX20     ,  VSX21      , ... */
+	{ 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
 	{13,14,15,16},{17,18,19,20},{21,22,23,24},
 	{25,26,27,28},{29,30,31,32},{33,34,35,36},
 	{37,38,39,40},{41,42,43,44},{45,46,47,48},
+	/* Second context will be set with these values, i.e. speculative */
+	/* VSX20         ,  VSX21          , ... */
 	{-1, -2, -3, -4 },{-5, -6, -7, -8 },{-9, -10,-11,-12},
 	{-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
 	{-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
@@ -48,41 +56,91 @@ vector int vss[] = {
 
 static void signal_usr1(int signum, siginfo_t *info, void *uc)
 {
-	int i;
-	uint8_t vsc[sizeof(vector int)];
-	uint8_t vst[sizeof(vector int)];
+	int i, j;
+	uint8_t vsx[sizeof(vector int)];
+	uint8_t vsx_tm[sizeof(vector int)];
 	ucontext_t *ucp = uc;
 	ucontext_t *tm_ucp = ucp->uc_link;
 
 	/*
-	 * The other half of the VSX regs will be after v_regs.
+	 * FP registers and VMX registers overlap the VSX registers.
+	 *
+	 * FP registers (f0-31) overlap the most significant 64 bits of VSX
+	 * registers vsr0-31, whilst VMX registers vr0-31, being 128-bit like
+	 * the VSX registers, overlap fully the other half of VSX registers,
+	 * i.e. vr0-31 overlaps fully vsr32-63.
+	 *
+	 * Due to compatibility and historical reasons (VMX/Altivec support
+	 * appeared first on the architecture), VMX registers vr0-31 (so VSX
+	 * half vsr32-63 too) are stored right after the v_regs pointer, in an
+	 * area allocated for 'vmx_reverse' array (please see
+	 * arch/powerpc/include/uapi/asm/sigcontext.h for details about the
+	 * mcontext_t structure on Power).
+	 *
+	 * The other VSX half (vsr0-31) is hence stored below vr0-31/vsr32-63
+	 * registers, but only the least significant 64 bits of vsr0-31. The
+	 * most significant 64 bits of vsr0-31 (f0-31), as it overlaps the FP
+	 * registers, is kept in fp_regs.
+	 *
+	 * v_regs is a 16 byte aligned pointer at the start of vmx_reserve
+	 * (vmx_reserve may or may not be 16 aligned) where the v_regs structure
+	 * exists, so v_regs points to where vr0-31 / vsr32-63 registers are
+	 * fully stored. Since v_regs type is elf_vrregset_t, v_regs + 1
+	 * skips all the slots used to store vr0-31 / vsr32-64 and points to
+	 * part of one VSX half, i.e. v_regs + 1 points to the least significant
+	 * 64 bits of vsr0-31. The other part of this half (the most significant
+	 * part of vsr0-31) is stored in fp_regs.
 	 *
-	 * In short, vmx_reserve array holds everything. v_regs is a 16
-	 * byte aligned pointer at the start of vmx_reserve (vmx_reserve
-	 * may or may not be 16 aligned) where the v_regs structure exists.
-	 * (half of) The VSX regsters are directly after v_regs so the
-	 * easiest way to find them below.
 	 */
+	/* Get pointer to least significant doubleword of vsr0-31 */
 	long *vsx_ptr = (long *)(ucp->uc_mcontext.v_regs + 1);
 	long *tm_vsx_ptr = (long *)(tm_ucp->uc_mcontext.v_regs + 1);
-	for (i = 0; i < NV_VSX_REGS && !fail; i++) {
-		memcpy(vsc, &ucp->uc_mcontext.fp_regs[i + 20], 8);
-		memcpy(vsc + 8, &vsx_ptr[20 + i], 8);
-		fail = memcmp(vsc, &vss[i], sizeof(vector int));
-		memcpy(vst, &tm_ucp->uc_mcontext.fp_regs[i + 20], 8);
-		memcpy(vst + 8, &tm_vsx_ptr[20 + i], 8);
-		fail |= memcmp(vst, &vss[i + NV_VSX_REGS], sizeof(vector int));
+
+	/* Check first context. Print all mismatches. */
+	for (i = 0; i < NV_VSX_REGS; i++) {
+		/*
+		 * Copy VSX most significant doubleword from fp_regs and
+		 * copy VSX least significant one from 64-bit slots below
+		 * saved VMX registers.
+		 */
+		memcpy(vsx, &ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
+		memcpy(vsx + 8, &vsx_ptr[VSX20 + i], 8);
+
+		fail = memcmp(vsx, &vsxs[i], sizeof(vector int));
 
 		if (fail) {
-			int j;
+			broken = 1;
+			printf("VSX%d (1st context) == 0x", VSX20 + i);
+			for (j = 0; j < 16; j++)
+				printf("%02x", vsx[j]);
+			printf(" instead of 0x");
+			for (j = 0; j < 4; j++)
+				printf("%08x", vsxs[i][j]);
+			printf(" (expected)\n");
+		}
+	}
 
-			fprintf(stderr, "Failed on %d vsx 0x", i);
+	/* Check second context. Print all mismatches. */
+	for (i = 0; i < NV_VSX_REGS; i++) {
+		/*
+		 * Copy VSX most significant doubleword from fp_regs and
+		 * copy VSX least significant one from 64-bit slots below
+		 * saved VMX registers.
+		 */
+		memcpy(vsx_tm, &tm_ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
+		memcpy(vsx_tm + 8, &tm_vsx_ptr[VSX20 + i], 8);
+
+		fail = memcmp(vsx_tm, &vsxs[NV_VSX_REGS + i], sizeof(vector int));
+
+		if (fail) {
+			broken = 1;
+			printf("VSX%d (2nd context) == 0x", VSX20 + i);
 			for (j = 0; j < 16; j++)
-				fprintf(stderr, "%02x", vsc[j]);
-			fprintf(stderr, " vs 0x");
-			for (j = 0; j < 16; j++)
-				fprintf(stderr, "%02x", vst[j]);
-			fprintf(stderr, "\n");
+				printf("%02x", vsx_tm[j]);
+			printf(" instead of 0x");
+			for (j = 0; j < 4; j++)
+				printf("%08x", vsxs[NV_VSX_REGS + i][j]);
+			printf("(expected)\n");
 		}
 	}
 }
@@ -105,13 +163,19 @@ static int tm_signal_context_chk()
 	}
 
 	i = 0;
-	while (i < MAX_ATTEMPT && !fail) {
-		rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vss);
+	while (i < MAX_ATTEMPT && !broken) {
+               /*
+                * tm_signal_self_context_load will set both first and second
+                * contexts accordingly to the values passed through non-NULL
+                * array pointers to it, in that case 'vsxs', and invoke the
+                * signal handler installed for SIGUSR1.
+                */
+		rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vsxs);
 		FAIL_IF(rc != pid);
 		i++;
 	}
 
-	return fail;
+	return (broken);
 }
 
 int main(void)

From 6652bf6408895b09d31fd4128a1589a1a0672823 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@linux.vnet.ibm.com>
Date: Tue, 20 Aug 2019 18:54:11 -0400
Subject: [PATCH 080/264] selftests/powerpc: Retry on host facility unavailable

TM test tm-unavailable must take into account aborts due to host aborting
a transactin because of a facility unavailable exception, just like it
already does for aborts on reschedules (TM_CAUSE_KVM_RESCHED).

Reported-by: Desnes A. Nunes do Rosario <desnesn@linux.ibm.com>
Tested-by: Desnes A. Nunes do Rosario <desnesn@linux.ibm.com>
Signed-off-by: Gustavo Romero <gromero@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1566341651-19747-1-git-send-email-gromero@linux.vnet.ibm.com
---
 tools/testing/selftests/powerpc/tm/tm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
index 97f9f491c541..c402464b038f 100644
--- a/tools/testing/selftests/powerpc/tm/tm.h
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -55,7 +55,8 @@ static inline bool failure_is_unavailable(void)
 static inline bool failure_is_reschedule(void)
 {
 	if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED ||
-	    (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED)
+	    (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED ||
+	    (failure_code() & TM_CAUSE_KVM_FAC_UNAV) == TM_CAUSE_KVM_FAC_UNAV)
 		return true;
 
 	return false;

From d8f0e0b073e1ec52a05f0c2a56318b47387d2f10 Mon Sep 17 00:00:00 2001
From: "Christopher M. Riedl" <cmr@informatik.wtf>
Date: Thu, 23 May 2019 21:46:48 -0500
Subject: [PATCH 081/264] powerpc/64s: support nospectre_v2 cmdline option

Add support for disabling the kernel implemented spectre v2 mitigation
(count cache flush on context switch) via the nospectre_v2 and
mitigations=off cmdline options.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190524024647.381-1-cmr@informatik.wtf
---
 arch/powerpc/kernel/security.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index e1c9cf079503..7cfcb294b11c 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -28,7 +28,7 @@ static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NO
 bool barrier_nospec_enabled;
 static bool no_nospec;
 static bool btb_flush_enabled;
-#ifdef CONFIG_PPC_FSL_BOOK3E
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
 static bool no_spectrev2;
 #endif
 
@@ -114,7 +114,7 @@ static __init int security_feature_debugfs_init(void)
 device_initcall(security_feature_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
 static int __init handle_nospectre_v2(char *p)
 {
 	no_spectrev2 = true;
@@ -122,6 +122,9 @@ static int __init handle_nospectre_v2(char *p)
 	return 0;
 }
 early_param("nospectre_v2", handle_nospectre_v2);
+#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
 void setup_spectre_v2(void)
 {
 	if (no_spectrev2 || cpu_mitigations_off())
@@ -399,7 +402,17 @@ static void toggle_count_cache_flush(bool enable)
 
 void setup_count_cache_flush(void)
 {
-	toggle_count_cache_flush(true);
+	bool enable = true;
+
+	if (no_spectrev2 || cpu_mitigations_off()) {
+		if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) ||
+		    security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED))
+			pr_warn("Spectre v2 mitigations not under software control, can't disable\n");
+
+		enable = false;
+	}
+
+	toggle_count_cache_flush(enable);
 }
 
 #ifdef CONFIG_DEBUG_FS

From b4645ffc49cfe34f67feda20c34bd7a859c78312 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 22 Aug 2019 16:44:05 +0000
Subject: [PATCH 082/264] powerpc/64: don't select ARCH_HAS_SCALED_CPUTIME on
 book3E

Book3E doesn't have SPRN_SPURR/SPRN_PURR.

Activating ARCH_HAS_SCALED_CPUTIME is just wasting CPU time.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://github.com/linuxppc/issues/issues/171
Link: https://lore.kernel.org/r/a8b567c569aa521a7cf1beb061d43d79070e850c.1566492229.git.christophe.leroy@c-s.fr
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 78f52030e76a..3d65084f4066 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -133,7 +133,7 @@ config PPC
 	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
-	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
+	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE

From f0f8d7ae3924ed93453e30123e4aaf6f888ca555 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Aug 2019 14:07:09 +0000
Subject: [PATCH 083/264] powerpc: remove the ppc44x ocm.c file

The on chip memory allocator is entirely unused in the kernel tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/7b1668941ad1041d08b19167030868de5840b153.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/configs/ppc40x_defconfig |   1 -
 arch/powerpc/include/asm/ppc4xx_ocm.h |  31 --
 arch/powerpc/platforms/44x/Kconfig    |   8 -
 arch/powerpc/platforms/4xx/Makefile   |   1 -
 arch/powerpc/platforms/4xx/ocm.c      | 390 --------------------------
 5 files changed, 431 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/ppc4xx_ocm.h
 delete mode 100644 arch/powerpc/platforms/4xx/ocm.c

diff --git a/arch/powerpc/configs/ppc40x_defconfig b/arch/powerpc/configs/ppc40x_defconfig
index 8f136b52198b..a5f683aed328 100644
--- a/arch/powerpc/configs/ppc40x_defconfig
+++ b/arch/powerpc/configs/ppc40x_defconfig
@@ -84,4 +84,3 @@ CONFIG_CRYPTO_ECB=y
 CONFIG_CRYPTO_PCBC=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_DES=y
-CONFIG_PPC4xx_OCM=y
diff --git a/arch/powerpc/include/asm/ppc4xx_ocm.h b/arch/powerpc/include/asm/ppc4xx_ocm.h
deleted file mode 100644
index fc4db6dcde84..000000000000
--- a/arch/powerpc/include/asm/ppc4xx_ocm.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * PowerPC 4xx OCM memory allocation support
- *
- * (C) Copyright 2009, Applied Micro Circuits Corporation
- * Victor Gallardo (vgallardo@amcc.com)
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- */
-
-#ifndef __ASM_POWERPC_PPC4XX_OCM_H__
-#define __ASM_POWERPC_PPC4XX_OCM_H__
-
-#define PPC4XX_OCM_NON_CACHED 0
-#define PPC4XX_OCM_CACHED     1
-
-#if defined(CONFIG_PPC4xx_OCM)
-
-void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
-		  int flags, const char *owner);
-void ppc4xx_ocm_free(const void *virt);
-
-#else
-
-#define ppc4xx_ocm_alloc(phys, size, align, flags, owner)	NULL
-#define ppc4xx_ocm_free(addr)	((void)0)
-
-#endif /* CONFIG_PPC4xx_OCM */
-
-#endif  /* __ASM_POWERPC_PPC4XX_OCM_H__ */
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index b369ed4e3675..25ebe634a661 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -272,14 +272,6 @@ config PPC4xx_GPIO
 	help
 	  Enable gpiolib support for ppc440 based boards
 
-config PPC4xx_OCM
-	bool "PPC4xx On Chip Memory (OCM) support"
-	depends on 4xx
-	select PPC_LIB_RHEAP
-	help
-	  Enable OCM support for PowerPC 4xx platforms with on chip memory,
-	  OCM provides the fast place for memory access to improve performance.
-
 # 44x specific CPU modules, selected based on the board above.
 config 440EP
 	bool
diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile
index f5ae27ca131b..d009d2e0b9e8 100644
--- a/arch/powerpc/platforms/4xx/Makefile
+++ b/arch/powerpc/platforms/4xx/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y				+= uic.o machine_check.o
-obj-$(CONFIG_PPC4xx_OCM)	+= ocm.o
 obj-$(CONFIG_4xx_SOC)		+= soc.o
 obj-$(CONFIG_PCI)		+= pci.o
 obj-$(CONFIG_PPC4xx_HSTA_MSI)	+= hsta_msi.o
diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c
deleted file mode 100644
index ba3257406ced..000000000000
--- a/arch/powerpc/platforms/4xx/ocm.c
+++ /dev/null
@@ -1,390 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PowerPC 4xx OCM memory allocation support
- *
- * (C) Copyright 2009, Applied Micro Circuits Corporation
- * Victor Gallardo (vgallardo@amcc.com)
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- */
-
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <asm/rheap.h>
-#include <asm/ppc4xx_ocm.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-
-#define OCM_DISABLED	0
-#define OCM_ENABLED		1
-
-struct ocm_block {
-	struct list_head	list;
-	void __iomem		*addr;
-	int					size;
-	const char			*owner;
-};
-
-/* non-cached or cached region */
-struct ocm_region {
-	phys_addr_t			phys;
-	void __iomem		*virt;
-
-	int					memtotal;
-	int					memfree;
-
-	rh_info_t			*rh;
-	struct list_head	list;
-};
-
-struct ocm_info {
-	int					index;
-	int					status;
-	int					ready;
-
-	phys_addr_t			phys;
-
-	int					alignment;
-	int					memtotal;
-	int					cache_size;
-
-	struct ocm_region	nc;	/* non-cached region */
-	struct ocm_region	c;	/* cached region */
-};
-
-static struct ocm_info *ocm_nodes;
-static int ocm_count;
-
-static struct ocm_info *ocm_get_node(unsigned int index)
-{
-	if (index >= ocm_count) {
-		printk(KERN_ERR "PPC4XX OCM: invalid index");
-		return NULL;
-	}
-
-	return &ocm_nodes[index];
-}
-
-static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr)
-{
-	struct ocm_block *blk, *tmp;
-	unsigned long offset;
-
-	if (!ocm_reg->virt)
-		return 0;
-
-	list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) {
-		if (blk->addr == addr) {
-			offset = addr - ocm_reg->virt;
-			ocm_reg->memfree += blk->size;
-			rh_free(ocm_reg->rh, offset);
-			list_del(&blk->list);
-			kfree(blk);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-static void __init ocm_init_node(int count, struct device_node *node)
-{
-	struct ocm_info *ocm;
-
-	const unsigned int *cell_index;
-	const unsigned int *cache_size;
-	int len;
-
-	struct resource rsrc;
-
-	ocm = ocm_get_node(count);
-
-	cell_index = of_get_property(node, "cell-index", &len);
-	if (!cell_index) {
-		printk(KERN_ERR "PPC4XX OCM: missing cell-index property");
-		return;
-	}
-	ocm->index = *cell_index;
-
-	if (of_device_is_available(node))
-		ocm->status = OCM_ENABLED;
-
-	cache_size = of_get_property(node, "cached-region-size", &len);
-	if (cache_size)
-		ocm->cache_size = *cache_size;
-
-	if (of_address_to_resource(node, 0, &rsrc)) {
-		printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n",
-			ocm->index);
-		return;
-	}
-
-	ocm->phys = rsrc.start;
-	ocm->memtotal = (rsrc.end - rsrc.start + 1);
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n",
-		ocm->index, ocm->memtotal,
-		(ocm->status == OCM_DISABLED) ? "disabled" : "enabled");
-
-	if (ocm->status == OCM_DISABLED)
-		return;
-
-	/* request region */
-
-	if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) {
-		printk(KERN_ERR "PPC4XX OCM%d: could not request region\n",
-			ocm->index);
-		return;
-	}
-
-	/* Configure non-cached and cached regions */
-
-	ocm->nc.phys = ocm->phys;
-	ocm->nc.memtotal = ocm->memtotal - ocm->cache_size;
-	ocm->nc.memfree = ocm->nc.memtotal;
-
-	ocm->c.phys = ocm->phys + ocm->nc.memtotal;
-	ocm->c.memtotal = ocm->cache_size;
-	ocm->c.memfree = ocm->c.memtotal;
-
-	if (ocm->nc.memtotal == 0)
-		ocm->nc.phys = 0;
-
-	if (ocm->c.memtotal == 0)
-		ocm->c.phys = 0;
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n",
-		ocm->index, ocm->nc.memtotal);
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n",
-		ocm->index, ocm->c.memtotal);
-
-	/* ioremap the non-cached region */
-	if (ocm->nc.memtotal) {
-		ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal,
-			_PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG));
-
-		if (!ocm->nc.virt) {
-			printk(KERN_ERR
-			       "PPC4XX OCM%d: failed to ioremap non-cached memory\n",
-			       ocm->index);
-			ocm->nc.memfree = 0;
-			return;
-		}
-	}
-
-	/* ioremap the cached region */
-
-	if (ocm->c.memtotal) {
-		ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal,
-					_PAGE_EXEC | pgprot_val(PAGE_KERNEL));
-
-		if (!ocm->c.virt) {
-			printk(KERN_ERR
-			       "PPC4XX OCM%d: failed to ioremap cached memory\n",
-			       ocm->index);
-			ocm->c.memfree = 0;
-			return;
-		}
-	}
-
-	/* Create Remote Heaps */
-
-	ocm->alignment = 4; /* default 4 byte alignment */
-
-	if (ocm->nc.virt) {
-		ocm->nc.rh = rh_create(ocm->alignment);
-		rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal);
-	}
-
-	if (ocm->c.virt) {
-		ocm->c.rh = rh_create(ocm->alignment);
-		rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal);
-	}
-
-	INIT_LIST_HEAD(&ocm->nc.list);
-	INIT_LIST_HEAD(&ocm->c.list);
-
-	ocm->ready = 1;
-}
-
-static int ocm_debugfs_show(struct seq_file *m, void *v)
-{
-	struct ocm_block *blk, *tmp;
-	unsigned int i;
-
-	for (i = 0; i < ocm_count; i++) {
-		struct ocm_info *ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		seq_printf(m, "PPC4XX OCM   : %d\n", ocm->index);
-		seq_printf(m, "PhysAddr     : %pa\n", &(ocm->phys));
-		seq_printf(m, "MemTotal     : %d Bytes\n", ocm->memtotal);
-		seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal);
-		seq_printf(m, "MemTotal(C)  : %d Bytes\n\n", ocm->c.memtotal);
-
-		seq_printf(m, "NC.PhysAddr  : %pa\n", &(ocm->nc.phys));
-		seq_printf(m, "NC.VirtAddr  : 0x%p\n", ocm->nc.virt);
-		seq_printf(m, "NC.MemTotal  : %d Bytes\n", ocm->nc.memtotal);
-		seq_printf(m, "NC.MemFree   : %d Bytes\n", ocm->nc.memfree);
-
-		list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) {
-			seq_printf(m, "NC.MemUsed   : %d Bytes (%s)\n",
-							blk->size, blk->owner);
-		}
-
-		seq_printf(m, "\nC.PhysAddr   : %pa\n", &(ocm->c.phys));
-		seq_printf(m, "C.VirtAddr   : 0x%p\n", ocm->c.virt);
-		seq_printf(m, "C.MemTotal   : %d Bytes\n", ocm->c.memtotal);
-		seq_printf(m, "C.MemFree    : %d Bytes\n", ocm->c.memfree);
-
-		list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) {
-			seq_printf(m, "C.MemUsed    : %d Bytes (%s)\n",
-						blk->size, blk->owner);
-		}
-
-		seq_putc(m, '\n');
-	}
-
-	return 0;
-}
-
-static int ocm_debugfs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ocm_debugfs_show, NULL);
-}
-
-static const struct file_operations ocm_debugfs_fops = {
-	.open = ocm_debugfs_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int ocm_debugfs_init(void)
-{
-	struct dentry *junk;
-
-	junk = debugfs_create_dir("ppc4xx_ocm", 0);
-	if (!junk) {
-		printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n");
-		return -1;
-	}
-
-	if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) {
-		printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
-			int flags, const char *owner)
-{
-	void __iomem *addr = NULL;
-	unsigned long offset;
-	struct ocm_info *ocm;
-	struct ocm_region *ocm_reg;
-	struct ocm_block *ocm_blk;
-	int i;
-
-	for (i = 0; i < ocm_count; i++) {
-		ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		if (flags == PPC4XX_OCM_NON_CACHED)
-			ocm_reg = &ocm->nc;
-		else
-			ocm_reg = &ocm->c;
-
-		if (!ocm_reg->virt)
-			continue;
-
-		if (align < ocm->alignment)
-			align = ocm->alignment;
-
-		offset = rh_alloc_align(ocm_reg->rh, size, align, NULL);
-
-		if (IS_ERR_VALUE(offset))
-			continue;
-
-		ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL);
-		if (!ocm_blk) {
-			rh_free(ocm_reg->rh, offset);
-			break;
-		}
-
-		*phys = ocm_reg->phys + offset;
-		addr = ocm_reg->virt + offset;
-		size = ALIGN(size, align);
-
-		ocm_blk->addr = addr;
-		ocm_blk->size = size;
-		ocm_blk->owner = owner;
-		list_add_tail(&ocm_blk->list, &ocm_reg->list);
-
-		ocm_reg->memfree -= size;
-
-		break;
-	}
-
-	return addr;
-}
-
-void ppc4xx_ocm_free(const void *addr)
-{
-	int i;
-
-	if (!addr)
-		return;
-
-	for (i = 0; i < ocm_count; i++) {
-		struct ocm_info *ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		if (ocm_free_region(&ocm->nc, addr) ||
-			ocm_free_region(&ocm->c, addr))
-			return;
-	}
-}
-
-static int __init ppc4xx_ocm_init(void)
-{
-	struct device_node *np;
-	int count;
-
-	count = 0;
-	for_each_compatible_node(np, NULL, "ibm,ocm")
-		count++;
-
-	if (!count)
-		return 0;
-
-	ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL);
-	if (!ocm_nodes)
-		return -ENOMEM;
-
-	ocm_count = count;
-	count = 0;
-
-	for_each_compatible_node(np, NULL, "ibm,ocm") {
-		ocm_init_node(count, np);
-		count++;
-	}
-
-	ocm_debugfs_init();
-
-	return 0;
-}
-
-arch_initcall(ppc4xx_ocm_init);

From 6f57e6631d7c6ae34a5e811f7f49cb63647a9cab Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:10 +0000
Subject: [PATCH 084/264] powerpc/ps3: replace __ioremap() by ioremap_prot()

__ioremap() is similar to ioremap_prot() except that ioremap_prot()
does a few sanity changes in addition.

The flags used by PS3 are not impacted by those changes so for
PS3 both functions are equivalent.

At the same time, drop parts of the comment that have been invalid
since commit e58e87adc8bf ("powerpc/mm: Update _PAGE_KERNEL_RO")

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/36bff5d875ff562889c5e12dab63e5d7c5d1fbd8.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/platforms/ps3/spu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index bdaeaecdc06b..1193c294b8d0 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu)
  * setup_areas - Map the spu regions into the address space.
  *
  * The current HV requires the spu shadow regs to be mapped with the
- * PTE page protection bits set as read-only (PP=3).  This implementation
- * uses the low level __ioremap() to bypass the page protection settings
- * inforced by ioremap_prot() to get the needed PTE bits set for the
- * shadow regs.
+ * PTE page protection bits set as read-only.
  */
 
 static int __init setup_areas(struct spu *spu)
@@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu)
 	struct table {char* name; unsigned long addr; unsigned long size;};
 	unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
 
-	spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
-					   sizeof(struct spe_shadow),
-					   shadow_flags);
+	spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr,
+					      sizeof(struct spe_shadow), shadow_flags);
 	if (!spu_pdata(spu)->shadow) {
 		pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__);
 		goto fail_ioremap;

From 8aee077292a34eaca717445290f12d23f8bd7732 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:11 +0000
Subject: [PATCH 085/264] powerpc/mm: drop ppc_md.iounmap() and __iounmap()

ppc_md.iounmap() is never set, drop it.

Once ppc_md.iounmap() is gone, iounmap() remains the only user of
__iounmap() and iounmap() does nothing else than calling __iounmap().
So drop iounmap() and make __iounmap() the new iounmap().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/d73ba92bb7a387cc58cc34666d7f5158a45851b0.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/io.h      |  5 -----
 arch/powerpc/include/asm/machdep.h |  2 --
 arch/powerpc/mm/pgtable_64.c       | 11 +----------
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 23e5d5d16c7e..02d6256fe1ea 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -712,9 +712,6 @@ static inline void iosync(void)
  * * __ioremap_caller is the same as above but takes an explicit caller
  *   reference rather than using __builtin_return_address(0)
  *
- * * __iounmap, is the low level implementation used by iounmap and cannot
- *   be hooked (but can be used by a hook on iounmap)
- *
  */
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
 extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
@@ -734,8 +731,6 @@ extern void __iomem *__ioremap(phys_addr_t, unsigned long size,
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
 				      pgprot_t prot, void *caller);
 
-extern void __iounmap(volatile void __iomem *addr);
-
 extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea,
 				   unsigned long size, pgprot_t prot);
 extern void __iounmap_at(void *ea, unsigned long size);
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index c43d6eca9edd..3370df4bdaa0 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -33,8 +33,6 @@ struct machdep_calls {
 #ifdef CONFIG_PPC64
 	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
 				   pgprot_t prot, void *caller);
-	void		(*iounmap)(volatile void __iomem *token);
-
 #ifdef CONFIG_PM
 	void		(*iommu_save)(void);
 	void		(*iommu_restore)(void);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9ad59b733984..57cdd6182932 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -266,7 +266,7 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
  * Unmap an IO region and remove it from imalloc'd list.
  * Access to IO memory should be serialized by driver.
  */
-void __iounmap(volatile void __iomem *token)
+void iounmap(volatile void __iomem *token)
 {
 	void *addr;
 
@@ -283,21 +283,12 @@ void __iounmap(volatile void __iomem *token)
 	vunmap(addr);
 }
 
-void iounmap(volatile void __iomem *token)
-{
-	if (ppc_md.iounmap)
-		ppc_md.iounmap(token);
-	else
-		__iounmap(token);
-}
-
 EXPORT_SYMBOL(ioremap);
 EXPORT_SYMBOL(ioremap_wc);
 EXPORT_SYMBOL(ioremap_prot);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(__ioremap_at);
 EXPORT_SYMBOL(iounmap);
-EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
 
 #ifndef __PAGETABLE_PUD_FOLDED

From 492643e81e58e7585cccb22c37814060e3f59268 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:12 +0000
Subject: [PATCH 086/264] powerpc/mm: drop function __ioremap()

__ioremap() is not used anymore, drop it.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/ccc439f481a0884e00a6be1bab44bab2a4477fea.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/io.h |  6 ------
 arch/powerpc/mm/pgtable_32.c  | 11 ++---------
 arch/powerpc/mm/pgtable_64.c  |  7 -------
 3 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 02d6256fe1ea..8e65ba59f06a 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -705,10 +705,6 @@ static inline void iosync(void)
  *   create hand-made mappings for use only by the PCI code and cannot
  *   currently be hooked. Must be page aligned.
  *
- * * __ioremap is the low level implementation used by ioremap and
- *   ioremap_prot and cannot be hooked (but can be used by a hook on one
- *   of the previous ones)
- *
  * * __ioremap_caller is the same as above but takes an explicit caller
  *   reference rather than using __builtin_return_address(0)
  *
@@ -726,8 +722,6 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
 
 extern void iounmap(volatile void __iomem *addr);
 
-extern void __iomem *__ioremap(phys_addr_t, unsigned long size,
-			       unsigned long flags);
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
 				      pgprot_t prot, void *caller);
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 97f401a06fcc..8cc5e9e83fc3 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -91,12 +91,6 @@ ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-void __iomem *
-__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
-{
-	return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
-}
-
 void __iomem *
 __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
 {
@@ -127,8 +121,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 	 */
 	if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
 	    page_is_ram(__phys_to_pfn(p))) {
-		printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n",
-		       (unsigned long long)p, __builtin_return_address(0));
+		pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
+			(unsigned long long)p, __builtin_return_address(0));
 		return NULL;
 	}
 #endif
@@ -171,7 +165,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 out:
 	return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
 }
-EXPORT_SYMBOL(__ioremap);
 
 void iounmap(volatile void __iomem *addr)
 {
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 57cdd6182932..2882419737b9 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -203,12 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
 	return ret;
 }
 
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
-			 unsigned long flags)
-{
-	return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
-}
-
 void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 {
 	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
@@ -286,7 +280,6 @@ void iounmap(volatile void __iomem *token)
 EXPORT_SYMBOL(ioremap);
 EXPORT_SYMBOL(ioremap_wc);
 EXPORT_SYMBOL(ioremap_prot);
-EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(__ioremap_at);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap_at);

From 14b4d97669b79d1ac83e64d6795098394e15ab1b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:13 +0000
Subject: [PATCH 087/264] powerpc/mm: rework io-workaround invocation.

ppc_md.ioremap() is only used for I/O workaround on CELL platform,
so indirect function call can be avoided.

This patch reworks the io-workaround and ioremap() functions to
use the global 'io_workaround_inited' flag for the activation
of io-workaround.

When CONFIG_PPC_IO_WORKAROUNDS or CONFIG_PPC_INDIRECT_MMIO are not
selected, the I/O workaround ioremap() voids and the global flag is
not used.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/5fa3ef069fbd0f152512afaae19e7a60161454cf.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/io-workarounds.h | 20 ++++++++++++++++++++
 arch/powerpc/include/asm/machdep.h        |  2 --
 arch/powerpc/kernel/io-workarounds.c      | 13 +++++--------
 arch/powerpc/mm/pgtable_64.c              | 17 +++++++++--------
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/io-workarounds.h b/arch/powerpc/include/asm/io-workarounds.h
index 01567ea4ceaf..3cce499fbe27 100644
--- a/arch/powerpc/include/asm/io-workarounds.h
+++ b/arch/powerpc/include/asm/io-workarounds.h
@@ -8,6 +8,7 @@
 #ifndef _IO_WORKAROUNDS_H
 #define _IO_WORKAROUNDS_H
 
+#ifdef CONFIG_PPC_IO_WORKAROUNDS
 #include <linux/io.h>
 #include <asm/pci-bridge.h>
 
@@ -32,4 +33,23 @@ extern int spiderpci_iowa_init(struct iowa_bus *, void *);
 #define SPIDER_PCI_DUMMY_READ		0x0810
 #define SPIDER_PCI_DUMMY_READ_BASE	0x0814
 
+#endif
+
+#if defined(CONFIG_PPC_IO_WORKAROUNDS) && defined(CONFIG_PPC_INDIRECT_MMIO)
+extern bool io_workaround_inited;
+
+static inline bool iowa_is_active(void)
+{
+	return unlikely(io_workaround_inited);
+}
+#else
+static inline bool iowa_is_active(void)
+{
+	return false;
+}
+#endif
+
+void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+			   pgprot_t prot, void *caller);
+
 #endif /* _IO_WORKAROUNDS_H */
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 3370df4bdaa0..657ec893bdcb 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -31,8 +31,6 @@ struct pci_host_bridge;
 struct machdep_calls {
 	char		*name;
 #ifdef CONFIG_PPC64
-	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
-				   pgprot_t prot, void *caller);
 #ifdef CONFIG_PM
 	void		(*iommu_save)(void);
 	void		(*iommu_restore)(void);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index fbd2d0007c52..0276bc8c8969 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = {
 };
 
 #ifdef CONFIG_PPC_INDIRECT_MMIO
-static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
-				  pgprot_t prot, void *caller)
+void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+			   pgprot_t prot, void *caller)
 {
 	struct iowa_bus *bus;
 	void __iomem *res = __ioremap_caller(addr, size, prot, caller);
@@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
 	}
 	return res;
 }
-#else /* CONFIG_PPC_INDIRECT_MMIO */
-#define iowa_ioremap NULL
 #endif /* !CONFIG_PPC_INDIRECT_MMIO */
 
+bool io_workaround_inited;
+
 /* Enable IO workaround */
 static void io_workaround_init(void)
 {
-	static int io_workaround_inited;
-
 	if (io_workaround_inited)
 		return;
 	ppc_pci_io = iowa_pci_io;
-	ppc_md.ioremap = iowa_ioremap;
-	io_workaround_inited = 1;
+	io_workaround_inited = true;
 }
 
 /* Register new bus to support workaround */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 2882419737b9..0a147daeb0f2 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -35,6 +35,7 @@
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/io.h>
+#include <asm/io-workarounds.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -208,8 +209,8 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
 	void *caller = __builtin_return_address(0);
 
-	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, prot, caller);
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
 	return __ioremap_caller(addr, size, prot, caller);
 }
 
@@ -218,8 +219,8 @@ void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
 	pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
 	void *caller = __builtin_return_address(0);
 
-	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, prot, caller);
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
 	return __ioremap_caller(addr, size, prot, caller);
 }
 
@@ -228,8 +229,8 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
 	pgprot_t prot = pgprot_cached(PAGE_KERNEL);
 	void *caller = __builtin_return_address(0);
 
-	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, prot, caller);
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
 	return __ioremap_caller(addr, size, prot, caller);
 }
 
@@ -250,8 +251,8 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
 	 */
 	pte = pte_mkprivileged(pte);
 
-	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller);
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
 	return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
 }
 

From 4634c375db7a082b2522621519a5fb6eba977584 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:14 +0000
Subject: [PATCH 088/264] powerpc/mm: move common 32/64 bits ioremap functions
 into ioremap.c

ioremap(), ioremap_wc() and ioremap_coherent() are now identical on
PPC32 and PPC64 as iowa_is_active() will always return false on
PPC32. Move them into a new common location called ioremap.c

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/6223803ce024d6ab4dfaa919f44098aed5b4bc33.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/Makefile     |  2 +-
 arch/powerpc/mm/ioremap.c    | 36 ++++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/pgtable_32.c | 27 ---------------------------
 arch/powerpc/mm/pgtable_64.c | 33 ---------------------------------
 4 files changed, 37 insertions(+), 61 deletions(-)
 create mode 100644 arch/powerpc/mm/ioremap.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..29c682fe9144 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
-				   pgtable-frag.o \
+				   pgtable-frag.o ioremap.o \
 				   init-common.o mmu_context.o drmem.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
new file mode 100644
index 000000000000..7f1d462e7745
--- /dev/null
+++ b/arch/powerpc/mm/ioremap.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <asm/io-workarounds.h>
+
+void __iomem *ioremap(phys_addr_t addr, unsigned long size)
+{
+	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
+	void *caller = __builtin_return_address(0);
+
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
+	return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap);
+
+void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size)
+{
+	pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
+	void *caller = __builtin_return_address(0);
+
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
+	return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
+{
+	pgprot_t prot = pgprot_cached(PAGE_KERNEL);
+	void *caller = __builtin_return_address(0);
+
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, prot, caller);
+	return __ioremap_caller(addr, size, prot, caller);
+}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 8cc5e9e83fc3..a363d87d0a9d 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -38,24 +38,6 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-void __iomem *
-ioremap(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
-
-	return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap);
-
-void __iomem *
-ioremap_wc(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
-
-	return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_wc);
-
 void __iomem *
 ioremap_wt(phys_addr_t addr, unsigned long size)
 {
@@ -65,15 +47,6 @@ ioremap_wt(phys_addr_t addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_wt);
 
-void __iomem *
-ioremap_coherent(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_cached(PAGE_KERNEL);
-
-	return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_coherent);
-
 void __iomem *
 ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0a147daeb0f2..358233ea8d85 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -35,7 +35,6 @@
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/io.h>
-#include <asm/io-workarounds.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -204,36 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
 	return ret;
 }
 
-void __iomem * ioremap(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
-	void *caller = __builtin_return_address(0);
-
-	if (iowa_is_active())
-		return iowa_ioremap(addr, size, prot, caller);
-	return __ioremap_caller(addr, size, prot, caller);
-}
-
-void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
-	void *caller = __builtin_return_address(0);
-
-	if (iowa_is_active())
-		return iowa_ioremap(addr, size, prot, caller);
-	return __ioremap_caller(addr, size, prot, caller);
-}
-
-void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_cached(PAGE_KERNEL);
-	void *caller = __builtin_return_address(0);
-
-	if (iowa_is_active())
-		return iowa_ioremap(addr, size, prot, caller);
-	return __ioremap_caller(addr, size, prot, caller);
-}
-
 void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
@@ -278,8 +247,6 @@ void iounmap(volatile void __iomem *token)
 	vunmap(addr);
 }
 
-EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(ioremap_wc);
 EXPORT_SYMBOL(ioremap_prot);
 EXPORT_SYMBOL(__ioremap_at);
 EXPORT_SYMBOL(iounmap);

From edfe1a5679263827ff94eb478c6ee0d65d467adf Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:15 +0000
Subject: [PATCH 089/264] powerpc/mm: move ioremap_prot() into ioremap.c

Both ioremap_prot() are idenfical, move them into ioremap.c

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/0b3eb0e0f1490a99fd6c983e166fb8946233f151.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/ioremap.c    | 19 +++++++++++++++++++
 arch/powerpc/mm/pgtable_32.c | 17 -----------------
 arch/powerpc/mm/pgtable_64.c | 24 ------------------------
 3 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 7f1d462e7745..3d388aaa3ee6 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -34,3 +34,22 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
 		return iowa_ioremap(addr, size, prot, caller);
 	return __ioremap_caller(addr, size, prot, caller);
 }
+
+void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+{
+	pte_t pte = __pte(flags);
+	void *caller = __builtin_return_address(0);
+
+	/* writeable implies dirty for kernel addresses */
+	if (pte_write(pte))
+		pte = pte_mkdirty(pte);
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	pte = pte_exprotect(pte);
+	pte = pte_mkprivileged(pte);
+
+	if (iowa_is_active())
+		return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
+	return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
+}
+EXPORT_SYMBOL(ioremap_prot);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index a363d87d0a9d..ac5a267467fa 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -47,23 +47,6 @@ ioremap_wt(phys_addr_t addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_wt);
 
-void __iomem *
-ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
-{
-	pte_t pte = __pte(flags);
-
-	/* writeable implies dirty for kernel addresses */
-	if (pte_write(pte))
-		pte = pte_mkdirty(pte);
-
-	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
-	pte = pte_exprotect(pte);
-	pte = pte_mkprivileged(pte);
-
-	return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_prot);
-
 void __iomem *
 __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
 {
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 358233ea8d85..2b9078e1bc43 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -203,29 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
 	return ret;
 }
 
-void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
-			     unsigned long flags)
-{
-	pte_t pte = __pte(flags);
-	void *caller = __builtin_return_address(0);
-
-	/* writeable implies dirty for kernel addresses */
-	if (pte_write(pte))
-		pte = pte_mkdirty(pte);
-
-	/* we don't want to let _PAGE_EXEC leak out */
-	pte = pte_exprotect(pte);
-	/*
-	 * Force kernel mapping.
-	 */
-	pte = pte_mkprivileged(pte);
-
-	if (iowa_is_active())
-		return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
-	return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
-}
-
-
 /*  
  * Unmap an IO region and remove it from imalloc'd list.
  * Access to IO memory should be serialized by driver.
@@ -247,7 +224,6 @@ void iounmap(volatile void __iomem *token)
 	vunmap(addr);
 }
 
-EXPORT_SYMBOL(ioremap_prot);
 EXPORT_SYMBOL(__ioremap_at);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap_at);

From 7cd9b317b630683b0c8eb2dfcfb046003ad6b97b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:16 +0000
Subject: [PATCH 090/264] powerpc/mm: make ioremap_bot common to all

Drop multiple definitions of ioremap_bot and make one common to
all subarches.

Only CONFIG_PPC_BOOK3E_64 had a global static init value for
ioremap_bot. Now ioremap_bot is set in early_init_mmu_global().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/920eebfd9f36f14c79d1755847f5bf7c83703bdd.1566309262.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 2 --
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/powerpc/include/asm/nohash/32/pgtable.h | 2 --
 arch/powerpc/include/asm/pgtable.h           | 2 ++
 arch/powerpc/mm/ioremap.c                    | 3 +++
 arch/powerpc/mm/mmu_decl.h                   | 1 -
 arch/powerpc/mm/nohash/tlb.c                 | 2 ++
 arch/powerpc/mm/pgtable_32.c                 | 3 ---
 arch/powerpc/mm/pgtable_64.c                 | 3 ---
 9 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 838de59f6754..aa1bc5f8da90 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -201,8 +201,6 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #include <linux/sched.h>
 #include <linux/threads.h>
 
-extern unsigned long ioremap_bot;
-
 /* Bits to mask out from a PGD to get to the PUD page */
 #define PGD_MASKED_BITS		0
 
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8e47fb85dfa6..03c9a14dd902 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -289,7 +289,6 @@ extern unsigned long __kernel_io_end;
 #define KERN_IO_END __kernel_io_end
 
 extern struct page *vmemmap;
-extern unsigned long ioremap_bot;
 extern unsigned long pci_io_base;
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 0284f8f5305f..7ce2a7c9fade 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -11,8 +11,6 @@
 #include <asm/mmu.h>			/* For sub-arch specific PPC_PIN_SIZE */
 #include <asm/asm-405.h>
 
-extern unsigned long ioremap_bot;
-
 #ifdef CONFIG_44x
 extern int icache_44x_need_flush;
 #endif
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c70916a7865a..8b7865a2d576 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -68,6 +68,8 @@ extern pgd_t swapper_pg_dir[];
 
 extern void paging_init(void);
 
+extern unsigned long ioremap_bot;
+
 /*
  * kern_addr_valid is intended to indicate whether an address is a valid
  * kernel address.  Most 32-bit archs define it as always true (like this)
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 3d388aaa3ee6..eaf5f8a4a63f 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -3,6 +3,9 @@
 #include <linux/io.h>
 #include <asm/io-workarounds.h>
 
+unsigned long ioremap_bot;
+EXPORT_SYMBOL(ioremap_bot);
+
 void __iomem *ioremap(phys_addr_t addr, unsigned long size)
 {
 	pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index adbaf2167214..c750ac9ec713 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -106,7 +106,6 @@ extern u8 early_hash[];
 
 #endif /* CONFIG_PPC32 */
 
-extern unsigned long ioremap_bot;
 extern unsigned long __max_low_memory;
 extern phys_addr_t __initial_memory_limit_addr;
 extern phys_addr_t total_memory;
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index bf60983a58c7..696f568253a0 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -703,6 +703,8 @@ static void __init early_init_mmu_global(void)
 	 * for use by the TLB miss code
 	 */
 	linear_map_top = memblock_end_of_DRAM();
+
+	ioremap_bot = IOREMAP_BASE;
 }
 
 static void __init early_mmu_set_memory_limit(void)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index ac5a267467fa..102901a19f3c 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -33,9 +33,6 @@
 
 #include <mm/mmu_decl.h>
 
-unsigned long ioremap_bot;
-EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
-
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
 void __iomem *
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 2b9078e1bc43..d865e053052d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -98,9 +98,6 @@ unsigned long __pte_frag_nr;
 EXPORT_SYMBOL(__pte_frag_nr);
 unsigned long __pte_frag_size_shift;
 EXPORT_SYMBOL(__pte_frag_size_shift);
-unsigned long ioremap_bot;
-#else /* !CONFIG_PPC_BOOK3S_64 */
-unsigned long ioremap_bot = IOREMAP_BASE;
 #endif
 
 int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid)

From f381d5711f091facd8847a54a2377cc0d1519df2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:17 +0000
Subject: [PATCH 091/264] powerpc/mm: Move ioremap functions out of
 pgtable_32/64.c

Create ioremap_32.c and ioremap_64.c and move respective ioremap
functions out of pgtable_32.c and pgtable_64.c

In the meantime, fix a few comments and changes a printk() to
pr_warn(). Also fix a few oversplitted lines.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b5c8b02ccefd4ede64c61b53cf64fb5dacb35740.1566309263.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/Makefile     |   2 +-
 arch/powerpc/mm/ioremap_32.c | 104 ++++++++++++++++++++++++++++
 arch/powerpc/mm/ioremap_64.c | 123 +++++++++++++++++++++++++++++++++
 arch/powerpc/mm/pgtable_32.c |  99 ---------------------------
 arch/powerpc/mm/pgtable_64.c | 128 +----------------------------------
 5 files changed, 229 insertions(+), 227 deletions(-)
 create mode 100644 arch/powerpc/mm/ioremap_32.c
 create mode 100644 arch/powerpc/mm/ioremap_64.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 29c682fe9144..5e147986400d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
-				   pgtable-frag.o ioremap.o \
+				   pgtable-frag.o ioremap.o ioremap_$(BITS).o \
 				   init-common.o mmu_context.o drmem.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
new file mode 100644
index 000000000000..fb43ba71aa54
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <mm/mmu_decl.h>
+
+void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size)
+{
+	pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
+
+	return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *
+__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
+{
+	unsigned long v, i;
+	phys_addr_t p;
+	int err;
+
+	/*
+	 * Choose an address to map it to.
+	 * Once the vmalloc system is running, we use it.
+	 * Before then, we use space going down from IOREMAP_TOP
+	 * (ioremap_bot records where we're up to).
+	 */
+	p = addr & PAGE_MASK;
+	size = PAGE_ALIGN(addr + size) - p;
+
+	/*
+	 * If the address lies within the first 16 MB, assume it's in ISA
+	 * memory space
+	 */
+	if (p < 16 * 1024 * 1024)
+		p += _ISA_MEM_BASE;
+
+#ifndef CONFIG_CRASH_DUMP
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using.
+	 * mem_init() sets high_memory so only do the check after that.
+	 */
+	if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
+	    page_is_ram(__phys_to_pfn(p))) {
+		pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
+			(unsigned long long)p, __builtin_return_address(0));
+		return NULL;
+	}
+#endif
+
+	if (size == 0)
+		return NULL;
+
+	/*
+	 * Is it already mapped?  Perhaps overlapped by a previous
+	 * mapping.
+	 */
+	v = p_block_mapped(p);
+	if (v)
+		goto out;
+
+	if (slab_is_available()) {
+		struct vm_struct *area;
+		area = get_vm_area_caller(size, VM_IOREMAP, caller);
+		if (area == 0)
+			return NULL;
+		area->phys_addr = p;
+		v = (unsigned long)area->addr;
+	} else {
+		v = (ioremap_bot -= size);
+	}
+
+	/*
+	 * Should check if it is a candidate for a BAT mapping
+	 */
+
+	err = 0;
+	for (i = 0; i < size && err == 0; i += PAGE_SIZE)
+		err = map_kernel_page(v + i, p + i, prot);
+	if (err) {
+		if (slab_is_available())
+			vunmap((void *)v);
+		return NULL;
+	}
+
+out:
+	return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK));
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+	/*
+	 * If mapped by BATs then there is nothing to do.
+	 * Calling vfree() generates a benign warning.
+	 */
+	if (v_block_mapped((unsigned long)addr))
+		return;
+
+	if (addr > high_memory && (unsigned long)addr < ioremap_bot)
+		vunmap((void *)(PAGE_MASK & (unsigned long)addr));
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
new file mode 100644
index 000000000000..57f3b096143c
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
+			 pgprot_t prot, int nid)
+{
+	unsigned long i;
+
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		int err = map_kernel_page(ea + i, pa + i, prot);
+		if (err) {
+			if (slab_is_available())
+				unmap_kernel_range(ea, size);
+			else
+				WARN_ON_ONCE(1); /* Should clean up */
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Low level function to establish the page tables for an IO mapping
+ */
+void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
+{
+	/* We don't support the 4K PFN hack with ioremap */
+	if (pgprot_val(prot) & H_PAGE_4K_PFN)
+		return NULL;
+
+	if ((ea + size) >= (void *)IOREMAP_END) {
+		pr_warn("Outside the supported range\n");
+		return NULL;
+	}
+
+	WARN_ON(pa & ~PAGE_MASK);
+	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+	WARN_ON(size & ~PAGE_MASK);
+
+	if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE))
+		return NULL;
+
+	return (void __iomem *)ea;
+}
+EXPORT_SYMBOL(__ioremap_at);
+
+/**
+ * Low level function to tear down the page tables for an IO mapping. This is
+ * used for mappings that are manipulated manually, like partial unmapping of
+ * PCI IOs or ISA space.
+ */
+void __iounmap_at(void *ea, unsigned long size)
+{
+	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+	WARN_ON(size & ~PAGE_MASK);
+
+	unmap_kernel_range((unsigned long)ea, size);
+}
+EXPORT_SYMBOL(__iounmap_at);
+
+void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
+			       pgprot_t prot, void *caller)
+{
+	phys_addr_t paligned;
+	void __iomem *ret;
+
+	/*
+	 * Choose an address to map it to. Once the vmalloc system is running,
+	 * we use it. Before that, we map using addresses going up from
+	 * ioremap_bot.  vmalloc will use the addresses from IOREMAP_BASE
+	 * through ioremap_bot.
+	 */
+	paligned = addr & PAGE_MASK;
+	size = PAGE_ALIGN(addr + size) - paligned;
+
+	if (size == 0 || paligned == 0)
+		return NULL;
+
+	if (slab_is_available()) {
+		struct vm_struct *area;
+
+		area = __get_vm_area_caller(size, VM_IOREMAP, ioremap_bot,
+					    IOREMAP_END, caller);
+		if (area == NULL)
+			return NULL;
+
+		area->phys_addr = paligned;
+		ret = __ioremap_at(paligned, area->addr, size, prot);
+	} else {
+		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
+		if (ret)
+			ioremap_bot += size;
+	}
+
+	if (ret)
+		ret += addr & ~PAGE_MASK;
+	return ret;
+}
+
+/*
+ * Unmap an IO region and remove it from vmalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ */
+void iounmap(volatile void __iomem *token)
+{
+	void *addr;
+
+	if (!slab_is_available())
+		return;
+
+	addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
+
+	if ((unsigned long)addr < ioremap_bot) {
+		pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
+		return;
+	}
+	vunmap(addr);
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 102901a19f3c..8ec5dfb65b2e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -27,7 +27,6 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
-#include <asm/io.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 
@@ -35,104 +34,6 @@
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-void __iomem *
-ioremap_wt(phys_addr_t addr, unsigned long size)
-{
-	pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
-
-	return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_wt);
-
-void __iomem *
-__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
-{
-	unsigned long v, i;
-	phys_addr_t p;
-	int err;
-
-	/*
-	 * Choose an address to map it to.
-	 * Once the vmalloc system is running, we use it.
-	 * Before then, we use space going down from IOREMAP_TOP
-	 * (ioremap_bot records where we're up to).
-	 */
-	p = addr & PAGE_MASK;
-	size = PAGE_ALIGN(addr + size) - p;
-
-	/*
-	 * If the address lies within the first 16 MB, assume it's in ISA
-	 * memory space
-	 */
-	if (p < 16*1024*1024)
-		p += _ISA_MEM_BASE;
-
-#ifndef CONFIG_CRASH_DUMP
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using.
-	 * mem_init() sets high_memory so only do the check after that.
-	 */
-	if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
-	    page_is_ram(__phys_to_pfn(p))) {
-		pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
-			(unsigned long long)p, __builtin_return_address(0));
-		return NULL;
-	}
-#endif
-
-	if (size == 0)
-		return NULL;
-
-	/*
-	 * Is it already mapped?  Perhaps overlapped by a previous
-	 * mapping.
-	 */
-	v = p_block_mapped(p);
-	if (v)
-		goto out;
-
-	if (slab_is_available()) {
-		struct vm_struct *area;
-		area = get_vm_area_caller(size, VM_IOREMAP, caller);
-		if (area == 0)
-			return NULL;
-		area->phys_addr = p;
-		v = (unsigned long) area->addr;
-	} else {
-		v = (ioremap_bot -= size);
-	}
-
-	/*
-	 * Should check if it is a candidate for a BAT mapping
-	 */
-
-	err = 0;
-	for (i = 0; i < size && err == 0; i += PAGE_SIZE)
-		err = map_kernel_page(v + i, p + i, prot);
-	if (err) {
-		if (slab_is_available())
-			vunmap((void *)v);
-		return NULL;
-	}
-
-out:
-	return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
-}
-
-void iounmap(volatile void __iomem *addr)
-{
-	/*
-	 * If mapped by BATs then there is nothing to do.
-	 * Calling vfree() generates a benign warning.
-	 */
-	if (v_block_mapped((unsigned long)addr))
-		return;
-
-	if (addr > high_memory && (unsigned long) addr < ioremap_bot)
-		vunmap((void *) (PAGE_MASK & (unsigned long)addr));
-}
-EXPORT_SYMBOL(iounmap);
-
 static void __init *early_alloc_pgtable(unsigned long size)
 {
 	void *ptr = memblock_alloc(size, size);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index d865e053052d..e78832dce7bb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- *  This file contains ioremap and related functions for 64-bit machines.
+ *  This file contains pgtable related functions for 64-bit machines.
  *
  *  Derived from arch/ppc64/mm/init.c
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -34,7 +34,6 @@
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/prom.h>
-#include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -100,131 +99,6 @@ unsigned long __pte_frag_size_shift;
 EXPORT_SYMBOL(__pte_frag_size_shift);
 #endif
 
-int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid)
-{
-	unsigned long i;
-
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		int err = map_kernel_page(ea + i, pa + i, prot);
-		if (err) {
-			if (slab_is_available())
-				unmap_kernel_range(ea, size);
-			else
-				WARN_ON_ONCE(1); /* Should clean up */
-			return err;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * __ioremap_at - Low level function to establish the page tables
- *                for an IO mapping
- */
-void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
-{
-	/* We don't support the 4K PFN hack with ioremap */
-	if (pgprot_val(prot) & H_PAGE_4K_PFN)
-		return NULL;
-
-	if ((ea + size) >= (void *)IOREMAP_END) {
-		pr_warn("Outside the supported range\n");
-		return NULL;
-	}
-
-	WARN_ON(pa & ~PAGE_MASK);
-	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
-	WARN_ON(size & ~PAGE_MASK);
-
-	if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE))
-		return NULL;
-
-	return (void __iomem *)ea;
-}
-
-/**
- * __iounmap_from - Low level function to tear down the page tables
- *                  for an IO mapping. This is used for mappings that
- *                  are manipulated manually, like partial unmapping of
- *                  PCI IOs or ISA space.
- */
-void __iounmap_at(void *ea, unsigned long size)
-{
-	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
-	WARN_ON(size & ~PAGE_MASK);
-
-	unmap_kernel_range((unsigned long)ea, size);
-}
-
-void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
-				pgprot_t prot, void *caller)
-{
-	phys_addr_t paligned;
-	void __iomem *ret;
-
-	/*
-	 * Choose an address to map it to.
-	 * Once the imalloc system is running, we use it.
-	 * Before that, we map using addresses going
-	 * up from ioremap_bot.  imalloc will use
-	 * the addresses from ioremap_bot through
-	 * IMALLOC_END
-	 * 
-	 */
-	paligned = addr & PAGE_MASK;
-	size = PAGE_ALIGN(addr + size) - paligned;
-
-	if ((size == 0) || (paligned == 0))
-		return NULL;
-
-	if (slab_is_available()) {
-		struct vm_struct *area;
-
-		area = __get_vm_area_caller(size, VM_IOREMAP,
-					    ioremap_bot, IOREMAP_END,
-					    caller);
-		if (area == NULL)
-			return NULL;
-
-		area->phys_addr = paligned;
-		ret = __ioremap_at(paligned, area->addr, size, prot);
-	} else {
-		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
-		if (ret)
-			ioremap_bot += size;
-	}
-
-	if (ret)
-		ret += addr & ~PAGE_MASK;
-	return ret;
-}
-
-/*  
- * Unmap an IO region and remove it from imalloc'd list.
- * Access to IO memory should be serialized by driver.
- */
-void iounmap(volatile void __iomem *token)
-{
-	void *addr;
-
-	if (!slab_is_available())
-		return;
-	
-	addr = (void *) ((unsigned long __force)
-			 PCI_FIX_ADDR(token) & PAGE_MASK);
-	if ((unsigned long)addr < ioremap_bot) {
-		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
-		       " at 0x%p\n", addr);
-		return;
-	}
-	vunmap(addr);
-}
-
-EXPORT_SYMBOL(__ioremap_at);
-EXPORT_SYMBOL(iounmap);
-EXPORT_SYMBOL(__iounmap_at);
-
 #ifndef __PAGETABLE_PUD_FOLDED
 /* 4 level page table */
 struct page *pgd_page(pgd_t pgd)

From 191e42063a7241e5c3a1d1f36896a20b147517e9 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:18 +0000
Subject: [PATCH 092/264] powerpc/mm: refactor ioremap_range() and use
 ioremap_page_range()

book3s64's ioremap_range() is almost same as fallback ioremap_range(),
except that it calls radix__ioremap_range() when radix is enabled.

radix__ioremap_range() is also very similar to the other ones, expect
that it calls ioremap_page_range when slab is available.

PPC32 __ioremap_caller() have a loop doing the same thing as
ioremap_range() so use it on PPC32 as well.

Lets keep only one version of ioremap_range() which calls
ioremap_page_range() on all platforms when slab is available.

At the same time, drop the nid parameter which is not used.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/4b1dca7096b01823b101be7338983578641547f1.1566309263.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/book3s/64/radix.h |  3 ---
 arch/powerpc/include/asm/io.h              |  2 ++
 arch/powerpc/mm/book3s64/pgtable.c         | 21 -------------------
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 20 ------------------
 arch/powerpc/mm/ioremap.c                  | 24 ++++++++++++++++++++++
 arch/powerpc/mm/ioremap_32.c               |  6 ++----
 arch/powerpc/mm/ioremap_64.c               | 21 +------------------
 7 files changed, 29 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index e04a839cb5b9..574eca33f893 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -266,9 +266,6 @@ extern void radix__vmemmap_remove_mapping(unsigned long start,
 extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
 				 pgprot_t flags, unsigned int psz);
 
-extern int radix__ioremap_range(unsigned long ea, phys_addr_t pa,
-				unsigned long size, pgprot_t prot, int nid);
-
 static inline unsigned long radix__get_tree_size(void)
 {
 	unsigned long rts_field;
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 8e65ba59f06a..8e00d95f9600 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -722,6 +722,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
 
 extern void iounmap(volatile void __iomem *addr);
 
+int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot);
+
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
 				      pgprot_t prot, void *caller);
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 7d0e0d0d22c4..4c8bed856533 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -446,24 +446,3 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 
 	return true;
 }
-
-int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid)
-{
-	unsigned long i;
-
-	if (radix_enabled())
-		return radix__ioremap_range(ea, pa, size, prot, nid);
-
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		int err = map_kernel_page(ea + i, pa + i, prot);
-		if (err) {
-			if (slab_is_available())
-				unmap_kernel_range(ea, size);
-			else
-				WARN_ON_ONCE(1); /* Should clean up */
-			return err;
-		}
-	}
-
-	return 0;
-}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 2204d8eeb784..5bab67fe96e0 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1218,26 +1218,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 	return 1;
 }
 
-int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
-			pgprot_t prot, int nid)
-{
-	if (likely(slab_is_available())) {
-		int err = ioremap_page_range(ea, ea + size, pa, prot);
-		if (err)
-			unmap_kernel_range(ea, size);
-		return err;
-	} else {
-		unsigned long i;
-
-		for (i = 0; i < size; i += PAGE_SIZE) {
-			int err = map_kernel_page(ea + i, pa + i, prot);
-			if (WARN_ON_ONCE(err)) /* Should clean up */
-				return err;
-		}
-		return 0;
-	}
-}
-
 int __init arch_ioremap_p4d_supported(void)
 {
 	return 0;
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index eaf5f8a4a63f..50ee6544d0b7 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <asm/io-workarounds.h>
 
 unsigned long ioremap_bot;
@@ -56,3 +58,25 @@ void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long f
 	return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
 }
 EXPORT_SYMBOL(ioremap_prot);
+
+int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot)
+{
+	unsigned long i;
+
+	if (slab_is_available()) {
+		int err = ioremap_page_range(ea, ea + size, pa, prot);
+
+		if (err)
+			unmap_kernel_range(ea, size);
+		return err;
+	}
+
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		int err = map_kernel_page(ea + i, pa + i, prot);
+
+		if (WARN_ON_ONCE(err))  /* Should clean up */
+			return err;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index fb43ba71aa54..85b90a62e084 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -17,7 +17,7 @@ EXPORT_SYMBOL(ioremap_wt);
 void __iomem *
 __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
 {
-	unsigned long v, i;
+	unsigned long v;
 	phys_addr_t p;
 	int err;
 
@@ -76,9 +76,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 	 * Should check if it is a candidate for a BAT mapping
 	 */
 
-	err = 0;
-	for (i = 0; i < size && err == 0; i += PAGE_SIZE)
-		err = map_kernel_page(v + i, p + i, prot);
+	err = ioremap_range((unsigned long)v, p, size, prot);
 	if (err) {
 		if (slab_is_available())
 			vunmap((void *)v);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 57f3b096143c..d132ce1e538d 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -4,25 +4,6 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
-int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
-			 pgprot_t prot, int nid)
-{
-	unsigned long i;
-
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		int err = map_kernel_page(ea + i, pa + i, prot);
-		if (err) {
-			if (slab_is_available())
-				unmap_kernel_range(ea, size);
-			else
-				WARN_ON_ONCE(1); /* Should clean up */
-			return err;
-		}
-	}
-
-	return 0;
-}
-
 /**
  * Low level function to establish the page tables for an IO mapping
  */
@@ -41,7 +22,7 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
 
-	if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE))
+	if (ioremap_range((unsigned long)ea, pa, size, prot))
 		return NULL;
 
 	return (void __iomem *)ea;

From 4a45b7460cf458012a6930f675e141256b81dcf4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:19 +0000
Subject: [PATCH 093/264] powerpc/mm: refactor ioremap vm area setup.

PPC32 and PPC64 are doing the same once SLAB is available.
Create a do_ioremap() function that calls get_vm_area and
do the mapping.

For PPC64, we add the 4K PFN hack sanity check to __ioremap_caller()
in order to avoid using __ioremap_at(). Other checks in __ioremap_at()
are irrelevant for __ioremap_caller().

On PPC64, VM area is allocated in the range [ioremap_bot ; IOREMAP_END]
On PPC32, VM area is allocated in the range [VMALLOC_START ; VMALLOC_END]

Lets define IOREMAP_START is ioremap_bot for PPC64, and alias
IOREMAP_START/END to VMALLOC_START/END on PPC32

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/42e7e36ad32e0fdf76692426cc642799c9f689b8.1566309263.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  4 ++++
 arch/powerpc/include/asm/book3s/64/pgtable.h |  1 +
 arch/powerpc/include/asm/io.h                |  2 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h |  4 ++++
 arch/powerpc/include/asm/nohash/64/pgtable.h |  1 +
 arch/powerpc/mm/ioremap.c                    | 20 ++++++++++++++++++++
 arch/powerpc/mm/ioremap_32.c                 | 15 ++++-----------
 arch/powerpc/mm/ioremap_64.c                 | 17 +++++++----------
 8 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index aa1bc5f8da90..331a29a501a1 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -165,6 +165,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #define IOREMAP_TOP	KVIRT_TOP
 #endif
 
+/* PPC32 shares vmalloc area with ioremap */
+#define IOREMAP_START	VMALLOC_START
+#define IOREMAP_END	VMALLOC_END
+
 /*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 16MB value just means that there will be a 64MB "hole" after the
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 03c9a14dd902..b01624e5c467 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -316,6 +316,7 @@ extern unsigned long pci_io_base;
 #define  PHB_IO_BASE	(ISA_IO_END)
 #define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
 #define IOREMAP_BASE	(PHB_IO_END)
+#define IOREMAP_START	(ioremap_bot)
 #define IOREMAP_END	(KERN_IO_END)
 
 /* Advertise special mapping type for AGP */
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 8e00d95f9600..dc529ea0fffa 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -723,6 +723,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
 extern void iounmap(volatile void __iomem *addr);
 
 int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot);
+void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
+			 pgprot_t prot, void *caller);
 
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
 				      pgprot_t prot, void *caller);
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 7ce2a7c9fade..3e1a4c1e40f0 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -93,6 +93,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 #define IOREMAP_TOP	KVIRT_TOP
 #endif
 
+/* PPC32 shares vmalloc area with ioremap */
+#define IOREMAP_START	VMALLOC_START
+#define IOREMAP_END	VMALLOC_END
+
 /*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 16MB value just means that there will be a 64MB "hole" after the
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index b9f66cf15c31..9a33b8bd842d 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -53,6 +53,7 @@
 #define  PHB_IO_BASE	(ISA_IO_END)
 #define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
 #define IOREMAP_BASE	(PHB_IO_END)
+#define IOREMAP_START	(ioremap_bot)
 #define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)
 
 
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 50ee6544d0b7..57630325846c 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -80,3 +80,23 @@ int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t
 
 	return 0;
 }
+
+void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
+			 pgprot_t prot, void *caller)
+{
+	struct vm_struct *area;
+	int ret;
+
+	area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
+	if (area == NULL)
+		return NULL;
+
+	area->phys_addr = pa;
+	ret = ioremap_range((unsigned long)area->addr, pa, size, prot);
+	if (!ret)
+		return (void __iomem *)area->addr + offset;
+
+	free_vm_area(area);
+
+	return NULL;
+}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index 85b90a62e084..fcf343dbf2bf 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -18,7 +18,7 @@ void __iomem *
 __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
 {
 	unsigned long v;
-	phys_addr_t p;
+	phys_addr_t p, offset;
 	int err;
 
 	/*
@@ -28,6 +28,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 	 * (ioremap_bot records where we're up to).
 	 */
 	p = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
 	size = PAGE_ALIGN(addr + size) - p;
 
 	/*
@@ -62,12 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 		goto out;
 
 	if (slab_is_available()) {
-		struct vm_struct *area;
-		area = get_vm_area_caller(size, VM_IOREMAP, caller);
-		if (area == 0)
-			return NULL;
-		area->phys_addr = p;
-		v = (unsigned long)area->addr;
+		return do_ioremap(p, offset, size, prot, caller);
 	} else {
 		v = (ioremap_bot -= size);
 	}
@@ -77,11 +73,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 	 */
 
 	err = ioremap_range((unsigned long)v, p, size, prot);
-	if (err) {
-		if (slab_is_available())
-			vunmap((void *)v);
+	if (err)
 		return NULL;
-	}
 
 out:
 	return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK));
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index d132ce1e538d..e37b68b7f0e8 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -46,9 +46,13 @@ EXPORT_SYMBOL(__iounmap_at);
 void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
 			       pgprot_t prot, void *caller)
 {
-	phys_addr_t paligned;
+	phys_addr_t paligned, offset;
 	void __iomem *ret;
 
+	/* We don't support the 4K PFN hack with ioremap */
+	if (pgprot_val(prot) & H_PAGE_4K_PFN)
+		return NULL;
+
 	/*
 	 * Choose an address to map it to. Once the vmalloc system is running,
 	 * we use it. Before that, we map using addresses going up from
@@ -56,21 +60,14 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
 	 * through ioremap_bot.
 	 */
 	paligned = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
 	size = PAGE_ALIGN(addr + size) - paligned;
 
 	if (size == 0 || paligned == 0)
 		return NULL;
 
 	if (slab_is_available()) {
-		struct vm_struct *area;
-
-		area = __get_vm_area_caller(size, VM_IOREMAP, ioremap_bot,
-					    IOREMAP_END, caller);
-		if (area == NULL)
-			return NULL;
-
-		area->phys_addr = paligned;
-		ret = __ioremap_at(paligned, area->addr, size, prot);
+		return do_ioremap(paligned, offset, size, prot, caller);
 	} else {
 		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
 		if (ret)

From 163918fc5741d755cf9d477ebfcb761f09b82982 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:07:20 +0000
Subject: [PATCH 094/264] powerpc/mm: split out early ioremap path.

ioremap does things differently depending on whether
SLAB is available or not at different levels.

Try to separate the early path from the beginning.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/3acd2dbe04b04f111475e7a59f2b6f2ab9b95ab6.1566309263.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/io.h |  3 ++-
 arch/powerpc/mm/ioremap.c     | 17 +++++++----------
 arch/powerpc/mm/ioremap_32.c  | 13 +++++--------
 arch/powerpc/mm/ioremap_64.c  | 30 +++++++++++++++++++++---------
 4 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index dc529ea0fffa..a63ec938636d 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -722,7 +722,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
 
 extern void iounmap(volatile void __iomem *addr);
 
-int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot);
+int early_ioremap_range(unsigned long ea, phys_addr_t pa,
+			unsigned long size, pgprot_t prot);
 void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
 			 pgprot_t prot, void *caller);
 
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 57630325846c..fc669643ce6a 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -59,18 +59,11 @@ void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long f
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot)
+int early_ioremap_range(unsigned long ea, phys_addr_t pa,
+			unsigned long size, pgprot_t prot)
 {
 	unsigned long i;
 
-	if (slab_is_available()) {
-		int err = ioremap_page_range(ea, ea + size, pa, prot);
-
-		if (err)
-			unmap_kernel_range(ea, size);
-		return err;
-	}
-
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		int err = map_kernel_page(ea + i, pa + i, prot);
 
@@ -86,16 +79,20 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
 {
 	struct vm_struct *area;
 	int ret;
+	unsigned long va;
 
 	area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
 	if (area == NULL)
 		return NULL;
 
 	area->phys_addr = pa;
-	ret = ioremap_range((unsigned long)area->addr, pa, size, prot);
+	va = (unsigned long)area->addr;
+
+	ret = ioremap_page_range(va, va + size, pa, prot);
 	if (!ret)
 		return (void __iomem *)area->addr + offset;
 
+	unmap_kernel_range(va, size);
 	free_vm_area(area);
 
 	return NULL;
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index fcf343dbf2bf..f36121f25243 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -60,24 +60,21 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
 	 */
 	v = p_block_mapped(p);
 	if (v)
-		goto out;
+		return (void __iomem *)v + offset;
 
-	if (slab_is_available()) {
+	if (slab_is_available())
 		return do_ioremap(p, offset, size, prot, caller);
-	} else {
-		v = (ioremap_bot -= size);
-	}
 
 	/*
 	 * Should check if it is a candidate for a BAT mapping
 	 */
 
-	err = ioremap_range((unsigned long)v, p, size, prot);
+	err = early_ioremap_range(ioremap_bot - size, p, size, prot);
 	if (err)
 		return NULL;
+	ioremap_bot -= size;
 
-out:
-	return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK));
+	return (void __iomem *)ioremap_bot + offset;
 }
 
 void iounmap(volatile void __iomem *addr)
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index e37b68b7f0e8..fd29e51700cd 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -9,6 +9,9 @@
  */
 void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
 {
+	int ret;
+	unsigned long va = (unsigned long)ea;
+
 	/* We don't support the 4K PFN hack with ioremap */
 	if (pgprot_val(prot) & H_PAGE_4K_PFN)
 		return NULL;
@@ -22,7 +25,15 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
 
-	if (ioremap_range((unsigned long)ea, pa, size, prot))
+	if (slab_is_available()) {
+		ret = ioremap_page_range(va, va + size, pa, prot);
+		if (ret)
+			unmap_kernel_range(va, size);
+	} else {
+		ret = early_ioremap_range(va, pa, size, prot);
+	}
+
+	if (ret)
 		return NULL;
 
 	return (void __iomem *)ea;
@@ -48,6 +59,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
 {
 	phys_addr_t paligned, offset;
 	void __iomem *ret;
+	int err;
 
 	/* We don't support the 4K PFN hack with ioremap */
 	if (pgprot_val(prot) & H_PAGE_4K_PFN)
@@ -66,16 +78,16 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
 	if (size == 0 || paligned == 0)
 		return NULL;
 
-	if (slab_is_available()) {
+	if (slab_is_available())
 		return do_ioremap(paligned, offset, size, prot, caller);
-	} else {
-		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
-		if (ret)
-			ioremap_bot += size;
-	}
 
-	if (ret)
-		ret += addr & ~PAGE_MASK;
+	err = early_ioremap_range(ioremap_bot, paligned, size, prot);
+	if (err)
+		return NULL;
+
+	ret = (void __iomem *)ioremap_bot + offset;
+	ioremap_bot += size;
+
 	return ret;
 }
 

From c691b4b83b6a348f7b9d13c36916e73c2e1d85e4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:34:12 +0000
Subject: [PATCH 095/264] powerpc: rewrite LOAD_REG_IMMEDIATE() as an
 intelligent macro

Today LOAD_REG_IMMEDIATE() is a basic #define which loads all
parts on a value into a register, including the parts that are NUL.

This means always 2 instructions on PPC32 and always 5 instructions
on PPC64. And those instructions cannot run in parallele as they are
updating the same register.

Ex: LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in:

3c 20 00 00     lis     r1,0
60 21 00 00     ori     r1,r1,0
78 21 07 c6     rldicr  r1,r1,32,31
64 21 00 00     oris    r1,r1,0
60 21 40 00     ori     r1,r1,16384

Rewrite LOAD_REG_IMMEDIATE() with GAS macro in order to skip
the parts that are NUL.

Rename existing LOAD_REG_IMMEDIATE() as LOAD_REG_IMMEDIATE_SYM()
and use that one for loading value of symbols which are not known
at compile time.

Now LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in:

38 20 40 00     li      r1,16384

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/d60ce8dd3a383c7adbfc322bf1d53d81724a6000.1566311636.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/ppc_asm.h   | 42 ++++++++++++++++++++++++----
 arch/powerpc/kernel/exceptions-64e.S | 10 +++----
 arch/powerpc/kernel/head_64.S        |  2 +-
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index e0637730a8e7..20a00209c965 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -311,13 +311,43 @@ n:
 	addis	reg,reg,(name - 0b)@ha;		\
 	addi	reg,reg,(name - 0b)@l;
 
-#ifdef __powerpc64__
-#ifdef HAVE_AS_ATHIGH
+#if defined(__powerpc64__) && defined(HAVE_AS_ATHIGH)
 #define __AS_ATHIGH high
 #else
 #define __AS_ATHIGH h
 #endif
-#define LOAD_REG_IMMEDIATE(reg,expr)		\
+
+.macro __LOAD_REG_IMMEDIATE_32 r, x
+	.if (\x) >= 0x8000 || (\x) < -0x8000
+		lis \r, (\x)@__AS_ATHIGH
+		.if (\x) & 0xffff != 0
+			ori \r, \r, (\x)@l
+		.endif
+	.else
+		li \r, (\x)@l
+	.endif
+.endm
+
+.macro __LOAD_REG_IMMEDIATE r, x
+	.if (\x) >= 0x80000000 || (\x) < -0x80000000
+		__LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
+		sldi	\r, \r, 32
+		.if (\x) & 0xffff0000 != 0
+			oris \r, \r, (\x)@__AS_ATHIGH
+		.endif
+		.if (\x) & 0xffff != 0
+			ori \r, \r, (\x)@l
+		.endif
+	.else
+		__LOAD_REG_IMMEDIATE_32 \r, \x
+	.endif
+.endm
+
+#ifdef __powerpc64__
+
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg,expr)	\
 	lis     reg,(expr)@highest;		\
 	ori     reg,reg,(expr)@higher;	\
 	rldicr  reg,reg,32,31;		\
@@ -335,11 +365,13 @@ n:
 
 #else /* 32-bit */
 
-#define LOAD_REG_IMMEDIATE(reg,expr)		\
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE_32 reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg,expr)		\
 	lis	reg,(expr)@ha;		\
 	addi	reg,reg,(expr)@l;
 
-#define LOAD_REG_ADDR(reg,name)		LOAD_REG_IMMEDIATE(reg, name)
+#define LOAD_REG_ADDR(reg,name)		LOAD_REG_IMMEDIATE_SYM(reg, name)
 
 #define LOAD_REG_ADDRBASE(reg, name)	lis	reg,name@ha
 #define ADDROFF(name)			name@l
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 1cfb3da4a84a..898aae6da167 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -751,8 +751,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
 #else
-	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+	LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
+	LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
 #endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
@@ -821,8 +821,8 @@ kernel_dbg_exc:
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
 #else
-	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+	LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
+	LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
 #endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
@@ -1449,7 +1449,7 @@ a2_tlbinit_code_start:
 a2_tlbinit_after_linear_map:
 
 	/* Now we branch the new virtual address mapped by this entry */
-	LOAD_REG_IMMEDIATE(r3,1f)
+	LOAD_REG_IMMEDIATE_SYM(r3,1f)
 	mtctr	r3
 	bctr
 
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 91d297e696dd..1fd44761e997 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -635,7 +635,7 @@ __after_prom_start:
 	sub	r5,r5,r11
 #else
 	/* just copy interrupts */
-	LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
+	LOAD_REG_IMMEDIATE_SYM(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
 #endif
 	b	5f
 3:

From ba18025fb03306ccdf3557a1e7b8a5b39b474872 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:34:13 +0000
Subject: [PATCH 096/264] powerpc/32: replace LOAD_MSR_KERNEL() by
 LOAD_REG_IMMEDIATE()

LOAD_MSR_KERNEL() and LOAD_REG_IMMEDIATE() are doing the same thing
in the same way. Drop LOAD_MSR_KERNEL()

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/8f04a6df0bc8949517fd8236d50c15008ccf9231.1566311636.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/entry_32.S | 18 +++++++++---------
 arch/powerpc/kernel/head_32.h  | 21 ++++-----------------
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 54fab22c9a43..972b05504a0a 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -230,7 +230,7 @@ transfer_to_handler_cont:
 	 */
 	lis	r12,reenable_mmu@h
 	ori	r12,r12,reenable_mmu@l
-	LOAD_MSR_KERNEL(r0, MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r0, MSR_KERNEL)
 	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r0
 	SYNC
@@ -304,7 +304,7 @@ stack_ovf:
 	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
 	lis	r9,StackOverflow@ha
 	addi	r9,r9,StackOverflow@l
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
 	mtspr	SPRN_NRI, r0
 #endif
@@ -324,7 +324,7 @@ trace_syscall_entry_irq_off:
 	bl	trace_hardirqs_on
 
 	/* Now enable for real */
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
 	mtmsr	r10
 
 	REST_GPR(0, r1)
@@ -394,7 +394,7 @@ ret_from_syscall:
 #endif
 	mr	r6,r3
 	/* disable interrupts so current_thread_info()->flags can't change */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
 	/* Note: We don't bother telling lockdep about it */
 	SYNC
 	MTMSRD(r10)
@@ -824,7 +824,7 @@ ret_from_except:
 	 * can't change between when we test it and when we return
 	 * from the interrupt. */
 	/* Note: We don't bother telling lockdep about it */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 	SYNC			/* Some chip revs have problems here... */
 	MTMSRD(r10)		/* disable interrupts */
 
@@ -991,7 +991,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	 * can restart the exception exit path at the label
 	 * exc_exit_restart below.  -- paulus
 	 */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI)
 	SYNC
 	MTMSRD(r10)		/* clear the RI bit */
 	.globl exc_exit_restart
@@ -1066,7 +1066,7 @@ exc_exit_restart_end:
 	REST_NVGPRS(r1);						\
 	lwz	r3,_MSR(r1);						\
 	andi.	r3,r3,MSR_PR;						\
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL);				\
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL);				\
 	bne	user_exc_return;					\
 	lwz	r0,GPR0(r1);						\
 	lwz	r2,GPR2(r1);						\
@@ -1236,7 +1236,7 @@ recheck:
 	 * neither. Those disable/enable cycles used to peek at
 	 * TI_FLAGS aren't advertised.
 	 */
-	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
 	SYNC
 	MTMSRD(r10)		/* disable interrupts */
 	lwz	r9,TI_FLAGS(r2)
@@ -1329,7 +1329,7 @@ _GLOBAL(enter_rtas)
 	lwz	r4,RTASBASE(r4)
 	mfmsr	r9
 	stw	r9,8(r1)
-	LOAD_MSR_KERNEL(r0,MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r0,MSR_KERNEL)
 	SYNC			/* disable interrupts so SRR0/1 */
 	MTMSRD(r0)		/* don't get trashed */
 	li	r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 4a692553651f..8abc7783dbe5 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -4,19 +4,6 @@
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
 
-/*
- * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE.
- */
-.macro __LOAD_MSR_KERNEL r, x
-.if \x >= 0x8000
-	lis \r, (\x)@h
-	ori \r, \r, (\x)@l
-.else
-	li \r, (\x)
-.endif
-.endm
-#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x
-
 /*
  * Exception entry code.  This code runs with address translation
  * turned off, i.e. using physical addresses.
@@ -92,7 +79,7 @@
 #ifdef CONFIG_40x
 	rlwinm	r9,r9,0,14,12		/* clear MSR_WE (necessary?) */
 #else
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
 	MTMSRD(r10)			/* (except for mach check in rtas) */
 #endif
 	lis	r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
@@ -140,10 +127,10 @@
 	 * otherwise we might risk taking an interrupt before we tell lockdep
 	 * they are enabled.
 	 */
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL)
 	rlwimi	r10, r9, 0, MSR_EE
 #else
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
 #endif
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
 	mtspr	SPRN_NRI, r0
@@ -187,7 +174,7 @@ label:
 #define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret)		\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
-	LOAD_MSR_KERNEL(r10, msr);				\
+	LOAD_REG_IMMEDIATE(r10, msr);				\
 	bl	tfer;						\
 	.long	hdlr;						\
 	.long	ret

From d7fb5b18a540efaf05da2b980fc11d50ba775677 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 20 Aug 2019 14:34:14 +0000
Subject: [PATCH 097/264] powerpc/64: optimise LOAD_REG_IMMEDIATE_SYM()

Optimise LOAD_REG_IMMEDIATE_SYM() using a temporary register to
parallelise operations.

It reduces the path from 5 to 3 instructions.

Suggested-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/bad41ed02531bb0382420cbab50a0d7153b71767.1566311636.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/ppc_asm.h   | 12 ++++++------
 arch/powerpc/kernel/exceptions-64e.S | 22 +++++++++++++---------
 arch/powerpc/kernel/head_64.S        |  2 +-
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 20a00209c965..dd3b191bdcea 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -347,12 +347,12 @@ n:
 
 #define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr
 
-#define LOAD_REG_IMMEDIATE_SYM(reg,expr)	\
-	lis     reg,(expr)@highest;		\
-	ori     reg,reg,(expr)@higher;	\
-	rldicr  reg,reg,32,31;		\
-	oris    reg,reg,(expr)@__AS_ATHIGH;	\
-	ori     reg,reg,(expr)@l;
+#define LOAD_REG_IMMEDIATE_SYM(reg, tmp, expr)	\
+	lis	tmp, (expr)@highest;		\
+	lis	reg, (expr)@__AS_ATHIGH;	\
+	ori	tmp, tmp, (expr)@higher;	\
+	ori	reg, reg, (expr)@l;		\
+	rldimi	reg, tmp, 32, 0
 
 #define LOAD_REG_ADDR(reg,name)			\
 	ld	reg,name@got(r2)
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 898aae6da167..829950b96d29 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	ld	r15,PACATOC(r13)
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
-#else
-	LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
-#endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
+#else
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
+	cmpld	cr0, r10, r14
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts)
+	cmpld	cr1, r10, r14
+#endif
 	blt+	cr0,1f
 	bge+	cr1,1f
 
@@ -820,12 +822,14 @@ kernel_dbg_exc:
 	ld	r15,PACATOC(r13)
 	ld	r14,interrupt_base_book3e@got(r15)
 	ld	r15,__end_interrupts@got(r15)
-#else
-	LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
-	LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
-#endif
 	cmpld	cr0,r10,r14
 	cmpld	cr1,r10,r15
+#else
+	LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
+	cmpld	cr0, r10, r14
+	LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts)
+	cmpld	cr1, r10, r14
+#endif
 	blt+	cr0,1f
 	bge+	cr1,1f
 
@@ -1449,7 +1453,7 @@ a2_tlbinit_code_start:
 a2_tlbinit_after_linear_map:
 
 	/* Now we branch the new virtual address mapped by this entry */
-	LOAD_REG_IMMEDIATE_SYM(r3,1f)
+	LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f)
 	mtctr	r3
 	bctr
 
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 1fd44761e997..0f2d61af47cc 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -635,7 +635,7 @@ __after_prom_start:
 	sub	r5,r5,r11
 #else
 	/* just copy interrupts */
-	LOAD_REG_IMMEDIATE_SYM(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
+	LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
 #endif
 	b	5f
 3:

From d57b78353a99f5d813248954dd7d0527a01751ac Mon Sep 17 00:00:00 2001
From: "Christopher M. Riedl" <cmr@informatik.wtf>
Date: Mon, 12 Aug 2019 22:13:12 -0500
Subject: [PATCH 098/264] powerpc/spinlocks: Refactor SHARED_PROCESSOR

Determining if a processor is in shared processor mode is not a constant
so don't hide it behind a #define.

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813031314.1828-2-cmr@informatik.wtf
---
 arch/powerpc/include/asm/spinlock.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index a47f827bc5f1..e9c60fbcc8fe 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -101,15 +101,27 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 
 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
-#define SHARED_PROCESSOR	0
 #endif
 
+static inline bool is_shared_processor(void)
+{
+/*
+ * LPPACA is only available on Pseries so guard anything LPPACA related to
+ * allow other platforms (which include this common header) to compile.
+ */
+#ifdef CONFIG_PPC_PSERIES
+	return (IS_ENABLED(CONFIG_PPC_SPLPAR) &&
+		lppaca_shared_proc(local_paca->lppaca_ptr));
+#else
+	return false;
+#endif
+}
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	while (1) {
@@ -117,7 +129,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
+			if (is_shared_processor())
 				__spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
@@ -136,7 +148,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 		local_irq_restore(flags);
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
+			if (is_shared_processor())
 				__spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
@@ -226,7 +238,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
+			if (is_shared_processor())
 				__rw_yield(rw);
 		} while (unlikely(rw->lock < 0));
 		HMT_medium();
@@ -240,7 +252,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 			break;
 		do {
 			HMT_low();
-			if (SHARED_PROCESSOR)
+			if (is_shared_processor())
 				__rw_yield(rw);
 		} while (unlikely(rw->lock != 0));
 		HMT_medium();

From 31391ff7ea1ef557a804475436501f33ff0ead95 Mon Sep 17 00:00:00 2001
From: "Christopher M. Riedl" <cmr@informatik.wtf>
Date: Mon, 12 Aug 2019 22:13:13 -0500
Subject: [PATCH 099/264] powerpc/spinlocks: Rename SPLPAR-only spinlocks

The __rw_yield and __spin_yield locks only pertain to SPLPAR mode.
Rename them to make this relationship obvious.

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813031314.1828-3-cmr@informatik.wtf
---
 arch/powerpc/include/asm/spinlock.h | 6 ++++--
 arch/powerpc/lib/locks.c            | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index e9c60fbcc8fe..0d04d468f660 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -101,8 +101,10 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 
 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
+void splpar_spin_yield(arch_spinlock_t *lock);
+void splpar_rw_yield(arch_rwlock_t *lock);
+#define __spin_yield(x) splpar_spin_yield(x)
+#define __rw_yield(x) splpar_rw_yield(x)
 #else /* SPLPAR */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6550b9e5ce5f..6440d5943c00 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -18,7 +18,7 @@
 #include <asm/hvcall.h>
 #include <asm/smp.h>
 
-void __spin_yield(arch_spinlock_t *lock)
+void splpar_spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
@@ -36,14 +36,14 @@ void __spin_yield(arch_spinlock_t *lock)
 	plpar_hcall_norets(H_CONFER,
 		get_hard_smp_processor_id(holder_cpu), yield_count);
 }
-EXPORT_SYMBOL_GPL(__spin_yield);
+EXPORT_SYMBOL_GPL(splpar_spin_yield);
 
 /*
  * Waiting for a read lock or a write lock on a rwlock...
  * This turns out to be the same for read and write locks, since
  * we only know the holder if it is write-locked.
  */
-void __rw_yield(arch_rwlock_t *rw)
+void splpar_rw_yield(arch_rwlock_t *rw)
 {
 	int lock_value;
 	unsigned int holder_cpu, yield_count;

From 405efc5980f2590a9520dc66cfe295456b8c9818 Mon Sep 17 00:00:00 2001
From: "Christopher M. Riedl" <cmr@informatik.wtf>
Date: Mon, 12 Aug 2019 22:13:14 -0500
Subject: [PATCH 100/264] powerpc/spinlocks: Fix oops in __spin_yield() on bare
 metal

Booting w/ppc64le_defconfig + CONFIG_PREEMPT on bare metal results in
the oops below due to calling into __spin_yield() when not running in
an SPLPAR, which means lppaca pointers are NULL.

We fixed a similar case previously in commit a6201da34ff9 ("powerpc:
Fix oops due to bad access of lppaca on bare metal"), by adding SPLPAR
checks in lppaca_shared_proc(). However when PREEMPT is enabled we can
call __spin_yield() directly from arch_spin_yield().

To fix it add spin_yield() and rw_yield() which check that
shared-processor LPAR is enabled before calling the SPLPAR-only
implementation of each.

  BUG: Kernel NULL pointer dereference at 0x00000100
  Faulting instruction address: 0xc000000000097f88
  Oops: Kernel access of bad area, sig: 7 [#1]
  LE PAGE_SIZE=64K MMU=Radix MMU=Hash PREEMPT SMP NR_CPUS=2048 NUMA PowerNV
  Modules linked in:
  CPU: 0 PID: 2 Comm: kthreadd Not tainted 5.2.0-rc6-00491-g249155c20f9b #28
  NIP:  c000000000097f88 LR: c000000000c07a88 CTR: c00000000015ca10
  REGS: c0000000727079f0 TRAP: 0300   Not tainted  (5.2.0-rc6-00491-g249155c20f9b)
  MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 84000424  XER: 20040000
  CFAR: c000000000c07a84 DAR: 0000000000000100 DSISR: 00080000 IRQMASK: 1
  GPR00: c000000000c07a88 c000000072707c80 c000000001546300 c00000007be38a80
  GPR04: c0000000726f0c00 0000000000000002 c00000007279c980 0000000000000100
  GPR08: c000000001581b78 0000000080000001 0000000000000008 c00000007279c9b0
  GPR12: 0000000000000000 c000000001730000 c000000000142558 0000000000000000
  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR24: c00000007be38a80 c000000000c002f4 0000000000000000 0000000000000000
  GPR28: c000000072221a00 c0000000726c2600 c00000007be38a80 c00000007be38a80
  NIP [c000000000097f88] __spin_yield+0x48/0xa0
  LR [c000000000c07a88] __raw_spin_lock+0xb8/0xc0
  Call Trace:
  [c000000072707c80] [c000000072221a00] 0xc000000072221a00 (unreliable)
  [c000000072707cb0] [c000000000bffb0c] __schedule+0xbc/0x850
  [c000000072707d70] [c000000000c002f4] schedule+0x54/0x130
  [c000000072707da0] [c0000000001427dc] kthreadd+0x28c/0x2b0
  [c000000072707e20] [c00000000000c1cc] ret_from_kernel_thread+0x5c/0x70
  Instruction dump:
  4d9e0020 552a043e 210a07ff 79080fe0 0b080000 3d020004 3908b878 794a1f24
  e8e80000 7ce7502a e8e70000 38e70100 <7ca03c2c> 70a70001 78a50020 4d820020
  ---[ end trace 474d6b2b8fc5cb7e ]---

Fixes: 499dcd41378e ("powerpc/64s: Allocate LPPACAs individually")
Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
[mpe: Reword change log a bit]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813031314.1828-4-cmr@informatik.wtf
---
 arch/powerpc/include/asm/spinlock.h | 36 ++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 0d04d468f660..e9a960e28f3c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -103,11 +103,9 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 /* We only yield to the hypervisor if we are in shared processor mode */
 void splpar_spin_yield(arch_spinlock_t *lock);
 void splpar_rw_yield(arch_rwlock_t *lock);
-#define __spin_yield(x) splpar_spin_yield(x)
-#define __rw_yield(x) splpar_rw_yield(x)
 #else /* SPLPAR */
-#define __spin_yield(x)	barrier()
-#define __rw_yield(x)	barrier()
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
 #endif
 
 static inline bool is_shared_processor(void)
@@ -124,6 +122,22 @@ static inline bool is_shared_processor(void)
 #endif
 }
 
+static inline void spin_yield(arch_spinlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_spin_yield(lock);
+	else
+		barrier();
+}
+
+static inline void rw_yield(arch_rwlock_t *lock)
+{
+	if (is_shared_processor())
+		splpar_rw_yield(lock);
+	else
+		barrier();
+}
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	while (1) {
@@ -132,7 +146,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 		do {
 			HMT_low();
 			if (is_shared_processor())
-				__spin_yield(lock);
+				splpar_spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 	}
@@ -151,7 +165,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 		do {
 			HMT_low();
 			if (is_shared_processor())
-				__spin_yield(lock);
+				splpar_spin_yield(lock);
 		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 		local_irq_restore(flags_dis);
@@ -241,7 +255,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 		do {
 			HMT_low();
 			if (is_shared_processor())
-				__rw_yield(rw);
+				splpar_rw_yield(rw);
 		} while (unlikely(rw->lock < 0));
 		HMT_medium();
 	}
@@ -255,7 +269,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 		do {
 			HMT_low();
 			if (is_shared_processor())
-				__rw_yield(rw);
+				splpar_rw_yield(rw);
 		} while (unlikely(rw->lock != 0));
 		HMT_medium();
 	}
@@ -295,9 +309,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	rw->lock = 0;
 }
 
-#define arch_spin_relax(lock)	__spin_yield(lock)
-#define arch_read_relax(lock)	__rw_yield(lock)
-#define arch_write_relax(lock)	__rw_yield(lock)
+#define arch_spin_relax(lock)	spin_yield(lock)
+#define arch_read_relax(lock)	rw_yield(lock)
+#define arch_write_relax(lock)	rw_yield(lock)
 
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()   smp_mb()

From 63ce271b5e377deaddace4bac6dafb6e79d2bee4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 11:10:23 +0000
Subject: [PATCH 101/264] powerpc/prom: convert PROM_BUG() to standard trap

Prior to commit 1bd98d7fbaf5 ("ppc64: Update BUG handling based on
ppc32"), BUG() family was using BUG_ILLEGAL_INSTRUCTION which
was an invalid instruction opcode to trap into program check
exception.

That commit converted them to using standard trap instructions,
but prom/prom_init and their PROM_BUG() macro were left over.
head_64.S and exception-64s.S were left aside as well.

Convert them to using the standard BUG infrastructure.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/cdaf4bbbb64c288a077845846f04b12683f8875a.1566817807.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/bug.h       | 8 --------
 arch/powerpc/kernel/exceptions-64s.S | 3 ++-
 arch/powerpc/kernel/head_64.S        | 6 ++++--
 arch/powerpc/kernel/prom_init.c      | 2 +-
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index fed7e6241349..f47e6ff6554d 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -5,14 +5,6 @@
 
 #include <asm/asm-compat.h>
 
-/*
- * Define an illegal instr to trap on the bug.
- * We don't use 0 because that marks the end of a function
- * in the ELF ABI.  That's "Boo Boo" in case you wonder...
- */
-#define BUG_OPCODE .long 0x00b00b00  /* For asm */
-#define BUG_ILLEGAL_INSTR "0x00b00b00" /* For BUG macro */
-
 #ifdef CONFIG_BUG
 
 #ifdef __ASSEMBLY__
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6ba3cc2ef8ab..dded4672579d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1467,7 +1467,8 @@ EXC_COMMON_BEGIN(fp_unavailable_common)
 	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	kernel_fp_unavailable_exception
-	BUG_OPCODE
+0:	trap
+	EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
 1:
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 0f2d61af47cc..ad79fddb974d 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -182,7 +182,8 @@ __secondary_hold:
 	isync
 	bctr
 #else
-	BUG_OPCODE
+0:	trap
+	EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
 #endif
 CLOSE_FIXED_SECTION(first_256B)
 
@@ -998,7 +999,8 @@ start_here_common:
 	bl	start_kernel
 
 	/* Not reached */
-	BUG_OPCODE
+	trap
+	EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
 
 /*
  * We put a few things here that have to be page-aligned.
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 514707ef6779..f2b63b4e1943 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -94,7 +94,7 @@ static int of_workarounds __prombss;
 #define PROM_BUG() do {						\
         prom_printf("kernel BUG at %s line 0x%x!\n",		\
 		    __FILE__, __LINE__);			\
-        __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR);	\
+	__builtin_trap();					\
 } while (0)
 
 #ifdef DEBUG_PROM

From a04565741284f695db4cfe5a3e61940d2259cb8f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 21 Aug 2019 20:00:34 +0000
Subject: [PATCH 102/264] powerpc/8xx: drop unused self-modifying code
 alternative to FixupDAR.

The code which fixups the DAR on TLB errors for dbcX instructions
has a self-modifying code alternative that has never been used.

Drop it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Joakim Tjernlund <joakim.tjernlund@infinera.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b095e12c82fcba1ac4c09fc3b85d969f36614746.1566417610.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_8xx.S | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 5ab9178c2347..5db461db63cc 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -574,8 +574,6 @@ InstructionBreakpoint:
  * by decoding the registers used by the dcbx instruction and adding them.
  * DAR is set to the calculated address.
  */
- /* define if you don't want to use self modifying code */
-#define NO_SELF_MODIFYING_CODE
 FixupDAR:/* Entry point for dcbx workaround. */
 	mtspr	SPRN_M_TW, r10
 	/* fetch instruction from memory. */
@@ -639,27 +637,6 @@ FixupDAR:/* Entry point for dcbx workaround. */
 	rlwinm	r10, r10,0,7,5	/* Clear store bit for buggy dcbst insn */
 	mtspr	SPRN_DSISR, r10
 142:	/* continue, it was a dcbx, dcbi instruction. */
-#ifndef NO_SELF_MODIFYING_CODE
-	andis.	r10,r11,0x1f	/* test if reg RA is r0 */
-	li	r10,modified_instr@l
-	dcbtst	r0,r10		/* touch for store */
-	rlwinm	r11,r11,0,0,20	/* Zero lower 10 bits */
-	oris	r11,r11,640	/* Transform instr. to a "add r10,RA,RB" */
-	ori	r11,r11,532
-	stw	r11,0(r10)	/* store add/and instruction */
-	dcbf	0,r10		/* flush new instr. to memory. */
-	icbi	0,r10		/* invalidate instr. cache line */
-	mfspr	r11, SPRN_SPRG_SCRATCH1	/* restore r11 */
-	mfspr	r10, SPRN_SPRG_SCRATCH0	/* restore r10 */
-	isync			/* Wait until new instr is loaded from memory */
-modified_instr:
-	.space	4		/* this is where the add instr. is stored */
-	bne+	143f
-	subf	r10,r0,r10	/* r10=r10-r0, only if reg RA is r0 */
-143:	mtdar	r10		/* store faulting EA in DAR */
-	mfspr	r10,SPRN_M_TW
-	b	DARFixed	/* Go back to normal TLB handling */
-#else
 	mfctr	r10
 	mtdar	r10			/* save ctr reg in DAR */
 	rlwinm	r10, r11, 24, 24, 28	/* offset into jump table for reg RB */
@@ -723,7 +700,6 @@ modified_instr:
 	add	r10, r10, r11	/* add it */
 	mfctr	r11		/* restore r11 */
 	b	151b
-#endif
 
 /*
  * This is where the main kernel code starts.

From 3bbd2343734e44de92238eea1a5cd3ad32a6baf0 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 21 Aug 2019 10:20:51 +0000
Subject: [PATCH 103/264] powerpc/8xx: set STACK_END_MAGIC earlier on the
 init_stack

Today, the STACK_END_MAGIC is set on init_stack in start_kernel().

To avoid a false 'Thread overran stack, or stack corrupted' message
on early Oopses, setup STACK_END_MAGIC as soon as possible.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/54f67bb7ac486c1350f2fa8905cd279f94b9dfb1.1566382841.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_8xx.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 5db461db63cc..19f583e18402 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -15,6 +15,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/magic.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -717,6 +718,9 @@ start_here:
 	/* stack */
 	lis	r1,init_thread_union@ha
 	addi	r1,r1,init_thread_union@l
+	lis	r0, STACK_END_MAGIC@h
+	ori	r0, r0, STACK_END_MAGIC@l
+	stw	r0, 0(r1)
 	li	r0,0
 	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
 

From f7a0bf7d904e902e13024986b7b357181ee30849 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:13 +0000
Subject: [PATCH 104/264] powerpc/32s: add an option to exclusively select
 powerpc 601

Powerpc 601 is rather old powerpc which as some important
limitations compared to other book3s/32 powerpcs:
- No Timebase.
- Common BATs for instruction and data.
- No execution protection in segment registers.
- No RI bit in MSR
- ...

It is starting to be difficult and cumbersome to maintain
kernels that are compatible both with 601 and other 6xx cores.

Create a compiletime option to exclusively select either powerpc 601
or other 6xx.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/d644eaf7dff8cc149260066802af230bdf34fded.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/platforms/Kconfig.cputype | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 56a7c814160d..68c5cc075bfc 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -6,6 +6,9 @@ config PPC64
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
 
+config PPC_BOOK3S_32
+	bool
+
 menu "Processor support"
 choice
 	prompt "Processor Type"
@@ -21,13 +24,20 @@ choice
 
 	  If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx.
 
-config PPC_BOOK3S_32
-	bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx"
+config PPC_BOOK3S_6xx
+	bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
+	select PPC_BOOK3S_32
 	select PPC_FPU
 	select PPC_HAVE_PMU_SUPPORT
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
 
+config PPC_BOOK3S_601
+	bool "PowerPC 601"
+	select PPC_BOOK3S_32
+	select PPC_FPU
+	select PPC_HAVE_KUAP
+
 config PPC_85xx
 	bool "Freescale 85xx"
 	select E500

From 12c3f1fd87bf4e55f06d079a45d6f15e2f6f9750 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:14 +0000
Subject: [PATCH 105/264] powerpc/32s: get rid of CPU_FTR_601 feature

Now that 601 is exclusive from other 6xx, CPU_FTR_601 and
associated fixups are useless.

Drop this feature and use #ifdefs instead.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/ecdb7194a17dbfa01865df6a82979533adc2c70b.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/configs/pmac32_defconfig |  1 -
 arch/powerpc/include/asm/cputable.h   | 14 ++++++-----
 arch/powerpc/include/asm/ppc_asm.h    | 28 ++++++----------------
 arch/powerpc/include/asm/ptrace.h     |  6 ++++-
 arch/powerpc/include/asm/timex.h      | 34 +++------------------------
 arch/powerpc/kernel/cputable.c        |  6 +++--
 arch/powerpc/kernel/entry_32.S        | 22 ++++++++++++-----
 arch/powerpc/mm/book3s32/mmu.c        | 15 +++++-------
 arch/powerpc/mm/ptdump/bats.c         |  2 +-
 arch/powerpc/platforms/Kconfig        |  3 ++-
 10 files changed, 52 insertions(+), 79 deletions(-)

diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 7e6654848531..4e6e95f92646 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -20,7 +20,6 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_PMAC=y
-CONFIG_PPC601_SYNC_FIX=y
 CONFIG_GEN_RTC=y
 CONFIG_HIGHMEM=y
 CONFIG_BINFMT_MISC=m
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index d05f0c28e515..0aad095896d6 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -145,7 +145,6 @@ static inline void cpu_feature_keys_init(void) { }
 
 /* Definitions for features that only exist on 32-bit chips */
 #ifdef CONFIG_PPC32
-#define CPU_FTR_601			ASM_CONST(0x00001000)
 #define CPU_FTR_L2CR			ASM_CONST(0x00002000)
 #define CPU_FTR_SPEC7450		ASM_CONST(0x00004000)
 #define CPU_FTR_TAU			ASM_CONST(0x00008000)
@@ -167,7 +166,6 @@ static inline void cpu_feature_keys_init(void) { }
 
 #else	/* CONFIG_PPC32 */
 /* Define these to 0 for the sake of tests in common code */
-#define CPU_FTR_601			(0)
 #define CPU_FTR_PPC_LE			(0)
 #endif
 
@@ -294,7 +292,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_MAYBE_CAN_NAP	0
 #endif
 
-#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | CPU_FTR_601 | \
+#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | \
 	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC)
 #define CPU_FTRS_603	(CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE)
@@ -498,7 +496,9 @@ static inline void cpu_feature_keys_init(void) { }
 #else
 enum {
 	CPU_FTRS_POSSIBLE =
-#ifdef CONFIG_PPC_BOOK3S_32
+#ifdef CONFIG_PPC_BOOK3S_601
+	    CPU_FTRS_PPC601 |
+#elif defined(CONFIG_PPC_BOOK3S_32)
 	    CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU |
 	    CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 |
 	    CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX |
@@ -574,8 +574,10 @@ enum {
 #else
 enum {
 	CPU_FTRS_ALWAYS =
-#ifdef CONFIG_PPC_BOOK3S_32
-	    CPU_FTRS_PPC601 & CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
+#ifdef CONFIG_PPC_BOOK3S_601
+	    CPU_FTRS_PPC601 &
+#elif defined(CONFIG_PPC_BOOK3S_32)
+	    CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
 	    CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 &
 	    CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX &
 	    CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 &
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index dd3b191bdcea..6b03dff61a05 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -383,19 +383,9 @@ n:
 
 /* various errata or part fixups */
 #ifdef CONFIG_PPC601_SYNC_FIX
-#define SYNC				\
-BEGIN_FTR_SECTION			\
-	sync;				\
-	isync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
-#define SYNC_601			\
-BEGIN_FTR_SECTION			\
-	sync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
-#define ISYNC_601			\
-BEGIN_FTR_SECTION			\
-	isync;				\
-END_FTR_SECTION_IFSET(CPU_FTR_601)
+#define SYNC		sync; isync
+#define SYNC_601	sync
+#define ISYNC_601	isync
 #else
 #define	SYNC
 #define SYNC_601
@@ -421,15 +411,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
 #define MFTBU(dest)			mfspr dest, SPRN_TBRU
 #endif
 
-#ifndef CONFIG_SMP
-#define TLBSYNC
-#else /* CONFIG_SMP */
 /* tlbsync is not implemented on 601 */
-#define TLBSYNC				\
-BEGIN_FTR_SECTION			\
-	tlbsync;			\
-	sync;				\
-END_FTR_SECTION_IFCLR(CPU_FTR_601)
+#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601)
+#define TLBSYNC
+#else
+#define TLBSYNC		tlbsync; sync
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index feee1b21bbd5..ee3ada66deb5 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -203,7 +203,11 @@ do {									      \
 #endif /* __powerpc64__ */
 
 #define arch_has_single_step()	(1)
-#define arch_has_block_step()	(!cpu_has_feature(CPU_FTR_601))
+#ifndef CONFIG_BOOK3S_601
+#define arch_has_block_step()	(true)
+#else
+#define arch_has_block_step()	(false)
+#endif
 #define ARCH_HAS_USER_SINGLE_STEP_REPORT
 
 /*
diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h
index 926b9f91a3ef..d2d2c4bd8435 100644
--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@@ -17,38 +17,10 @@ typedef unsigned long cycles_t;
 
 static inline cycles_t get_cycles(void)
 {
-#ifdef __powerpc64__
+	if (IS_ENABLED(CONFIG_BOOK3S_601))
+		return 0;
+
 	return mftb();
-#else
-	cycles_t ret;
-
-	/*
-	 * For the "cycle" counter we use the timebase lower half.
-	 * Currently only used on SMP.
-	 */
-
-	ret = 0;
-
-	__asm__ __volatile__(
-#ifdef CONFIG_PPC_8xx
-		"97:	mftb %0\n"
-#else
-		"97:	mfspr %0, %2\n"
-#endif
-		"99:\n"
-		".section __ftr_fixup,\"a\"\n"
-		".align 2\n"
-		"98:\n"
-		"	.long %1\n"
-		"	.long 0\n"
-		"	.long 97b-98b\n"
-		"	.long 99b-98b\n"
-		"	.long 0\n"
-		"	.long 0\n"
-		".previous"
-		: "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL));
-	return ret;
-#endif
 }
 
 #endif	/* __KERNEL__ */
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index bfe5f4a2886b..e745abc5457a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 #endif	/* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_PPC32
-#ifdef CONFIG_PPC_BOOK3S_32
+#ifdef CONFIG_PPC_BOOK3S_601
 	{	/* 601 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00010000,
@@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check		= machine_check_generic,
 		.platform		= "ppc601",
 	},
+#endif /* CONFIG_PPC_BOOK3S_601 */
+#ifdef CONFIG_PPC_BOOK3S_6xx
 	{	/* 603 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00030000,
@@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check		= machine_check_generic,
 		.platform		= "ppc603",
 	},
-#endif /* CONFIG_PPC_BOOK3S_32 */
+#endif /* CONFIG_PPC_BOOK3S_6xx */
 #ifdef CONFIG_PPC_8xx
 	{	/* 8xx */
 		.pvr_mask		= 0xffff0000,
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 972b05504a0a..d60908ea37fb 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -777,11 +777,19 @@ fast_exception_return:
 1:	lis	r3,exc_exit_restart_end@ha
 	addi	r3,r3,exc_exit_restart_end@l
 	cmplw	r12,r3
+#if CONFIG_PPC_BOOK3S_601
+	bge	2b
+#else
 	bge	3f
+#endif
 	lis	r4,exc_exit_restart@ha
 	addi	r4,r4,exc_exit_restart@l
 	cmplw	r12,r4
+#if CONFIG_PPC_BOOK3S_601
+	blt	2b
+#else
 	blt	3f
+#endif
 	lis	r3,fee_restarts@ha
 	tophys(r3,r3)
 	lwz	r5,fee_restarts@l(r3)
@@ -800,9 +808,6 @@ fee_restarts:
 /* aargh, we don't know which trap this is */
 /* but the 601 doesn't implement the RI bit, so assume it's OK */
 3:
-BEGIN_FTR_SECTION
-	b	2b
-END_FTR_SECTION_IFSET(CPU_FTR_601)
 	li	r10,-1
 	stw	r10,_TRAP(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -1270,11 +1275,19 @@ nonrecoverable:
 	lis	r10,exc_exit_restart_end@ha
 	addi	r10,r10,exc_exit_restart_end@l
 	cmplw	r12,r10
+#ifdef CONFIG_PPC_BOOK3S_601
+	bgelr
+#else
 	bge	3f
+#endif
 	lis	r11,exc_exit_restart@ha
 	addi	r11,r11,exc_exit_restart@l
 	cmplw	r12,r11
+#ifdef CONFIG_PPC_BOOK3S_601
+	bltlr
+#else
 	blt	3f
+#endif
 	lis	r10,ee_restarts@ha
 	lwz	r12,ee_restarts@l(r10)
 	addi	r12,r12,1
@@ -1283,9 +1296,6 @@ nonrecoverable:
 	blr
 3:	/* OK, we can't recover, kill this process */
 	/* but the 601 doesn't implement the RI bit, so assume it's OK */
-BEGIN_FTR_SECTION
-	blr
-END_FTR_SECTION_IFSET(CPU_FTR_601)
 	lwz	r3,_TRAP(r1)
 	andi.	r0,r3,1
 	beq	5f
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 50a1991d257f..84d5fab94f8f 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -74,7 +74,7 @@ static int find_free_bat(void)
 {
 	int b;
 
-	if (cpu_has_feature(CPU_FTR_601)) {
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
 		for (b = 0; b < 4; b++) {
 			struct ppc_bat *bat = BATS[b];
 
@@ -106,7 +106,7 @@ static int find_free_bat(void)
  */
 static unsigned int block_size(unsigned long base, unsigned long top)
 {
-	unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20;
+	unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M;
 	unsigned int base_shift = (ffs(base) - 1) & 31;
 	unsigned int block_shift = (fls(top - base) - 1) & 31;
 
@@ -189,7 +189,7 @@ void mmu_mark_initmem_nx(void)
 	unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
 	unsigned long size;
 
-	if (cpu_has_feature(CPU_FTR_601))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
 		return;
 
 	for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
@@ -227,7 +227,7 @@ void mmu_mark_rodata_ro(void)
 	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
 	int i;
 
-	if (cpu_has_feature(CPU_FTR_601))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
 		return;
 
 	for (i = 0; i < nb; i++) {
@@ -259,7 +259,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
 		flags &= ~_PAGE_COHERENT;
 
 	bl = (size >> 17) - 1;
-	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
 		/* 603, 604, etc. */
 		/* Do DBAT first */
 		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
@@ -441,7 +441,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	BUG_ON(first_memblock_base != 0);
 
 	/* 601 can only access 16MB at the moment */
-	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
 		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
 	else /* Anything else has 256M mapped */
 		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
@@ -459,9 +459,6 @@ void __init setup_kuep(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Execution Prevention\n");
 
-	if (cpu_has_feature(CPU_FTR_601))
-		pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n");
-
 	if (disabled)
 		pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
 }
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index a0d23e96e841..4154feac1da3 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -149,7 +149,7 @@ static int bats_show_603(struct seq_file *m, void *v)
 
 static int bats_open(struct inode *inode, struct file *file)
 {
-	if (cpu_has_feature(CPU_FTR_601))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
 		return single_open(file, bats_show_601, NULL);
 
 	return single_open(file, bats_show_603, NULL);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f3fb79fccc72..d82e3664ffdf 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -197,7 +197,8 @@ endmenu
 
 config PPC601_SYNC_FIX
 	bool "Workarounds for PPC601 bugs"
-	depends on PPC_BOOK3S_32 && PPC_PMAC
+	depends on PPC_BOOK3S_601 && PPC_PMAC
+	default y
 	help
 	  Some versions of the PPC601 (the first PowerPC chip) have bugs which
 	  mean that extra synchronization instructions are required near

From 88fb309409ab454b497a6abb0f931ce3b6d9969c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:16 +0000
Subject: [PATCH 106/264] powerpc/32s: drop CPU_FTR_USE_RTC feature

CPU_FTR_USE_RTC feature only applies to powerpc601.

Drop this feature and replace it with tests on CONFIG_PPC_BOOK3S_601.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/170411e2360861f4a95c21faad43519a08bc4040.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/cputable.h     |  3 +--
 arch/powerpc/include/asm/time.h         |  6 +-----
 arch/powerpc/kernel/vdso.c              | 22 ----------------------
 arch/powerpc/kernel/vdso32/datapage.S   |  2 ++
 arch/powerpc/kernel/vdso32/vdso32.lds.S |  4 +++-
 5 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 0aad095896d6..fac1b81bbbd9 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -149,7 +149,6 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_SPEC7450		ASM_CONST(0x00004000)
 #define CPU_FTR_TAU			ASM_CONST(0x00008000)
 #define CPU_FTR_CAN_DOZE		ASM_CONST(0x00010000)
-#define CPU_FTR_USE_RTC			ASM_CONST(0x00020000)
 #define CPU_FTR_L3CR			ASM_CONST(0x00040000)
 #define CPU_FTR_L3_DISABLE_NAP		ASM_CONST(0x00080000)
 #define CPU_FTR_NAP_DISABLE_L2_PR	ASM_CONST(0x00100000)
@@ -293,7 +292,7 @@ static inline void cpu_feature_keys_init(void) { }
 #endif
 
 #define CPU_FTRS_PPC601	(CPU_FTR_COMMON | \
-	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC)
+	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE)
 #define CPU_FTRS_603	(CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_604	(CPU_FTR_COMMON | CPU_FTR_PPC_LE)
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9fab..08dbe3e6831c 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -41,11 +41,7 @@ struct div_result {
 
 /* Accessor functions for the timebase (RTC on 601) registers. */
 /* If one day CONFIG_POWER is added just define __USE_RTC as 1 */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define __USE_RTC()	(cpu_has_feature(CPU_FTR_USE_RTC))
-#else
-#define __USE_RTC()	0
-#endif
+#define __USE_RTC()	(IS_ENABLED(CONFIG_PPC_BOOK3S_601))
 
 #ifdef CONFIG_PPC64
 
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d60598113a9f..eae9ddaecbcf 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -94,28 +94,6 @@ static struct vdso_patch_def vdso_patches[] = {
 		CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
 		"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
 	},
-#ifdef CONFIG_PPC32
-	{
-		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
-		"__kernel_gettimeofday", NULL
-	},
-	{
-		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
-		"__kernel_clock_gettime", NULL
-	},
-	{
-		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
-		"__kernel_clock_getres", NULL
-	},
-	{
-		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
-		"__kernel_get_tbfreq", NULL
-	},
-	{
-		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
-		"__kernel_time", NULL
-	},
-#endif
 };
 
 /*
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 6984125b9fc0..6c7401bd284e 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -70,6 +70,7 @@ V_FUNCTION_END(__kernel_get_syscall_map)
  *
  * returns the timebase frequency in HZ
  */
+#ifndef CONFIG_PPC_BOOK3S_601
 V_FUNCTION_BEGIN(__kernel_get_tbfreq)
   .cfi_startproc
 	mflr	r12
@@ -82,3 +83,4 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
 	blr
   .cfi_endproc
 V_FUNCTION_END(__kernel_get_tbfreq)
+#endif
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 099a6db14e67..00c025ba4a92 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -144,10 +144,13 @@ VERSION
 		__kernel_datapage_offset;
 
 		__kernel_get_syscall_map;
+#ifndef CONFIG_PPC_BOOK3S_601
 		__kernel_gettimeofday;
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
+		__kernel_time;
 		__kernel_get_tbfreq;
+#endif
 		__kernel_sync_dicache;
 		__kernel_sync_dicache_p5;
 		__kernel_sigtramp32;
@@ -155,7 +158,6 @@ VERSION
 #ifdef CONFIG_PPC64
 		__kernel_getcpu;
 #endif
-		__kernel_time;
 
 	local: *;
 	};

From 39097b9c6d762d3fcd6f753e05ee3e34ec250ff3 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:17 +0000
Subject: [PATCH 107/264] powerpc/32s: use CONFIG_PPC_BOOK3S_601 instead of
 reading PVR

Use CONFIG_PPC_BOOK3S_601 instead of reading PVR to know if
it is a 601 or not.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/909c26db9facd7fe454695b303f952e019dd9eda.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_32.S | 49 ++++++++++++++++-------------------
 arch/powerpc/kernel/misc_32.S |  6 ++---
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 9e6f01abb31e..4a24f8f026c7 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -34,7 +34,16 @@
 
 #include "head_32.h"
 
-/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
+/* 601 only have IBAT */
+#ifdef CONFIG_PPC_BOOK3S_601
+#define LOAD_BAT(n, reg, RA, RB)	\
+	li	RA,0;			\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	lwz	RA,(n*16)+0(reg);	\
+	lwz	RB,(n*16)+4(reg);	\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_IBAT##n##L,RB
+#else
 #define LOAD_BAT(n, reg, RA, RB)	\
 	/* see the comment for clear_bats() -- Cort */ \
 	li	RA,0;			\
@@ -44,12 +53,11 @@
 	lwz	RB,(n*16)+4(reg);	\
 	mtspr	SPRN_IBAT##n##U,RA;	\
 	mtspr	SPRN_IBAT##n##L,RB;	\
-	beq	1f;			\
 	lwz	RA,(n*16)+8(reg);	\
 	lwz	RB,(n*16)+12(reg);	\
 	mtspr	SPRN_DBAT##n##U,RA;	\
-	mtspr	SPRN_DBAT##n##L,RB;	\
-1:
+	mtspr	SPRN_DBAT##n##L,RB
+#endif
 
 	__HEAD
 	.stabs	"arch/powerpc/kernel/",N_SO,0,0,0f
@@ -820,9 +828,6 @@ load_up_mmu:
 
 /* Load the BAT registers with the values set up by MMU_init.
    MMU_init takes care of whether we're on a 601 or not. */
-	mfpvr	r3
-	srwi	r3,r3,16
-	cmpwi	r3,1
 	lis	r3,BATS@ha
 	addi	r3,r3,BATS@l
 	tophys(r3,r3)
@@ -998,11 +1003,8 @@ EXPORT_SYMBOL(switch_mmu_context)
  */
 clear_bats:
 	li	r10,0
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	r9, 1
-	beq	1f
 
+#ifndef CONFIG_PPC_BOOK3S_601
 	mtspr	SPRN_DBAT0U,r10
 	mtspr	SPRN_DBAT0L,r10
 	mtspr	SPRN_DBAT1U,r10
@@ -1011,7 +1013,7 @@ clear_bats:
 	mtspr	SPRN_DBAT2L,r10
 	mtspr	SPRN_DBAT3U,r10
 	mtspr	SPRN_DBAT3L,r10
-1:
+#endif
 	mtspr	SPRN_IBAT0U,r10
 	mtspr	SPRN_IBAT0L,r10
 	mtspr	SPRN_IBAT1U,r10
@@ -1106,10 +1108,7 @@ mmu_off:
  */
 initial_bats:
 	lis	r11,PAGE_OFFSET@h
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	0,r9,1
-	bne	4f
+#ifdef CONFIG_PPC_BOOK3S_601
 	ori	r11,r11,4		/* set up BAT registers for 601 */
 	li	r8,0x7f			/* valid, block length = 8MB */
 	mtspr	SPRN_IBAT0U,r11		/* N.B. 601 has valid bit in */
@@ -1122,10 +1121,8 @@ initial_bats:
 	addis	r8,r8,0x800000@h
 	mtspr	SPRN_IBAT2U,r11
 	mtspr	SPRN_IBAT2L,r8
-	isync
-	blr
-
-4:	tophys(r8,r11)
+#else
+	tophys(r8,r11)
 #ifdef CONFIG_SMP
 	ori	r8,r8,0x12		/* R/W access, M=1 */
 #else
@@ -1137,10 +1134,10 @@ initial_bats:
 	mtspr	SPRN_DBAT0U,r11		/* bit in upper BAT register */
 	mtspr	SPRN_IBAT0L,r8
 	mtspr	SPRN_IBAT0U,r11
+#endif
 	isync
 	blr
 
-
 #ifdef CONFIG_BOOTX_TEXT
 setup_disp_bat:
 	/*
@@ -1155,15 +1152,13 @@ setup_disp_bat:
 	beqlr
 	lwz	r11,0(r8)
 	lwz	r8,4(r8)
-	mfspr	r9,SPRN_PVR
-	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
-	cmpwi	0,r9,1
-	beq	1f
+#ifndef CONFIG_PPC_BOOK3S_601
 	mtspr	SPRN_DBAT3L,r8
 	mtspr	SPRN_DBAT3U,r11
-	blr
-1:	mtspr	SPRN_IBAT3L,r8
+#else
+	mtspr	SPRN_IBAT3L,r8
 	mtspr	SPRN_IBAT3U,r11
+#endif
 	blr
 #endif /* CONFIG_BOOTX_TEXT */
 
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 02d90e1ebf65..b917641cdaa6 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -303,11 +303,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
 	mfspr	r3,SPRN_L1CSR1
 	ori	r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR
 	mtspr	SPRN_L1CSR1,r3
+#elif defined(CONFIG_PPC_BOOK3S_601)
+	blr			/* for 601, do nothing */
 #else
-	mfspr	r3,SPRN_PVR
-	rlwinm	r3,r3,16,16,31
-	cmpwi	0,r3,1
-	beqlr			/* for 601, do nothing */
 	/* 603/604 processor - use invalidate-all bit in HID0 */
 	mfspr	r3,SPRN_HID0
 	ori	r3,r3,HID0_ICFI

From e0291f1decd6e8d447067f7d2cf01b1091b7cb3f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:18 +0000
Subject: [PATCH 108/264] powerpc/32: drop CPU_FTR_UNIFIED_ID_CACHE

Only 601 and e200 have unified I/D cache.

Drop the feature and use CONFIG_PPC_BOOK3S_601 and CONFIG_E200.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b5902144266d2f4eed1ffea53915bd0245841e02.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/cputable.h | 5 ++---
 arch/powerpc/kernel/misc_32.S       | 4 ++--
 arch/powerpc/kernel/setup_32.c      | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index fac1b81bbbd9..a1ebcbc3931f 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -158,7 +158,6 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_NEED_COHERENT		ASM_CONST(0x01000000)
 #define CPU_FTR_NO_BTIC			ASM_CONST(0x02000000)
 #define CPU_FTR_PPC_LE			ASM_CONST(0x04000000)
-#define CPU_FTR_UNIFIED_ID_CACHE	ASM_CONST(0x08000000)
 #define CPU_FTR_SPE			ASM_CONST(0x10000000)
 #define CPU_FTR_NEED_PAIRED_STWCX	ASM_CONST(0x20000000)
 #define CPU_FTR_INDEXED_DCR		ASM_CONST(0x40000000)
@@ -292,7 +291,7 @@ static inline void cpu_feature_keys_init(void) { }
 #endif
 
 #define CPU_FTRS_PPC601	(CPU_FTR_COMMON | \
-	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE)
+	CPU_FTR_COHERENT_ICACHE)
 #define CPU_FTRS_603	(CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_604	(CPU_FTR_COMMON | CPU_FTR_PPC_LE)
@@ -383,7 +382,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTRS_47X	(CPU_FTRS_440x6)
 #define CPU_FTRS_E200	(CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
-	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E500	(CPU_FTR_MAYBE_CAN_DOZE | \
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index b917641cdaa6..3d21fb110797 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -292,14 +292,14 @@ _GLOBAL(flush_instruction_cache)
 	iccci	0,r3
 #endif
 #elif defined(CONFIG_FSL_BOOKE)
-BEGIN_FTR_SECTION
+#ifdef CONFIG_E200
 	mfspr   r3,SPRN_L1CSR0
 	ori     r3,r3,L1CSR0_CFI|L1CSR0_CLFC
 	/* msync; isync recommended here */
 	mtspr   SPRN_L1CSR0,r3
 	isync
 	blr
-END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
+#endif
 	mfspr	r3,SPRN_L1CSR1
 	ori	r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR
 	mtspr	SPRN_L1CSR1,r3
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 94517e4a2723..a7541edf0cdb 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -206,6 +206,6 @@ __init void initialize_cache_info(void)
 	dcache_bsize = cur_cpu_spec->dcache_bsize;
 	icache_bsize = cur_cpu_spec->icache_bsize;
 	ucache_bsize = 0;
-	if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200))
 		ucache_bsize = icache_bsize = dcache_bsize;
 }

From c7bf1252d5b3891b4ab7072b240a8422fb9da793 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 26 Aug 2019 15:52:19 +0000
Subject: [PATCH 109/264] powerpc/32: don't use CPU_FTR_COHERENT_ICACHE

Only 601 and E200 have CPU_FTR_COHERENT_ICACHE.

Just use #ifdefs instead of feature fixup.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/5f3e92ccd64d06477b27626f6007a9da3b8da157.1566834712.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/misc_32.S | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 3d21fb110797..82df4b09e79f 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -324,10 +324,10 @@ EXPORT_SYMBOL(flush_instruction_cache)
  * flush_icache_range(unsigned long start, unsigned long stop)
  */
 _GLOBAL(flush_icache_range)
-BEGIN_FTR_SECTION
+#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
 	PURGE_PREFETCHED_INS
-	blr				/* for 601, do nothing */
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+	blr				/* for 601 and e200, do nothing */
+#else
 	rlwinm	r3,r3,0,0,31 - L1_CACHE_SHIFT
 	subf	r4,r3,r4
 	addi	r4,r4,L1_CACHE_BYTES - 1
@@ -353,6 +353,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	sync				/* additional sync needed on g4 */
 	isync
 	blr
+#endif
 _ASM_NOKPROBE_SYMBOL(flush_icache_range)
 EXPORT_SYMBOL(flush_icache_range)
 
@@ -360,15 +361,15 @@ EXPORT_SYMBOL(flush_icache_range)
  * Flush a particular page from the data cache to RAM.
  * Note: this is necessary because the instruction cache does *not*
  * snoop from the data cache.
- * This is a no-op on the 601 which has a unified cache.
+ * This is a no-op on the 601 and e200 which have a unified cache.
  *
  *	void __flush_dcache_icache(void *page)
  */
 _GLOBAL(__flush_dcache_icache)
-BEGIN_FTR_SECTION
+#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
 	PURGE_PREFETCHED_INS
 	blr
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+#else
 	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
 	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
 	mtctr	r4
@@ -396,6 +397,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
 	sync
 	isync
 	blr
+#endif
 
 #ifndef CONFIG_BOOKE
 /*
@@ -407,10 +409,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
  *	void __flush_dcache_icache_phys(unsigned long physaddr)
  */
 _GLOBAL(__flush_dcache_icache_phys)
-BEGIN_FTR_SECTION
+#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
 	PURGE_PREFETCHED_INS
-	blr					/* for 601, do nothing */
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+	blr					/* for 601 and e200, do nothing */
+#else
 	mfmsr	r10
 	rlwinm	r0,r10,0,28,26			/* clear DR */
 	mtmsr	r0
@@ -431,6 +433,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	mtmsr	r10				/* restore DR */
 	isync
 	blr
+#endif
 #endif /* CONFIG_BOOKE */
 
 /*

From facd04a904ff6cdc6ee85d6e85d500f478a1bec4 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 27 Aug 2019 13:30:06 +1000
Subject: [PATCH 110/264] powerpc: convert to copy_thread_tls

Commit 3033f14ab78c3 ("clone: support passing tls argument via C rather
than pt_regs magic") introduced the HAVE_COPY_THREAD_TLS option. Use it
to avoid a subtle assumption about the argument ordering of clone type
syscalls.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190827033010.28090-2-npiggin@gmail.com
---
 arch/powerpc/Kconfig          | 1 +
 arch/powerpc/kernel/process.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3d65084f4066..cfb21856559b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -184,6 +184,7 @@ config PPC
 	select HAVE_STACKPROTECTOR		if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
 	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
 	select HAVE_CONTEXT_TRACKING		if PPC64
+	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8fc4de0d22b4..24621e7e5033 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1600,8 +1600,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
 /*
  * Copy architecture-specific thread state
  */
-int copy_thread(unsigned long clone_flags, unsigned long usp,
-		unsigned long kthread_arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *p,
+		unsigned long tls)
 {
 	struct pt_regs *childregs, *kregs;
 	extern void ret_from_fork(void);
@@ -1642,10 +1643,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 		if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_PPC64
 			if (!is_32bit_task())
-				childregs->gpr[13] = childregs->gpr[6];
+				childregs->gpr[13] = tls;
 			else
 #endif
-				childregs->gpr[2] = childregs->gpr[6];
+				childregs->gpr[2] = tls;
 		}
 
 		f = ret_from_fork;

From 555e28179d37e431e97d78d106fc917bec2c6f93 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 27 Aug 2019 13:30:07 +1000
Subject: [PATCH 111/264] powerpc/64: remove support for kernel-mode syscalls

There is support for the kernel to execute the 'sc 0' instruction and
make a system call to itself. This is a relic that is unused in the
tree, therefore untested. It's also highly questionable for modules to
be doing this.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190827033010.28090-3-npiggin@gmail.com
---
 arch/powerpc/kernel/entry_64.S       | 21 ++++++---------------
 arch/powerpc/kernel/exceptions-64s.S |  2 --
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 0a0b5310f54a..6467bdab8d40 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -69,24 +69,20 @@ BEGIN_FTR_SECTION
 	bne	.Ltabort_syscall
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
-	andi.	r10,r12,MSR_PR
 	mr	r10,r1
-	addi	r1,r1,-INT_FRAME_SIZE
-	beq-	1f
 	ld	r1,PACAKSAVE(r13)
-1:	std	r10,0(r1)
+	std	r10,0(r1)
 	std	r11,_NIP(r1)
 	std	r12,_MSR(r1)
 	std	r0,GPR0(r1)
 	std	r10,GPR1(r1)
-	beq	2f			/* if from kernel mode */
 #ifdef CONFIG_PPC_FSL_BOOK3E
 START_BTB_FLUSH_SECTION
 	BTB_FLUSH(r10)
 END_BTB_FLUSH_SECTION
 #endif
 	ACCOUNT_CPU_USER_ENTRY(r13, r10, r11)
-2:	std	r2,GPR2(r1)
+	std	r2,GPR2(r1)
 	std	r3,GPR3(r1)
 	mfcr	r2
 	std	r4,GPR4(r1)
@@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION
 
 #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
 BEGIN_FW_FTR_SECTION
-	beq	33f
-	/* if from user, see if there are any DTL entries to process */
+	/* see if there are any DTL entries to process */
 	ld	r10,PACALPPACAPTR(r13)	/* get ptr to VPA */
 	ld	r11,PACA_DTL_RIDX(r13)	/* get log read index */
 	addi	r10,r10,LPPACA_DTLIDX
 	LDX_BE	r10,0,r10		/* get log write index */
-	cmpd	cr1,r11,r10
-	beq+	cr1,33f
+	cmpd	r11,r10
+	beq+	33f
 	bl	accumulate_stolen_time
 	REST_GPR(0,r1)
 	REST_4GPRS(3,r1)
@@ -203,6 +198,7 @@ system_call:			/* label this so stack traces look sane */
 	mtctr   r12
 	bctrl			/* Call handler */
 
+	/* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */
 .Lsyscall_exit:
 	std	r3,RESULT(r1)
 
@@ -216,11 +212,6 @@ system_call:			/* label this so stack traces look sane */
 	ld	r12, PACA_THREAD_INFO(r13)
 
 	ld	r8,_MSR(r1)
-#ifdef CONFIG_PPC_BOOK3S
-	/* No MSR:RI on BookE */
-	andi.	r10,r8,MSR_RI
-	beq-	.Lunrecov_restore
-#endif
 
 /*
  * This is a few instructions into the actual syscall exit path (which actually
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index dded4672579d..520804351601 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1522,8 +1522,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
  * system call / hypercall (0xc00, 0x4c00)
  *
  * The system call exception is invoked with "sc 0" and does not alter HV bit.
- * There is support for kernel code to invoke system calls but there are no
- * in-tree users.
  *
  * The hypercall is invoked with "sc 1" and sets HV=1.
  *

From f2902a2fb40c589b886d21518ef8a1ee87f76b0c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Aug 2019 15:22:30 +0200
Subject: [PATCH 112/264] powerpc: use the generic dma coherent remap allocator

This switches to using common code for the DMA allocations, including
potential use of the CMA allocator if configured.

Switching to the generic code enables DMA allocations from atomic
context, which is required by the DMA API documentation, and also
adds various other minor features drivers start relying upon.  It
also makes sure we have on tested code base for all architectures
that require uncached pte bits for coherent DMA allocations.

Another advantage is that consistent memory allocations now share
the general vmalloc pool instead of needing an explicit careout
from it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # tested on 8xx
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190814132230.31874-2-hch@lst.de
---
 arch/powerpc/Kconfig                         |  12 -
 arch/powerpc/include/asm/book3s/32/pgtable.h |  12 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h |  12 +-
 arch/powerpc/mm/dma-noncoherent.c            | 318 +------------------
 arch/powerpc/mm/mem.c                        |   4 -
 arch/powerpc/mm/ptdump/ptdump.c              |   9 -
 arch/powerpc/platforms/Kconfig.cputype       |   2 +
 7 files changed, 17 insertions(+), 352 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index cfb21856559b..edc19363a729 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1141,18 +1141,6 @@ config TASK_SIZE
 	default "0x80000000" if PPC_8xx
 	default "0xc0000000"
 
-config CONSISTENT_SIZE_BOOL
-	bool "Set custom consistent memory pool size"
-	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
-	help
-	  This option allows you to set the size of the
-	  consistent memory pool.  This pool of virtual memory
-	  is used to make consistent memory allocations.
-
-config CONSISTENT_SIZE
-	hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL
-	default "0x00200000" if NOT_COHERENT_CACHE
-
 config PIN_TLB
 	bool "Pinned Kernel TLBs (860 ONLY)"
 	depends on ADVANCED_OPTIONS && PPC_8xx && \
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 331a29a501a1..0796533d37dd 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -148,21 +148,15 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
  */
 #include <asm/fixmap.h>
 
-#ifdef CONFIG_HIGHMEM
-#define KVIRT_TOP	PKMAP_BASE
-#else
-#define KVIRT_TOP	FIXADDR_START
-#endif
-
 /*
  * ioremap_bot starts at that address. Early ioremaps move down from there,
  * until mem_init() at which point this becomes the top of the vmalloc
  * and ioremap space
  */
-#ifdef CONFIG_NOT_COHERENT_CACHE
-#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#ifdef CONFIG_HIGHMEM
+#define IOREMAP_TOP	PKMAP_BASE
 #else
-#define IOREMAP_TOP	KVIRT_TOP
+#define IOREMAP_TOP	FIXADDR_START
 #endif
 
 /* PPC32 shares vmalloc area with ioremap */
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 3e1a4c1e40f0..552b96eef0c8 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -76,21 +76,15 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
  */
 #include <asm/fixmap.h>
 
-#ifdef CONFIG_HIGHMEM
-#define KVIRT_TOP	PKMAP_BASE
-#else
-#define KVIRT_TOP	FIXADDR_START
-#endif
-
 /*
  * ioremap_bot starts at that address. Early ioremaps move down from there,
  * until mem_init() at which point this becomes the top of the vmalloc
  * and ioremap space
  */
-#ifdef CONFIG_NOT_COHERENT_CACHE
-#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#ifdef CONFIG_HIGHMEM
+#define IOREMAP_TOP	PKMAP_BASE
 #else
-#define IOREMAP_TOP	KVIRT_TOP
+#define IOREMAP_TOP	FIXADDR_START
 #endif
 
 /* PPC32 shares vmalloc area with ioremap */
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index c617282d5b2a..4272ca5e8159 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -4,310 +4,18 @@
  *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
  *
  *  Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators.  Used for DMA devices that want to
- * share uncached memory with the processor core.  The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- *						-- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
  */
 
-#include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#include <linux/string.h>
 #include <linux/types.h>
 #include <linux/highmem.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
-#include <linux/export.h>
 
 #include <asm/tlbflush.h>
 #include <asm/dma.h>
 
-#include <mm/mmu_decl.h>
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE		(IOREMAP_TOP)
-#define CONSISTENT_END 		(CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct ppc_vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-};
-
-static struct ppc_vm_region consistent_head = {
-	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
-	.vm_start	= CONSISTENT_BASE,
-	.vm_end		= CONSISTENT_END,
-};
-
-static struct ppc_vm_region *
-ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct ppc_vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct ppc_vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
-{
-	struct ppc_vm_region *c;
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, unsigned long attrs)
-{
-	struct page *page;
-	struct ppc_vm_region *c;
-	unsigned long order;
-	u64 mask = ISA_DMA_THRESHOLD, limit;
-
-	if (dev) {
-		mask = dev->coherent_dma_mask;
-
-		/*
-		 * Sanity check the DMA mask - it must be non-zero, and
-		 * must be able to be satisfied by a DMA allocation.
-		 */
-		if (mask == 0) {
-			dev_warn(dev, "coherent DMA mask is unset\n");
-			goto no_page;
-		}
-
-		if ((~mask) & ISA_DMA_THRESHOLD) {
-			dev_warn(dev, "coherent DMA mask %#llx is smaller "
-				 "than system GFP_DMA mask %#llx\n",
-				 mask, (unsigned long long)ISA_DMA_THRESHOLD);
-			goto no_page;
-		}
-	}
-
-
-	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) ||
-	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
-		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
-		       size, mask);
-		return NULL;
-	}
-
-	order = get_order(size);
-
-	/* Might be useful if we ever have a real legacy DMA zone... */
-	if (mask != 0xffffffff)
-		gfp |= GFP_DMA;
-
-	page = alloc_pages(gfp, order);
-	if (!page)
-		goto no_page;
-
-	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		unsigned long kaddr = (unsigned long)page_address(page);
-		memset(page_address(page), 0, size);
-		flush_dcache_range(kaddr, kaddr + size);
-	}
-
-	/*
-	 * Allocate a virtual address in the consistent mapping region.
-	 */
-	c = ppc_vm_region_alloc(&consistent_head, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (c) {
-		unsigned long vaddr = c->vm_start;
-		struct page *end = page + (1 << order);
-
-		split_page(page, order);
-
-		/*
-		 * Set the "dma handle"
-		 */
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
-
-		do {
-			SetPageReserved(page);
-			map_kernel_page(vaddr, page_to_phys(page),
-					pgprot_noncached(PAGE_KERNEL));
-			page++;
-			vaddr += PAGE_SIZE;
-		} while (size -= PAGE_SIZE);
-
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
-		return (void *)c->vm_start;
-	}
-
-	if (page)
-		__free_pages(page, order);
- no_page:
-	return NULL;
-}
-
-/*
- * free a page as defined by the above mapping.
- */
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs)
-{
-	struct ppc_vm_region *c;
-	unsigned long flags, addr;
-	
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
-	if (!c)
-		goto no_area;
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	addr = c->vm_start;
-	do {
-		pte_t *ptep;
-		unsigned long pfn;
-
-		ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
-							       addr),
-						    addr),
-					 addr);
-		if (!pte_none(*ptep) && pte_present(*ptep)) {
-			pfn = pte_pfn(*ptep);
-			pte_clear(&init_mm, addr, ptep);
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-				__free_reserved_page(page);
-			}
-		}
-		addr += PAGE_SIZE;
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	list_del(&c->vm_list);
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
-	return;
-
- no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, vaddr);
-	dump_stack();
-}
-
 /*
  * make an area consistent.
  */
@@ -408,23 +116,15 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 	__dma_sync_page(paddr, size, dir);
 }
 
-/*
- * Return the PFN for a given cpu virtual address returned by arch_dma_alloc.
- */
-long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr,
-		dma_addr_t dma_addr)
+void arch_dma_prep_coherent(struct page *page, size_t size)
 {
-	/* This should always be populated, so we don't test every
-	 * level. If that fails, we'll have a nice crash which
-	 * will be as good as a BUG_ON()
-	 */
-	unsigned long cpu_addr = (unsigned long)vaddr;
-	pgd_t *pgd = pgd_offset_k(cpu_addr);
-	pud_t *pud = pud_offset(pgd, cpu_addr);
-	pmd_t *pmd = pmd_offset(pud, cpu_addr);
-	pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
+	unsigned long kaddr = (unsigned long)page_address(page);
 
-	if (pte_none(*ptep) || !pte_present(*ptep))
-		return 0;
-	return pte_pfn(*ptep);
+	flush_dcache_range(kaddr, kaddr + size);
 }
+
+static int __init atomic_pool_init(void)
+{
+	return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL));
+}
+postcore_initcall(atomic_pool_init);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 69f99128a8d6..be941d382c8d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -302,10 +302,6 @@ void __init mem_init(void)
 	pr_info("  * 0x%08lx..0x%08lx  : highmem PTEs\n",
 		PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
 #endif /* CONFIG_HIGHMEM */
-#ifdef CONFIG_NOT_COHERENT_CACHE
-	pr_info("  * 0x%08lx..0x%08lx  : consistent mem\n",
-		IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
-#endif /* CONFIG_NOT_COHERENT_CACHE */
 	if (ioremap_bot != IOREMAP_TOP)
 		pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
 			ioremap_bot, IOREMAP_TOP);
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index ab6a572202b4..2f9ddc29c535 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -84,10 +84,6 @@ static struct addr_marker address_markers[] = {
 #else
 	{ 0,	"Early I/O remap start" },
 	{ 0,	"Early I/O remap end" },
-#ifdef CONFIG_NOT_COHERENT_CACHE
-	{ 0,	"Consistent mem start" },
-	{ 0,	"Consistent mem end" },
-#endif
 #ifdef CONFIG_HIGHMEM
 	{ 0,	"Highmem PTEs start" },
 	{ 0,	"Highmem PTEs end" },
@@ -335,11 +331,6 @@ static void populate_markers(void)
 #else /* !CONFIG_PPC64 */
 	address_markers[i++].start_address = ioremap_bot;
 	address_markers[i++].start_address = IOREMAP_TOP;
-#ifdef CONFIG_NOT_COHERENT_CACHE
-	address_markers[i++].start_address = IOREMAP_TOP;
-	address_markers[i++].start_address = IOREMAP_TOP +
-					     CONFIG_CONSISTENT_SIZE;
-#endif
 #ifdef CONFIG_HIGHMEM
 	address_markers[i++].start_address = PKMAP_BASE;
 	address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 68c5cc075bfc..12543e53fa96 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -460,8 +460,10 @@ config NOT_COHERENT_CACHE
 	depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
 		GAMECUBE_COMMON || AMIGAONE
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select DMA_DIRECT_REMAP
 	default n if PPC_47x
 	default y
 

From bc605cd79edb68131d3be5b00b949aa312277d39 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:44:17 +1000
Subject: [PATCH 113/264] powerpc/of/pci: Rewrite pci_parse_of_flags

The existing code uses bunch of hardcoded values from the PCI Bus
Binding to IEEE Std 1275 spec; and it does so in quite non-obvious
way.

This defines fields from the cell#0 of the "reg" property of a PCI
device and uses them for parsing.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[mpe: Unsplit some 80/81 char lines, space the code with some newlines]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829084417.71873-1-aik@ozlabs.ru
---
 arch/powerpc/kernel/pci_of_scan.c | 66 +++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index 409c6c1beabf..f91d7e94872e 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -34,31 +34,75 @@ static u32 get_int_prop(struct device_node *np, const char *name, u32 def)
  * pci_parse_of_flags - Parse the flags cell of a device tree PCI address
  * @addr0: value of 1st cell of a device tree PCI address.
  * @bridge: Set this flag if the address is from a bridge 'ranges' property
+ *
+ * PCI Bus Binding to IEEE Std 1275-1994
+ *
+ * Bit#            33222222 22221111 11111100 00000000
+ *                 10987654 32109876 54321098 76543210
+ * phys.hi cell:   npt000ss bbbbbbbb dddddfff rrrrrrrr
+ * phys.mid cell:  hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh
+ * phys.lo cell:   llllllll llllllll llllllll llllllll
+ *
+ * where:
+ * n        is 0 if the address is relocatable, 1 otherwise
+ * p        is 1 if the addressable region is "prefetchable", 0 otherwise
+ * t        is 1 if the address is aliased (for non-relocatable I/O),
+ *          below 1 MB (for Memory),or below 64 KB (for relocatable I/O).
+ * ss       is the space code, denoting the address space:
+ *              00 denotes Configuration Space
+ *              01 denotes I/O Space
+ *              10 denotes 32-bit-address Memory Space
+ *              11 denotes 64-bit-address Memory Space
+ * bbbbbbbb is the 8-bit Bus Number
+ * ddddd    is the 5-bit Device Number
+ * fff      is the 3-bit Function Number
+ * rrrrrrrr is the 8-bit Register Number
  */
+#define OF_PCI_ADDR0_SPACE(ss)		(((ss)&3)<<24)
+#define OF_PCI_ADDR0_SPACE_CFG		OF_PCI_ADDR0_SPACE(0)
+#define OF_PCI_ADDR0_SPACE_IO		OF_PCI_ADDR0_SPACE(1)
+#define OF_PCI_ADDR0_SPACE_MMIO32	OF_PCI_ADDR0_SPACE(2)
+#define OF_PCI_ADDR0_SPACE_MMIO64	OF_PCI_ADDR0_SPACE(3)
+#define OF_PCI_ADDR0_SPACE_MASK		OF_PCI_ADDR0_SPACE(3)
+#define OF_PCI_ADDR0_RELOC		(1UL<<31)
+#define OF_PCI_ADDR0_PREFETCH		(1UL<<30)
+#define OF_PCI_ADDR0_ALIAS		(1UL<<29)
+#define OF_PCI_ADDR0_BUS		0x00FF0000UL
+#define OF_PCI_ADDR0_DEV		0x0000F800UL
+#define OF_PCI_ADDR0_FN			0x00000700UL
+#define OF_PCI_ADDR0_BARREG		0x000000FFUL
+
 unsigned int pci_parse_of_flags(u32 addr0, int bridge)
 {
-	unsigned int flags = 0;
+	unsigned int flags = 0, as = addr0 & OF_PCI_ADDR0_SPACE_MASK;
 
-	if (addr0 & 0x02000000) {
+	if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64) {
 		flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
-		flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
-		if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64)
-			flags |= IORESOURCE_MEM_64;
-		flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
-		if (addr0 & 0x40000000)
-			flags |= IORESOURCE_PREFETCH
-				 | PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+		if (as == OF_PCI_ADDR0_SPACE_MMIO64)
+			flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64;
+
+		if (addr0 & OF_PCI_ADDR0_ALIAS)
+			flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M;
+
+		if (addr0 & OF_PCI_ADDR0_PREFETCH)
+			flags |= IORESOURCE_PREFETCH |
+				 PCI_BASE_ADDRESS_MEM_PREFETCH;
+
 		/* Note: We don't know whether the ROM has been left enabled
 		 * by the firmware or not. We mark it as disabled (ie, we do
 		 * not set the IORESOURCE_ROM_ENABLE flag) for now rather than
 		 * do a config space read, it will be force-enabled if needed
 		 */
-		if (!bridge && (addr0 & 0xff) == 0x30)
+		if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS)
 			flags |= IORESOURCE_READONLY;
-	} else if (addr0 & 0x01000000)
+
+	} else if (as == OF_PCI_ADDR0_SPACE_IO)
 		flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO;
+
 	if (flags)
 		flags |= IORESOURCE_SIZEALIGN;
+
 	return flags;
 }
 

From 4f916593be9da38c5cf0d3a5c386b57beb70f422 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 26 Aug 2019 14:55:20 +1000
Subject: [PATCH 114/264] KVM: PPC: Book3S: Fix incorrect
 guest-to-user-translation error handling

H_PUT_TCE_INDIRECT handlers receive a page with up to 512 TCEs from
a guest. Although we verify correctness of TCEs before we do anything
with the existing tables, there is a small window when a check in
kvmppc_tce_validate might pass and right after that the guest alters
the page with TCEs which can cause early exit from the handler and
leave srcu_read_lock(&vcpu->kvm->srcu) (virtual mode) or lock_rmap(rmap)
(real mode) locked.

This fixes the bug by jumping to the common exit code with an appropriate
unlock.

Fixes: 121f80ba68f1 ("KVM: PPC: VFIO: Add in-kernel acceleration for VFIO")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190826045520.92153-1-aik@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c    | 6 ++++--
 arch/powerpc/kvm/book3s_64_vio_hv.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e99a14798ab0..c4b606fe73eb 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -660,8 +660,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		}
 		tce = be64_to_cpu(tce);
 
-		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua))
-			return H_PARAMETER;
+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
+			ret = H_PARAMETER;
+			goto unlock_exit;
+		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index f50bbeedfc66..b4f20f13b860 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -556,8 +556,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
 		ua = 0;
-		if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
-			return H_PARAMETER;
+		if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
+			ret = H_PARAMETER;
+			goto unlock_exit;
+		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,

From 35872480da47ec714fd9c4f2f3d2d83daf304851 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:52:48 +1000
Subject: [PATCH 115/264] powerpc/powernv/ioda: Split out TCE invalidation from
 TCE updates

At the moment updates in a TCE table are made by iommu_table_ops::exchange
which update one TCE and invalidates an entry in the PHB/NPU TCE cache
via set of registers called "TCE Kill" (hence the naming).
Writing a TCE is a simple xchg() but invalidating the TCE cache is
a relatively expensive OPAL call. Mapping a 100GB guest with PCI+NPU
passed through devices takes about 20s.

Thankfully we can do better. Since such big mappings happen at the boot
time and when memory is plugged/onlined (i.e. not often), these requests
come in 512 pages so we call call OPAL 512 times less which brings 20s
from the above to less than 10s. Also, since TCE caches can be flushed
entirely, calling OPAL for 512 TCEs helps skiboot [1] to decide whether
to flush the entire cache or not.

This implements 2 new iommu_table_ops callbacks:
- xchg_no_kill() to update a single TCE with no TCE invalidation;
- tce_kill() to invalidate multiple TCEs.
This uses the same xchg_no_kill() callback for IODA1/2.

This implements 2 new wrappers on top of the new callbacks similar to
the existing iommu_tce_xchg().

This does not use the new callbacks yet, the next patches will;
so this should not cause any behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829085252.72370-2-aik@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h          | 17 ++++++++++++++
 arch/powerpc/kernel/iommu.c               | 27 +++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 12 ++++++++++
 3 files changed, 56 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 18d342b815e4..babddede9245 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -58,6 +58,17 @@ struct iommu_table_ops {
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
 
+	int (*xchg_no_kill)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction,
+			bool realmode);
+
+	void (*tce_kill)(struct iommu_table *tbl,
+			unsigned long index,
+			unsigned long pages,
+			bool realmode);
+
 	__be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
 #endif
 	void (*clear)(struct iommu_table *tbl,
@@ -206,6 +217,12 @@ extern void iommu_del_device(struct device *dev);
 extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
 		unsigned long entry, unsigned long *hpa,
 		enum dma_data_direction *direction);
+extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction);
+extern void iommu_tce_kill(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0a67ce9f827e..145f29cf7e4c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1005,6 +1005,33 @@ long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
+extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction)
+{
+	long ret;
+	unsigned long size = 0;
+
+	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false);
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL)) &&
+			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
+					&size))
+		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
+
+void iommu_tce_kill(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	if (tbl->it_ops->tce_kill)
+		tbl->it_ops->tce_kill(tbl, entry, pages, false);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_kill);
+
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8080558d020..911f96abed89 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1939,6 +1939,14 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
 }
 
 #ifdef CONFIG_IOMMU_API
+/* Common for IODA1 and IODA2 */
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool realmode)
+{
+	return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
+}
+
 static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction)
 {
@@ -1975,6 +1983,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda1_tce_xchg,
 	.exchange_rm = pnv_ioda1_tce_xchg_rm,
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_p7ioc_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,
 #endif
 	.clear = pnv_ioda1_tce_free,
@@ -2140,6 +2150,8 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda2_tce_xchg,
 	.exchange_rm = pnv_ioda2_tce_xchg_rm,
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_ioda2_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,
 #endif
 	.clear = pnv_ioda2_tce_free,

From 01b7d128b5a7f0d09626c090093ff44155f347c9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:52:49 +1000
Subject: [PATCH 116/264] KVM: PPC: Book3S: Invalidate multiple TCEs at once

Invalidating a TCE cache entry for each updated TCE is quite expensive.
This makes use of the new iommu_table_ops::xchg_no_kill()/tce_kill()
callbacks to bring down the time spent in mapping a huge guest DMA window;
roughly 20s to 10s for each guest's 100GB of DMA space.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829085252.72370-3-aik@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c    | 29 +++++++++++++++-------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 38 +++++++++++++++++++++--------
 2 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c4b606fe73eb..5834db0a54c6 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -416,7 +416,7 @@ static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
 	unsigned long hpa = 0;
 	enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg(mm, tbl, entry, &hpa, &dir);
+	iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir);
 }
 
 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -447,7 +447,8 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 	unsigned long hpa = 0;
 	long ret;
 
-	if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir)))
+	if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa,
+					&dir)))
 		return H_TOO_HARD;
 
 	if (dir == DMA_NONE)
@@ -455,7 +456,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 
 	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
 	if (ret != H_SUCCESS)
-		iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
 
 	return ret;
 }
@@ -501,7 +502,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 	if (mm_iommu_mapped_inc(mem))
 		return H_TOO_HARD;
 
-	ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
+	ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
 	if (WARN_ON_ONCE(ret)) {
 		mm_iommu_mapped_dec(mem);
 		return H_TOO_HARD;
@@ -579,6 +580,8 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
 					entry, ua, dir);
 
+		iommu_tce_kill(stit->tbl, entry, 1);
+
 		if (ret != H_SUCCESS) {
 			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
 			goto unlock_exit;
@@ -656,13 +659,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		 */
 		if (get_user(tce, tces + i)) {
 			ret = H_TOO_HARD;
-			goto unlock_exit;
+			goto invalidate_exit;
 		}
 		tce = be64_to_cpu(tce);
 
 		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
 			ret = H_PARAMETER;
-			goto unlock_exit;
+			goto invalidate_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -673,13 +676,17 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 			if (ret != H_SUCCESS) {
 				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
 						entry);
-				goto unlock_exit;
+				goto invalidate_exit;
 			}
 		}
 
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
+invalidate_exit:
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+		iommu_tce_kill(stit->tbl, entry, npages);
+
 unlock_exit:
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
@@ -718,7 +725,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				return ret;
+				goto invalidate_exit;
 
 			WARN_ON_ONCE(1);
 			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
@@ -728,6 +735,10 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-	return H_SUCCESS;
+invalidate_exit:
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+		iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index b4f20f13b860..ab6eeb8e753e 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -218,13 +218,14 @@ static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
 	return H_SUCCESS;
 }
 
-static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
+static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
+		struct iommu_table *tbl,
 		unsigned long entry, unsigned long *hpa,
 		enum dma_data_direction *direction)
 {
 	long ret;
 
-	ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true);
 
 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
 				(*direction == DMA_BIDIRECTIONAL))) {
@@ -240,13 +241,20 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
 	return ret;
 }
 
+extern void iommu_tce_kill_rm(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	if (tbl->it_ops->tce_kill)
+		tbl->it_ops->tce_kill(tbl, entry, pages, true);
+}
+
 static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
 		unsigned long entry)
 {
 	unsigned long hpa = 0;
 	enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+	iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
 }
 
 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -278,7 +286,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
 	unsigned long hpa = 0;
 	long ret;
 
-	if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir))
+	if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir))
 		/*
 		 * real mode xchg can fail if struct page crosses
 		 * a page boundary
@@ -290,7 +298,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
 
 	ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
 	if (ret)
-		iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
 
 	return ret;
 }
@@ -336,7 +344,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
 		return H_TOO_HARD;
 
-	ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+	ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
 	if (ret) {
 		mm_iommu_mapped_dec(mem);
 		/*
@@ -417,6 +425,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
 					stit->tbl, entry, ua, dir);
 
+		iommu_tce_kill_rm(stit->tbl, entry, 1);
+
 		if (ret != H_SUCCESS) {
 			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
 			return ret;
@@ -558,7 +568,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		ua = 0;
 		if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
 			ret = H_PARAMETER;
-			goto unlock_exit;
+			goto invalidate_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -569,13 +579,17 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 			if (ret != H_SUCCESS) {
 				kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
 						entry);
-				goto unlock_exit;
+				goto invalidate_exit;
 			}
 		}
 
 		kvmppc_rm_tce_put(stt, entry + i, tce);
 	}
 
+invalidate_exit:
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+		iommu_tce_kill_rm(stit->tbl, entry, npages);
+
 unlock_exit:
 	if (rmap)
 		unlock_rmap(rmap);
@@ -618,7 +632,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				return ret;
+				goto invalidate_exit;
 
 			WARN_ON_ONCE_RM(1);
 			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
@@ -628,7 +642,11 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-	return H_SUCCESS;
+invalidate_exit:
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+		iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages);
+
+	return ret;
 }
 
 /* This can be called in either virtual mode or real mode */

From 650ab1e370cdb61ba5c7495006f4376e02374da0 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:52:50 +1000
Subject: [PATCH 117/264] vfio/spapr_tce: Invalidate multiple TCEs at once

Invalidating a TCE cache entry for each updated TCE is quite expensive.
This makes use of the new iommu_table_ops::xchg_no_kill()/tce_kill()
callbacks to bring down the time spent in mapping a huge guest DMA window.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829085252.72370-4-aik@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8ce9ad21129f..9809369e0ed3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -435,7 +435,7 @@ static int tce_iommu_clear(struct tce_container *container,
 	unsigned long oldhpa;
 	long ret;
 	enum dma_data_direction direction;
-	unsigned long lastentry = entry + pages;
+	unsigned long lastentry = entry + pages, firstentry = entry;
 
 	for ( ; entry < lastentry; ++entry) {
 		if (tbl->it_indirect_levels && tbl->it_userspace) {
@@ -460,7 +460,7 @@ static int tce_iommu_clear(struct tce_container *container,
 
 		direction = DMA_NONE;
 		oldhpa = 0;
-		ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 				&direction);
 		if (ret)
 			continue;
@@ -476,6 +476,8 @@ static int tce_iommu_clear(struct tce_container *container,
 		tce_iommu_unuse_page(container, oldhpa);
 	}
 
+	iommu_tce_kill(tbl, firstentry, pages);
+
 	return 0;
 }
 
@@ -518,8 +520,8 @@ static long tce_iommu_build(struct tce_container *container,
 
 		hpa |= offset;
 		dirtmp = direction;
-		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
-				&dirtmp);
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+				&hpa, &dirtmp);
 		if (ret) {
 			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -536,6 +538,8 @@ static long tce_iommu_build(struct tce_container *container,
 
 	if (ret)
 		tce_iommu_clear(container, tbl, entry, i);
+	else
+		iommu_tce_kill(tbl, entry, pages);
 
 	return ret;
 }
@@ -572,8 +576,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
 		if (mm_iommu_mapped_inc(mem))
 			break;
 
-		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
-				&dirtmp);
+		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+				&hpa, &dirtmp);
 		if (ret) {
 			/* dirtmp cannot be DMA_NONE here */
 			tce_iommu_unuse_page_v2(container, tbl, entry + i);
@@ -593,6 +597,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
 
 	if (ret)
 		tce_iommu_clear(container, tbl, entry, i);
+	else
+		iommu_tce_kill(tbl, entry, pages);
 
 	return ret;
 }

From 021b7868113cdca56297f2356d0f6fe89985c285 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:52:51 +1000
Subject: [PATCH 118/264] powerpc/pseries/iommu: Switch to xchg_no_kill

This is the last implementation of iommu_table_ops::exchange() which
we are about to remove.

This implements xchg_no_kill() for pseries. Since it is paravirtual
platform, the hypervisor does TCE invalidations and we do not have
to deal with it here, hence no tce_kill() hook.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829085252.72370-5-aik@ozlabs.ru
---
 arch/powerpc/platforms/pseries/iommu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 889dc2e44b89..a22ef9c7de2e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -621,7 +621,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 #ifdef CONFIG_IOMMU_API
 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
-				long *tce, enum dma_data_direction *direction)
+				long *tce, enum dma_data_direction *direction,
+				bool realmode)
 {
 	long rc;
 	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
@@ -649,7 +650,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
 struct iommu_table_ops iommu_table_lpar_multi_ops = {
 	.set = tce_buildmulti_pSeriesLP,
 #ifdef CONFIG_IOMMU_API
-	.exchange = tce_exchange_pseries,
+	.xchg_no_kill = tce_exchange_pseries,
 #endif
 	.clear = tce_freemulti_pSeriesLP,
 	.get = tce_get_pSeriesLP

From a102f139aac54689eeb05883952742ae780159f3 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Aug 2019 18:52:52 +1000
Subject: [PATCH 119/264] powerpc/powernv/ioda: Remove obsolete
 iommu_table_ops::exchange callbacks

As now we have xchg_no_kill/tce_kill, these are not used anymore so
remove them.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829085252.72370-6-aik@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h          | 10 -----
 arch/powerpc/kernel/iommu.c               | 26 +-----------
 arch/powerpc/platforms/powernv/pci-ioda.c | 50 -----------------------
 3 files changed, 1 insertion(+), 85 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index babddede9245..9ed0206cd98c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -48,16 +48,6 @@ struct iommu_table_ops {
 	 * returns old TCE and DMA direction mask.
 	 * @tce is a physical address.
 	 */
-	int (*exchange)(struct iommu_table *tbl,
-			long index,
-			unsigned long *hpa,
-			enum dma_data_direction *direction);
-	/* Real mode */
-	int (*exchange_rm)(struct iommu_table *tbl,
-			long index,
-			unsigned long *hpa,
-			enum dma_data_direction *direction);
-
 	int (*xchg_no_kill)(struct iommu_table *tbl,
 			long index,
 			unsigned long *hpa,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 145f29cf7e4c..bf803000e4b3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -981,30 +981,6 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
-long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
-		unsigned long entry, unsigned long *hpa,
-		enum dma_data_direction *direction)
-{
-	long ret;
-	unsigned long size = 0;
-
-	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
-
-	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-			(*direction == DMA_BIDIRECTIONAL)) &&
-			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
-					&size))
-		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
-
-	/* if (unlikely(ret))
-		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
-			__func__, hwaddr, entry << tbl->it_page_shift,
-				hwaddr, ret); */
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_tce_xchg);
-
 extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long *hpa,
@@ -1044,7 +1020,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	 * requires exchange() callback defined so if it is not
 	 * implemented, we disallow taking ownership over the table.
 	 */
-	if (!tbl->it_ops->exchange)
+	if (!tbl->it_ops->xchg_no_kill)
 		return -EINVAL;
 
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 911f96abed89..22a51aa75796 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1946,28 +1946,6 @@ static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
 {
 	return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
 }
-
-static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
-	if (!ret)
-		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
-
-	return ret;
-}
-
-static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
-	if (!ret)
-		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
-
-	return ret;
-}
 #endif
 
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1981,8 +1959,6 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
-	.exchange = pnv_ioda1_tce_xchg,
-	.exchange_rm = pnv_ioda1_tce_xchg_rm,
 	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
 	.tce_kill = pnv_pci_p7ioc_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,
@@ -2113,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
-#ifdef CONFIG_IOMMU_API
-static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
-	if (!ret)
-		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
-
-	return ret;
-}
-
-static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
-	if (!ret)
-		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
-
-	return ret;
-}
-#endif
-
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -2148,8 +2100,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
-	.exchange = pnv_ioda2_tce_xchg,
-	.exchange_rm = pnv_ioda2_tce_xchg_rm,
 	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
 	.tce_kill = pnv_pci_ioda2_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,

From 70ed86f4de5bd74dd2d884dcd2f3275c4cfe665f Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Thu, 29 Aug 2019 12:50:20 -0300
Subject: [PATCH 120/264] powerpc: Add PowerPC Capabilities ELF note

Add the PowerPC name and the PPC_ELFNOTE_CAPABILITIES type in the
kernel binary ELF note. This type is a bitmap that can be used to
advertise kernel capabilities to userland.

This patch also defines PPCCAP_ULTRAVISOR_BIT as being the bit zero.

Suggested-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
[ maxiwell: Define the 'PowerPC' type in the elfnote.h ]
Signed-off-by: Maxiwell S. Garcia <maxiwell@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829155021.2915-2-maxiwell@linux.ibm.com
---
 arch/powerpc/include/asm/elfnote.h | 24 ++++++++++++++++++
 arch/powerpc/kernel/Makefile       |  2 +-
 arch/powerpc/kernel/note.S         | 40 ++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/elfnote.h
 create mode 100644 arch/powerpc/kernel/note.S

diff --git a/arch/powerpc/include/asm/elfnote.h b/arch/powerpc/include/asm/elfnote.h
new file mode 100644
index 000000000000..a201b6e9ae44
--- /dev/null
+++ b/arch/powerpc/include/asm/elfnote.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PowerPC ELF notes.
+ *
+ * Copyright 2019, IBM Corporation
+ */
+
+#ifndef __ASM_POWERPC_ELFNOTE_H__
+#define __ASM_POWERPC_ELFNOTE_H__
+
+/*
+ * These note types should live in a SHT_NOTE segment and have
+ * "PowerPC" in the name field.
+ */
+
+/*
+ * The capabilities supported/required by this kernel (bitmap).
+ *
+ * This type uses a bitmap as "desc" field. Each bit is described
+ * in arch/powerpc/kernel/note.S
+ */
+#define PPC_ELFNOTE_CAPABILITIES 1
+
+#endif /* __ASM_POWERPC_ELFNOTE_H__ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ea0c69236789..d4eb50de13b1 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -53,7 +53,7 @@ obj-y				:= cputable.o ptrace.o syscalls.o \
 				   dma-common.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
-				   paca.o nvram_64.o firmware.o
+				   paca.o nvram_64.o firmware.o note.o
 obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S
new file mode 100644
index 000000000000..bcdad15395dd
--- /dev/null
+++ b/arch/powerpc/kernel/note.S
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PowerPC ELF notes.
+ *
+ * Copyright 2019, IBM Corporation
+ */
+
+#include <linux/elfnote.h>
+#include <asm/elfnote.h>
+
+/*
+ * Ultravisor-capable bit (PowerNV only).
+ *
+ * Bit 0 indicates that the powerpc kernel binary knows how to run in an
+ * ultravisor-enabled system.
+ *
+ * In an ultravisor-enabled system, some machine resources are now controlled
+ * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
+ * being run on a machine with ultravisor, the kernel will probably crash
+ * trying to access ultravisor resources. For instance, it may crash in early
+ * boot trying to set the partition table entry 0.
+ *
+ * In an ultravisor-enabled system, a bootloader could warn the user or prevent
+ * the kernel from being run if the PowerPC ultravisor capability doesn't exist
+ * or the Ultravisor-capable bit is not set.
+ */
+#ifdef CONFIG_PPC_POWERNV
+#define PPCCAP_ULTRAVISOR_BIT		(1 << 0)
+#else
+#define PPCCAP_ULTRAVISOR_BIT		0
+#endif
+
+/*
+ * Add the PowerPC Capabilities in the binary ELF note. It is a bitmap that
+ * can be used to advertise kernel capabilities to userland.
+ */
+#define PPC_CAPABILITIES_BITMAP (PPCCAP_ULTRAVISOR_BIT)
+
+ELFNOTE(PowerPC, PPC_ELFNOTE_CAPABILITIES,
+	.long PPC_CAPABILITIES_BITMAP)

From 134cb3ab71c0736da0ec66d2cfd7f6ebba96c8cf Mon Sep 17 00:00:00 2001
From: "Maxiwell S. Garcia" <maxiwell@linux.ibm.com>
Date: Thu, 29 Aug 2019 12:50:21 -0300
Subject: [PATCH 121/264] Documentation/powerpc: Add ELF note documentation

The ELF note documentation describes the types and descriptors to be
used with the PowerPC namespace.

Signed-off-by: Maxiwell S. Garcia <maxiwell@linux.ibm.com>
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190829155021.2915-3-maxiwell@linux.ibm.com
---
 Documentation/powerpc/elfnote.rst | 42 +++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/powerpc/elfnote.rst

diff --git a/Documentation/powerpc/elfnote.rst b/Documentation/powerpc/elfnote.rst
new file mode 100644
index 000000000000..2a5c4beeb809
--- /dev/null
+++ b/Documentation/powerpc/elfnote.rst
@@ -0,0 +1,42 @@
+==========================
+ELF Note PowerPC Namespace
+==========================
+
+The PowerPC namespace in an ELF Note of the kernel binary is used to store
+capabilities and information which can be used by a bootloader or userland.
+
+Types and Descriptors
+---------------------
+
+The types to be used with the "PowerPC" namesapce are defined in the
+include/uapi/asm/elfnote.h
+
+	1) PPC_ELFNOTE_CAPABILITIES
+
+Define the capabilities supported/required by the kernel. This type uses a
+bitmap as "descriptor" field. Each bit is described below:
+
+- Ultravisor-capable bit (PowerNV only).
+
+	#define PPCCAP_ULTRAVISOR_BIT (1 << 0)
+
+	Indicate that the powerpc kernel binary knows how to run in an
+	ultravisor-enabled system.
+
+	In an ultravisor-enabled system, some machine resources are now controlled
+	by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
+	being run on a machine with ultravisor, the kernel will probably crash
+	trying to access ultravisor resources. For instance, it may crash in early
+	boot trying to set the partition table entry 0.
+
+	In an ultravisor-enabled system, a bootloader could warn the user or prevent
+   	the kernel from being run if the PowerPC ultravisor capability doesn't exist
+	or the Ultravisor-capable bit is not set.
+
+References
+----------
+
+arch/powerpc/include/asm/elfnote.h
+arch/powerpc/kernel/note.S
+
+

From 250c6c31228d49f3b96855ec387cf37bbe7cb6a7 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:32 -0300
Subject: [PATCH 122/264] Documentation/powerpc: Ultravisor API

Protected Execution Facility (PEF) is an architectural change for
POWER 9 that enables Secure Virtual Machines (SVMs). When enabled,
PEF adds a new higher privileged mode, called Ultravisor mode, to POWER
architecture. Along with the new mode there is new firmware called the
Protected Execution Ultravisor (or Ultravisor for short).

POWER 9 DD2.3 chips (PVR=0x004e1203) or greater will be PEF-capable.

Attached documentation provides an overview of PEF and defines the API
for various interfaces that must be implemented in the Ultravisor
firmware as well as in the KVM Hypervisor.

Based on input from Mike Anderson, Thiago Bauermann, Claudio Carvalho,
Ben Herrenschmidt, Guerney Hunt, Paul Mackerras.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@linux.ibm.com>
Signed-off-by: Guerney Hunt <gdhh@linux.ibm.com>
Reviewed-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Reviewed-by: Michael Anderson <andmike@linux.ibm.com>
Reviewed-by: Thiago Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-2-cclaudio@linux.ibm.com
---
 Documentation/powerpc/ultravisor.rst | 1057 ++++++++++++++++++++++++++
 1 file changed, 1057 insertions(+)
 create mode 100644 Documentation/powerpc/ultravisor.rst

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
new file mode 100644
index 000000000000..94a149f34ec3
--- /dev/null
+++ b/Documentation/powerpc/ultravisor.rst
@@ -0,0 +1,1057 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. _ultravisor:
+
+============================
+Protected Execution Facility
+============================
+
+.. contents::
+    :depth: 3
+
+.. sectnum::
+    :depth: 3
+
+Protected Execution Facility
+############################
+
+    Protected Execution Facility (PEF) is an architectural change for
+    POWER 9 that enables Secure Virtual Machines (SVMs). DD2.3 chips
+    (PVR=0x004e1203) or greater will be PEF-capable. A new ISA release
+    will include the PEF RFC02487 changes.
+
+    When enabled, PEF adds a new higher privileged mode, called Ultravisor
+    mode, to POWER architecture. Along with the new mode there is new
+    firmware called the Protected Execution Ultravisor (or Ultravisor
+    for short). Ultravisor mode is the highest privileged mode in POWER
+    architecture.
+
+	+------------------+
+	| Privilege States |
+	+==================+
+	|  Problem         |
+	+------------------+
+	|  Supervisor      |
+	+------------------+
+	|  Hypervisor      |
+	+------------------+
+	|  Ultravisor      |
+	+------------------+
+
+    PEF protects SVMs from the hypervisor, privileged users, and other
+    VMs in the system. SVMs are protected while at rest and can only be
+    executed by an authorized machine. All virtual machines utilize
+    hypervisor services. The Ultravisor filters calls between the SVMs
+    and the hypervisor to assure that information does not accidentally
+    leak. All hypercalls except H_RANDOM are reflected to the hypervisor.
+    H_RANDOM is not reflected to prevent the hypervisor from influencing
+    random values in the SVM.
+
+    To support this there is a refactoring of the ownership of resources
+    in the CPU. Some of the resources which were previously hypervisor
+    privileged are now ultravisor privileged.
+
+Hardware
+========
+
+    The hardware changes include the following:
+
+    * There is a new bit in the MSR that determines whether the current
+      process is running in secure mode, MSR(S) bit 41. MSR(S)=1, process
+      is in secure mode, MSR(s)=0 process is in normal mode.
+
+    * The MSR(S) bit can only be set by the Ultravisor.
+
+    * HRFID cannot be used to set the MSR(S) bit. If the hypervisor needs
+      to return to a SVM it must use an ultracall. It can determine if
+      the VM it is returning to is secure.
+
+    * There is a new Ultravisor privileged register, SMFCTRL, which has an
+      enable/disable bit SMFCTRL(E).
+
+    * The privilege of a process is now determined by three MSR bits,
+      MSR(S, HV, PR). In each of the tables below the modes are listed
+      from least privilege to highest privilege. The higher privilege
+      modes can access all the resources of the lower privilege modes.
+
+      **Secure Mode MSR Settings**
+
+      +---+---+---+---------------+
+      | S | HV| PR|Privilege      |
+      +===+===+===+===============+
+      | 1 | 0 | 1 | Problem       |
+      +---+---+---+---------------+
+      | 1 | 0 | 0 | Privileged(OS)|
+      +---+---+---+---------------+
+      | 1 | 1 | 0 | Ultravisor    |
+      +---+---+---+---------------+
+      | 1 | 1 | 1 | Reserved      |
+      +---+---+---+---------------+
+
+      **Normal Mode MSR Settings**
+
+      +---+---+---+---------------+
+      | S | HV| PR|Privilege      |
+      +===+===+===+===============+
+      | 0 | 0 | 1 | Problem       |
+      +---+---+---+---------------+
+      | 0 | 0 | 0 | Privileged(OS)|
+      +---+---+---+---------------+
+      | 0 | 1 | 0 | Hypervisor    |
+      +---+---+---+---------------+
+      | 0 | 1 | 1 | Problem (Host)|
+      +---+---+---+---------------+
+
+    * Memory is partitioned into secure and normal memory. Only processes
+      that are running in secure mode can access secure memory.
+
+    * The hardware does not allow anything that is not running secure to
+      access secure memory. This means that the Hypervisor cannot access
+      the memory of the SVM without using an ultracall (asking the
+      Ultravisor). The Ultravisor will only allow the hypervisor to see
+      the SVM memory encrypted.
+
+    * I/O systems are not allowed to directly address secure memory. This
+      limits the SVMs to virtual I/O only.
+
+    * The architecture allows the SVM to share pages of memory with the
+      hypervisor that are not protected with encryption. However, this
+      sharing must be initiated by the SVM.
+
+    * When a process is running in secure mode all hypercalls
+      (syscall lev=1) go to the Ultravisor.
+
+    * When a process is in secure mode all interrupts go to the
+      Ultravisor.
+
+    * The following resources have become Ultravisor privileged and
+      require an Ultravisor interface to manipulate:
+
+      * Processor configurations registers (SCOMs).
+
+      * Stop state information.
+
+      * The debug registers CIABR, DAWR, and DAWRX when SMFCTRL(D) is set.
+        If SMFCTRL(D) is not set they do not work in secure mode. When set,
+        reading and writing requires an Ultravisor call, otherwise that
+        will cause a Hypervisor Emulation Assistance interrupt.
+
+      * PTCR and partition table entries (partition table is in secure
+        memory). An attempt to write to PTCR will cause a Hypervisor
+        Emulation Assitance interrupt.
+
+      * LDBAR (LD Base Address Register) and IMC (In-Memory Collection)
+        non-architected registers. An attempt to write to them will cause a
+        Hypervisor Emulation Assistance interrupt.
+
+      * Paging for an SVM, sharing of memory with Hypervisor for an SVM.
+        (Including Virtual Processor Area (VPA) and virtual I/O).
+
+
+Software/Microcode
+==================
+
+    The software changes include:
+
+    * SVMs are created from normal VM using (open source) tooling supplied
+      by IBM.
+
+    * All SVMs start as normal VMs and utilize an ultracall, UV_ESM
+      (Enter Secure Mode), to make the transition.
+
+    * When the UV_ESM ultracall is made the Ultravisor copies the VM into
+      secure memory, decrypts the verification information, and checks the
+      integrity of the SVM. If the integrity check passes the Ultravisor
+      passes control in secure mode.
+
+    * The verification information includes the pass phrase for the
+      encrypted disk associated with the SVM. This pass phrase is given
+      to the SVM when requested.
+
+    * The Ultravisor is not involved in protecting the encrypted disk of
+      the SVM while at rest.
+
+    * For external interrupts the Ultravisor saves the state of the SVM,
+      and reflects the interrupt to the hypervisor for processing.
+      For hypercalls, the Ultravisor inserts neutral state into all
+      registers not needed for the hypercall then reflects the call to
+      the hypervisor for processing. The H_RANDOM hypercall is performed
+      by the Ultravisor and not reflected.
+
+    * For virtual I/O to work bounce buffering must be done.
+
+    * The Ultravisor uses AES (IAPM) for protection of SVM memory. IAPM
+      is a mode of AES that provides integrity and secrecy concurrently.
+
+    * The movement of data between normal and secure pages is coordinated
+      with the Ultravisor by a new HMM plug-in in the Hypervisor.
+
+    The Ultravisor offers new services to the hypervisor and SVMs. These
+    are accessed through ultracalls.
+
+Terminology
+===========
+
+    * Hypercalls: special system calls used to request services from
+      Hypervisor.
+
+    * Normal memory: Memory that is accessible to Hypervisor.
+
+    * Normal page: Page backed by normal memory and available to
+      Hypervisor.
+
+    * Shared page: A page backed by normal memory and available to both
+      the Hypervisor/QEMU and the SVM (i.e page has mappings in SVM and
+      Hypervisor/QEMU).
+
+    * Secure memory: Memory that is accessible only to Ultravisor and
+      SVMs.
+
+    * Secure page: Page backed by secure memory and only available to
+      Ultravisor and SVM.
+
+    * SVM: Secure Virtual Machine.
+
+    * Ultracalls: special system calls used to request services from
+      Ultravisor.
+
+
+Ultravisor calls API
+####################
+
+    This section describes Ultravisor calls (ultracalls) needed to
+    support Secure Virtual Machines (SVM)s and Paravirtualized KVM. The
+    ultracalls allow the SVMs and Hypervisor to request services from the
+    Ultravisor such as accessing a register or memory region that can only
+    be accessed when running in Ultravisor-privileged mode.
+
+    The specific service needed from an ultracall is specified in register
+    R3 (the first parameter to the ultracall). Other parameters to the
+    ultracall, if any, are specified in registers R4 through R12.
+
+    Return value of all ultracalls is in register R3. Other output values
+    from the ultracall, if any, are returned in registers R4 through R12.
+    The only exception to this register usage is the ``UV_RETURN``
+    ultracall described below.
+
+    Each ultracall returns specific error codes, applicable in the context
+    of the ultracall. However, like with the PowerPC Architecture Platform
+    Reference (PAPR), if no specific error code is defined for a
+    particular situation, then the ultracall will fallback to an erroneous
+    parameter-position based code. i.e U_PARAMETER, U_P2, U_P3 etc
+    depending on the ultracall parameter that may have caused the error.
+
+    Some ultracalls involve transferring a page of data between Ultravisor
+    and Hypervisor.  Secure pages that are transferred from secure memory
+    to normal memory may be encrypted using dynamically generated keys.
+    When the secure pages are transferred back to secure memory, they may
+    be decrypted using the same dynamically generated keys. Generation and
+    management of these keys will be covered in a separate document.
+
+    For now this only covers ultracalls currently implemented and being
+    used by Hypervisor and SVMs but others can be added here when it
+    makes sense.
+
+    The full specification for all hypercalls/ultracalls will eventually
+    be made available in the public/OpenPower version of the PAPR
+    specification.
+
+    **Note**
+
+    If PEF is not enabled, the ultracalls will be redirected to the
+    Hypervisor which must handle/fail the calls.
+
+Ultracalls used by Hypervisor
+=============================
+
+    This section describes the virtual memory management ultracalls used
+    by the Hypervisor to manage SVMs.
+
+UV_PAGE_OUT
+-----------
+
+    Encrypt and move the contents of a page from secure memory to normal
+    memory.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_PAGE_OUT,
+		uint16_t lpid,		/* LPAR ID */
+		uint64_t dest_ra,	/* real address of destination page */
+		uint64_t src_gpa,	/* source guest-physical-address */
+		uint8_t  flags,		/* flags */
+		uint64_t order)		/* page size order */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``dest_ra`` is invalid.
+	* U_P3		if the ``src_gpa`` address is invalid.
+	* U_P4		if any bit in the ``flags`` is unrecognized
+	* U_P5		if the ``order`` parameter is unsupported.
+	* U_FUNCTION	if functionality is not supported.
+	* U_BUSY	if page cannot be currently paged-out.
+
+Description
+~~~~~~~~~~~
+
+    Encrypt the contents of a secure-page and make it available to
+    Hypervisor in a normal page.
+
+    By default, the source page is unmapped from the SVM's partition-
+    scoped page table. But the Hypervisor can provide a hint to the
+    Ultravisor to retain the page mapping by setting the ``UV_SNAPSHOT``
+    flag in ``flags`` parameter.
+
+    If the source page is already a shared page the call returns
+    U_SUCCESS, without doing anything.
+
+Use cases
+~~~~~~~~~
+
+    #. QEMU attempts to access an address belonging to the SVM but the
+       page frame for that address is not mapped into QEMU's address
+       space. In this case, the Hypervisor will allocate a page frame,
+       map it into QEMU's address space and issue the ``UV_PAGE_OUT``
+       call to retrieve the encrypted contents of the page.
+
+    #. When Ultravisor runs low on secure memory and it needs to page-out
+       an LRU page. In this case, Ultravisor will issue the
+       ``H_SVM_PAGE_OUT`` hypercall to the Hypervisor. The Hypervisor will
+       then allocate a normal page and issue the ``UV_PAGE_OUT`` ultracall
+       and the Ultravisor will encrypt and move the contents of the secure
+       page into the normal page.
+
+    #. When Hypervisor accesses SVM data, the Hypervisor requests the
+       Ultravisor to transfer the corresponding page into a insecure page,
+       which the Hypervisor can access. The data in the normal page will
+       be encrypted though.
+
+UV_PAGE_IN
+----------
+
+    Move the contents of a page from normal memory to secure memory.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_PAGE_IN,
+		uint16_t lpid,		/* the LPAR ID */
+		uint64_t src_ra,	/* source real address of page */
+		uint64_t dest_gpa,	/* destination guest physical address */
+		uint64_t flags,		/* flags */
+		uint64_t order)		/* page size order */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_BUSY	if page cannot be currently paged-in.
+	* U_FUNCTION	if functionality is not supported
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``src_ra`` is invalid.
+	* U_P3		if the ``dest_gpa`` address is invalid.
+	* U_P4		if any bit in the ``flags`` is unrecognized
+	* U_P5		if the ``order`` parameter is unsupported.
+
+Description
+~~~~~~~~~~~
+
+    Move the contents of the page identified by ``src_ra`` from normal
+    memory to secure memory and map it to the guest physical address
+    ``dest_gpa``.
+
+    If `dest_gpa` refers to a shared address, map the page into the
+    partition-scoped page-table of the SVM.  If `dest_gpa` is not shared,
+    copy the contents of the page into the corresponding secure page.
+    Depending on the context, decrypt the page before being copied.
+
+    The caller provides the attributes of the page through the ``flags``
+    parameter. Valid values for ``flags`` are:
+
+	* CACHE_INHIBITED
+	* CACHE_ENABLED
+	* WRITE_PROTECTION
+
+    The Hypervisor must pin the page in memory before making
+    ``UV_PAGE_IN`` ultracall.
+
+Use cases
+~~~~~~~~~
+
+    #. When a normal VM switches to secure mode, all its pages residing
+       in normal memory, are moved into secure memory.
+
+    #. When an SVM requests to share a page with Hypervisor the Hypervisor
+       allocates a page and informs the Ultravisor.
+
+    #. When an SVM accesses a secure page that has been paged-out,
+       Ultravisor invokes the Hypervisor to locate the page. After
+       locating the page, the Hypervisor uses UV_PAGE_IN to make the
+       page available to Ultravisor.
+
+UV_PAGE_INVAL
+-------------
+
+    Invalidate the Ultravisor mapping of a page.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_PAGE_INVAL,
+		uint16_t lpid,		/* the LPAR ID */
+		uint64_t guest_pa,	/* destination guest-physical-address */
+		uint64_t order)		/* page size order */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``guest_pa`` is invalid (or corresponds to a secure
+                        page mapping).
+	* U_P3		if the ``order`` is invalid.
+	* U_FUNCTION	if functionality is not supported.
+	* U_BUSY	if page cannot be currently invalidated.
+
+Description
+~~~~~~~~~~~
+
+    This ultracall informs Ultravisor that the page mapping in Hypervisor
+    corresponding to the given guest physical address has been invalidated
+    and that the Ultravisor should not access the page. If the specified
+    ``guest_pa`` corresponds to a secure page, Ultravisor will ignore the
+    attempt to invalidate the page and return U_P2.
+
+Use cases
+~~~~~~~~~
+
+    #. When a shared page is unmapped from the QEMU's page table, possibly
+       because it is paged-out to disk, Ultravisor needs to know that the
+       page should not be accessed from its side too.
+
+
+UV_WRITE_PATE
+-------------
+
+    Validate and write the partition table entry (PATE) for a given
+    partition.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_WRITE_PATE,
+		uint32_t lpid,		/* the LPAR ID */
+		uint64_t dw0		/* the first double word to write */
+		uint64_t dw1)		/* the second double word to write */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_BUSY	if PATE cannot be currently written to.
+	* U_FUNCTION	if functionality is not supported.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``dw0`` is invalid.
+	* U_P3		if the ``dw1`` address is invalid.
+	* U_PERMISSION	if the Hypervisor is attempting to change the PATE
+			of a secure virtual machine or if called from a
+			context other than Hypervisor.
+
+Description
+~~~~~~~~~~~
+
+    Validate and write a LPID and its partition-table-entry for the given
+    LPID.  If the LPID is already allocated and initialized, this call
+    results in changing the partition table entry.
+
+Use cases
+~~~~~~~~~
+
+    #. The Partition table resides in Secure memory and its entries,
+       called PATE (Partition Table Entries), point to the partition-
+       scoped page tables for the Hypervisor as well as each of the
+       virtual machines (both secure and normal). The Hypervisor
+       operates in partition 0 and its partition-scoped page tables
+       reside in normal memory.
+
+    #. This ultracall allows the Hypervisor to register the partition-
+       scoped and process-scoped page table entries for the Hypervisor
+       and other partitions (virtual machines) with the Ultravisor.
+
+    #. If the value of the PATE for an existing partition (VM) changes,
+       the TLB cache for the partition is flushed.
+
+    #. The Hypervisor is responsible for allocating LPID. The LPID and
+       its PATE entry are registered together.  The Hypervisor manages
+       the PATE entries for a normal VM and can change the PATE entry
+       anytime. Ultravisor manages the PATE entries for an SVM and
+       Hypervisor is not allowed to modify them.
+
+UV_RETURN
+---------
+
+    Return control from the Hypervisor back to the Ultravisor after
+    processing an hypercall or interrupt that was forwarded (aka
+    *reflected*) to the Hypervisor.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_RETURN)
+
+Return values
+~~~~~~~~~~~~~
+
+     This call never returns to Hypervisor on success.  It returns
+     U_INVALID if ultracall is not made from a Hypervisor context.
+
+Description
+~~~~~~~~~~~
+
+    When an SVM makes an hypercall or incurs some other exception, the
+    Ultravisor usually forwards (aka *reflects*) the exceptions to the
+    Hypervisor.  After processing the exception, Hypervisor uses the
+    ``UV_RETURN`` ultracall to return control back to the SVM.
+
+    The expected register state on entry to this ultracall is:
+
+    * Non-volatile registers are restored to their original values.
+    * If returning from an hypercall, register R0 contains the return
+      value (**unlike other ultracalls**) and, registers R4 through R12
+      contain any output values of the hypercall.
+    * R3 contains the ultracall number, i.e UV_RETURN.
+    * If returning with a synthesized interrupt, R2 contains the
+      synthesized interrupt number.
+
+Use cases
+~~~~~~~~~
+
+    #. Ultravisor relies on the Hypervisor to provide several services to
+       the SVM such as processing hypercall and other exceptions. After
+       processing the exception, Hypervisor uses UV_RETURN to return
+       control back to the Ultravisor.
+
+    #. Hypervisor has to use this ultracall to return control to the SVM.
+
+
+UV_REGISTER_MEM_SLOT
+--------------------
+
+    Register an SVM address-range with specified properties.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_REGISTER_MEM_SLOT,
+		uint64_t lpid,		/* LPAR ID of the SVM */
+		uint64_t start_gpa,	/* start guest physical address */
+		uint64_t size,		/* size of address range in bytes */
+		uint64_t flags		/* reserved for future expansion */
+		uint16_t slotid)	/* slot identifier */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``start_gpa`` is invalid.
+	* U_P3		if ``size`` is invalid.
+	* U_P4		if any bit in the ``flags`` is unrecognized.
+	* U_P5		if the ``slotid`` parameter is unsupported.
+	* U_PERMISSION	if called from context other than Hypervisor.
+	* U_FUNCTION	if functionality is not supported.
+
+
+Description
+~~~~~~~~~~~
+
+    Register a memory range for an SVM.  The memory range starts at the
+    guest physical address ``start_gpa`` and is ``size`` bytes long.
+
+Use cases
+~~~~~~~~~
+
+
+    #. When a virtual machine goes secure, all the memory slots managed by
+       the Hypervisor move into secure memory. The Hypervisor iterates
+       through each of memory slots, and registers the slot with
+       Ultravisor.  Hypervisor may discard some slots such as those used
+       for firmware (SLOF).
+
+    #. When new memory is hot-plugged, a new memory slot gets registered.
+
+
+UV_UNREGISTER_MEM_SLOT
+----------------------
+
+    Unregister an SVM address-range that was previously registered using
+    UV_REGISTER_MEM_SLOT.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_UNREGISTER_MEM_SLOT,
+		uint64_t lpid,		/* LPAR ID of the SVM */
+		uint64_t slotid)	/* reservation slotid */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_FUNCTION	if functionality is not supported.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_P2 		if ``slotid`` is invalid.
+	* U_PERMISSION	if called from context other than Hypervisor.
+
+Description
+~~~~~~~~~~~
+
+    Release the memory slot identified by ``slotid`` and free any
+    resources allocated towards the reservation.
+
+Use cases
+~~~~~~~~~
+
+    #. Memory hot-remove.
+
+
+UV_SVM_TERMINATE
+----------------
+
+    Terminate an SVM and release its resources.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_SVM_TERMINATE,
+		uint64_t lpid,		/* LPAR ID of the SVM */)
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_FUNCTION	if functionality is not supported.
+	* U_PARAMETER	if ``lpid`` is invalid.
+	* U_INVALID	if VM is not secure.
+	* U_PERMISSION  if not called from a Hypervisor context.
+
+Description
+~~~~~~~~~~~
+
+    Terminate an SVM and release all its resources.
+
+Use cases
+~~~~~~~~~
+
+    #. Called by Hypervisor when terminating an SVM.
+
+
+Ultracalls used by SVM
+======================
+
+UV_SHARE_PAGE
+-------------
+
+    Share a set of guest physical pages with the Hypervisor.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_SHARE_PAGE,
+		uint64_t gfn,	/* guest page frame number */
+		uint64_t num)	/* number of pages of size PAGE_SIZE */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_FUNCTION	if functionality is not supported.
+	* U_INVALID	if the VM is not secure.
+	* U_PARAMETER	if ``gfn`` is invalid.
+	* U_P2 		if ``num`` is invalid.
+
+Description
+~~~~~~~~~~~
+
+    Share the ``num`` pages starting at guest physical frame number ``gfn``
+    with the Hypervisor. Assume page size is PAGE_SIZE bytes. Zero the
+    pages before returning.
+
+    If the address is already backed by a secure page, unmap the page and
+    back it with an insecure page, with the help of the Hypervisor. If it
+    is not backed by any page yet, mark the PTE as insecure and back it
+    with an insecure page when the address is accessed. If it is already
+    backed by an insecure page, zero the page and return.
+
+Use cases
+~~~~~~~~~
+
+    #. The Hypervisor cannot access the SVM pages since they are backed by
+       secure pages. Hence an SVM must explicitly request Ultravisor for
+       pages it can share with Hypervisor.
+
+    #. Shared pages are needed to support virtio and Virtual Processor Area
+       (VPA) in SVMs.
+
+
+UV_UNSHARE_PAGE
+---------------
+
+    Restore a shared SVM page to its initial state.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_UNSHARE_PAGE,
+		uint64_t gfn,	/* guest page frame number */
+		uint73 num)	/* number of pages of size PAGE_SIZE*/
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_FUNCTION	if functionality is not supported.
+	* U_INVALID	if VM is not secure.
+	* U_PARAMETER	if ``gfn`` is invalid.
+	* U_P2 		if ``num`` is invalid.
+
+Description
+~~~~~~~~~~~
+
+    Stop sharing ``num`` pages starting at ``gfn`` with the Hypervisor.
+    Assume that the page size is PAGE_SIZE. Zero the pages before
+    returning.
+
+    If the address is already backed by an insecure page, unmap the page
+    and back it with a secure page. Inform the Hypervisor to release
+    reference to its shared page. If the address is not backed by a page
+    yet, mark the PTE as secure and back it with a secure page when that
+    address is accessed. If it is already backed by an secure page zero
+    the page and return.
+
+Use cases
+~~~~~~~~~
+
+    #. The SVM may decide to unshare a page from the Hypervisor.
+
+
+UV_UNSHARE_ALL_PAGES
+--------------------
+
+    Unshare all pages the SVM has shared with Hypervisor.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_UNSHARE_ALL_PAGES)
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success.
+	* U_FUNCTION	if functionality is not supported.
+	* U_INVAL	if VM is not secure.
+
+Description
+~~~~~~~~~~~
+
+    Unshare all shared pages from the Hypervisor. All unshared pages are
+    zeroed on return. Only pages explicitly shared by the SVM with the
+    Hypervisor (using UV_SHARE_PAGE ultracall) are unshared. Ultravisor
+    may internally share some pages with the Hypervisor without explicit
+    request from the SVM.  These pages will not be unshared by this
+    ultracall.
+
+Use cases
+~~~~~~~~~
+
+    #. This call is needed when ``kexec`` is used to boot a different
+       kernel. It may also be needed during SVM reset.
+
+UV_ESM
+------
+
+    Secure the virtual machine (*enter secure mode*).
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t ultracall(const uint64_t UV_ESM,
+		uint64_t esm_blob_addr,	/* location of the ESM blob */
+		unint64_t fdt)		/* Flattened device tree */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* U_SUCCESS	on success (including if VM is already secure).
+	* U_FUNCTION	if functionality is not supported.
+	* U_INVALID	if VM is not secure.
+	* U_PARAMETER	if ``esm_blob_addr`` is invalid.
+	* U_P2 		if ``fdt`` is invalid.
+	* U_PERMISSION	if any integrity checks fail.
+	* U_RETRY	insufficient memory to create SVM.
+	* U_NO_KEY	symmetric key unavailable.
+
+Description
+~~~~~~~~~~~
+
+    Secure the virtual machine. On successful completion, return
+    control to the virtual machine at the address specified in the
+    ESM blob.
+
+Use cases
+~~~~~~~~~
+
+    #. A normal virtual machine can choose to switch to a secure mode.
+
+Hypervisor Calls API
+####################
+
+    This document describes the Hypervisor calls (hypercalls) that are
+    needed to support the Ultravisor. Hypercalls are services provided by
+    the Hypervisor to virtual machines and Ultravisor.
+
+    Register usage for these hypercalls is identical to that of the other
+    hypercalls defined in the Power Architecture Platform Reference (PAPR)
+    document.  i.e on input, register R3 identifies the specific service
+    that is being requested and registers R4 through R11 contain
+    additional parameters to the hypercall, if any. On output, register
+    R3 contains the return value and registers R4 through R9 contain any
+    other output values from the hypercall.
+
+    This document only covers hypercalls currently implemented/planned
+    for Ultravisor usage but others can be added here when it makes sense.
+
+    The full specification for all hypercalls/ultracalls will eventually
+    be made available in the public/OpenPower version of the PAPR
+    specification.
+
+Hypervisor calls to support Ultravisor
+======================================
+
+    Following are the set of hypercalls needed to support Ultravisor.
+
+H_SVM_INIT_START
+----------------
+
+    Begin the process of converting a normal virtual machine into an SVM.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t hypercall(const uint64_t H_SVM_INIT_START)
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* H_SUCCESS	 on success.
+
+Description
+~~~~~~~~~~~
+
+    Initiate the process of securing a virtual machine. This involves
+    coordinating with the Ultravisor, using ultracalls, to allocate
+    resources in the Ultravisor for the new SVM, transferring the VM's
+    pages from normal to secure memory etc. When the process is
+    completed, Ultravisor issues the H_SVM_INIT_DONE hypercall.
+
+Use cases
+~~~~~~~~~
+
+     #. Ultravisor uses this hypercall to inform Hypervisor that a VM
+        has initiated the process of switching to secure mode.
+
+
+H_SVM_INIT_DONE
+---------------
+
+    Complete the process of securing an SVM.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t hypercall(const uint64_t H_SVM_INIT_DONE)
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* H_SUCCESS 		on success.
+	* H_UNSUPPORTED		if called from the wrong context (e.g.
+				from an SVM or before an H_SVM_INIT_START
+				hypercall).
+
+Description
+~~~~~~~~~~~
+
+    Complete the process of securing a virtual machine. This call must
+    be made after a prior call to ``H_SVM_INIT_START`` hypercall.
+
+Use cases
+~~~~~~~~~
+
+    On successfully securing a virtual machine, the Ultravisor informs
+    Hypervisor about it. Hypervisor can use this call to finish setting
+    up its internal state for this virtual machine.
+
+
+H_SVM_PAGE_IN
+-------------
+
+    Move the contents of a page from normal memory to secure memory.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t hypercall(const uint64_t H_SVM_PAGE_IN,
+		uint64_t guest_pa,	/* guest-physical-address */
+		uint64_t flags,		/* flags */
+		uint64_t order)		/* page size order */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* H_SUCCESS	on success.
+	* H_PARAMETER	if ``guest_pa`` is invalid.
+	* H_P2		if ``flags`` is invalid.
+	* H_P3		if ``order`` of page is invalid.
+
+Description
+~~~~~~~~~~~
+
+    Retrieve the content of the page, belonging to the VM at the specified
+    guest physical address.
+
+    Only valid value(s) in ``flags`` are:
+
+        * H_PAGE_IN_SHARED which indicates that the page is to be shared
+	  with the Ultravisor.
+
+	* H_PAGE_IN_NONSHARED indicates that the UV is not anymore
+          interested in the page. Applicable if the page is a shared page.
+
+    The ``order`` parameter must correspond to the configured page size.
+
+Use cases
+~~~~~~~~~
+
+    #. When a normal VM becomes a secure VM (using the UV_ESM ultracall),
+       the Ultravisor uses this hypercall to move contents of each page of
+       the VM from normal memory to secure memory.
+
+    #. Ultravisor uses this hypercall to ask Hypervisor to provide a page
+       in normal memory that can be shared between the SVM and Hypervisor.
+
+    #. Ultravisor uses this hypercall to page-in a paged-out page. This
+       can happen when the SVM touches a paged-out page.
+
+    #. If SVM wants to disable sharing of pages with Hypervisor, it can
+       inform Ultravisor to do so. Ultravisor will then use this hypercall
+       and inform Hypervisor that it has released access to the normal
+       page.
+
+H_SVM_PAGE_OUT
+---------------
+
+    Move the contents of the page to normal memory.
+
+Syntax
+~~~~~~
+
+.. code-block:: c
+
+	uint64_t hypercall(const uint64_t H_SVM_PAGE_OUT,
+		uint64_t guest_pa,	/* guest-physical-address */
+		uint64_t flags,		/* flags (currently none) */
+		uint64_t order)		/* page size order */
+
+Return values
+~~~~~~~~~~~~~
+
+    One of the following values:
+
+	* H_SUCCESS	on success.
+	* H_PARAMETER	if ``guest_pa`` is invalid.
+	* H_P2		if ``flags`` is invalid.
+	* H_P3		if ``order`` is invalid.
+
+Description
+~~~~~~~~~~~
+
+    Move the contents of the page identified by ``guest_pa`` to normal
+    memory.
+
+    Currently ``flags`` is unused and must be set to 0. The ``order``
+    parameter must correspond to the configured page size.
+
+Use cases
+~~~~~~~~~
+
+    #. If Ultravisor is running low on secure pages, it can move the
+       contents of some secure pages, into normal pages using this
+       hypercall. The content will be encrypted.
+
+References
+##########
+
+.. [1] `Supporting Protected Computing on IBM Power Architecture <https://developer.ibm.com/articles/l-support-protected-computing/>`_

From a49dddbdb0cca1d00fc9251e543a0aac09a6a65b Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:33 -0300
Subject: [PATCH 123/264] powerpc/kernel: Add ucall_norets() ultravisor call
 handler

The ultracalls (ucalls for short) allow the Secure Virtual Machines
(SVM)s and hypervisor to request services from the ultravisor such as
accessing a register or memory region that can only be accessed when
running in ultravisor-privileged mode.

This patch adds the ucall_norets() ultravisor call handler.

The specific service needed from an ucall is specified in register
R3 (the first parameter to the ucall). Other parameters to the
ucall, if any, are specified in registers R4 through R12.

Return value of all ucalls is in register R3. Other output values
from the ucall, if any, are returned in registers R4 through R12.

Each ucall returns specific error codes, applicable in the context
of the ucall. However, like with the PowerPC Architecture Platform
Reference (PAPR), if no specific error code is defined for a particular
situation, then the ucall will fallback to an erroneous
parameter-position based code. i.e U_PARAMETER, U_P2, U_P3 etc depending
on the ucall parameter that may have caused the error.

Every host kernel (powernv) needs to be able to do ucalls in case it
ends up being run in a machine with ultravisor enabled. Otherwise, the
kernel may crash early in boot trying to access ultravisor resources,
for instance, trying to set the partition table entry 0. Secure guests
also need to be able to do ucalls and its kernel may not have
CONFIG_PPC_POWERNV=y. For that reason, the ucall.S file is placed under
arch/powerpc/kernel.

If ultravisor is not enabled, the ucalls will be redirected to the
hypervisor which must handle/fail the call.

Thanks to inputs from Ram Pai and Michael Anderson.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-3-cclaudio@linux.ibm.com
---
 arch/powerpc/include/asm/asm-prototypes.h | 11 +++++++++++
 arch/powerpc/include/asm/ultravisor-api.h | 23 +++++++++++++++++++++++
 arch/powerpc/kernel/Makefile              |  1 +
 arch/powerpc/kernel/ucall.S               | 14 ++++++++++++++
 4 files changed, 49 insertions(+)
 create mode 100644 arch/powerpc/include/asm/ultravisor-api.h
 create mode 100644 arch/powerpc/kernel/ucall.S

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index ec1c97a8e8cb..e698f48cbc6d 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -15,6 +15,7 @@
 #include <asm/epapr_hcalls.h>
 #include <asm/dcr.h>
 #include <asm/mmu_context.h>
+#include <asm/ultravisor-api.h>
 
 #include <uapi/asm/ucontext.h>
 
@@ -34,6 +35,16 @@ extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
 
+/* Ultravisor */
+#ifdef CONFIG_PPC_POWERNV
+long ucall_norets(unsigned long opcode, ...);
+#else
+static inline long ucall_norets(unsigned long opcode, ...)
+{
+	return U_NOT_AVAILABLE;
+}
+#endif
+
 /* OPAL */
 int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
 		    int64_t a4, int64_t a5, int64_t a6, int64_t a7,
diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
new file mode 100644
index 000000000000..88ffa78f9d61
--- /dev/null
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ultravisor API.
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#ifndef _ASM_POWERPC_ULTRAVISOR_API_H
+#define _ASM_POWERPC_ULTRAVISOR_API_H
+
+#include <asm/hvcall.h>
+
+/* Return codes */
+#define U_FUNCTION		H_FUNCTION
+#define U_NOT_AVAILABLE		H_NOT_AVAILABLE
+#define U_P2			H_P2
+#define U_P3			H_P3
+#define U_P4			H_P4
+#define U_P5			H_P5
+#define U_PARAMETER		H_PARAMETER
+#define U_SUCCESS		H_SUCCESS
+
+#endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index d4eb50de13b1..934e64b28894 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -156,6 +156,7 @@ endif
 
 obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
+obj-$(CONFIG_PPC_POWERNV)	+= ucall.o
 
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S
new file mode 100644
index 000000000000..07296bc39166
--- /dev/null
+++ b/arch/powerpc/kernel/ucall.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generic code to perform an ultravisor call.
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+_GLOBAL(ucall_norets)
+EXPORT_SYMBOL_GPL(ucall_norets)
+	sc	2	/* Invoke the ultravisor */
+	blr		/* Return r3 = status */

From bb04ffe85eebebd64d5e673a9434d968e80f3aa1 Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:34 -0300
Subject: [PATCH 124/264] powerpc/powernv: Introduce FW_FEATURE_ULTRAVISOR

In PEF enabled systems, some of the resources which were previously
hypervisor privileged are now ultravisor privileged and controlled by
the ultravisor firmware.

This adds FW_FEATURE_ULTRAVISOR to indicate if PEF is enabled.

The host kernel can use FW_FEATURE_ULTRAVISOR, for instance, to skip
accessing resources (e.g. PTCR and LDBAR) in case PEF is enabled.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
[ andmike: Device node name to "ibm,ultravisor" ]
Signed-off-by: Michael Anderson <andmike@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-4-cclaudio@linux.ibm.com
---
 arch/powerpc/include/asm/firmware.h         |  5 +++--
 arch/powerpc/include/asm/ultravisor.h       | 14 ++++++++++++
 arch/powerpc/kernel/prom.c                  |  4 ++++
 arch/powerpc/platforms/powernv/Makefile     |  1 +
 arch/powerpc/platforms/powernv/ultravisor.c | 24 +++++++++++++++++++++
 5 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/asm/ultravisor.h
 create mode 100644 arch/powerpc/platforms/powernv/ultravisor.c

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index faeca8b76c8c..b3e214a97f3a 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -50,6 +50,7 @@
 #define FW_FEATURE_DRC_INFO	ASM_CONST(0x0000000800000000)
 #define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000)
 #define FW_FEATURE_PAPR_SCM 	ASM_CONST(0x0000002000000000)
+#define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -68,9 +69,9 @@ enum {
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
 		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
 		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
-		FW_FEATURE_PAPR_SCM,
+		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
-	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
+	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
new file mode 100644
index 000000000000..dc6e1ea198f2
--- /dev/null
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ultravisor definitions
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#ifndef _ASM_POWERPC_ULTRAVISOR_H
+#define _ASM_POWERPC_ULTRAVISOR_H
+
+int early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+				  int depth, void *data);
+
+#endif	/* _ASM_POWERPC_ULTRAVISOR_H */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 7159e791a70d..5828f1c81dc9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,6 +55,7 @@
 #include <asm/firmware.h>
 #include <asm/dt_cpu_ftrs.h>
 #include <asm/drmem.h>
+#include <asm/ultravisor.h>
 
 #include <mm/mmu_decl.h>
 
@@ -702,6 +703,9 @@ void __init early_init_devtree(void *params)
 #ifdef CONFIG_PPC_POWERNV
 	/* Some machines might need OPAL info for debugging, grab it now. */
 	of_scan_flat_dt(early_init_dt_scan_opal, NULL);
+
+	/* Scan tree for ultravisor feature */
+	of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL);
 #endif
 
 #ifdef CONFIG_FA_DUMP
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..2c27c8ac00c8 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,6 +4,7 @@ obj-y			+= idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y			+= ultravisor.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000000..02ac57b4bded
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ultravisor high level interfaces
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/of_fdt.h>
+
+#include <asm/ultravisor.h>
+#include <asm/firmware.h>
+
+int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+					 int depth, void *data)
+{
+	if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
+		return 0;
+
+	powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
+	pr_debug("Ultravisor detected!\n");
+	return 1;
+}

From 139a1d2842ec181cf017502a46bb8d947682a960 Mon Sep 17 00:00:00 2001
From: Michael Anderson <andmike@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:35 -0300
Subject: [PATCH 125/264] powerpc/mm: Use UV_WRITE_PATE ucall to register a
 PATE

When Ultravisor (UV) is enabled, the partition table is stored in secure
memory and can only be accessed via the UV. The Hypervisor (HV) however
maintains a copy of the partition table in normal memory to allow Nest MMU
translations to occur (for normal VMs). The HV copy includes partition
table entries (PATE)s for secure VMs which would currently be unused
(Nest MMU translations cannot access secure memory) but they would be
needed as we add functionality.

This patch adds the UV_WRITE_PATE ucall which is used to update the PATE
for a VM (both normal and secure) when Ultravisor is enabled.

Signed-off-by: Michael Anderson <andmike@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[ cclaudio: Write the PATE in HV's table before doing that in UV's ]
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Reviewed-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-5-cclaudio@linux.ibm.com
---
 arch/powerpc/include/asm/ultravisor-api.h |  5 +++
 arch/powerpc/include/asm/ultravisor.h     |  8 ++++
 arch/powerpc/mm/book3s64/pgtable.c        | 50 +++++++++++++++++------
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 88ffa78f9d61..8cd49abff4f3 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -11,6 +11,7 @@
 #include <asm/hvcall.h>
 
 /* Return codes */
+#define U_BUSY			H_BUSY
 #define U_FUNCTION		H_FUNCTION
 #define U_NOT_AVAILABLE		H_NOT_AVAILABLE
 #define U_P2			H_P2
@@ -18,6 +19,10 @@
 #define U_P4			H_P4
 #define U_P5			H_P5
 #define U_PARAMETER		H_PARAMETER
+#define U_PERMISSION		H_PERMISSION
 #define U_SUCCESS		H_SUCCESS
 
+/* opcodes */
+#define UV_WRITE_PATE			0xF104
+
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
index dc6e1ea198f2..6fe1f365dec8 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -8,7 +8,15 @@
 #ifndef _ASM_POWERPC_ULTRAVISOR_H
 #define _ASM_POWERPC_ULTRAVISOR_H
 
+#include <asm/asm-prototypes.h>
+#include <asm/ultravisor-api.h>
+
 int early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
 				  int depth, void *data);
 
+static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
+{
+	return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
+}
+
 #endif	/* _ASM_POWERPC_ULTRAVISOR_H */
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 7d0e0d0d22c4..4173f6931009 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -12,6 +12,8 @@
 #include <asm/tlb.h>
 #include <asm/trace.h>
 #include <asm/powernv.h>
+#include <asm/firmware.h>
+#include <asm/ultravisor.h>
 
 #include <mm/mmu_decl.h>
 #include <trace/events/thp.h>
@@ -209,21 +211,10 @@ void __init mmu_partition_table_init(void)
 	powernv_set_nmmu_ptcr(ptcr);
 }
 
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-				   unsigned long dw1)
+static void flush_partition(unsigned int lpid, bool radix)
 {
-	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
-	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
-	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
-	/*
-	 * Global flush of TLBs and partition table caches for this lpid.
-	 * The type of flush (hash or radix) depends on what the previous
-	 * use of this partition ID was, not the new use.
-	 */
 	asm volatile("ptesync" : : : "memory");
-	if (old & PATB_HR) {
+	if (radix) {
 		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 		asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
@@ -237,6 +228,39 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 	/* do we need fixup here ?*/
 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 }
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+				  unsigned long dw1)
+{
+	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+	/*
+	 * When ultravisor is enabled, the partition table is stored in secure
+	 * memory and can only be accessed doing an ultravisor call. However, we
+	 * maintain a copy of the partition table in normal memory to allow Nest
+	 * MMU translations to occur (for normal VMs).
+	 *
+	 * Therefore, here we always update partition_tb, regardless of whether
+	 * we are running under an ultravisor or not.
+	 */
+	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+	/*
+	 * If ultravisor is enabled, we do an ultravisor call to register the
+	 * partition table entry (PATE), which also do a global flush of TLBs
+	 * and partition table caches for the lpid. Otherwise, just do the
+	 * flush. The type of flush (hash or radix) depends on what the previous
+	 * use of the partition ID was, not the new use.
+	 */
+	if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
+		uv_register_pate(lpid, dw0, dw1);
+		pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
+			dw0, dw1);
+	} else {
+		flush_partition(lpid, (old & PATB_HR));
+	}
+}
 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
 
 static pmd_t *get_pmd_from_cache(struct mm_struct *mm)

From 5223134029a87db925ecc9449f9501bad391a52e Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:36 -0300
Subject: [PATCH 126/264] powerpc/mm: Write to PTCR only if ultravisor disabled

In ultravisor enabled systems, PTCR becomes ultravisor privileged only
for writing and an attempt to write to it will cause a Hypervisor
Emulation Assitance interrupt.

This patch uses the set_ptcr_when_no_uv() function to restrict PTCR
writing to only when ultravisor is disabled.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-6-cclaudio@linux.ibm.com
---
 arch/powerpc/include/asm/ultravisor.h    | 12 ++++++++++++
 arch/powerpc/mm/book3s64/hash_utils.c    |  5 +++--
 arch/powerpc/mm/book3s64/pgtable.c       |  2 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c |  8 +++++---
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
index 6fe1f365dec8..d7aa97aa7834 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -10,10 +10,22 @@
 
 #include <asm/asm-prototypes.h>
 #include <asm/ultravisor-api.h>
+#include <asm/firmware.h>
 
 int early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
 				  int depth, void *data);
 
+/*
+ * In ultravisor enabled systems, PTCR becomes ultravisor privileged only for
+ * writing and an attempt to write to it will cause a Hypervisor Emulation
+ * Assistance interrupt.
+ */
+static inline void set_ptcr_when_no_uv(u64 val)
+{
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		mtspr(SPRN_PTCR, val);
+}
+
 static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
 {
 	return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b8ad14bb1170..88aab920caca 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -61,6 +61,7 @@
 #include <asm/ps3.h>
 #include <asm/pte-walk.h>
 #include <asm/asm-prototypes.h>
+#include <asm/ultravisor.h>
 
 #include <mm/mmu_decl.h>
 
@@ -1075,8 +1076,8 @@ void hash__early_init_mmu_secondary(void)
 		if (!cpu_has_feature(CPU_FTR_ARCH_300))
 			mtspr(SPRN_SDR1, _SDR1);
 		else
-			mtspr(SPRN_PTCR,
-			      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+			set_ptcr_when_no_uv(__pa(partition_tb) |
+					    (PATB_SIZE_SHIFT - 12));
 	}
 	/* Initialize SLB */
 	slb_initialize();
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 4173f6931009..01a7570c10e0 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -207,7 +207,7 @@ void __init mmu_partition_table_init(void)
 	 * 64 K size.
 	 */
 	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
-	mtspr(SPRN_PTCR, ptcr);
+	set_ptcr_when_no_uv(ptcr);
 	powernv_set_nmmu_ptcr(ptcr);
 }
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b4ca9e95e678..5bc118b4a634 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -27,6 +27,7 @@
 #include <asm/sections.h>
 #include <asm/trace.h>
 #include <asm/uaccess.h>
+#include <asm/ultravisor.h>
 
 #include <trace/events/thp.h>
 
@@ -650,8 +651,9 @@ void radix__early_init_mmu_secondary(void)
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 
-		mtspr(SPRN_PTCR,
-		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+		set_ptcr_when_no_uv(__pa(partition_tb) |
+				    (PATB_SIZE_SHIFT - 12));
+
 		radix_init_amor();
 	}
 
@@ -667,7 +669,7 @@ void radix__mmu_cleanup_all(void)
 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
-		mtspr(SPRN_PTCR, 0);
+		set_ptcr_when_no_uv(0);
 		powernv_set_nmmu_ptcr(0);
 		radix__flush_tlb_all();
 	}

From 512a5a6452b6d742b6f713184414d28cb6413080 Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Thu, 22 Aug 2019 00:48:37 -0300
Subject: [PATCH 127/264] powerpc/powernv: Access LDBAR only if ultravisor
 disabled

LDBAR is a per-thread SPR populated and used by the thread-imc pmu
driver to dump the data counter into memory. It contains memory along
with few other configuration bits. LDBAR is populated and enabled only
when any of the thread imc pmu events are monitored.

In ultravisor enabled systems, LDBAR becomes ultravisor privileged and
an attempt to write to it will cause a Hypervisor Emulation Assistance
interrupt.

In ultravisor enabled systems, the ultravisor is responsible to maintain
the LDBAR (e.g. save and restore it).

This restricts LDBAR access to only when ultravisor is disabled.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Reviewed-by: Ryan Grimm <grimm@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-7-cclaudio@linux.ibm.com
---
 arch/powerpc/platforms/powernv/idle.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 09f49eed7fb8..78599bca66c2 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 		sprs.ptcr	= mfspr(SPRN_PTCR);
 		sprs.rpr	= mfspr(SPRN_RPR);
 		sprs.tscr	= mfspr(SPRN_TSCR);
-		sprs.ldbar	= mfspr(SPRN_LDBAR);
+		if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+			sprs.ldbar = mfspr(SPRN_LDBAR);
 
 		sprs_saved = true;
 
@@ -789,7 +790,8 @@ core_woken:
 	mtspr(SPRN_MMCR0,	sprs.mmcr0);
 	mtspr(SPRN_MMCR1,	sprs.mmcr1);
 	mtspr(SPRN_MMCR2,	sprs.mmcr2);
-	mtspr(SPRN_LDBAR,	sprs.ldbar);
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		mtspr(SPRN_LDBAR, sprs.ldbar);
 
 	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
 

From 6c85b7bc637b64e681760f62c0eafba2f56745c6 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 22 Aug 2019 00:48:38 -0300
Subject: [PATCH 128/264] powerpc/kvm: Use UV_RETURN ucall to return to
 ultravisor

When an SVM makes an hypercall or incurs some other exception, the
Ultravisor usually forwards (a.k.a. reflects) the exceptions to the
Hypervisor. After processing the exception, Hypervisor uses the
UV_RETURN ultracall to return control back to the SVM.

The expected register state on entry to this ultracall is:

* Non-volatile registers are restored to their original values.
* If returning from an hypercall, register R0 contains the return value
  (unlike other ultracalls) and, registers R4 through R12 contain any
  output values of the hypercall.
* R3 contains the ultracall number, i.e UV_RETURN.
* If returning with a synthesized interrupt, R2 contains the
  synthesized interrupt number.

Thanks to input from Paul Mackerras, Ram Pai and Mike Anderson.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190822034838.27876-8-cclaudio@linux.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h       |  1 +
 arch/powerpc/include/asm/ultravisor-api.h |  1 +
 arch/powerpc/kernel/asm-offsets.c         |  1 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 39 +++++++++++++++++++----
 4 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e6e5f59aaa97..4bb552d639b8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -283,6 +283,7 @@ struct kvm_arch {
 	cpumask_t cpu_in_guest;
 	u8 radix;
 	u8 fwnmi_enabled;
+	u8 secure_guest;
 	bool threads_indep;
 	bool nested_enable;
 	pgd_t *pgtable;
diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 8cd49abff4f3..6a0f9c74f959 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -24,5 +24,6 @@
 
 /* opcodes */
 #define UV_WRITE_PATE			0xF104
+#define UV_RETURN			0xF11C
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4ccb6b3a7fbd..484f54dab247 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,6 +506,7 @@ int main(void)
 	OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
 	OFFSET(KVM_RADIX, kvm, arch.radix);
 	OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
+	OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest);
 	OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
 	OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 07181d0dfcb7..9a05b0d932ef 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -29,6 +29,7 @@
 #include <asm/asm-compat.h>
 #include <asm/feature-fixups.h>
 #include <asm/cpuidle.h>
+#include <asm/ultravisor-api.h>
 
 /* Sign-extend HDEC if not on POWER9 */
 #define EXTEND_HDEC(reg)			\
@@ -1085,16 +1086,10 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	ld	r5, VCPU_LR(r4)
-	ld	r6, VCPU_CR(r4)
 	mtlr	r5
-	mtcr	r6
 
 	ld	r1, VCPU_GPR(R1)(r4)
-	ld	r2, VCPU_GPR(R2)(r4)
-	ld	r3, VCPU_GPR(R3)(r4)
 	ld	r5, VCPU_GPR(R5)(r4)
-	ld	r6, VCPU_GPR(R6)(r4)
-	ld	r7, VCPU_GPR(R7)(r4)
 	ld	r8, VCPU_GPR(R8)(r4)
 	ld	r9, VCPU_GPR(R9)(r4)
 	ld	r10, VCPU_GPR(R10)(r4)
@@ -1112,10 +1107,42 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_HDSISR, r0
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
+	ld	r6, VCPU_KVM(r4)
+	lbz	r7, KVM_SECURE_GUEST(r6)
+	cmpdi	r7, 0
+	ld	r6, VCPU_GPR(R6)(r4)
+	ld	r7, VCPU_GPR(R7)(r4)
+	bne	ret_to_ultra
+
+	lwz	r0, VCPU_CR(r4)
+	mtcr	r0
+
 	ld	r0, VCPU_GPR(R0)(r4)
+	ld	r2, VCPU_GPR(R2)(r4)
+	ld	r3, VCPU_GPR(R3)(r4)
 	ld	r4, VCPU_GPR(R4)(r4)
 	HRFI_TO_GUEST
 	b	.
+/*
+ * Use UV_RETURN ultracall to return control back to the Ultravisor after
+ * processing an hypercall or interrupt that was forwarded (a.k.a. reflected)
+ * to the Hypervisor.
+ *
+ * All registers have already been loaded, except:
+ *   R0 = hcall result
+ *   R2 = SRR1, so UV can detect a synthesized interrupt (if any)
+ *   R3 = UV_RETURN
+ */
+ret_to_ultra:
+	lwz	r0, VCPU_CR(r4)
+	mtcr	r0
+
+	ld	r0, VCPU_GPR(R3)(r4)
+	mfspr	r2, SPRN_SRR1
+	li	r3, 0
+	ori	r3, r3, UV_RETURN
+	ld	r4, VCPU_GPR(R4)(r4)
+	sc	2
 
 /*
  * Enter the guest on a P9 or later system where we have exactly

From dea45ea7775240047f70e2631e468f6b65d09493 Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Wed, 28 Aug 2019 23:05:20 +1000
Subject: [PATCH 129/264] powerpc/powernv/opal-msglog: Refactor memcons code

This patch refactors the code in opal-msglog that operates on the OPAL
memory console in order to make it cleaner and also allow the reuse of
the new memcons_* functions.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190828130521.26764-1-mpe@ellerman.id.au
---
 arch/powerpc/platforms/powernv/opal-msglog.c | 57 +++++++++++++-------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index dc51d03c6370..d26da19a611f 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -29,23 +29,23 @@ struct memcons {
 
 static struct memcons *opal_memcons = NULL;
 
-ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
 {
 	const char *conbuf;
 	ssize_t ret;
 	size_t first_read = 0;
 	uint32_t out_pos, avail;
 
-	if (!opal_memcons)
+	if (!mc)
 		return -ENODEV;
 
-	out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
+	out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
 
 	/* Now we've read out_pos, put a barrier in before reading the new
 	 * data it points to in conbuf. */
 	smp_rmb();
 
-	conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
+	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
 
 	/* When the buffer has wrapped, read from the out_pos marker to the end
 	 * of the buffer, and then read the remaining data as in the un-wrapped
@@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	if (out_pos & MEMCONS_OUT_POS_WRAP) {
 
 		out_pos &= MEMCONS_OUT_POS_MASK;
-		avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
+		avail = be32_to_cpu(mc->obuf_size) - out_pos;
 
 		ret = memory_read_from_buffer(to, count, &pos,
 				conbuf + out_pos, avail);
@@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	}
 
 	/* Sanity check. The firmware should not do this to us. */
-	if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
+	if (out_pos > be32_to_cpu(mc->obuf_size)) {
 		pr_err("OPAL: memory console corruption. Aborting read.\n");
 		return -EINVAL;
 	}
@@ -86,6 +86,11 @@ out:
 	return ret;
 }
 
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+	return memcons_copy(opal_memcons, to, pos, count);
+}
+
 static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
 				struct bin_attribute *bin_attr, char *to,
 				loff_t pos, size_t count)
@@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = {
 	.read = opal_msglog_read
 };
 
-void __init opal_msglog_init(void)
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
 {
 	u64 mcaddr;
 	struct memcons *mc;
 
-	if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
-		pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
-		return;
+	if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
+		pr_warn("%s property not found, no message log\n",
+			mc_prop_name);
+		goto out_err;
 	}
 
 	mc = phys_to_virt(mcaddr);
 	if (!mc) {
-		pr_warn("OPAL: memory console address is invalid\n");
-		return;
+		pr_warn("memory console address is invalid\n");
+		goto out_err;
 	}
 
 	if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
-		pr_warn("OPAL: memory console version is invalid\n");
+		pr_warn("memory console version is invalid\n");
+		goto out_err;
+	}
+
+	return mc;
+
+out_err:
+	return NULL;
+}
+
+u32 memcons_get_size(struct memcons *mc)
+{
+	return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
+}
+
+void __init opal_msglog_init(void)
+{
+	opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
+	if (!opal_memcons) {
+		pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
 		return;
 	}
 
-	/* Report maximum size */
-	opal_msglog_attr.size =  be32_to_cpu(mc->ibuf_size) +
-		be32_to_cpu(mc->obuf_size);
-
-	opal_memcons = mc;
+	opal_msglog_attr.size = memcons_get_size(opal_memcons);
 }
 
 void __init opal_msglog_sysfs_init(void)

From 68e0aa8ec5cedec48dd5b11df84afc956c8f85be Mon Sep 17 00:00:00 2001
From: Claudio Carvalho <cclaudio@linux.ibm.com>
Date: Wed, 28 Aug 2019 23:05:21 +1000
Subject: [PATCH 130/264] powerpc/powernv: Add ultravisor message log interface

The ultravisor (UV) provides an in-memory console which follows the
OPAL in-memory console structure.

This patch extends the OPAL msglog code to initialize the UV memory
console and provide the "/sys/firmware/ultravisor/msglog" interface
for userspace to view the UV message log.

Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190828130521.26764-2-mpe@ellerman.id.au
---
 arch/powerpc/platforms/powernv/powernv.h    |  5 +++
 arch/powerpc/platforms/powernv/ultravisor.c | 45 +++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index fd4a1c5a6369..1aa51c4fa904 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -30,4 +30,9 @@ extern void opal_event_shutdown(void);
 
 bool cpu_core_split_required(void);
 
+struct memcons;
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
+u32 memcons_get_size(struct memcons *mc);
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
+
 #endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
index 02ac57b4bded..e4a00ad06f9d 100644
--- a/arch/powerpc/platforms/powernv/ultravisor.c
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -8,9 +8,15 @@
 #include <linux/init.h>
 #include <linux/printk.h>
 #include <linux/of_fdt.h>
+#include <linux/of.h>
 
 #include <asm/ultravisor.h>
 #include <asm/firmware.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static struct kobject *ultravisor_kobj;
 
 int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
 					 int depth, void *data)
@@ -22,3 +28,42 @@ int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
 	pr_debug("Ultravisor detected!\n");
 	return 1;
 }
+
+static struct memcons *uv_memcons;
+
+static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
+			      struct bin_attribute *bin_attr, char *to,
+			      loff_t pos, size_t count)
+{
+	return memcons_copy(uv_memcons, to, pos, count);
+}
+
+static struct bin_attribute uv_msglog_attr = {
+	.attr = {.name = "msglog", .mode = 0400},
+	.read = uv_msglog_read
+};
+
+static int __init uv_init(void)
+{
+	struct device_node *node;
+
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		return 0;
+
+	node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+	if (!node)
+		return -ENODEV;
+
+	uv_memcons = memcons_init(node, "memcons");
+	if (!uv_memcons)
+		return -ENOENT;
+
+	uv_msglog_attr.size = memcons_get_size(uv_memcons);
+
+	ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
+	if (!ultravisor_kobj)
+		return -ENOMEM;
+
+	return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
+}
+machine_subsys_initcall(powernv, uv_init);

From 136bc0397ae21dbf63ca02e5775ad353a479cd2f Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Mon, 19 Aug 2019 23:13:12 -0300
Subject: [PATCH 131/264] powerpc/pseries: Introduce option to build secure
 virtual machines

Introduce CONFIG_PPC_SVM to control support for secure guests and include
Ultravisor-related helpers when it is selected

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-3-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/kernel/Makefile              |  4 +++-
 arch/powerpc/platforms/pseries/Kconfig    | 11 +++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index e698f48cbc6d..49196d35e3bb 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -36,7 +36,7 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
 
 /* Ultravisor */
-#ifdef CONFIG_PPC_POWERNV
+#if defined(CONFIG_PPC_POWERNV) || defined(CONFIG_PPC_SVM)
 long ucall_norets(unsigned long opcode, ...);
 #else
 static inline long ucall_norets(unsigned long opcode, ...)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 934e64b28894..c6ae0e7914bc 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -156,7 +156,9 @@ endif
 
 obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
-obj-$(CONFIG_PPC_POWERNV)	+= ucall.o
+ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),)
+obj-y				+= ucall.o
+endif
 
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f7b484f55553..d09deb05bb66 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,3 +145,14 @@ config PAPR_SCM
 	tristate "Support for the PAPR Storage Class Memory interface"
 	help
 	  Enable access to hypervisor provided storage class memory.
+
+config PPC_SVM
+	bool "Secure virtual machine (SVM) support for POWER"
+	depends on PPC_PSERIES
+	help
+	 There are certain POWER platforms which support secure guests using
+	 the Protected Execution Facility, with the help of an Ultravisor
+	 executing below the hypervisor layer. This enables support for
+	 those guests.
+
+	 If unsure, say "N".

From 528229d210781b2da66c6d257a326c21099982b5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 19 Aug 2019 23:13:13 -0300
Subject: [PATCH 132/264] powerpc: Add support for adding an ESM blob to the
 zImage wrapper

For secure VMs, the signing tool will create a ticket called the "ESM blob"
for the Enter Secure Mode ultravisor call with the signatures of the kernel
and initrd among other things.

This adds support to the wrapper script for adding that blob via the "-e"
option to the zImage.pseries.

It also adds code to the zImage wrapper itself to retrieve and if necessary
relocate the blob, and pass its address to Linux via the device-tree, to be
later consumed by prom_init.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[ bauerman: Minor adjustments to some comments. ]
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-4-bauerman@linux.ibm.com
---
 arch/powerpc/boot/main.c       | 41 ++++++++++++++++++++++++++++++++++
 arch/powerpc/boot/ops.h        |  2 ++
 arch/powerpc/boot/wrapper      | 24 +++++++++++++++++---
 arch/powerpc/boot/zImage.lds.S |  8 +++++++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index 102cc546444d..a9d209135975 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -146,6 +146,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen,
 	return (struct addr_range){(void *)initrd_addr, initrd_size};
 }
 
+#ifdef __powerpc64__
+static void prep_esm_blob(struct addr_range vmlinux, void *chosen)
+{
+	unsigned long esm_blob_addr, esm_blob_size;
+
+	/* Do we have an ESM (Enter Secure Mode) blob? */
+	if (_esm_blob_end <= _esm_blob_start)
+		return;
+
+	printf("Attached ESM blob at 0x%p-0x%p\n\r",
+	       _esm_blob_start, _esm_blob_end);
+	esm_blob_addr = (unsigned long)_esm_blob_start;
+	esm_blob_size = _esm_blob_end - _esm_blob_start;
+
+	/*
+	 * If the ESM blob is too low it will be clobbered when the
+	 * kernel relocates to its final location.  In this case,
+	 * allocate a safer place and move it.
+	 */
+	if (esm_blob_addr < vmlinux.size) {
+		void *old_addr = (void *)esm_blob_addr;
+
+		printf("Allocating 0x%lx bytes for esm_blob ...\n\r",
+		       esm_blob_size);
+		esm_blob_addr = (unsigned long)malloc(esm_blob_size);
+		if (!esm_blob_addr)
+			fatal("Can't allocate memory for ESM blob !\n\r");
+		printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r",
+		       esm_blob_addr, old_addr, esm_blob_size);
+		memmove((void *)esm_blob_addr, old_addr, esm_blob_size);
+	}
+
+	/* Tell the kernel ESM blob address via device tree. */
+	setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr));
+	setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size));
+}
+#else
+static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { }
+#endif
+
 /* A buffer that may be edited by tools operating on a zImage binary so as to
  * edit the command line passed to vmlinux (by setting /chosen/bootargs).
  * The buffer is put in it's own section so that tools may locate it easier.
@@ -214,6 +254,7 @@ void start(void)
 	vmlinux = prep_kernel();
 	initrd = prep_initrd(vmlinux, chosen,
 			     loader_info.initrd_addr, loader_info.initrd_size);
+	prep_esm_blob(vmlinux, chosen);
 	prep_cmdline(chosen);
 
 	printf("Finalizing device tree...");
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index cd043726ed88..e0606766480f 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -251,6 +251,8 @@ extern char _initrd_start[];
 extern char _initrd_end[];
 extern char _dtb_start[];
 extern char _dtb_end[];
+extern char _esm_blob_start[];
+extern char _esm_blob_end[];
 
 static inline __attribute__((const))
 int __ilog2_u32(u32 n)
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 5148ac271f28..ed6266367bc0 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -13,6 +13,7 @@
 # -i initrd	specify initrd file
 # -d devtree	specify device-tree blob
 # -s tree.dts	specify device-tree source file (needs dtc installed)
+# -e esm_blob   specify ESM blob for secure images
 # -c		cache $kernel.strip.gz (use if present & newer, else make)
 # -C prefix	specify command prefix for cross-building tools
 #		(strip, objcopy, ld)
@@ -37,6 +38,7 @@ platform=of
 initrd=
 dtb=
 dts=
+esm_blob=
 cacheit=
 binary=
 compression=.gz
@@ -60,9 +62,9 @@ tmpdir=.
 
 usage() {
     echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
-    echo '       [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-    echo '       [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
-    echo '       [--no-compression] [vmlinux]' >&2
+    echo '       [-d devtree] [-s tree.dts] [-e esm_blob]' >&2
+    echo '       [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2
+    echo '       [-Z (gz|xz|none)] [--no-compression] [vmlinux]' >&2
     exit 1
 }
 
@@ -105,6 +107,11 @@ while [ "$#" -gt 0 ]; do
 	[ "$#" -gt 0 ] || usage
 	dtb="$1"
 	;;
+    -e)
+	shift
+	[ "$#" -gt 0 ] || usage
+	esm_blob="$1"
+	;;
     -s)
 	shift
 	[ "$#" -gt 0 ] || usage
@@ -218,9 +225,16 @@ objflags=-S
 tmp=$tmpdir/zImage.$$.o
 ksection=.kernel:vmlinux.strip
 isection=.kernel:initrd
+esection=.kernel:esm_blob
 link_address='0x400000'
 make_space=y
 
+
+if [ -n "$esm_blob" -a "$platform" != "pseries" ]; then
+    echo "ESM blob not support on non-pseries platforms" >&2
+    exit 1
+fi
+
 case "$platform" in
 of)
     platformo="$object/of.o $object/epapr.o"
@@ -477,6 +491,10 @@ if [ -n "$dtb" ]; then
     fi
 fi
 
+if [ -n "$esm_blob" ]; then
+    addsec $tmp "$esm_blob" $esection
+fi
+
 if [ "$platform" != "miboot" ]; then
     if [ -n "$link_address" ] ; then
         text_start="-Ttext $link_address"
diff --git a/arch/powerpc/boot/zImage.lds.S b/arch/powerpc/boot/zImage.lds.S
index 4ac1e36edfe7..a21f3a76e06f 100644
--- a/arch/powerpc/boot/zImage.lds.S
+++ b/arch/powerpc/boot/zImage.lds.S
@@ -68,6 +68,14 @@ SECTIONS
     _initrd_end =  .;
   }
 
+  . = ALIGN(4096);
+  .kernel:esm_blob :
+  {
+    _esm_blob_start =  .;
+    *(.kernel:esm_blob)
+    _esm_blob_end =  .;
+  }
+
 #ifdef CONFIG_PPC64_BOOT_WRAPPER
   . = ALIGN(256);
   .got :

From 6a9c930bd7751bf0630d8b9b73b07af5c6842da6 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 19 Aug 2019 23:13:14 -0300
Subject: [PATCH 133/264] powerpc/prom_init: Add the ESM call to prom_init

Make the Enter-Secure-Mode (ESM) ultravisor call to switch the VM to secure
mode. Pass kernel base address and FDT address so that the Ultravisor is
able to verify the integrity of the VM using information from the ESM blob.

Add "svm=" command line option to turn on switching to secure mode.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[ andmike: Generate an RTAS os-term hcall when the ESM ucall fails. ]
Signed-off-by: Michael Anderson <andmike@linux.ibm.com>
[ bauerman: Cleaned up the code a bit. ]
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-5-bauerman@linux.ibm.com
---
 .../admin-guide/kernel-parameters.txt         |  5 +
 arch/powerpc/include/asm/ultravisor-api.h     |  1 +
 arch/powerpc/kernel/prom_init.c               | 96 +++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6d495aab4d0b..4923d8f726e8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4620,6 +4620,11 @@
 			/sys/power/pm_test). Only available when CONFIG_PM_DEBUG
 			is set. Default value is 5.
 
+	svm=		[PPC]
+			Format: { on | off | y | n | 1 | 0 }
+			This parameter controls use of the Protected
+			Execution Facility on pSeries.
+
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 6a0f9c74f959..34c8711334cb 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -25,5 +25,6 @@
 /* opcodes */
 #define UV_WRITE_PATE			0xF104
 #define UV_RETURN			0xF11C
+#define UV_ESM				0xF110
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f2b63b4e1943..a4e7762dd286 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -40,6 +40,7 @@
 #include <asm/sections.h>
 #include <asm/machdep.h>
 #include <asm/asm-prototypes.h>
+#include <asm/ultravisor-api.h>
 
 #include <linux/linux_logo.h>
 
@@ -171,6 +172,10 @@ static bool __prombss prom_radix_disable;
 static bool __prombss prom_xive_disable;
 #endif
 
+#ifdef CONFIG_PPC_SVM
+static bool __prombss prom_svm_enable;
+#endif
+
 struct platform_support {
 	bool hash_mmu;
 	bool radix_mmu;
@@ -812,6 +817,17 @@ static void __init early_cmdline_parse(void)
 		prom_debug("XIVE disabled from cmdline\n");
 	}
 #endif /* CONFIG_PPC_PSERIES */
+
+#ifdef CONFIG_PPC_SVM
+	opt = prom_strstr(prom_cmd_line, "svm=");
+	if (opt) {
+		bool val;
+
+		opt += sizeof("svm=") - 1;
+		if (!prom_strtobool(opt, &val))
+			prom_svm_enable = val;
+	}
+#endif /* CONFIG_PPC_SVM */
 }
 
 #ifdef CONFIG_PPC_PSERIES
@@ -1712,6 +1728,43 @@ static void __init prom_close_stdin(void)
 	}
 }
 
+#ifdef CONFIG_PPC_SVM
+static int prom_rtas_hcall(uint64_t args)
+{
+	register uint64_t arg1 asm("r3") = H_RTAS;
+	register uint64_t arg2 asm("r4") = args;
+
+	asm volatile("sc 1\n" : "=r" (arg1) :
+			"r" (arg1),
+			"r" (arg2) :);
+	return arg1;
+}
+
+static struct rtas_args __prombss os_term_args;
+
+static void __init prom_rtas_os_term(char *str)
+{
+	phandle rtas_node;
+	__be32 val;
+	u32 token;
+
+	prom_debug("%s: start...\n", __func__);
+	rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+	prom_debug("rtas_node: %x\n", rtas_node);
+	if (!PHANDLE_VALID(rtas_node))
+		return;
+
+	val = 0;
+	prom_getprop(rtas_node, "ibm,os-term", &val, sizeof(val));
+	token = be32_to_cpu(val);
+	prom_debug("ibm,os-term: %x\n", token);
+	if (token == 0)
+		prom_panic("Could not get token for ibm,os-term\n");
+	os_term_args.token = cpu_to_be32(token);
+	prom_rtas_hcall((uint64_t)&os_term_args);
+}
+#endif /* CONFIG_PPC_SVM */
+
 /*
  * Allocate room for and instantiate RTAS
  */
@@ -3168,6 +3221,46 @@ static void unreloc_toc(void)
 #endif
 #endif
 
+#ifdef CONFIG_PPC_SVM
+/*
+ * Perform the Enter Secure Mode ultracall.
+ */
+static int enter_secure_mode(unsigned long kbase, unsigned long fdt)
+{
+	register unsigned long r3 asm("r3") = UV_ESM;
+	register unsigned long r4 asm("r4") = kbase;
+	register unsigned long r5 asm("r5") = fdt;
+
+	asm volatile("sc 2" : "+r"(r3) : "r"(r4), "r"(r5));
+
+	return r3;
+}
+
+/*
+ * Call the Ultravisor to transfer us to secure memory if we have an ESM blob.
+ */
+static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
+{
+	int ret;
+
+	if (!prom_svm_enable)
+		return;
+
+	/* Switch to secure mode. */
+	prom_printf("Switching to secure mode.\n");
+
+	ret = enter_secure_mode(kbase, fdt);
+	if (ret != U_SUCCESS) {
+		prom_printf("Returned %d from switching to secure mode.\n", ret);
+		prom_rtas_os_term("Switch to secure mode failed.\n");
+	}
+}
+#else
+static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
+{
+}
+#endif /* CONFIG_PPC_SVM */
+
 /*
  * We enter here early on, when the Open Firmware prom is still
  * handling exceptions and the MMU hash table for us.
@@ -3366,6 +3459,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
 	unreloc_toc();
 #endif
 
+	/* Move to secure memory if we're supposed to be secure guests. */
+	setup_secure_guest(kbase, hdr);
+
 	__start(hdr, kbase, 0, 0, 0, 0, 0);
 
 	return 0;

From f7777e008cad17bf757ca256709070c07efd8283 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 19 Aug 2019 23:13:15 -0300
Subject: [PATCH 134/264] powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE
 and UV_UNSHARE_PAGE

These functions are used when the guest wants to grant the hypervisor
access to certain pages.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-6-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/ultravisor-api.h |  2 ++
 arch/powerpc/include/asm/ultravisor.h     | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 34c8711334cb..0f5b2d718bfc 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -26,5 +26,7 @@
 #define UV_WRITE_PATE			0xF104
 #define UV_RETURN			0xF11C
 #define UV_ESM				0xF110
+#define UV_SHARE_PAGE			0xF130
+#define UV_UNSHARE_PAGE			0xF134
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
index d7aa97aa7834..e5c8413de06f 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -31,4 +31,14 @@ static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
 	return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
 }
 
+static inline int uv_share_page(u64 pfn, u64 npages)
+{
+	return ucall_norets(UV_SHARE_PAGE, pfn, npages);
+}
+
+static inline int uv_unshare_page(u64 pfn, u64 npages)
+{
+	return ucall_norets(UV_UNSHARE_PAGE, pfn, npages);
+}
+
 #endif	/* _ASM_POWERPC_ULTRAVISOR_H */

From 7f70c3815a352f76d69fc5642ae9685033a428c0 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:16 -0300
Subject: [PATCH 135/264] powerpc: Introduce the MSR_S bit

Protected Execution Facility (PEF) is an architectural change for
POWER 9 that enables Secure Virtual Machines (SVMs). When enabled,
PEF adds a new higher privileged mode, called Ultravisor mode, to
POWER architecture.

The hardware changes include the following:

  * There is a new bit in the MSR that determines whether the current
    process is running in secure mode, MSR(S) bit 41. MSR(S)=1, process
    is in secure mode, MSR(s)=0 process is in normal mode.

  * The MSR(S) bit can only be set by the Ultravisor.

  * HRFID cannot be used to set the MSR(S) bit. If the hypervisor needs
    to return to a SVM it must use an ultracall. It can determine if
    the VM it is returning to is secure.

  * The privilege of a process is now determined by three MSR bits,
    MSR(S, HV, PR). In each of the tables below the modes are listed
    from least privilege to highest privilege. The higher privilege
    modes can access all the resources of the lower privilege modes.

    **Secure Mode MSR Settings**

       +---+---+---+---------------+
       | S | HV| PR|Privilege      |
       +===+===+===+===============+
       | 1 | 0 | 1 | Problem       |
       +---+---+---+---------------+
       | 1 | 0 | 0 | Privileged(OS)|
       +---+---+---+---------------+
       | 1 | 1 | 0 | Ultravisor    |
       +---+---+---+---------------+
       | 1 | 1 | 1 | Reserved      |
       +---+---+---+---------------+

    **Normal Mode MSR Settings**

       +---+---+---+---------------+
       | S | HV| PR|Privilege      |
       +===+===+===+===============+
       | 0 | 0 | 1 | Problem       |
       +---+---+---+---------------+
       | 0 | 0 | 0 | Privileged(OS)|
       +---+---+---+---------------+
       | 0 | 1 | 0 | Hypervisor    |
       +---+---+---+---------------+
       | 0 | 1 | 1 | Problem (HV)  |
       +---+---+---+---------------+

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[ cclaudio: Update the commit message ]
Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-7-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/reg.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 10caa145f98b..ec3714cf0989 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -38,6 +38,7 @@
 #define MSR_TM_LG	32		/* Trans Mem Available */
 #define MSR_VEC_LG	25	        /* Enable AltiVec */
 #define MSR_VSX_LG	23		/* Enable VSX */
+#define MSR_S_LG	22		/* Secure state */
 #define MSR_POW_LG	18		/* Enable Power Management */
 #define MSR_WE_LG	18		/* Wait State Enable */
 #define MSR_TGPR_LG	17		/* TLB Update registers in use */
@@ -71,11 +72,13 @@
 #define MSR_SF		__MASK(MSR_SF_LG)	/* Enable 64 bit mode */
 #define MSR_ISF		__MASK(MSR_ISF_LG)	/* Interrupt 64b mode valid on 630 */
 #define MSR_HV 		__MASK(MSR_HV_LG)	/* Hypervisor state */
+#define MSR_S		__MASK(MSR_S_LG)	/* Secure state */
 #else
 /* so tests for these bits fail on 32-bit */
 #define MSR_SF		0
 #define MSR_ISF		0
 #define MSR_HV		0
+#define MSR_S		0
 #endif
 
 /*

From e311a92da18cbdd4972dab0cda88b1b8484b8fef Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Mon, 19 Aug 2019 23:13:17 -0300
Subject: [PATCH 136/264] powerpc/pseries: Add and use LPPACA_SIZE constant

Helps document what the hard-coded number means.

Also take the opportunity to fix an #endif comment.

Suggested-by: Alexey Kardashevskiy <aik@linux.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-8-bauerman@linux.ibm.com
---
 arch/powerpc/kernel/paca.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index e3ad8aa4730d..612fc87ef785 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -52,6 +52,8 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
 
 #ifdef CONFIG_PPC_PSERIES
 
+#define LPPACA_SIZE 0x400
+
 /*
  * See asm/lppaca.h for more detail.
  *
@@ -65,7 +67,7 @@ static inline void init_lppaca(struct lppaca *lppaca)
 
 	*lppaca = (struct lppaca) {
 		.desc = cpu_to_be32(0xd397d781),	/* "LpPa" */
-		.size = cpu_to_be16(0x400),
+		.size = cpu_to_be16(LPPACA_SIZE),
 		.fpregs_in_use = 1,
 		.slb_count = cpu_to_be16(64),
 		.vmxregs_in_use = 0,
@@ -75,19 +77,18 @@ static inline void init_lppaca(struct lppaca *lppaca)
 static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
 {
 	struct lppaca *lp;
-	size_t size = 0x400;
 
-	BUILD_BUG_ON(size < sizeof(struct lppaca));
+	BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE);
 
 	if (early_cpu_has_feature(CPU_FTR_HVMODE))
 		return NULL;
 
-	lp = alloc_paca_data(size, 0x400, limit, cpu);
+	lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
 	init_lppaca(lp);
 
 	return lp;
 }
-#endif /* CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_PPC_PSERIES */
 
 #ifdef CONFIG_PPC_BOOK3S_64
 

From bd104e6db6f0ad124e507a9ecf1a468efe5697db Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:18 -0300
Subject: [PATCH 137/264] powerpc/pseries/svm: Use shared memory for LPPACA
 structures

LPPACA structures need to be shared with the host. Hence they need to be in
shared memory. Instead of allocating individual chunks of memory for a
given structure from memblock, a contiguous chunk of memory is allocated
and then converted into shared memory. Subsequent allocation requests will
come from the contiguous chunk which will be always shared memory for all
structures.

While we are able to use a kmem_cache constructor for the Debug Trace Log,
LPPACAs are allocated very early in the boot process (before SLUB is
available) so we need to use a simpler scheme here.

Introduce helper is_svm_platform() which uses the S bit of the MSR to tell
whether we're running as a secure guest.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-9-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/svm.h | 26 ++++++++++++++++++++
 arch/powerpc/kernel/paca.c     | 43 +++++++++++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/svm.h

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
new file mode 100644
index 000000000000..2689d8d841f8
--- /dev/null
+++ b/arch/powerpc/include/asm/svm.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2018 Anshuman Khandual, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_SVM_H
+#define _ASM_POWERPC_SVM_H
+
+#ifdef CONFIG_PPC_SVM
+
+static inline bool is_secure_guest(void)
+{
+	return mfmsr() & MSR_S;
+}
+
+#else /* CONFIG_PPC_SVM */
+
+static inline bool is_secure_guest(void)
+{
+	return false;
+}
+
+#endif /* CONFIG_PPC_SVM */
+#endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 612fc87ef785..949eceb254d8 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -14,6 +14,8 @@
 #include <asm/sections.h>
 #include <asm/pgtable.h>
 #include <asm/kexec.h>
+#include <asm/svm.h>
+#include <asm/ultravisor.h>
 
 #include "setup.h"
 
@@ -54,6 +56,41 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
 
 #define LPPACA_SIZE 0x400
 
+static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align,
+					unsigned long limit, int cpu)
+{
+	size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE);
+	static unsigned long shared_lppaca_size;
+	static void *shared_lppaca;
+	void *ptr;
+
+	if (!shared_lppaca) {
+		memblock_set_bottom_up(true);
+
+		shared_lppaca =
+			memblock_alloc_try_nid(shared_lppaca_total_size,
+					       PAGE_SIZE, MEMBLOCK_LOW_LIMIT,
+					       limit, NUMA_NO_NODE);
+		if (!shared_lppaca)
+			panic("cannot allocate shared data");
+
+		memblock_set_bottom_up(false);
+		uv_share_page(PHYS_PFN(__pa(shared_lppaca)),
+			      shared_lppaca_total_size >> PAGE_SHIFT);
+	}
+
+	ptr = shared_lppaca + shared_lppaca_size;
+	shared_lppaca_size += size;
+
+	/*
+	 * This is very early in boot, so no harm done if the kernel crashes at
+	 * this point.
+	 */
+	BUG_ON(shared_lppaca_size >= shared_lppaca_total_size);
+
+	return ptr;
+}
+
 /*
  * See asm/lppaca.h for more detail.
  *
@@ -83,7 +120,11 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
 	if (early_cpu_has_feature(CPU_FTR_HVMODE))
 		return NULL;
 
-	lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
+	if (is_secure_guest())
+		lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu);
+	else
+		lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
+
 	init_lppaca(lp);
 
 	return lp;

From d5394c059da9786043934bed474562cde124e7d3 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:19 -0300
Subject: [PATCH 138/264] powerpc/pseries/svm: Use shared memory for Debug
 Trace Log (DTL)

Secure guests need to share the DTL buffers with the hypervisor. To that
end, use a kmem_cache constructor which converts the underlying buddy
allocated SLUB cache pages into shared memory.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-10-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/svm.h          |  5 ++++
 arch/powerpc/platforms/pseries/Makefile |  1 +
 arch/powerpc/platforms/pseries/setup.c  |  5 +++-
 arch/powerpc/platforms/pseries/svm.c    | 40 +++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/pseries/svm.c

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index 2689d8d841f8..85580b30aba4 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,6 +15,9 @@ static inline bool is_secure_guest(void)
 	return mfmsr() & MSR_S;
 }
 
+void dtl_cache_ctor(void *addr);
+#define get_dtl_cache_ctor()	(is_secure_guest() ? dtl_cache_ctor : NULL)
+
 #else /* CONFIG_PPC_SVM */
 
 static inline bool is_secure_guest(void)
@@ -22,5 +25,7 @@ static inline bool is_secure_guest(void)
 	return false;
 }
 
+#define get_dtl_cache_ctor() NULL
+
 #endif /* CONFIG_PPC_SVM */
 #endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index ab3d59aeacca..a420ef4c9d8e 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)		+= ibmebus.o
 obj-$(CONFIG_PAPR_SCM)		+= papr_scm.o
 obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
+obj-$(CONFIG_PPC_SVM)		+= svm.o
 
 ifdef CONFIG_PPC_PSERIES
 obj-$(CONFIG_SUSPEND)		+= suspend.o
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f5940cc71c37..d8930c3a8a11 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -69,6 +69,7 @@
 #include <asm/security_features.h>
 #include <asm/asm-const.h>
 #include <asm/swiotlb.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 #include "../../../../drivers/pci/pci.h"
@@ -297,8 +298,10 @@ static inline int alloc_dispatch_logs(void)
 
 static int alloc_dispatch_log_kmem_cache(void)
 {
+	void (*ctor)(void *) = get_dtl_cache_ctor();
+
 	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
-						DISPATCH_LOG_BYTES, 0, NULL);
+						DISPATCH_LOG_BYTES, 0, ctor);
 	if (!dtl_cache) {
 		pr_warn("Failed to create dispatch trace log buffer cache\n");
 		pr_warn("Stolen time statistics will be unreliable\n");
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index 000000000000..2b2b1a77ca1e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Secure VM platform
+ *
+ * Copyright 2018 IBM Corporation
+ * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <asm/ultravisor.h>
+
+/* There's one dispatch log per CPU. */
+#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
+
+static struct page *dtl_page_store[NR_DTL_PAGE];
+static long dtl_nr_pages;
+
+static bool is_dtl_page_shared(struct page *page)
+{
+	long i;
+
+	for (i = 0; i < dtl_nr_pages; i++)
+		if (dtl_page_store[i] == page)
+			return true;
+
+	return false;
+}
+
+void dtl_cache_ctor(void *addr)
+{
+	unsigned long pfn = PHYS_PFN(__pa(addr));
+	struct page *page = pfn_to_page(pfn);
+
+	if (!is_dtl_page_shared(page)) {
+		dtl_page_store[dtl_nr_pages] = page;
+		dtl_nr_pages++;
+		WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
+		uv_share_page(pfn, 1);
+	}
+}

From 256ba2c1689efd4f5383cf7ebe2f9970c198b79d Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Mon, 19 Aug 2019 23:13:20 -0300
Subject: [PATCH 139/264] powerpc/pseries/svm: Unshare all pages before
 kexecing a new kernel

A new kernel deserves a clean slate. Any pages shared with the hypervisor
is unshared before invoking the new kernel. However there are exceptions.
If the new kernel is invoked to dump the current kernel, or if there is a
explicit request to preserve the state of the current kernel, unsharing
of pages is skipped.

NOTE: While testing crashkernel, make sure at least 256M is reserved for
crashkernel. Otherwise SWIOTLB allocation will fail and crash kernel will
fail to boot.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-11-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/ultravisor-api.h | 1 +
 arch/powerpc/include/asm/ultravisor.h     | 5 +++++
 arch/powerpc/kernel/machine_kexec_64.c    | 9 +++++++++
 3 files changed, 15 insertions(+)

diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 0f5b2d718bfc..4fcda1d5793d 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -28,5 +28,6 @@
 #define UV_ESM				0xF110
 #define UV_SHARE_PAGE			0xF130
 #define UV_UNSHARE_PAGE			0xF134
+#define UV_UNSHARE_ALL_PAGES		0xF140
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
index e5c8413de06f..b1bc2e043ed4 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -41,4 +41,9 @@ static inline int uv_unshare_page(u64 pfn, u64 npages)
 	return ucall_norets(UV_UNSHARE_PAGE, pfn, npages);
 }
 
+static inline int uv_unshare_all_pages(void)
+{
+	return ucall_norets(UV_UNSHARE_ALL_PAGES);
+}
+
 #endif	/* _ASM_POWERPC_ULTRAVISOR_H */
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 18481b0e2788..04a7cba58eff 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -29,6 +29,8 @@
 #include <asm/smp.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/asm-prototypes.h>
+#include <asm/svm.h>
+#include <asm/ultravisor.h>
 
 int default_machine_kexec_prepare(struct kimage *image)
 {
@@ -327,6 +329,13 @@ void default_machine_kexec(struct kimage *image)
 #ifdef CONFIG_PPC_PSERIES
 	kexec_paca.lppaca_ptr = NULL;
 #endif
+
+	if (is_secure_guest() && !(image->preserve_context ||
+				   image->type == KEXEC_TYPE_CRASH)) {
+		uv_unshare_all_pages();
+		printk("kexec: Unshared all shared pages.\n");
+	}
+
 	paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
 
 	setup_paca(&kexec_paca);

From 734560ac39aeb2516419c5878856df011f794a74 Mon Sep 17 00:00:00 2001
From: Ryan Grimm <grimm@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:21 -0300
Subject: [PATCH 140/264] powerpc/pseries/svm: Export guest SVM status to user
 space via sysfs

User space might want to know it's running in a secure VM.  It can't do
a mfmsr because mfmsr is a privileged instruction.

The solution here is to create a cpu attribute:

/sys/devices/system/cpu/svm

which will read 0 or 1 based on the S bit of the current CPU.

Signed-off-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-12-bauerman@linux.ibm.com
---
 .../ABI/testing/sysfs-devices-system-cpu      | 10 ++++++++++
 arch/powerpc/kernel/sysfs.c                   | 20 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 5f7d7b14fa44..06d0931119cc 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -562,3 +562,13 @@ Description:	Umwait control
 			  or C0.2 state. The time is an unsigned 32-bit number.
 			  Note that a value of zero means there is no limit.
 			  Low order two bits must be zero.
+
+What:		/sys/devices/system/cpu/svm
+Date:		August 2019
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+		Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description:	Secure Virtual Machine
+
+		If 1, it means the system is using the Protected Execution
+		Facility in POWER9 and newer processors. i.e., it is a Secure
+		Virtual Machine.
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e2147d7c9e72..80a676da11cb 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -19,6 +19,7 @@
 #include <asm/smp.h>
 #include <asm/pmc.h>
 #include <asm/firmware.h>
+#include <asm/svm.h>
 
 #include "cacheinfo.h"
 #include "setup.h"
@@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = {
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
 
+#ifdef CONFIG_PPC_SVM
+static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", is_secure_guest());
+}
+static DEVICE_ATTR(svm, 0444, show_svm, NULL);
+
+static void create_svm_file(void)
+{
+	device_create_file(cpu_subsys.dev_root, &dev_attr_svm);
+}
+#else
+static void create_svm_file(void)
+{
+}
+#endif /* CONFIG_PPC_SVM */
+
 static int register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -1058,6 +1076,8 @@ static int __init topology_init(void)
 	sysfs_create_dscr_default();
 #endif /* CONFIG_PPC64 */
 
+	create_svm_file();
+
 	return 0;
 }
 subsys_initcall(topology_init);

From 4edaac512c8a31b0aa715b58ee00ee06df642fa2 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Date: Mon, 19 Aug 2019 23:13:22 -0300
Subject: [PATCH 141/264] powerpc/pseries/svm: Disable doorbells in SVM guests

Normally, the HV emulates some instructions like MSGSNDP, MSGCLRP
from a KVM guest. To emulate the instructions, it must first read
the instruction from the guest's memory and decode its parameters.

However for a secure guest (aka SVM), the page containing the
instruction is in secure memory and the HV cannot access directly.
It would need the Ultravisor (UV) to facilitate accessing the
instruction and parameters but the UV currently does not have
the support for such accesses.

Until the UV has such support, disable doorbells in SVMs. This might
incur a performance hit but that is yet to be quantified.

With this patch applied (needed only in SVMs not needed for HV) we
are able to launch SVM guests with multi-core support. Eg:

	qemu -smp sockets=2,cores=2,threads=2.

Fix suggested by Benjamin Herrenschmidt. Thanks to input from
Paul Mackerras, Ram Pai and Michael Anderson.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-13-bauerman@linux.ibm.com
---
 arch/powerpc/platforms/pseries/smp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 4b3ef8d9c63f..ad61e90032da 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -41,6 +41,7 @@
 #include <asm/dbell.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/code-patching.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 #include "offline_states.h"
@@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void)
 {
 	xics_smp_probe();
 
-	if (cpu_has_feature(CPU_FTR_DBELL))
+	if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
 		smp_ops->cause_ipi = smp_pseries_cause_ipi;
 	else
 		smp_ops->cause_ipi = icp_ops->cause_ipi;

From edea902c1c1efb855f77e041f9daf1abe7a9768a Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Date: Mon, 19 Aug 2019 23:13:23 -0300
Subject: [PATCH 142/264] powerpc/pseries/iommu: Don't use dma_iommu_ops on
 secure guests

Secure guest memory is inacessible to devices so regular DMA isn't
possible.

In that case set devices' dma_map_ops to NULL so that the generic
DMA code path will use SWIOTLB to bounce buffers for DMA.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-14-bauerman@linux.ibm.com
---
 arch/powerpc/platforms/pseries/iommu.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index df7db33ca93b..6ba081dd61c9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include <asm/udbg.h>
 #include <asm/mmzone.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 
@@ -1319,7 +1320,15 @@ void iommu_init_early_pSeries(void)
 	of_reconfig_notifier_register(&iommu_reconfig_nb);
 	register_memory_notifier(&iommu_mem_nb);
 
-	set_pci_dma_ops(&dma_iommu_ops);
+	/*
+	 * Secure guest memory is inacessible to devices so regular DMA isn't
+	 * possible.
+	 *
+	 * In that case keep devices' dma_map_ops as NULL so that the generic
+	 * DMA code path will use SWIOTLB to bounce buffers for DMA.
+	 */
+	if (!is_secure_guest())
+		set_pci_dma_ops(&dma_iommu_ops);
 }
 
 static int __init disable_multitce(char *str)

From 2efbc58f157a39ad9e9199b92d9c47736023a2fe Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:24 -0300
Subject: [PATCH 143/264] powerpc/pseries/svm: Force SWIOTLB for secure guests

SWIOTLB checks range of incoming CPU addresses to be bounced and sees if
the device can access it through its DMA window without requiring bouncing.
In such cases it just chooses to skip bouncing. But for cases like secure
guests on powerpc platform all addresses need to be bounced into the shared
pool of memory because the host cannot access it otherwise. Hence the need
to do the bouncing is not related to device's DMA window and use of bounce
buffers is forced by setting swiotlb_force.

Also, connect the shared memory conversion functions into the
ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to
convert SWIOTLB's memory pool to shared memory.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[ bauerman: Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ]
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-15-bauerman@linux.ibm.com
---
 arch/powerpc/include/asm/mem_encrypt.h | 26 +++++++++++++++
 arch/powerpc/platforms/pseries/Kconfig |  3 ++
 arch/powerpc/platforms/pseries/svm.c   | 45 ++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)
 create mode 100644 arch/powerpc/include/asm/mem_encrypt.h

diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..ba9dab07c1be
--- /dev/null
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2018 IBM Corporation
+ */
+
+#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
+#define _ASM_POWERPC_MEM_ENCRYPT_H
+
+#include <asm/svm.h>
+
+static inline bool mem_encrypt_active(void)
+{
+	return is_secure_guest();
+}
+
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return is_secure_guest();
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index d09deb05bb66..9e35cddddf73 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -149,6 +149,9 @@ config PAPR_SCM
 config PPC_SVM
 	bool "Secure virtual machine (SVM) support for POWER"
 	depends on PPC_PSERIES
+	select SWIOTLB
+	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	help
 	 There are certain POWER platforms which support secure guests using
 	 the Protected Execution Facility, with the help of an Ultravisor
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 2b2b1a77ca1e..40c0637203d5 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -7,8 +7,53 @@
  */
 
 #include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/svm.h>
+#include <asm/swiotlb.h>
 #include <asm/ultravisor.h>
 
+static int __init init_svm(void)
+{
+	if (!is_secure_guest())
+		return 0;
+
+	/* Don't release the SWIOTLB buffer. */
+	ppc_swiotlb_enable = 1;
+
+	/*
+	 * Since the guest memory is inaccessible to the host, devices always
+	 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
+	 * otherwise.
+	 */
+	swiotlb_force = SWIOTLB_FORCE;
+
+	/* Share the SWIOTLB buffer with the host. */
+	swiotlb_update_mem_attributes();
+
+	return 0;
+}
+machine_early_initcall(pseries, init_svm);
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_share_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
 /* There's one dispatch log per CPU. */
 #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
 

From bf75a8db72e9454aadeca0276f6e6b0e9161861c Mon Sep 17 00:00:00 2001
From: Ryan Grimm <grimm@linux.vnet.ibm.com>
Date: Mon, 19 Aug 2019 23:13:26 -0300
Subject: [PATCH 144/264] powerpc/configs: Enable secure guest support in
 pseries and ppc64 defconfigs

Enables running as a secure guest in platforms with an Ultravisor.

Signed-off-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190820021326.6884-17-bauerman@linux.ibm.com
---
 arch/powerpc/configs/ppc64_defconfig   | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index dc83fefa04f7..b250e6f5a7ca 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -29,6 +29,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 CONFIG_PPC_MAPLE=y
 CONFIG_PPC_PASEMI=y
 CONFIG_PPC_PASEMI_IOMMU=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 38abc9c1770a..26126b4d4de3 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -42,6 +42,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 # CONFIG_PPC_PMAC is not set
 CONFIG_RTAS_FLASH=m
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y

From 0be9f7fd5d8fd984b34ad98838ef7cfd0079ddae Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:26 +1000
Subject: [PATCH 145/264] powerpc/64s/exception: machine check fwnmi remove HV
 case

fwnmi does not trigger in HV mode, so remove always-true feature test.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-2-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 520804351601..515ea0243ff2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1026,9 +1026,8 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
 	.globl machine_check_fwnmi
 machine_check_fwnmi:
 	EXCEPTION_PROLOG_0 PACA_EXMC
-BEGIN_FTR_SECTION
 	b	machine_check_common_early
-END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
+
 machine_check_pSeries_0:
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	/*

From 1039f62431e2aa16487cd6d64bc841d71f6465b8 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:27 +1000
Subject: [PATCH 146/264] powerpc/64s/exception: machine check remove bitrotted
 comment

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-3-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 515ea0243ff2..8cf4e44d2d76 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -933,10 +933,6 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
-	/* This is moved out of line as it can be patched by FW, but
-	 * some code path might still want to branch into the original
-	 * vector
-	 */
 	EXCEPTION_PROLOG_0 PACA_EXMC
 BEGIN_FTR_SECTION
 	b	machine_check_common_early

From 19dbe673e62b076f00ae2841fcf5898b4728d5ab Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:28 +1000
Subject: [PATCH 147/264] powerpc/64s/exception: machine check fix KVM guest
 test

The machine_check_handle_early hypervisor guest test is skipped if
!HVMODE or MSR[HV]=0, which is wrong for PR or nested hypervisors
that could be running a guest in this state.

Test HSTATE_IN_GUEST up front and use that to branch out to the KVM
handler, then MSR[PR] alone can test for this kernel's userspace.
This matches all other interrupt handling.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-4-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 33 +++++++++++-----------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 8cf4e44d2d76..7c28c22fc6a6 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1108,11 +1108,8 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	bl	machine_check_early
 	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
-BEGIN_FTR_SECTION
-	b	4f
-END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 
-#ifdef	CONFIG_PPC_P7_NAP
+#ifdef CONFIG_PPC_P7_NAP
 	/*
 	 * Check if thread was in power saving mode. We come here when any
 	 * of the following is true:
@@ -1128,30 +1125,26 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
-	/*
-	 * Check if we are coming from hypervisor userspace. If yes then we
-	 * continue in host kernel in V mode to deliver the MC event.
-	 */
-	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
-	beq	5f
-4:	andi.	r11,r12,MSR_PR		/* See if coming from user. */
-	bne	9f			/* continue in V mode if we are. */
-
-5:
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-BEGIN_FTR_SECTION
 	/*
-	 * We are coming from kernel context. Check if we are coming from
-	 * guest. if yes, then we can continue. We will fall through
-	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
+	 * Check if we are coming from guest. If yes, then run the normal
+	 * exception handler which will take the do_kvm_200->kvmppc_interrupt
+	 * branch to deliver the MC event to guest.
 	 */
 	lbz	r11,HSTATE_IN_GUEST(r13)
 	cmpwi	r11,0			/* Check if coming from guest */
 	bne	9f			/* continue if we are. */
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
+
 	/*
-	 * At this point we are not sure about what context we come from.
+	 * Check if we are coming from userspace. If yes, then run the normal
+	 * exception handler which will deliver the MC event to this kernel.
+	 */
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	9f			/* continue in V mode if we are. */
+
+	/*
+	 * At this point we are coming from kernel context.
 	 * Queue up the MCE event and return from the interrupt.
 	 * But before that, check if this is an un-recoverable exception.
 	 * If yes, then stay on emergency stack and panic.

From fe9d482b1d87c76441492e51d866cee652eee4d5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:29 +1000
Subject: [PATCH 148/264] powerpc/64s/exception: machine check adjust RFI
 target

The host kernel delivery case for powernv does RFI_TO_USER_OR_KERNEL,
but should just use RFI_TO_KERNEL which makes it clear this is not a
user case.

This is not a bug because RFI_TO_USER_OR_KERNEL deals with kernel
returns just fine.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-5-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 7c28c22fc6a6..9a7cc3edc721 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1184,7 +1184,7 @@ BEGIN_FTR_SECTION
 	 */
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
-	RFI_TO_USER_OR_KERNEL
+	RFI_TO_KERNEL
 FTR_SECTION_ELSE
 	/*
 	 * pSeries: Return from MC interrupt. Before that stay on emergency

From b5c27f7c5679c3726148fd25ad220b4560d210cf Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:30 +1000
Subject: [PATCH 149/264] powerpc/64s/exception: machine check pseries should
 always run the early handler

Now that pseries with fwnmi registered runs the early machine check
handler, there is no good reason to special case the non-fwnmi case
and skip the early handler. Reducing the code and number of paths is
a top priority for asm code, it's better to handle this in C where
possible (and the pseries early handler is a no-op if fwnmi is not
registered).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-6-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9a7cc3edc721..73be4f9027de 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -934,11 +934,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 	EXCEPTION_PROLOG_0 PACA_EXMC
-BEGIN_FTR_SECTION
 	b	machine_check_common_early
-FTR_SECTION_ELSE
-	b	machine_check_pSeries_0
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)

From fa2760eca504f554a5adb6cd2f576828933c4c7b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:31 +1000
Subject: [PATCH 150/264] powerpc/64s/exception: machine check remove
 machine_check_pSeries_0 branch

This label has only one caller, so unwind the branch and move it
inline. The location of the comment is adjusted to match similar
one in system reset.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-7-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 73be4f9027de..577d93a83d05 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1014,20 +1014,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	b	1b
 	b	.	/* prevent speculative execution */
 
-TRAMP_REAL_BEGIN(machine_check_pSeries)
-	.globl machine_check_fwnmi
-machine_check_fwnmi:
+#ifdef CONFIG_PPC_PSERIES
+TRAMP_REAL_BEGIN(machine_check_fwnmi)
 	EXCEPTION_PROLOG_0 PACA_EXMC
 	b	machine_check_common_early
-
-machine_check_pSeries_0:
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
-	/*
-	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
-	 * nested machine check corrupts it. machine_check_common enables
-	 * MSR_RI.
-	 */
-	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+#endif
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
@@ -1197,7 +1188,13 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 	/* Deliver the machine check to host kernel in V mode. */
 	MACHINE_CHECK_HANDLER_WINDUP
 	EXCEPTION_PROLOG_0 PACA_EXMC
-	b	machine_check_pSeries_0
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
+	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+	/*
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
+	 */
 
 EXC_COMMON_BEGIN(unrecover_mce)
 	/* Invoke machine_check_exception to print MCE event and panic. */

From 0b66370c61fcf5fcc1d6901013e110284da6e2bb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:32 +1000
Subject: [PATCH 151/264] powerpc/64s/exception: machine check use correct cfar
 for late handler

Bare metal machine checks run an "early" handler in real mode before
running the main handler which reports the event.

The main handler runs exactly as a normal interrupt handler, after the
"windup" which sets registers back as they were at interrupt entry.
CFAR does not get restored by the windup code, so that will be wrong
when the handler is run.

Restore the CFAR to the saved value before running the late handler.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-8-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 577d93a83d05..b7c4149cb91c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1186,6 +1186,10 @@ FTR_SECTION_ELSE
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 9:
 	/* Deliver the machine check to host kernel in V mode. */
+BEGIN_FTR_SECTION
+	ld	r10,ORIG_GPR3(r1)
+	mtspr	SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	MACHINE_CHECK_HANDLER_WINDUP
 	EXCEPTION_PROLOG_0 PACA_EXMC
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0

From 7290f3b3d3e66b54720f23079ffc60e0b7bbb0cc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:33 +1000
Subject: [PATCH 152/264] powerpc/64s/powernv: machine check dump SLB contents

Re-use the code introduced in pseries to save and dump the contents
of the SLB in the case of an SLB involved machine check exception.

This patch also avoids allocating the SLB save array on pseries radix.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-9-npiggin@gmail.com
---
 arch/powerpc/kernel/mce.c              |  6 ++++++
 arch/powerpc/kernel/mce_power.c        |  4 ++++
 arch/powerpc/platforms/powernv/setup.c |  9 +++++++++
 arch/powerpc/platforms/pseries/setup.c | 22 ++++++++++++----------
 4 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ec4b3e1087be..04280a5871fc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -511,6 +511,12 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 	subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
 		mc_error_class[evt->error_class] : "Unknown";
 	printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Display faulty slb contents for SLB errors. */
+	if (evt->error_type == MCE_ERROR_TYPE_SLB)
+		slb_dump_contents(local_paca->mce_faulty_slbs);
+#endif
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index b6cbe3449358..356e7b99f661 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -405,6 +405,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
 		/* attempt to correct the error */
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_SLB:
+			if (local_paca->in_mce == 1)
+				slb_save_contents(local_paca->mce_faulty_slbs);
 			handled = mce_flush(MCE_FLUSH_SLB);
 			break;
 		case MCE_ERROR_TYPE_ERAT:
@@ -490,6 +492,8 @@ static int mce_handle_derror(struct pt_regs *regs,
 		/* attempt to correct the error */
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_SLB:
+			if (local_paca->in_mce == 1)
+				slb_save_contents(local_paca->mce_faulty_slbs);
 			if (mce_flush(MCE_FLUSH_SLB))
 				handled = 1;
 			break;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a5e52f9eed3c..83498604d322 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -24,6 +24,7 @@
 #include <linux/bug.h>
 #include <linux/pci.h>
 #include <linux/cpufreq.h>
+#include <linux/memblock.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
@@ -166,6 +167,14 @@ static void __init pnv_init(void)
 	else
 #endif
 		add_preferred_console("hvc", 0, NULL);
+
+	if (!radix_enabled()) {
+		int i;
+
+		/* Allocate per cpu area to save old slb contents during MCE */
+		for_each_possible_cpu(i)
+			paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i));
+	}
 }
 
 static void __init pnv_init_IRQ(void)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index d8930c3a8a11..b955d54628ff 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -142,17 +142,19 @@ static void __init fwnmi_init(void)
 	}
 
 #ifdef CONFIG_PPC_BOOK3S_64
-	/* Allocate per cpu slb area to save old slb contents during MCE */
-	size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
-	slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry),
-					MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
-					NUMA_NO_NODE);
-	if (!slb_ptr)
-		panic("Failed to allocate %zu bytes below %pa for slb area\n",
-		      size, &ppc64_rma_size);
+	if (!radix_enabled()) {
+		/* Allocate per cpu area to save old slb contents during MCE */
+		size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
+		slb_ptr = memblock_alloc_try_nid_raw(size,
+				sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
+				ppc64_rma_size, NUMA_NO_NODE);
+		if (!slb_ptr)
+			panic("Failed to allocate %zu bytes below %pa for slb area\n",
+			      size, &ppc64_rma_size);
 
-	for_each_possible_cpu(i)
-		paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+		for_each_possible_cpu(i)
+			paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+	}
 #endif
 }
 

From 9ca766f9891d23743b4e1a7b1cafdc63723cd6a7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:34 +1000
Subject: [PATCH 153/264] powerpc/64s/pseries: machine check convert to use
 common event code

The common machine_check_event data structures and queues are mostly
platform independent, with powernv decoding SRR1/DSISR/etc., into
machine_check_event objects.

This patch converts pseries to use this infrastructure by decoding
fwnmi/rtas data into machine_check_event objects.

This allows queueing to be used by a subsequent change to delay the
virtual mode handling of machine checks that occur in kernel space
where it is unsafe to switch immediately to virtual mode, similarly
to powernv.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Fix implicit fallthrough warnings in mce_handle_error()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-10-npiggin@gmail.com
---
 arch/powerpc/include/asm/mce.h       |   6 +
 arch/powerpc/kernel/mce.c            |  34 +-
 arch/powerpc/platforms/pseries/ras.c | 478 ++++++++++++---------------
 3 files changed, 242 insertions(+), 276 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 19a33707d5ef..6a6ddaabdb34 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -30,6 +30,10 @@ enum MCE_Disposition {
 enum MCE_Initiator {
 	MCE_INITIATOR_UNKNOWN = 0,
 	MCE_INITIATOR_CPU = 1,
+	MCE_INITIATOR_PCI = 2,
+	MCE_INITIATOR_ISA = 3,
+	MCE_INITIATOR_MEMORY= 4,
+	MCE_INITIATOR_POWERMGM = 5,
 };
 
 enum MCE_ErrorType {
@@ -41,6 +45,8 @@ enum MCE_ErrorType {
 	MCE_ERROR_TYPE_USER = 5,
 	MCE_ERROR_TYPE_RA = 6,
 	MCE_ERROR_TYPE_LINK = 7,
+	MCE_ERROR_TYPE_DCACHE = 8,
+	MCE_ERROR_TYPE_ICACHE = 9,
 };
 
 enum MCE_ErrorClass {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 04280a5871fc..34c1001e9e8b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -325,7 +325,7 @@ static void machine_check_process_queued_event(struct irq_work *work)
 void machine_check_print_event_info(struct machine_check_event *evt,
 				    bool user_mode, bool in_guest)
 {
-	const char *level, *sevstr, *subtype, *err_type;
+	const char *level, *sevstr, *subtype, *err_type, *initiator;
 	uint64_t ea = 0, pa = 0;
 	int n = 0;
 	char dar_str[50];
@@ -410,6 +410,28 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		break;
 	}
 
+	switch(evt->initiator) {
+	case MCE_INITIATOR_CPU:
+		initiator = "CPU";
+		break;
+	case MCE_INITIATOR_PCI:
+		initiator = "PCI";
+		break;
+	case MCE_INITIATOR_ISA:
+		initiator = "ISA";
+		break;
+	case MCE_INITIATOR_MEMORY:
+		initiator = "Memory";
+		break;
+	case MCE_INITIATOR_POWERMGM:
+		initiator = "Power Management";
+		break;
+	case MCE_INITIATOR_UNKNOWN:
+	default:
+		initiator = "Unknown";
+		break;
+	}
+
 	switch (evt->error_type) {
 	case MCE_ERROR_TYPE_UE:
 		err_type = "UE";
@@ -476,6 +498,14 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		if (evt->u.link_error.effective_address_provided)
 			ea = evt->u.link_error.effective_address;
 		break;
+	case MCE_ERROR_TYPE_DCACHE:
+		err_type = "D-Cache";
+		subtype = "Unknown";
+		break;
+	case MCE_ERROR_TYPE_ICACHE:
+		err_type = "I-Cache";
+		subtype = "Unknown";
+		break;
 	default:
 	case MCE_ERROR_TYPE_UNKNOWN:
 		err_type = "Unknown";
@@ -508,6 +538,8 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 			level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
 	}
 
+	printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator);
+
 	subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
 		mc_error_class[evt->error_class] : "Unknown";
 	printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f16fdd0f71f7..3acdcc3bb908 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -76,6 +76,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_UE		0x00
 #define MC_ERROR_TYPE_SLB		0x01
 #define MC_ERROR_TYPE_ERAT		0x02
+#define MC_ERROR_TYPE_UNKNOWN		0x03
 #define MC_ERROR_TYPE_TLB		0x04
 #define MC_ERROR_TYPE_D_CACHE		0x05
 #define MC_ERROR_TYPE_I_CACHE		0x07
@@ -87,6 +88,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_UE_LOAD_STORE			3
 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE	4
 
+#define UE_EFFECTIVE_ADDR_PROVIDED		0x40
+#define UE_LOGICAL_ADDR_PROVIDED		0x20
+
 #define MC_ERROR_SLB_PARITY		0
 #define MC_ERROR_SLB_MULTIHIT		1
 #define MC_ERROR_SLB_INDETERMINATE	2
@@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 	}
 }
 
-static
-inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog)
-{
-	__be64 addr = 0;
-
-	switch (mlog->error_type) {
-	case	MC_ERROR_TYPE_UE:
-		if (mlog->sub_err_type & 0x40)
-			addr = mlog->effective_address;
-		break;
-	case	MC_ERROR_TYPE_SLB:
-	case	MC_ERROR_TYPE_ERAT:
-	case	MC_ERROR_TYPE_TLB:
-		if (mlog->sub_err_type & 0x80)
-			addr = mlog->effective_address;
-	default:
-		break;
-	}
-	return be64_to_cpu(addr);
-}
-
 /*
  * Enable the hotplug interrupt late because processing them may touch other
  * devices or systems (e.g. hugepages) that have not been initialized at the
@@ -511,149 +494,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
-#define VAL_TO_STRING(ar, val)	\
-	(((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
 
-static void pseries_print_mce_info(struct pt_regs *regs,
-				   struct rtas_error_log *errp)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
-	const char *level, *sevstr;
+	struct mce_error_info mce_err = { 0 };
+	unsigned long eaddr = 0, paddr = 0;
 	struct pseries_errorlog *pseries_log;
 	struct pseries_mc_errorlog *mce_log;
+	int disposition = rtas_error_disposition(errp);
+	int initiator = rtas_error_initiator(errp);
+	int severity = rtas_error_severity(errp);
 	u8 error_type, err_sub_type;
-	u64 addr;
-	u8 initiator = rtas_error_initiator(errp);
-	int disposition = rtas_error_disposition(errp);
 
-	static const char * const initiators[] = {
-		[0] = "Unknown",
-		[1] = "CPU",
-		[2] = "PCI",
-		[3] = "ISA",
-		[4] = "Memory",
-		[5] = "Power Mgmt",
-	};
-	static const char * const mc_err_types[] = {
-		[0] = "UE",
-		[1] = "SLB",
-		[2] = "ERAT",
-		[3] = "Unknown",
-		[4] = "TLB",
-		[5] = "D-Cache",
-		[6] = "Unknown",
-		[7] = "I-Cache",
-	};
-	static const char * const mc_ue_types[] = {
-		[0] = "Indeterminate",
-		[1] = "Instruction fetch",
-		[2] = "Page table walk ifetch",
-		[3] = "Load/Store",
-		[4] = "Page table walk Load/Store",
-	};
+	if (initiator == RTAS_INITIATOR_UNKNOWN)
+		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
+	else if (initiator == RTAS_INITIATOR_CPU)
+		mce_err.initiator = MCE_INITIATOR_CPU;
+	else if (initiator == RTAS_INITIATOR_PCI)
+		mce_err.initiator = MCE_INITIATOR_PCI;
+	else if (initiator == RTAS_INITIATOR_ISA)
+		mce_err.initiator = MCE_INITIATOR_ISA;
+	else if (initiator == RTAS_INITIATOR_MEMORY)
+		mce_err.initiator = MCE_INITIATOR_MEMORY;
+	else if (initiator == RTAS_INITIATOR_POWERMGM)
+		mce_err.initiator = MCE_INITIATOR_POWERMGM;
+	else
+		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 
-	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
-	static const char * const mc_slb_types[] = {
-		[0] = "Parity",
-		[1] = "Multihit",
-		[2] = "Indeterminate",
-	};
+	if (severity == RTAS_SEVERITY_NO_ERROR)
+		mce_err.severity = MCE_SEV_NO_ERROR;
+	else if (severity == RTAS_SEVERITY_EVENT)
+		mce_err.severity = MCE_SEV_WARNING;
+	else if (severity == RTAS_SEVERITY_WARNING)
+		mce_err.severity = MCE_SEV_WARNING;
+	else if (severity == RTAS_SEVERITY_ERROR_SYNC)
+		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_ERROR)
+		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_FATAL)
+		mce_err.severity = MCE_SEV_FATAL;
+	else
+		mce_err.severity = MCE_SEV_FATAL;
 
-	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
-	static const char * const mc_soft_types[] = {
-		[0] = "Unknown",
-		[1] = "Parity",
-		[2] = "Multihit",
-		[3] = "Indeterminate",
-	};
+	if (severity <= RTAS_SEVERITY_ERROR_SYNC)
+		mce_err.sync_error = true;
+	else
+		mce_err.sync_error = false;
 
-	if (!rtas_error_extended(errp)) {
-		pr_err("Machine check interrupt: Missing extended error log\n");
-		return;
-	}
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (pseries_log == NULL)
-		return;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
-	error_type = mce_log->error_type;
-	err_sub_type = rtas_mc_error_sub_type(mce_log);
-
-	switch (rtas_error_severity(errp)) {
-	case RTAS_SEVERITY_NO_ERROR:
-		level = KERN_INFO;
-		sevstr = "Harmless";
-		break;
-	case RTAS_SEVERITY_WARNING:
-		level = KERN_WARNING;
-		sevstr = "";
-		break;
-	case RTAS_SEVERITY_ERROR:
-	case RTAS_SEVERITY_ERROR_SYNC:
-		level = KERN_ERR;
-		sevstr = "Severe";
-		break;
-	case RTAS_SEVERITY_FATAL:
-	default:
-		level = KERN_ERR;
-		sevstr = "Fatal";
-		break;
-	}
-
-#ifdef CONFIG_PPC_BOOK3S_64
-	/* Display faulty slb contents for SLB errors. */
-	if (error_type == MC_ERROR_TYPE_SLB)
-		slb_dump_contents(local_paca->mce_faulty_slbs);
-#endif
-
-	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       disposition == RTAS_DISP_FULLY_RECOVERED ?
-	       "Recovered" : "Not recovered");
-	if (user_mode(regs)) {
-		printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
-		       regs->nip, current->pid, current->comm);
-	} else {
-		printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
-		       (void *)regs->nip);
-	}
-	printk("%s  Initiator: %s\n", level,
-	       VAL_TO_STRING(initiators, initiator));
-
-	switch (error_type) {
-	case MC_ERROR_TYPE_UE:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_ue_types, err_sub_type));
-		break;
-	case MC_ERROR_TYPE_SLB:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_slb_types, err_sub_type));
-		break;
-	case MC_ERROR_TYPE_ERAT:
-	case MC_ERROR_TYPE_TLB:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_soft_types, err_sub_type));
-		break;
-	default:
-		printk("%s  Error type: %s\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type));
-		break;
-	}
-
-	addr = rtas_mc_get_effective_addr(mce_log);
-	if (addr)
-		printk("%s    Effective address: %016llx\n", level, addr);
-}
-
-static int mce_handle_error(struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log;
-	int disposition = rtas_error_disposition(errp);
-	u8 error_type;
+	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
 	if (!rtas_error_extended(errp))
 		goto out;
@@ -664,6 +553,105 @@ static int mce_handle_error(struct rtas_error_log *errp)
 
 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
 	error_type = mce_log->error_type;
+	err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+	switch (mce_log->error_type) {
+	case MC_ERROR_TYPE_UE:
+		mce_err.error_type = MCE_ERROR_TYPE_UE;
+		switch (err_sub_type) {
+		case MC_ERROR_UE_IFETCH:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
+			break;
+		case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+			break;
+		case MC_ERROR_UE_LOAD_STORE:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+			break;
+		case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+			break;
+		case MC_ERROR_UE_INDETERMINATE:
+		default:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+
+		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+			paddr = be64_to_cpu(mce_log->logical_address);
+		} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+			unsigned long pfn;
+
+			pfn = addr_to_pfn(regs, eaddr);
+			if (pfn != ULONG_MAX)
+				paddr = pfn << PAGE_SHIFT;
+		}
+
+		break;
+	case MC_ERROR_TYPE_SLB:
+		mce_err.error_type = MCE_ERROR_TYPE_SLB;
+		switch (err_sub_type) {
+		case MC_ERROR_SLB_PARITY:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
+			break;
+		case MC_ERROR_SLB_MULTIHIT:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_SLB_INDETERMINATE:
+		default:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+		break;
+	case MC_ERROR_TYPE_ERAT:
+		mce_err.error_type = MCE_ERROR_TYPE_ERAT;
+		switch (err_sub_type) {
+		case MC_ERROR_ERAT_PARITY:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
+			break;
+		case MC_ERROR_ERAT_MULTIHIT:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_ERAT_INDETERMINATE:
+		default:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+		break;
+	case MC_ERROR_TYPE_TLB:
+		mce_err.error_type = MCE_ERROR_TYPE_TLB;
+		switch (err_sub_type) {
+		case MC_ERROR_TLB_PARITY:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
+			break;
+		case MC_ERROR_TLB_MULTIHIT:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_TLB_INDETERMINATE:
+		default:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+		break;
+	case MC_ERROR_TYPE_D_CACHE:
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+		break;
+	case MC_ERROR_TYPE_I_CACHE:
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+		break;
+	case MC_ERROR_TYPE_UNKNOWN:
+	default:
+		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
+		break;
+	}
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (disposition == RTAS_DISP_NOT_RECOVERED) {
@@ -682,99 +670,25 @@ static int mce_handle_error(struct rtas_error_log *errp)
 				slb_save_contents(local_paca->mce_faulty_slbs);
 			flush_and_reload_slb();
 			disposition = RTAS_DISP_FULLY_RECOVERED;
-			rtas_set_disposition_recovered(errp);
 			break;
 		default:
 			break;
 		}
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		printk(KERN_ERR "MCE: limited recovery, system may "
+		       "be degraded\n");
+		disposition = RTAS_DISP_FULLY_RECOVERED;
 	}
 #endif
 
 out:
+	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
+			&mce_err, regs->nip, eaddr, paddr);
+
 	return disposition;
 }
 
-#ifdef CONFIG_MEMORY_FAILURE
-
-static DEFINE_PER_CPU(int, rtas_ue_count);
-static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
-
-#define UE_EFFECTIVE_ADDR_PROVIDED	0x40
-#define UE_LOGICAL_ADDR_PROVIDED	0x20
-
-
-static void pseries_hwpoison_work_fn(struct work_struct *work)
-{
-	unsigned long paddr;
-	int index;
-
-	while (__this_cpu_read(rtas_ue_count) > 0) {
-		index = __this_cpu_read(rtas_ue_count) - 1;
-		paddr = __this_cpu_read(rtas_ue_paddr[index]);
-		memory_failure(paddr >> PAGE_SHIFT, 0);
-		__this_cpu_dec(rtas_ue_count);
-	}
-}
-
-static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
-
-static void queue_ue_paddr(unsigned long paddr)
-{
-	int index;
-
-	index = __this_cpu_inc_return(rtas_ue_count) - 1;
-	if (index >= MAX_MC_EVT) {
-		__this_cpu_dec(rtas_ue_count);
-		return;
-	}
-	this_cpu_write(rtas_ue_paddr[index], paddr);
-	schedule_work(&hwpoison_work);
-}
-
-static void pseries_do_memory_failure(struct pt_regs *regs,
-				      struct pseries_mc_errorlog *mce_log)
-{
-	unsigned long paddr;
-
-	if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
-		paddr = be64_to_cpu(mce_log->logical_address);
-	} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
-		unsigned long pfn;
-
-		pfn = addr_to_pfn(regs,
-				  be64_to_cpu(mce_log->effective_address));
-		if (pfn == ULONG_MAX)
-			return;
-		paddr = pfn << PAGE_SHIFT;
-	} else {
-		return;
-	}
-	queue_ue_paddr(paddr);
-}
-
-static void pseries_process_ue(struct pt_regs *regs,
-			       struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log;
-
-	if (!rtas_error_extended(errp))
-		return;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (!pseries_log)
-		return;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
-	if (mce_log->error_type == MC_ERROR_TYPE_UE)
-		pseries_do_memory_failure(regs, mce_log);
-}
-#else
-static inline void pseries_process_ue(struct pt_regs *regs,
-				      struct rtas_error_log *errp) { }
-#endif /*CONFIG_MEMORY_FAILURE */
-
 /*
  * Process MCE rtas errlog event.
  */
@@ -795,48 +709,50 @@ static void mce_process_errlog_event(struct irq_work *work)
  * Return 1 if corrected (or delivered a signal).
  * Return 0 if there is nothing we can do.
  */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
 {
 	int recovered = 0;
-	int disposition = rtas_error_disposition(err);
-
-	pseries_print_mce_info(regs, err);
 
 	if (!(regs->msr & MSR_RI)) {
 		/* If MSR_RI isn't set, we cannot recover */
 		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
 		recovered = 0;
-
-	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
+	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
 		/* Platform corrected itself */
 		recovered = 1;
-
-	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-		/* Platform corrected itself but could be degraded */
-		printk(KERN_ERR "MCE: limited recovery, system may "
-		       "be degraded\n");
-		recovered = 1;
-
-	} else if (user_mode(regs) && !is_global_init(current) &&
-		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
-
-		/*
-		 * If we received a synchronous error when in userspace
-		 * kill the task. Firmware may report details of the fail
-		 * asynchronously, so we can't rely on the target and type
-		 * fields being valid here.
-		 */
-		printk(KERN_ERR "MCE: uncorrectable error, killing task "
-		       "%s:%d\n", current->comm, current->pid);
-
-		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
-		recovered = 1;
+	} else if (evt->severity == MCE_SEV_FATAL) {
+		/* Fatal machine check */
+		pr_err("Machine check interrupt is fatal\n");
+		recovered = 0;
 	}
 
-	pseries_process_ue(regs, err);
-
-	/* Queue irq work to log this rtas event later. */
-	irq_work_queue(&mce_errlog_process_work);
+	if (!recovered && evt->sync_error) {
+		/*
+		 * Try to kill processes if we get a synchronous machine check
+		 * (e.g., one caused by execution of this instruction). This
+		 * will devolve into a panic if we try to kill init or are in
+		 * an interrupt etc.
+		 *
+		 * TODO: Queue up this address for hwpoisioning later.
+		 * TODO: This is not quite right for d-side machine
+		 *       checks ->nip is not necessarily the important
+		 *       address.
+		 */
+		if ((user_mode(regs))) {
+			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+			recovered = 1;
+		} else if (die_will_crash()) {
+			/*
+			 * die() would kill the kernel, so better to go via
+			 * the platform reboot code that will log the
+			 * machine check.
+			 */
+			recovered = 0;
+		} else {
+			die("Machine check", regs, SIGBUS);
+			recovered = 1;
+		}
+	}
 
 	return recovered;
 }
@@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
  */
 int pSeries_machine_check_exception(struct pt_regs *regs)
 {
-	struct rtas_error_log *errp;
+	struct machine_check_event evt;
 
-	if (fwnmi_active) {
-		fwnmi_release_errinfo();
-		errp = fwnmi_get_errlog();
-		if (errp && recover_mce(regs, errp))
-			return 1;
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return 0;
+
+	/* Print things out */
+	if (evt.version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt.version);
+		return 0;
 	}
+	machine_check_print_event_info(&evt, user_mode(regs), false);
+
+	if (recover_mce(regs, &evt))
+		return 1;
 
 	return 0;
 }
@@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs)
 		 * to panic. Hence we will call it as soon as we go into
 		 * virtual mode.
 		 */
-		disposition = mce_handle_error(errp);
+		disposition = mce_handle_error(regs, errp);
+		fwnmi_release_errinfo();
+
+		/* Queue irq work to log this rtas event later. */
+		irq_work_queue(&mce_errlog_process_work);
+
 		if (disposition == RTAS_DISP_FULLY_RECOVERED)
 			return 1;
 	}

From 272f636445cf556498c8840dc63ad1218e94391b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:35 +1000
Subject: [PATCH 154/264] powerpc/64s/exception: machine check pseries should
 skip the late handler for kernel MCEs

The powernv machine check handler copes with taking a MCE from one of
three contexts, guest, kernel, and user. In each case the early
handler runs first on a special stack, then:

- The guest case branches to the KVM interrupt handler (via standard
  interrupt macros).
- The user case will run the "late" handler which is like a normal
  interrupt that runs in virtual mode and uses the regular kernel
  stack.
- The kernel case queues the event and schedules it for processing
  with irq work.

The last case is important, it must not enable virtual memory because
the MMU state may not be set up to deal with that (e.g., SLB might be
clear), it must not use the regular kernel stack for similar reasons
(e.g., might be in OPAL with OPAL stack in r1), and the kernel does
not expect anything to touch its stack if interrupts are disabled.

The pseries handler does not do this queueing, but instead it always
runs the late handler for host MCEs, which has some of the same
problems.

Now that pseries is using machine_check_events, change it to do the
same as powernv and queue events for kernel MCEs.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-11-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index b7c4149cb91c..aa3720c0b8fe 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1163,7 +1163,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	cmpdi	r3,0		/* see if we handled MCE successfully */
 
 	beq	1b		/* if !handled then panic */
-BEGIN_FTR_SECTION
+
 	/*
 	 * Return from MC interrupt.
 	 * Queue up the MCE event so that we can log it later, while
@@ -1172,18 +1172,7 @@ BEGIN_FTR_SECTION
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
 	RFI_TO_KERNEL
-FTR_SECTION_ELSE
-	/*
-	 * pSeries: Return from MC interrupt. Before that stay on emergency
-	 * stack and call machine_check_exception to log the MCE event.
-	 */
-	LOAD_HANDLER(r10,mce_return)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	mtspr	SPRN_SRR1,r10
-	RFI_TO_KERNEL
-	b	.
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+
 9:
 	/* Deliver the machine check to host kernel in V mode. */
 BEGIN_FTR_SECTION
@@ -1212,13 +1201,6 @@ EXC_COMMON_BEGIN(unrecover_mce)
 	bl	unrecoverable_exception
 	b	1b
 
-EXC_COMMON_BEGIN(mce_return)
-	/* Invoke machine_check_exception to print MCE event and return. */
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
-	MACHINE_CHECK_HANDLER_WINDUP
-	RFI_TO_KERNEL
-	b	.
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
 	EXCEPTION_PROLOG_0 PACA_EXGEN

From c8eb54dbc8087c3d114cb583925395e211bfffa4 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:36 +1000
Subject: [PATCH 155/264] powerpc/64s/exception: machine check restructure to
 reuse common macros

Follow the pattern of sreset and HMI handlers more closely: use
EXCEPTION_PROLOG_COMMON_1 rather than open-coding it, and run the
handler at the relocated location.

This helps later simplification and code sharing.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-12-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 71 ++++++++++++++--------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index aa3720c0b8fe..a1f0a88d39a5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -934,17 +934,23 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 	EXCEPTION_PROLOG_0 PACA_EXMC
-	b	machine_check_common_early
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
+	mfctr	r10			/* save ctr, even for !RELOCATABLE */
+	BRANCH_TO_C000(r11, machine_check_early_common)
+	/*
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
+	 */
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
-TRAMP_REAL_BEGIN(machine_check_common_early)
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0
+
+EXC_COMMON_BEGIN(machine_check_early_common)
+	mtctr	r10			/* Restore ctr */
+	mfspr	r11,SPRN_SRR0
+	mfspr	r12,SPRN_SRR1
+
 	/*
-	 * Register contents:
-	 * R13		= PACA
-	 * R9		= CR
-	 * Original R9 to R13 is saved on PACA_EXMC
-	 *
 	 * Switch to mc_emergency stack and handle re-entrancy (we limit
 	 * the nested MCE upto level 4 to avoid stack overflow).
 	 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
@@ -965,32 +971,30 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 	 * the machine check is handled then the idle wakeup code is called
 	 * to restore state.
 	 */
-	mr	r11,r1			/* Save r1 */
 	lhz	r10,PACA_IN_MCE(r13)
 	cmpwi	r10,0			/* Are we in nested machine check */
-	bne	0f			/* Yes, we are. */
-	/* First machine check entry */
-	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
-0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
+	cmpwi	cr1,r10,MAX_MCE_DEPTH	/* Are we at maximum nesting */
 	addi	r10,r10,1		/* increment paca->in_mce */
 	sth	r10,PACA_IN_MCE(r13)
+
+	mr	r10,r1			/* Save r1 */
+	bne	1f
+	/* First machine check entry */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+1:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 	/* Limit nested MCE to level 4 to avoid stack overflow */
-	cmpwi	r10,MAX_MCE_DEPTH
-	bgt	2f			/* Check if we hit limit of 4 */
-	std	r11,GPR1(r1)		/* Save r1 on the stack. */
-	std	r11,0(r1)		/* make stack chain pointer */
-	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
-	std	r11,_NIP(r1)
-	mfspr	r11,SPRN_SRR1		/* Save SRR1 */
-	std	r11,_MSR(r1)
-	mfspr	r11,SPRN_DAR		/* Save DAR */
-	std	r11,_DAR(r1)
-	mfspr	r11,SPRN_DSISR		/* Save DSISR */
-	std	r11,_DSISR(r1)
-	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	bge	cr1,2f			/* Check if we hit limit of 4 */
+
+	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
-	/* Save r9 through r13 from EXMC save area to stack frame. */
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+
 	mfmsr	r11			/* get MSR value */
 BEGIN_FTR_SECTION
 	ori	r11,r11,MSR_ME		/* turn on ME bit */
@@ -1016,8 +1020,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
 #ifdef CONFIG_PPC_PSERIES
 TRAMP_REAL_BEGIN(machine_check_fwnmi)
+	/* See comment at machine_check exception, don't turn on RI */
 	EXCEPTION_PROLOG_0 PACA_EXMC
-	b	machine_check_common_early
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
+	mfctr	r10		/* save ctr */
+	BRANCH_TO_C000(r11, machine_check_early_common)
 #endif
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
@@ -1088,8 +1095,6 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
 	 */
 EXC_COMMON_BEGIN(machine_check_handle_early)
-	std	r0,GPR0(r1)	/* Save r0 */
-	EXCEPTION_PROLOG_COMMON_3(0x200)
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
@@ -1180,14 +1185,10 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_CFAR,r10
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	MACHINE_CHECK_HANDLER_WINDUP
+	/* See comment at machine_check exception, don't turn on RI */
 	EXCEPTION_PROLOG_0 PACA_EXMC
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
-	/*
-	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
-	 * nested machine check corrupts it. machine_check_common enables
-	 * MSR_RI.
-	 */
 
 EXC_COMMON_BEGIN(unrecover_mce)
 	/* Invoke machine_check_exception to print MCE event and panic. */

From abd1f4ca2b41ffba768c3baadc006a95d178fbf1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:37 +1000
Subject: [PATCH 156/264] powerpc/64s/exception: machine check move tramp code

Following convention, move the tramp code (unrelocated) above the
common handlers (relocated).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-13-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a1f0a88d39a5..06821b199511 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -945,6 +945,17 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 
+#ifdef CONFIG_PPC_PSERIES
+TRAMP_REAL_BEGIN(machine_check_fwnmi)
+	/* See comment at machine_check exception, don't turn on RI */
+	EXCEPTION_PROLOG_0 PACA_EXMC
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
+	mfctr	r10		/* save ctr */
+	BRANCH_TO_C000(r11, machine_check_early_common)
+#endif
+
+TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
+
 EXC_COMMON_BEGIN(machine_check_early_common)
 	mtctr	r10			/* Restore ctr */
 	mfspr	r11,SPRN_SRR0
@@ -1018,17 +1029,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	b	1b
 	b	.	/* prevent speculative execution */
 
-#ifdef CONFIG_PPC_PSERIES
-TRAMP_REAL_BEGIN(machine_check_fwnmi)
-	/* See comment at machine_check exception, don't turn on RI */
-	EXCEPTION_PROLOG_0 PACA_EXMC
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
-	mfctr	r10		/* save ctr */
-	BRANCH_TO_C000(r11, machine_check_early_common)
-#endif
-
-TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
-
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
 	 * Machine check is different because we use a different

From 296e753fb447eb14c8c9fd6a7c48e7ffab269343 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:38 +1000
Subject: [PATCH 157/264] powerpc/64s/exception: simplify machine check early
 path

machine_check_handle_early_common can reach machine_check_handle_early
directly now that it runs at the relocated address, so just branch
directly.

The rfi sequence is required to enable MSR[ME] but that step is moved
into a helper function, making the code easier to follow.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-14-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 31 ++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 06821b199511..bbbcab88cf78 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1006,16 +1006,13 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
 
-	mfmsr	r11			/* get MSR value */
 BEGIN_FTR_SECTION
-	ori	r11,r11,MSR_ME		/* turn on ME bit */
+	bl	enable_machine_check
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-	ori	r11,r11,MSR_RI		/* turn on RI bit */
-	LOAD_HANDLER(r12, machine_check_handle_early)
-1:	mtspr	SPRN_SRR0,r12
-	mtspr	SPRN_SRR1,r11
-	RFI_TO_KERNEL
-	b	.	/* prevent speculative execution */
+	li	r10,MSR_RI
+	mtmsrd	r10,1
+	b	machine_check_handle_early
+
 2:
 	/* Stack overflow. Stay on emergency stack and panic.
 	 * Keep the ME bit off while panic-ing, so that if we hit
@@ -1026,7 +1023,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	LOAD_HANDLER(r12, unrecover_mce)
 	li	r10,MSR_ME
 	andc	r11,r11,r10		/* Turn off MSR_ME */
-	b	1b
+	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r11
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 EXC_COMMON_BEGIN(machine_check_common)
@@ -2269,6 +2268,20 @@ CLOSE_FIXED_SECTION(virt_trampolines);
 
 USE_TEXT_SECTION()
 
+/* MSR[RI] should be clear because this uses SRR[01] */
+enable_machine_check:
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r3
+	addi	r3,r3,(1f - 0b)
+	mtspr	SPRN_SRR0,r3
+	mfmsr	r3
+	ori	r3,r3,MSR_ME
+	mtspr	SPRN_SRR1,r3
+	RFI_TO_KERNEL
+1:	mtlr	r0
+	blr
+
 /*
  * Hash table stuff
  */

From b7d9ccec3056913528690c5fae7cc86a5ea3dffc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:39 +1000
Subject: [PATCH 158/264] powerpc/64s/exception: machine check move
 unrecoverable handling out of line

Similarly to the previous change, all callers of the unrecoverable
handler run relocated so can reach it with a direct branch. This makes
it easy to move out of line, which makes the "normal" path less
cluttered and easier to follow.

MSR[ME] manipulation still requires the rfi, so that is moved out of
line to its own function.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-15-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 87 ++++++++++++++--------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index bbbcab88cf78..af18d0f1d4ab 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -992,9 +992,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	bne	1f
 	/* First machine check entry */
 	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
-1:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
-	/* Limit nested MCE to level 4 to avoid stack overflow */
-	bge	cr1,2f			/* Check if we hit limit of 4 */
+1:	/* Limit nested MCE to level 4 to avoid stack overflow */
+	bgt	cr1,unrecoverable_mce	/* Check if we hit limit of 4 */
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 
 	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
@@ -1013,21 +1013,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	mtmsrd	r10,1
 	b	machine_check_handle_early
 
-2:
-	/* Stack overflow. Stay on emergency stack and panic.
-	 * Keep the ME bit off while panic-ing, so that if we hit
-	 * another machine check we checkstop.
-	 */
-	addi	r1,r1,INT_FRAME_SIZE	/* go back to previous stack frame */
-	ld	r11,PACAKMSR(r13)
-	LOAD_HANDLER(r12, unrecover_mce)
-	li	r10,MSR_ME
-	andc	r11,r11,r10		/* Turn off MSR_ME */
-	mtspr	SPRN_SRR0,r12
-	mtspr	SPRN_SRR1,r11
-	RFI_TO_KERNEL
-	b	.	/* prevent speculative execution */
-
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
 	 * Machine check is different because we use a different
@@ -1141,32 +1126,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	 * If yes, then stay on emergency stack and panic.
 	 */
 	andi.	r11,r12,MSR_RI
-	bne	2f
-1:	mfspr	r11,SPRN_SRR0
-	LOAD_HANDLER(r10,unrecover_mce)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	/*
-	 * We are going down. But there are chances that we might get hit by
-	 * another MCE during panic path and we may run into unstable state
-	 * with no way out. Hence, turn ME bit off while going down, so that
-	 * when another MCE is hit during panic path, system will checkstop
-	 * and hypervisor will get restarted cleanly by SP.
-	 */
-	li	r3,MSR_ME
-	andc	r10,r10,r3		/* Turn off MSR_ME */
-	mtspr	SPRN_SRR1,r10
-	RFI_TO_KERNEL
-	b	.
-2:
+	beq	unrecoverable_mce
+
 	/*
 	 * Check if we have successfully handled/recovered from error, if not
 	 * then stay on emergency stack and panic.
 	 */
 	ld	r3,RESULT(r1)	/* Load result */
 	cmpdi	r3,0		/* see if we handled MCE successfully */
-
-	beq	1b		/* if !handled then panic */
+	beq	unrecoverable_mce /* if !handled then panic */
 
 	/*
 	 * Return from MC interrupt.
@@ -1189,17 +1157,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
-EXC_COMMON_BEGIN(unrecover_mce)
+EXC_COMMON_BEGIN(unrecoverable_mce)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, system will checkstop
+	 * and hypervisor will get restarted cleanly by SP.
+	 */
+BEGIN_FTR_SECTION
+	li	r10,0 /* clear MSR_RI */
+	mtmsrd	r10,1
+	bl	disable_machine_check
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+	ld	r10,PACAKMSR(r13)
+	li	r3,MSR_ME
+	andc	r10,r10,r3
+	mtmsrd	r10
+
 	/* Invoke machine_check_exception to print MCE event and panic. */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_exception
+
 	/*
-	 * We will not reach here. Even if we did, there is no way out. Call
-	 * unrecoverable_exception and die.
+	 * We will not reach here. Even if we did, there is no way out.
+	 * Call unrecoverable_exception and die.
 	 */
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	unrecoverable_exception
-	b	1b
+	b	.
 
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
@@ -2282,6 +2268,21 @@ enable_machine_check:
 1:	mtlr	r0
 	blr
 
+/* MSR[RI] should be clear because this uses SRR[01] */
+disable_machine_check:
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r3
+	addi	r3,r3,(1f - 0b)
+	mtspr	SPRN_SRR0,r3
+	mfmsr	r3
+	li	r4,MSR_ME
+	andc	r3,r3,r4
+	mtspr	SPRN_SRR1,r3
+	RFI_TO_KERNEL
+1:	mtlr	r0
+	blr
+
 /*
  * Hash table stuff
  */

From fce16d482276f059b08368c833a1188ac1f25e86 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:40 +1000
Subject: [PATCH 159/264] powerpc/64s/exception: untangle early machine check
 handler branch

machine_check_early_common now branches to machine_check_handle_early
which is its only caller.

Move interleaving code out of the way, and remove the branch.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-16-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 129 +++++++++++++--------------
 1 file changed, 62 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index af18d0f1d4ab..3a7f18021365 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -956,6 +956,16 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi)
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r9,0;					\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Decrement paca->in_mce now RI is clear. */	\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	EXCEPTION_RESTORE_REGS EXC_STD
+
 EXC_COMMON_BEGIN(machine_check_early_common)
 	mtctr	r10			/* Restore ctr */
 	mfspr	r11,SPRN_SRR0
@@ -1011,74 +1021,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	li	r10,MSR_RI
 	mtmsrd	r10,1
-	b	machine_check_handle_early
 
-EXC_COMMON_BEGIN(machine_check_common)
-	/*
-	 * Machine check is different because we use a different
-	 * save area: PACA_EXMC instead of PACA_EXGEN.
-	 */
-	EXCEPTION_COMMON(PACA_EXMC, 0x200)
-	FINISH_NAP
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r3,PACA_EXMC+EX_DAR(r13)
-	lwz	r4,PACA_EXMC+EX_DSISR(r13)
-	/* Enable MSR_RI when finished with PACA_EXMC */
-	li	r10,MSR_RI
-	mtmsrd 	r10,1
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-	bl	save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
-	b	ret_from_except
-
-#define MACHINE_CHECK_HANDLER_WINDUP			\
-	/* Clear MSR_RI before setting SRR0 and SRR1. */\
-	li	r9,0;					\
-	mtmsrd	r9,1;		/* Clear MSR_RI */	\
-	/* Decrement paca->in_mce now RI is clear. */	\
-	lhz	r12,PACA_IN_MCE(r13);			\
-	subi	r12,r12,1;				\
-	sth	r12,PACA_IN_MCE(r13);			\
-	EXCEPTION_RESTORE_REGS EXC_STD
-
-#ifdef CONFIG_PPC_P7_NAP
-/*
- * This is an idle wakeup. Low level machine check has already been
- * done. Queue the event then call the idle code to do the wake up.
- */
-EXC_COMMON_BEGIN(machine_check_idle_common)
-	bl	machine_check_queue_event
-
-	/*
-	 * We have not used any non-volatile GPRs here, and as a rule
-	 * most exception code including machine check does not.
-	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
-	 * wakeup will restore volatile registers.
-	 *
-	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
-	 *
-	 * Then decrement MCE nesting after finishing with the stack.
-	 */
-	ld	r3,_MSR(r1)
-	ld	r4,_LINK(r1)
-
-	lhz	r11,PACA_IN_MCE(r13)
-	subi	r11,r11,1
-	sth	r11,PACA_IN_MCE(r13)
-
-	mtlr	r4
-	rlwinm	r10,r3,47-31,30,31
-	cmpwi	cr1,r10,2
-	bltlr	cr1	/* no state loss, return to idle caller */
-	b	idle_return_gpr_loss
-#endif
-	/*
-	 * Handle machine check early in real mode. We come here with
-	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
-	 */
-EXC_COMMON_BEGIN(machine_check_handle_early)
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
@@ -1157,6 +1100,58 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
+EXC_COMMON_BEGIN(machine_check_common)
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	EXCEPTION_COMMON(PACA_EXMC, 0x200)
+	FINISH_NAP
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	/* Enable MSR_RI when finished with PACA_EXMC */
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_except
+
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+	bl	machine_check_queue_event
+
+	/*
+	 * We have not used any non-volatile GPRs here, and as a rule
+	 * most exception code including machine check does not.
+	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+	 * wakeup will restore volatile registers.
+	 *
+	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 *
+	 * Then decrement MCE nesting after finishing with the stack.
+	 */
+	ld	r3,_MSR(r1)
+	ld	r4,_LINK(r1)
+
+	lhz	r11,PACA_IN_MCE(r13)
+	subi	r11,r11,1
+	sth	r11,PACA_IN_MCE(r13)
+
+	mtlr	r4
+	rlwinm	r10,r3,47-31,30,31
+	cmpwi	cr1,r10,2
+	bltlr	cr1	/* no state loss, return to idle caller */
+	b	idle_return_gpr_loss
+#endif
+
 EXC_COMMON_BEGIN(unrecoverable_mce)
 	/*
 	 * We are going down. But there are chances that we might get hit by

From b3fe35261e329e15736bc95630fd865df9896c66 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:41 +1000
Subject: [PATCH 160/264] powerpc/64s/exception: machine check improve labels
 and comments

Short forward and backward branches can be given number labels,
but larger significant divergences in code path a more readable
if they're given descriptive names.

Also adjusts a comment to account for guest delivery.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-17-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 3a7f18021365..c2474c9c8d41 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1052,7 +1052,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	 */
 	lbz	r11,HSTATE_IN_GUEST(r13)
 	cmpwi	r11,0			/* Check if coming from guest */
-	bne	9f			/* continue if we are. */
+	bne	mce_deliver		/* continue if we are. */
 #endif
 
 	/*
@@ -1060,7 +1060,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	 * exception handler which will deliver the MC event to this kernel.
 	 */
 	andi.	r11,r12,MSR_PR		/* See if coming from user. */
-	bne	9f			/* continue in V mode if we are. */
+	bne	mce_deliver		/* continue in V mode if we are. */
 
 	/*
 	 * At this point we are coming from kernel context.
@@ -1088,8 +1088,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	MACHINE_CHECK_HANDLER_WINDUP
 	RFI_TO_KERNEL
 
-9:
-	/* Deliver the machine check to host kernel in V mode. */
+mce_deliver:
+	/*
+	 * This is a host user or guest MCE. Restore all registers, then
+	 * run the "late" handler. For host user, this will run the
+	 * machine_check_exception handler in virtual mode like a normal
+	 * interrupt handler. For guest, this will trigger the KVM test
+	 * and branch to the KVM interrupt similarly to other interrupts.
+	 */
 BEGIN_FTR_SECTION
 	ld	r10,ORIG_GPR3(r1)
 	mtspr	SPRN_CFAR,r10

From c31f7134dc53f7020b3d49e846d1b950a761e324 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:42 +1000
Subject: [PATCH 161/264] powerpc/64s/exception: Fix DAR load for
 handle_page_fault error case

This buglet goes back to before the 64/32 arch merge, but it does not
seem to have had practical consequences because bad_page_fault does
not use the 2nd argument, but rather regs->dar/nip.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-18-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index c2474c9c8d41..d44f6d103014 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2335,7 +2335,7 @@ handle_page_fault:
 	bl	save_nvgprs
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	lwz	r4,_DAR(r1)
+	ld	r4,_DAR(r1)
 	bl	bad_page_fault
 	b	ret_from_except
 

From a243281195c338489ec5088380821f64340cb82f Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:43 +1000
Subject: [PATCH 162/264] powerpc/64s/exception: move head-64.h exception code
 to exception-64s.S

The head-64.h code should deal only with the head code sections
and offset calculations.

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-19-npiggin@gmail.com
---
 arch/powerpc/include/asm/head-64.h   | 41 ----------------------------
 arch/powerpc/kernel/exceptions-64s.S | 41 ++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index a466765709a9..2dabcf668292 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -169,47 +169,6 @@ name:
 
 #define ABS_ADDR(label) (label - fs_label + fs_start)
 
-#define EXC_REAL_BEGIN(name, start, size)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
-
-#define EXC_REAL_END(name, start, size)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
-
-#define EXC_VIRT_BEGIN(name, start, size)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
-
-#define EXC_VIRT_END(name, start, size)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
-
-#define EXC_COMMON_BEGIN(name)					\
-	USE_TEXT_SECTION();					\
-	.balign IFETCH_ALIGN_BYTES;				\
-	.global name;						\
-	_ASM_NOKPROBE_SYMBOL(name);				\
-	DEFINE_FIXED_SYMBOL(name);				\
-name:
-
-#define TRAMP_REAL_BEGIN(name)					\
-	FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
-
-#define TRAMP_VIRT_BEGIN(name)					\
-	FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#define TRAMP_KVM_BEGIN(name)					\
-	TRAMP_VIRT_BEGIN(name)
-#else
-#define TRAMP_KVM_BEGIN(name)
-#endif
-
-#define EXC_REAL_NONE(start, size)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
-
-#define EXC_VIRT_NONE(start, size)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size)
-
 #endif /* __ASSEMBLY__ */
 
 #endif	/* _ASM_POWERPC_HEAD_64_H */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d44f6d103014..febeea021939 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -43,6 +43,47 @@
 .endif
 #endif
 
+#define EXC_REAL_BEGIN(name, start, size)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
+
+#define EXC_REAL_END(name, start, size)				\
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
+
+#define EXC_VIRT_BEGIN(name, start, size)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
+
+#define EXC_VIRT_END(name, start, size)				\
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
+
+#define EXC_COMMON_BEGIN(name)					\
+	USE_TEXT_SECTION();					\
+	.balign IFETCH_ALIGN_BYTES;				\
+	.global name;						\
+	_ASM_NOKPROBE_SYMBOL(name);				\
+	DEFINE_FIXED_SYMBOL(name);				\
+name:
+
+#define TRAMP_REAL_BEGIN(name)					\
+	FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
+
+#define TRAMP_VIRT_BEGIN(name)					\
+	FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define TRAMP_KVM_BEGIN(name)					\
+	TRAMP_VIRT_BEGIN(name)
+#else
+#define TRAMP_KVM_BEGIN(name)
+#endif
+
+#define EXC_REAL_NONE(start, size)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
+
+#define EXC_VIRT_NONE(start, size)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size)
+
 /*
  * We're short on space and time in the exception prolog, so we can't
  * use the normal LOAD_REG_IMMEDIATE macro to load the address of label.

From def0db4f9ddc24ad7b37735c34a78e9c3c7978ef Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:44 +1000
Subject: [PATCH 163/264] powerpc/64s/exception: Add EXC_HV_OR_STD, which
 selects HSRR if HVMODE

Add EXC_HV_OR_STD and use it to consolidate the 0x500 external
interrupt.

Executed code is unchanged.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-20-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 102 +++++++++++++++++++++------
 1 file changed, 79 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index febeea021939..33bd99116ae4 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -109,6 +109,7 @@ name:
 	addis	reg,reg,(ABS_ADDR(label))@h
 
 /* Exception register prefixes */
+#define EXC_HV_OR_STD	2 /* depends on HVMODE */
 #define EXC_HV		1
 #define EXC_STD		0
 
@@ -205,7 +206,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 		.abort "Bad maskable vector"
 		.endif
 
-		.if \hsrr
+		.if \hsrr == EXC_HV_OR_STD
+		BEGIN_FTR_SECTION
+		bne	masked_Hinterrupt
+		FTR_SECTION_ELSE
+		bne	masked_interrupt
+		ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+		.elseif \hsrr
 		bne	masked_Hinterrupt
 		.else
 		bne	masked_interrupt
@@ -237,7 +244,17 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	.if ! \set_ri
 	xori	r10,r10,MSR_RI		/* Clear MSR_RI */
 	.endif
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	mtspr	SPRN_HSRR1,r10
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mtspr	SPRN_SRR1,r10
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
 	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
 	mtspr	SPRN_HSRR1,r10
@@ -247,7 +264,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_SRR1,r10
 	.endif
 	LOAD_HANDLER(r10, \label\())
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mtspr	SPRN_HSRR0,r10
+	HRFI_TO_KERNEL
+	FTR_SECTION_ELSE
+	mtspr	SPRN_SRR0,r10
+	RFI_TO_KERNEL
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	mtspr	SPRN_HSRR0,r10
 	HRFI_TO_KERNEL
 	.else
@@ -259,14 +284,26 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 .macro EXCEPTION_PROLOG_2_VIRT label, hsrr
 #ifdef CONFIG_RELOCATABLE
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0	/* save SRR0 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
 	.else
 	mfspr	r11,SPRN_SRR0	/* save SRR0 */
 	.endif
 	LOAD_HANDLER(r12, \label\())
 	mtctr	r12
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
+	FTR_SECTION_ELSE
+	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
 	.else
 	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
@@ -275,7 +312,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtmsrd 	r10,1		/* Set RI (EE=0) */
 	bctr
 #else
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
 	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
 	.else
@@ -316,7 +361,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 .macro KVMTEST hsrr, n
 	lbz	r10,HSTATE_IN_GUEST(r13)
 	cmpwi	r10,0
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	bne	do_kvm_H\n
+	FTR_SECTION_ELSE
+	bne	do_kvm_\n
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	bne	do_kvm_H\n
 	.else
 	bne	do_kvm_\n
@@ -342,7 +393,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	std	r12,HSTATE_SCRATCH0(r13)
 	sldi	r12,r9,32
 	/* HSRR variants have the 0x2 bit added to their trap number */
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	ori	r12,r12,(\n + 0x2)
+	FTR_SECTION_ELSE
+	ori	r12,r12,(\n)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	ori	r12,r12,(\n + 0x2)
 	.else
 	ori	r12,r12,(\n)
@@ -370,7 +427,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 89:	mtocrf	0x80,r9
 	ld	r9,\area+EX_R9(r13)
 	ld	r10,\area+EX_R10(r13)
-	.if \hsrr
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	b	kvmppc_skip_Hinterrupt
+	FTR_SECTION_ELSE
+	b	kvmppc_skip_interrupt
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
 	b	kvmppc_skip_Hinterrupt
 	.else
 	b	kvmppc_skip_interrupt
@@ -469,6 +532,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 .macro EXCEPTION_RESTORE_REGS hsrr
 	/* Move original SRR0 and SRR1 into the respective regs */
 	ld	r9,_MSR(r1)
+	.if \hsrr == EXC_HV_OR_STD
+	.error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS"
+	.endif
 	.if \hsrr
 	mtspr	SPRN_HSRR1,r9
 	.else
@@ -1363,24 +1429,14 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
 	EXCEPTION_PROLOG_0 PACA_EXGEN
-BEGIN_FTR_SECTION
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV, 1
-FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_STD, 1
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
+	EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV_OR_STD, 1
 EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
 
 EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
 	EXCEPTION_PROLOG_0 PACA_EXGEN
-BEGIN_FTR_SECTION
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV
-FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_STD
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+	EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
+	EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV_OR_STD
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0x500)

From 9a7a0773d7d2fd1e4c581f42ad75de7872a174dc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:45 +1000
Subject: [PATCH 164/264] powerpc/64s/exception: Fix performance monitor virt
 handler

The perf virt handler uses EXCEPTION_PROLOG_2_REAL rather than _VIRT.
In practice this is okay because the _REAL variant is usable by virt
mode interrupts, but should be fixed (and is a performance win).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-21-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 33bd99116ae4..4b685d894ba1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -750,7 +750,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask)		\
 	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
+	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD
 
 #define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask)	\
 	__EXC_VIRT_OOL_MASKABLE(name, start, size);			\

From 5ff79a5ea69f8f5d1131064af9f8c9a8b1bc8266 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:46 +1000
Subject: [PATCH 165/264] powerpc/64s/exception: remove 0xb00 handler

This vector is not used by any supported processor, and has been
implemented as an unknown exception going back to 2.6. There is
nothing special about 0xb00, so remove it like other unused
vectors.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-22-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 4b685d894ba1..d9c953e419bf 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1564,10 +1564,8 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception)
 #endif
 
 
-EXC_REAL(trap_0b, 0xb00, 0x100)
-EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00)
-TRAMP_KVM(PACA_EXGEN, 0xb00)
-EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
+EXC_REAL_NONE(0xb00, 0x100)
+EXC_VIRT_NONE(0x4b00, 0x100)
 
 /*
  * system call / hypercall (0xc00, 0x4c00)

From 7299417c8214ce09c0da01d7719c7b1c11503578 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:47 +1000
Subject: [PATCH 166/264] powerpc/64s/exception: Replace PROLOG macros and EXC
 helpers with a gas macro

This creates a single macro that generates the exception prolog code,
with variants specified by arguments, rather than assorted nested
macros for different variants.

The increasing length of macro argument list is not nice to read or
modify, but this is a temporary condition that will be improved in
later changes.

No generated code change except BUG line number constants and label
names.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-23-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 502 +++++++++++----------------
 1 file changed, 206 insertions(+), 296 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d9c953e419bf..a476bb2f80f3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -43,6 +43,17 @@
 .endif
 #endif
 
+/*
+ * Following are fixed section helper macros.
+ *
+ * EXC_REAL_BEGIN/END  - real, unrelocated exception vectors
+ * EXC_VIRT_BEGIN/END  - virt (AIL), unrelocated exception vectors
+ * TRAMP_REAL_BEGIN    - real, unrelocated helpers (virt may call these)
+ * TRAMP_VIRT_BEGIN    - virt, unreloc helpers (in practice, real can use)
+ * TRAMP_KVM_BEGIN     - KVM handlers, these are put into real, unrelocated
+ * EXC_COMMON          - After switching to virtual, relocated mode.
+ */
+
 #define EXC_REAL_BEGIN(name, start, size)			\
 	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
 
@@ -589,196 +600,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #endif
 
 /*
- * Following are the BOOK3S exception handler helper macros.
- * Handlers come in a number of types, and each type has a number of varieties.
+ * This is the BOOK3S interrupt entry code macro.
  *
- * EXC_REAL_*     - real, unrelocated exception vectors
- * EXC_VIRT_*     - virt (AIL), unrelocated exception vectors
- * TRAMP_REAL_*   - real, unrelocated helpers (virt can call these)
- * TRAMP_VIRT_*   - virt, unreloc helpers (in practice, real can use)
- * TRAMP_KVM      - KVM handlers that get put into real, unrelocated
- * EXC_COMMON     - virt, relocated common handlers
- *
- * The EXC handlers are given a name, and branch to name_common, or the
- * appropriate KVM or masking function. Vector handler verieties are as
- * follows:
- *
- * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception
- *
- * EXC_{REAL|VIRT}  - standard exception
- *
- * EXC_{REAL|VIRT}_suffix
- *     where _suffix is:
- *   - _MASKABLE               - maskable exception
- *   - _OOL                    - out of line with trampoline to common handler
- *   - _HV                     - HV exception
- *
- * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV
- *
- * KVM handlers come in the following verieties:
- * TRAMP_KVM
- * TRAMP_KVM_SKIP
- * TRAMP_KVM_HV
- * TRAMP_KVM_HV_SKIP
- *
- * COMMON handlers come in the following verieties:
- * EXC_COMMON_BEGIN/END - used to open-code the handler
- * EXC_COMMON
- * EXC_COMMON_ASYNC
- *
- * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM
- * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers.
+ * This can result in one of several things happening:
+ * - Branch to the _common handler, relocated, in virtual mode.
+ *   These are normal interrupts (synchronous and asynchronous) handled by
+ *   the kernel.
+ * - Branch to KVM, relocated but real mode interrupts remain in real mode.
+ *   These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by
+ *   / intended for host or guest kernel, but KVM must always be involved
+ *   because the machine state is set for guest execution.
+ * - Branch to the masked handler, unrelocated.
+ *   These occur when maskable asynchronous interrupts are taken with the
+ *   irq_soft_mask set.
+ * - Branch to an "early" handler in real mode but relocated.
+ *   This is done if early=1. MCE and HMI use these to handle errors in real
+ *   mode.
+ * - Fall through and continue executing in real, unrelocated mode.
+ *   This is done if early=2.
  */
+.macro INT_HANDLER name, vec, ool, early, virt, hsrr, area, ri, dar, dsisr, bitmask, kvm
+	EXCEPTION_PROLOG_0 \area
+	.if \ool
+	.if !\virt
+	b	tramp_real_\name
+	.pushsection .text
+	TRAMP_REAL_BEGIN(tramp_real_\name)
+	.else
+	b	tramp_virt_\name
+	.pushsection .text
+	TRAMP_VIRT_BEGIN(tramp_virt_\name)
+	.endif
+	.endif
+	EXCEPTION_PROLOG_1 \hsrr, \area, \kvm, \vec, \dar, \dsisr, \bitmask
+	.if \early == 2
+	/* nothing more */
+	.elseif \early
+	mfctr	r10			/* save ctr, even for !RELOCATABLE */
+	BRANCH_TO_C000(r11, \name\()_early_common)
+	.elseif !\virt
+	EXCEPTION_PROLOG_2_REAL \name\()_common, \hsrr, \ri
+	.else
+	EXCEPTION_PROLOG_2_VIRT \name\()_common, \hsrr
+	.endif
+	.if \ool
+	.popsection
+	.endif
+.endm
 
-#define __EXC_REAL(name, start, size, area)				\
-	EXC_REAL_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 area ;					\
-	EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ;		\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ;		\
-	EXC_REAL_END(name, start, size)
-
-#define EXC_REAL(name, start, size)					\
-	__EXC_REAL(name, start, size, PACA_EXGEN)
-
-#define __EXC_VIRT(name, start, size, realvec, area)			\
-	EXC_VIRT_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 area ;					\
-	EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0;		\
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;		\
-	EXC_VIRT_END(name, start, size)
-
-#define EXC_VIRT(name, start, size, realvec)				\
-	__EXC_VIRT(name, start, size, realvec, PACA_EXGEN)
-
-#define EXC_REAL_MASKABLE(name, start, size, bitmask)			\
-	EXC_REAL_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN ;					\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ;		\
-	EXC_REAL_END(name, start, size)
-
-#define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask)		\
-	EXC_VIRT_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN ;					\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;		\
-	EXC_VIRT_END(name, start, size)
-
-#define EXC_REAL_HV(name, start, size)					\
-	EXC_REAL_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN;					\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ;		\
-	EXC_REAL_END(name, start, size)
-
-#define EXC_VIRT_HV(name, start, size, realvec)				\
-	EXC_VIRT_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN;					\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ;			\
-	EXC_VIRT_END(name, start, size)
-
-#define __EXC_REAL_OOL(name, start, size)				\
-	EXC_REAL_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN ;					\
-	b	tramp_real_##name ;					\
-	EXC_REAL_END(name, start, size)
-
-#define __TRAMP_REAL_OOL(name, vec)					\
-	TRAMP_REAL_BEGIN(tramp_real_##name);				\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
-
-#define EXC_REAL_OOL(name, start, size)					\
-	__EXC_REAL_OOL(name, start, size);				\
-	__TRAMP_REAL_OOL(name, start)
-
-#define __EXC_REAL_OOL_MASKABLE(name, start, size)			\
-	__EXC_REAL_OOL(name, start, size)
-
-#define __TRAMP_REAL_OOL_MASKABLE(name, vec, bitmask)			\
-	TRAMP_REAL_BEGIN(tramp_real_##name);				\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, bitmask ;	\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
-
-#define EXC_REAL_OOL_MASKABLE(name, start, size, bitmask)		\
-	__EXC_REAL_OOL_MASKABLE(name, start, size);			\
-	__TRAMP_REAL_OOL_MASKABLE(name, start, bitmask)
-
-#define __EXC_REAL_OOL_HV(name, start, size)				\
-	__EXC_REAL_OOL(name, start, size)
-
-#define __TRAMP_REAL_OOL_HV(name, vec)					\
-	TRAMP_REAL_BEGIN(tramp_real_##name);				\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1
-
-#define EXC_REAL_OOL_HV(name, start, size)				\
-	__EXC_REAL_OOL_HV(name, start, size);				\
-	__TRAMP_REAL_OOL_HV(name, start)
-
-#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size)			\
-	__EXC_REAL_OOL(name, start, size)
-
-#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec, bitmask)		\
-	TRAMP_REAL_BEGIN(tramp_real_##name);				\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, bitmask ;	\
-	EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1
-
-#define EXC_REAL_OOL_MASKABLE_HV(name, start, size, bitmask)		\
-	__EXC_REAL_OOL_MASKABLE_HV(name, start, size);			\
-	__TRAMP_REAL_OOL_MASKABLE_HV(name, start, bitmask)
-
-#define __EXC_VIRT_OOL(name, start, size)				\
-	EXC_VIRT_BEGIN(name, start, size);				\
-	EXCEPTION_PROLOG_0 PACA_EXGEN ;					\
-	b	tramp_virt_##name;					\
-	EXC_VIRT_END(name, start, size)
-
-#define __TRAMP_VIRT_OOL(name, realvec)					\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, vec, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD
-
-#define EXC_VIRT_OOL(name, start, size, realvec)			\
-	__EXC_VIRT_OOL(name, start, size);				\
-	__TRAMP_VIRT_OOL(name, realvec)
-
-#define __EXC_VIRT_OOL_MASKABLE(name, start, size)			\
-	__EXC_VIRT_OOL(name, start, size)
-
-#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask)		\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD
-
-#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask)	\
-	__EXC_VIRT_OOL_MASKABLE(name, start, size);			\
-	__TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask)
-
-#define __EXC_VIRT_OOL_HV(name, start, size)				\
-	__EXC_VIRT_OOL(name, start, size)
-
-#define __TRAMP_VIRT_OOL_HV(name, realvec)				\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ;	\
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV
-
-#define EXC_VIRT_OOL_HV(name, start, size, realvec)			\
-	__EXC_VIRT_OOL_HV(name, start, size);				\
-	__TRAMP_VIRT_OOL_HV(name, realvec)
-
-#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size)			\
-	__EXC_VIRT_OOL(name, start, size)
-
-#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask)		\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, bitmask ; \
-	EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV
-
-#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec, bitmask)	\
-	__EXC_VIRT_OOL_MASKABLE_HV(name, start, size);			\
-	__TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask)
 
 #define TRAMP_KVM(area, n)						\
 	TRAMP_KVM_BEGIN(do_kvm_##n);					\
@@ -943,9 +812,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
-	EXCEPTION_PROLOG_0 PACA_EXNMI
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0, 0, 0
-	EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0
+	INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 1
 	/*
 	 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
 	 * being used, so a nested NMI exception would corrupt it.
@@ -975,9 +842,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake)
  */
 TRAMP_REAL_BEGIN(system_reset_fwnmi)
 	/* See comment at system_reset exception, don't turn on RI */
-	EXCEPTION_PROLOG_0 PACA_EXNMI
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 0, 0x100, 0, 0, 0
-	EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0
+	INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 0
 
 #endif /* CONFIG_PPC_PSERIES */
 
@@ -1040,10 +905,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXMC
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
-	mfctr	r10			/* save ctr, even for !RELOCATABLE */
-	BRANCH_TO_C000(r11, machine_check_early_common)
+	INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0
 	/*
 	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
 	 * nested machine check corrupts it. machine_check_common enables
@@ -1055,10 +917,7 @@ EXC_VIRT_NONE(0x4200, 0x100)
 #ifdef CONFIG_PPC_PSERIES
 TRAMP_REAL_BEGIN(machine_check_fwnmi)
 	/* See comment at machine_check exception, don't turn on RI */
-	EXCEPTION_PROLOG_0 PACA_EXMC
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
-	mfctr	r10		/* save ctr */
-	BRANCH_TO_C000(r11, machine_check_early_common)
+	INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0
 #endif
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
@@ -1209,9 +1068,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	MACHINE_CHECK_HANDLER_WINDUP
 	/* See comment at machine_check exception, don't turn on RI */
-	EXCEPTION_PROLOG_0 PACA_EXMC
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
-	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+	INT_HANDLER machine_check, 0x200, 0, 0, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 1
 
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
@@ -1297,18 +1154,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	b	tramp_real_data_access
+	INT_HANDLER data_access, 0x300, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1
 EXC_REAL_END(data_access, 0x300, 0x80)
-
-TRAMP_REAL_BEGIN(tramp_real_data_access)
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 1, 1, 0
-	EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1
-
 EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 1, 1, 0
-EXCEPTION_PROLOG_2_VIRT data_access_common, EXC_STD
+	INT_HANDLER data_access, 0x300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0
 EXC_VIRT_END(data_access, 0x4300, 0x80)
 
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
@@ -1336,18 +1185,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
-	EXCEPTION_PROLOG_0 PACA_EXSLB
-	b	tramp_real_data_access_slb
+	INT_HANDLER data_access_slb, 0x380, 1, 0, 0, 0, PACA_EXSLB, 1, 1, 0, 0, 1
 EXC_REAL_END(data_access_slb, 0x380, 0x80)
-
-TRAMP_REAL_BEGIN(tramp_real_data_access_slb)
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 1, 0x380, 1, 0, 0
-	EXCEPTION_PROLOG_2_REAL data_access_slb_common, EXC_STD, 1
-
 EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
-	EXCEPTION_PROLOG_0 PACA_EXSLB
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 0, 0x380, 1, 0, 0
-	EXCEPTION_PROLOG_2_VIRT data_access_slb_common, EXC_STD
+	INT_HANDLER data_access_slb, 0x380, 0, 0, 1, 0, PACA_EXSLB, 1, 1, 0, 0, 0
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 
 TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
@@ -1378,8 +1219,13 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	b	ret_from_except
 
 
-EXC_REAL(instruction_access, 0x400, 0x80)
-EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400)
+EXC_REAL_BEGIN(instruction_access, 0x400, 0x80)
+	INT_HANDLER instruction_access, 0x400, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(instruction_access, 0x400, 0x80)
+EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
+	INT_HANDLER instruction_access, 0x400, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(instruction_access, 0x4400, 0x80)
+
 TRAMP_KVM(PACA_EXGEN, 0x400)
 
 EXC_COMMON_BEGIN(instruction_access_common)
@@ -1398,8 +1244,12 @@ MMU_FTR_SECTION_ELSE
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
-__EXC_REAL(instruction_access_slb, 0x480, 0x80, PACA_EXSLB)
-__EXC_VIRT(instruction_access_slb, 0x4480, 0x80, 0x480, PACA_EXSLB)
+EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
+	INT_HANDLER instruction_access_slb, 0x480, 0, 0, 0, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 1
+EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
+EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
+	INT_HANDLER instruction_access_slb, 0x480, 0, 0, 1, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 0
+EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 TRAMP_KVM(PACA_EXSLB, 0x480)
 
 EXC_COMMON_BEGIN(instruction_access_slb_common)
@@ -1426,17 +1276,11 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	bl	do_bad_slb_fault
 	b	ret_from_except
 
-
 EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV_OR_STD, 1
+	INT_HANDLER hardware_interrupt, 0x500, 0, 0, 0, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
 EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
-
 EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV_OR_STD
+	INT_HANDLER hardware_interrupt, 0x500, 0, 0, 1, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0x500)
@@ -1445,15 +1289,10 @@ EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
 EXC_REAL_BEGIN(alignment, 0x600, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x600, 1, 1, 0
-	EXCEPTION_PROLOG_2_REAL alignment_common, EXC_STD, 1
+	INT_HANDLER alignment, 0x600, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1
 EXC_REAL_END(alignment, 0x600, 0x100)
-
 EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x600, 1, 1, 0
-	EXCEPTION_PROLOG_2_VIRT alignment_common, EXC_STD
+	INT_HANDLER alignment, 0x600, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0
 EXC_VIRT_END(alignment, 0x4600, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0x600)
@@ -1470,8 +1309,12 @@ EXC_COMMON_BEGIN(alignment_common)
 	b	ret_from_except
 
 
-EXC_REAL(program_check, 0x700, 0x100)
-EXC_VIRT(program_check, 0x4700, 0x100, 0x700)
+EXC_REAL_BEGIN(program_check, 0x700, 0x100)
+	INT_HANDLER program_check, 0x700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(program_check, 0x700, 0x100)
+EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
+	INT_HANDLER program_check, 0x700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(program_check, 0x4700, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x700)
 EXC_COMMON_BEGIN(program_check_common)
 	/*
@@ -1508,8 +1351,12 @@ EXC_COMMON_BEGIN(program_check_common)
 	b	ret_from_except
 
 
-EXC_REAL(fp_unavailable, 0x800, 0x100)
-EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800)
+EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100)
+	INT_HANDLER fp_unavailable, 0x800, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(fp_unavailable, 0x800, 0x100)
+EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
+	INT_HANDLER fp_unavailable, 0x800, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x800)
 EXC_COMMON_BEGIN(fp_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0x800)
@@ -1542,20 +1389,32 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 
-EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED)
-EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED)
+EXC_REAL_BEGIN(decrementer, 0x900, 0x80)
+	INT_HANDLER decrementer, 0x900, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_REAL_END(decrementer, 0x900, 0x80)
+EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
+	INT_HANDLER decrementer, 0x900, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0
+EXC_VIRT_END(decrementer, 0x4900, 0x80)
 TRAMP_KVM(PACA_EXGEN, 0x900)
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
 
 
-EXC_REAL_HV(hdecrementer, 0x980, 0x80)
-EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980)
+EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80)
+	INT_HANDLER hdecrementer, 0x980, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(hdecrementer, 0x980, 0x80)
+EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80)
+	INT_HANDLER hdecrementer, 0x980, 0, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_VIRT_END(hdecrementer, 0x4980, 0x80)
 TRAMP_KVM_HV(PACA_EXGEN, 0x980)
 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
 
 
-EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED)
-EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED)
+EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100)
+	INT_HANDLER doorbell_super, 0xa00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_REAL_END(doorbell_super, 0xa00, 0x100)
+EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
+	INT_HANDLER doorbell_super, 0xa00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0
+EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0xa00)
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
@@ -1669,7 +1528,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
 	SYSTEM_CALL 0
 EXC_REAL_END(system_call, 0xc00, 0x100)
-
 EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
 	SYSTEM_CALL 1
 EXC_VIRT_END(system_call, 0x4c00, 0x100)
@@ -1699,13 +1557,22 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00)
 #endif
 
 
-EXC_REAL(single_step, 0xd00, 0x100)
-EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00)
+EXC_REAL_BEGIN(single_step, 0xd00, 0x100)
+	INT_HANDLER single_step, 0xd00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(single_step, 0xd00, 0x100)
+EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
+	INT_HANDLER single_step, 0xd00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(single_step, 0x4d00, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
-EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20)
-EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00)
+
+EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20)
+	INT_HANDLER h_data_storage, 0xe00, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(h_data_storage, 0xe00, 0x20)
+EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
+	INT_HANDLER h_data_storage, 0xe00, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
 	mfspr   r10,SPRN_HDAR
@@ -1729,14 +1596,22 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
 	b       ret_from_except
 
 
-EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20)
-EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20)
+EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20)
+	INT_HANDLER h_instr_storage, 0xe20, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(h_instr_storage, 0xe20, 0x20)
+EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
+	INT_HANDLER h_instr_storage, 0xe20, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
 
-EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20)
-EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40)
+EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20)
+	INT_HANDLER emulation_assist, 0xe40, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(emulation_assist, 0xe40, 0x20)
+EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
+	INT_HANDLER emulation_assist, 0xe40, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
 
@@ -1747,15 +1622,10 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
  * mode.
  */
 EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	b	hmi_exception_early
+	INT_HANDLER hmi_exception, 0xe60, 1, 1, 0, EXC_HV, PACA_EXGEN, 0, 0, 0, 0, 1
 EXC_REAL_END(hmi_exception, 0xe60, 0x20)
 EXC_VIRT_NONE(0x4e60, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
-TRAMP_REAL_BEGIN(hmi_exception_early)
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, 0
-	mfctr	r10			/* save ctr, even for !RELOCATABLE */
-	BRANCH_TO_C000(r11, hmi_exception_early_common)
 
 EXC_COMMON_BEGIN(hmi_exception_early_common)
 	mtctr	r10			/* Restore ctr */
@@ -1782,9 +1652,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	 * firmware.
 	 */
 	EXCEPTION_RESTORE_REGS EXC_HV
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, IRQS_DISABLED
-	EXCEPTION_PROLOG_2_REAL hmi_exception_common, EXC_HV, 1
+	INT_HANDLER hmi_exception, 0xe60, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
 
 EXC_COMMON_BEGIN(hmi_exception_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xe60)
@@ -1796,8 +1664,13 @@ EXC_COMMON_BEGIN(hmi_exception_common)
 	bl	handle_hmi_exception
 	b	ret_from_except
 
-EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED)
-EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED)
+
+EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20)
+	INT_HANDLER h_doorbell, 0xe80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_REAL_END(h_doorbell, 0xe80, 0x20)
+EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
+	INT_HANDLER h_doorbell, 0xe80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
@@ -1806,8 +1679,12 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
 #endif
 
 
-EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED)
-EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED)
+EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20)
+	INT_HANDLER h_virt_irq, 0xea0, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_REAL_END(h_virt_irq, 0xea0, 0x20)
+EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
+	INT_HANDLER h_virt_irq, 0xea0, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
 
@@ -1818,14 +1695,22 @@ EXC_REAL_NONE(0xee0, 0x20)
 EXC_VIRT_NONE(0x4ee0, 0x20)
 
 
-EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED)
-EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED)
+EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20)
+	INT_HANDLER performance_monitor, 0xf00, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 1
+EXC_REAL_END(performance_monitor, 0xf00, 0x20)
+EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
+	INT_HANDLER performance_monitor, 0xf00, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 0
+EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf00)
 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
 
 
-EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20)
-EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20)
+EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20)
+	INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(altivec_unavailable, 0xf20, 0x20)
+EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
+	INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf20)
 EXC_COMMON_BEGIN(altivec_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xf20)
@@ -1861,8 +1746,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	b	ret_from_except
 
 
-EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20)
-EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40)
+EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20)
+	INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(vsx_unavailable, 0xf40, 0x20)
+EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
+	INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf40)
 EXC_COMMON_BEGIN(vsx_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xf40)
@@ -1897,14 +1786,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	b	ret_from_except
 
 
-EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20)
-EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60)
+EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20)
+	INT_HANDLER facility_unavailable, 0xf60, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(facility_unavailable, 0xf60, 0x20)
+EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
+	INT_HANDLER facility_unavailable, 0xf60, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf60)
 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
 
-EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20)
-EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80)
+EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20)
+	INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20)
+EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
+	INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
 
@@ -1922,7 +1819,9 @@ EXC_REAL_NONE(0x1100, 0x100)
 EXC_VIRT_NONE(0x5100, 0x100)
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_system_error, 0x1200, 0x100)
+EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
+	INT_HANDLER cbe_system_error, 0x1200, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
 EXC_VIRT_NONE(0x5200, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
@@ -1932,24 +1831,26 @@ EXC_VIRT_NONE(0x5200, 0x100)
 #endif
 
 
-EXC_REAL(instruction_breakpoint, 0x1300, 0x100)
-EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300)
+EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100)
+	INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100)
+EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
+	INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
 
+
 EXC_REAL_NONE(0x1400, 0x100)
 EXC_VIRT_NONE(0x5400, 0x100)
 
 EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
-	EXCEPTION_PROLOG_0 PACA_EXGEN
-	EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0
-
+	INT_HANDLER denorm_exception_hv, 0x1500, 0, 2, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0
 #ifdef CONFIG_PPC_DENORMALISATION
 	mfspr	r10,SPRN_HSRR1
 	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
 	bne+	denorm_assist
 #endif
-
 	KVMTEST EXC_HV 0x1500
 	EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1
 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
@@ -2037,7 +1938,9 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception)
 
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100)
+EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
+	INT_HANDLER cbe_maintenance, 0x1600, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
 EXC_VIRT_NONE(0x5600, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
@@ -2047,8 +1950,12 @@ EXC_VIRT_NONE(0x5600, 0x100)
 #endif
 
 
-EXC_REAL(altivec_assist, 0x1700, 0x100)
-EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700)
+EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100)
+	INT_HANDLER altivec_assist, 0x1700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(altivec_assist, 0x1700, 0x100)
+EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
+	INT_HANDLER altivec_assist, 0x1700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x1700)
 #ifdef CONFIG_ALTIVEC
 EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
@@ -2058,7 +1965,9 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
 
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_thermal, 0x1800, 0x100)
+EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
+	INT_HANDLER cbe_thermal, 0x1800, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)
 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
@@ -2067,6 +1976,7 @@ EXC_REAL_NONE(0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
 #endif
 
+
 #ifdef CONFIG_PPC_WATCHDOG
 
 #define MASKED_DEC_HANDLER_LABEL 3f

From 9b40f62b8a49f0c922eb4b6cf6502f1307745c98 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 29 Aug 2019 23:36:08 +1000
Subject: [PATCH 167/264] powerpc/64s/exceptions: Use keyword params to shorten
 arg lists

The argument lists for the INT_HANDLER macro are getting a bit
unwieldy. Use keyword parameters with default values to shorten them.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190830011426.16810-1-mpe@ellerman.id.au
---
 arch/powerpc/kernel/exceptions-64s.S | 120 +++++++++++++--------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a476bb2f80f3..10037981ff2a 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -619,7 +619,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
  * - Fall through and continue executing in real, unrelocated mode.
  *   This is done if early=2.
  */
-.macro INT_HANDLER name, vec, ool, early, virt, hsrr, area, ri, dar, dsisr, bitmask, kvm
+.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0
 	EXCEPTION_PROLOG_0 \area
 	.if \ool
 	.if !\virt
@@ -812,7 +812,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
-	INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 1
+	INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1
 	/*
 	 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
 	 * being used, so a nested NMI exception would corrupt it.
@@ -842,7 +842,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake)
  */
 TRAMP_REAL_BEGIN(system_reset_fwnmi)
 	/* See comment at system_reset exception, don't turn on RI */
-	INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 0
+	INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0
 
 #endif /* CONFIG_PPC_PSERIES */
 
@@ -905,7 +905,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
-	INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0
+	INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
 	/*
 	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
 	 * nested machine check corrupts it. machine_check_common enables
@@ -917,7 +917,7 @@ EXC_VIRT_NONE(0x4200, 0x100)
 #ifdef CONFIG_PPC_PSERIES
 TRAMP_REAL_BEGIN(machine_check_fwnmi)
 	/* See comment at machine_check exception, don't turn on RI */
-	INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0
+	INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
 #endif
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
@@ -1068,7 +1068,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	MACHINE_CHECK_HANDLER_WINDUP
 	/* See comment at machine_check exception, don't turn on RI */
-	INT_HANDLER machine_check, 0x200, 0, 0, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 1
+	INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1
 
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
@@ -1154,10 +1154,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
-	INT_HANDLER data_access, 0x300, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1
+	INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1
 EXC_REAL_END(data_access, 0x300, 0x80)
 EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
-	INT_HANDLER data_access, 0x300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0
+	INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(data_access, 0x4300, 0x80)
 
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
@@ -1185,10 +1185,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
-	INT_HANDLER data_access_slb, 0x380, 1, 0, 0, 0, PACA_EXSLB, 1, 1, 0, 0, 1
+	INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1
 EXC_REAL_END(data_access_slb, 0x380, 0x80)
 EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
-	INT_HANDLER data_access_slb, 0x380, 0, 0, 1, 0, PACA_EXSLB, 1, 1, 0, 0, 0
+	INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 
 TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
@@ -1220,10 +1220,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(instruction_access, 0x400, 0x80)
-	INT_HANDLER instruction_access, 0x400, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER instruction_access, 0x400, kvm=1
 EXC_REAL_END(instruction_access, 0x400, 0x80)
 EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
-	INT_HANDLER instruction_access, 0x400, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER instruction_access, 0x400, virt=1
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
 
 TRAMP_KVM(PACA_EXGEN, 0x400)
@@ -1245,10 +1245,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
-	INT_HANDLER instruction_access_slb, 0x480, 0, 0, 0, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 1
+	INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1
 EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
 EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
-	INT_HANDLER instruction_access_slb, 0x480, 0, 0, 1, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 0
+	INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 TRAMP_KVM(PACA_EXSLB, 0x480)
 
@@ -1277,10 +1277,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	b	ret_from_except
 
 EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
-	INT_HANDLER hardware_interrupt, 0x500, 0, 0, 0, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
 EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
 EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
-	INT_HANDLER hardware_interrupt, 0x500, 0, 0, 1, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0x500)
@@ -1289,10 +1289,10 @@ EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
 EXC_REAL_BEGIN(alignment, 0x600, 0x100)
-	INT_HANDLER alignment, 0x600, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1
+	INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1
 EXC_REAL_END(alignment, 0x600, 0x100)
 EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
-	INT_HANDLER alignment, 0x600, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0
+	INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(alignment, 0x4600, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0x600)
@@ -1310,10 +1310,10 @@ EXC_COMMON_BEGIN(alignment_common)
 
 
 EXC_REAL_BEGIN(program_check, 0x700, 0x100)
-	INT_HANDLER program_check, 0x700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER program_check, 0x700, kvm=1
 EXC_REAL_END(program_check, 0x700, 0x100)
 EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
-	INT_HANDLER program_check, 0x700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER program_check, 0x700, virt=1
 EXC_VIRT_END(program_check, 0x4700, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x700)
 EXC_COMMON_BEGIN(program_check_common)
@@ -1352,10 +1352,10 @@ EXC_COMMON_BEGIN(program_check_common)
 
 
 EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100)
-	INT_HANDLER fp_unavailable, 0x800, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER fp_unavailable, 0x800, kvm=1
 EXC_REAL_END(fp_unavailable, 0x800, 0x100)
 EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
-	INT_HANDLER fp_unavailable, 0x800, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER fp_unavailable, 0x800, virt=1
 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x800)
 EXC_COMMON_BEGIN(fp_unavailable_common)
@@ -1390,30 +1390,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 
 
 EXC_REAL_BEGIN(decrementer, 0x900, 0x80)
-	INT_HANDLER decrementer, 0x900, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1
 EXC_REAL_END(decrementer, 0x900, 0x80)
 EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
-	INT_HANDLER decrementer, 0x900, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0
+	INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(decrementer, 0x4900, 0x80)
 TRAMP_KVM(PACA_EXGEN, 0x900)
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
 
 
 EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80)
-	INT_HANDLER hdecrementer, 0x980, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(hdecrementer, 0x980, 0x80)
 EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80)
-	INT_HANDLER hdecrementer, 0x980, 0, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(hdecrementer, 0x4980, 0x80)
 TRAMP_KVM_HV(PACA_EXGEN, 0x980)
 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
 
 
 EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100)
-	INT_HANDLER doorbell_super, 0xa00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1
 EXC_REAL_END(doorbell_super, 0xa00, 0x100)
 EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
-	INT_HANDLER doorbell_super, 0xa00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0
+	INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0xa00)
 #ifdef CONFIG_PPC_DOORBELL
@@ -1558,20 +1558,20 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00)
 
 
 EXC_REAL_BEGIN(single_step, 0xd00, 0x100)
-	INT_HANDLER single_step, 0xd00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER single_step, 0xd00, kvm=1
 EXC_REAL_END(single_step, 0xd00, 0x100)
 EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
-	INT_HANDLER single_step, 0xd00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER single_step, 0xd00, virt=1
 EXC_VIRT_END(single_step, 0x4d00, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 
 EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20)
-	INT_HANDLER h_data_storage, 0xe00, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(h_data_storage, 0xe00, 0x20)
 EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
-	INT_HANDLER h_data_storage, 0xe00, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
@@ -1597,20 +1597,20 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20)
-	INT_HANDLER h_instr_storage, 0xe20, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(h_instr_storage, 0xe20, 0x20)
 EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
-	INT_HANDLER h_instr_storage, 0xe20, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
 
 EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20)
-	INT_HANDLER emulation_assist, 0xe40, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(emulation_assist, 0xe40, 0x20)
 EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
-	INT_HANDLER emulation_assist, 0xe40, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
@@ -1622,7 +1622,7 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
  * mode.
  */
 EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20)
-	INT_HANDLER hmi_exception, 0xe60, 1, 1, 0, EXC_HV, PACA_EXGEN, 0, 0, 0, 0, 1
+	INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1
 EXC_REAL_END(hmi_exception, 0xe60, 0x20)
 EXC_VIRT_NONE(0x4e60, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
@@ -1652,7 +1652,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	 * firmware.
 	 */
 	EXCEPTION_RESTORE_REGS EXC_HV
-	INT_HANDLER hmi_exception, 0xe60, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 
 EXC_COMMON_BEGIN(hmi_exception_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xe60)
@@ -1666,10 +1666,10 @@ EXC_COMMON_BEGIN(hmi_exception_common)
 
 
 EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20)
-	INT_HANDLER h_doorbell, 0xe80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_REAL_END(h_doorbell, 0xe80, 0x20)
 EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
-	INT_HANDLER h_doorbell, 0xe80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
 #ifdef CONFIG_PPC_DOORBELL
@@ -1680,10 +1680,10 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
 
 
 EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20)
-	INT_HANDLER h_virt_irq, 0xea0, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_REAL_END(h_virt_irq, 0xea0, 0x20)
 EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
-	INT_HANDLER h_virt_irq, 0xea0, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1
+	INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
@@ -1696,20 +1696,20 @@ EXC_VIRT_NONE(0x4ee0, 0x20)
 
 
 EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20)
-	INT_HANDLER performance_monitor, 0xf00, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 1
+	INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1
 EXC_REAL_END(performance_monitor, 0xf00, 0x20)
 EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
-	INT_HANDLER performance_monitor, 0xf00, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 0
+	INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED
 EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf00)
 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
 
 
 EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20)
-	INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1
 EXC_REAL_END(altivec_unavailable, 0xf20, 0x20)
 EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
-	INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1
 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf20)
 EXC_COMMON_BEGIN(altivec_unavailable_common)
@@ -1747,10 +1747,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 
 
 EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20)
-	INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1
 EXC_REAL_END(vsx_unavailable, 0xf40, 0x20)
 EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
-	INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1
 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf40)
 EXC_COMMON_BEGIN(vsx_unavailable_common)
@@ -1787,20 +1787,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 
 
 EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20)
-	INT_HANDLER facility_unavailable, 0xf60, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1
 EXC_REAL_END(facility_unavailable, 0xf60, 0x20)
 EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
-	INT_HANDLER facility_unavailable, 0xf60, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1
 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
 TRAMP_KVM(PACA_EXGEN, 0xf60)
 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
 
 EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20)
-	INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20)
 EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
-	INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
@@ -1820,7 +1820,7 @@ EXC_VIRT_NONE(0x5100, 0x100)
 
 #ifdef CONFIG_CBE_RAS
 EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
-	INT_HANDLER cbe_system_error, 0x1200, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
 EXC_VIRT_NONE(0x5200, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
@@ -1832,10 +1832,10 @@ EXC_VIRT_NONE(0x5200, 0x100)
 
 
 EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100)
-	INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER instruction_breakpoint, 0x1300, kvm=1
 EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100)
 EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
-	INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER instruction_breakpoint, 0x1300, virt=1
 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
@@ -1845,7 +1845,7 @@ EXC_REAL_NONE(0x1400, 0x100)
 EXC_VIRT_NONE(0x5400, 0x100)
 
 EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
-	INT_HANDLER denorm_exception_hv, 0x1500, 0, 2, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV
 #ifdef CONFIG_PPC_DENORMALISATION
 	mfspr	r10,SPRN_HSRR1
 	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
@@ -1939,7 +1939,7 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception)
 
 #ifdef CONFIG_CBE_RAS
 EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
-	INT_HANDLER cbe_maintenance, 0x1600, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
 EXC_VIRT_NONE(0x5600, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
@@ -1951,10 +1951,10 @@ EXC_VIRT_NONE(0x5600, 0x100)
 
 
 EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100)
-	INT_HANDLER altivec_assist, 0x1700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER altivec_assist, 0x1700, kvm=1
 EXC_REAL_END(altivec_assist, 0x1700, 0x100)
 EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
-	INT_HANDLER altivec_assist, 0x1700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0
+	INT_HANDLER altivec_assist, 0x1700, virt=1
 EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
 TRAMP_KVM(PACA_EXGEN, 0x1700)
 #ifdef CONFIG_ALTIVEC
@@ -1966,7 +1966,7 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
 
 #ifdef CONFIG_CBE_RAS
 EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
-	INT_HANDLER cbe_thermal, 0x1800, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1
+	INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)

From d29768e13cf6b50bd54a690e2ac52ab71465e2eb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:48 +1000
Subject: [PATCH 168/264] powerpc/64s/exception: remove EXCEPTION_PROLOG_0/1,
 rename _2

EXCEPTION_PROLOG_0 and _1 have only a single caller, so expand them
into it.

Rename EXCEPTION_PROLOG_2_REAL to INT_SAVE_SRR_AND_JUMP and
EXCEPTION_PROLOG_2_VIRT to INT_VIRT_SAVE_SRR_AND_JUMP, which are
more descriptive.

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-24-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 151 +++++++++++++--------------
 1 file changed, 73 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 10037981ff2a..1ae2a8d59aa0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -180,77 +180,7 @@ BEGIN_FTR_SECTION_NESTED(943)						\
 	std	ra,offset(r13);						\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
-.macro EXCEPTION_PROLOG_0 area
-	SET_SCRATCH0(r13)			/* save r13 */
-	GET_PACA(r13)
-	std	r9,\area\()+EX_R9(r13)		/* save r9 */
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
-	HMT_MEDIUM
-	std	r10,\area\()+EX_R10(r13)	/* save r10 - r12 */
-	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
-.endm
-
-.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask
-	OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
-	OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
-	INTERRUPT_TO_KERNEL
-	SAVE_CTR(r10, \area\())
-	mfcr	r9
-	.if \kvm
-		KVMTEST \hsrr \vec
-	.endif
-	.if \bitmask
-		lbz	r10,PACAIRQSOFTMASK(r13)
-		andi.	r10,r10,\bitmask
-		/* Associate vector numbers with bits in paca->irq_happened */
-		.if \vec == 0x500 || \vec == 0xea0
-		li	r10,PACA_IRQ_EE
-		.elseif \vec == 0x900
-		li	r10,PACA_IRQ_DEC
-		.elseif \vec == 0xa00 || \vec == 0xe80
-		li	r10,PACA_IRQ_DBELL
-		.elseif \vec == 0xe60
-		li	r10,PACA_IRQ_HMI
-		.elseif \vec == 0xf00
-		li	r10,PACA_IRQ_PMI
-		.else
-		.abort "Bad maskable vector"
-		.endif
-
-		.if \hsrr == EXC_HV_OR_STD
-		BEGIN_FTR_SECTION
-		bne	masked_Hinterrupt
-		FTR_SECTION_ELSE
-		bne	masked_interrupt
-		ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-		.elseif \hsrr
-		bne	masked_Hinterrupt
-		.else
-		bne	masked_interrupt
-		.endif
-	.endif
-
-	std	r11,\area\()+EX_R11(r13)
-	std	r12,\area\()+EX_R12(r13)
-
-	/*
-	 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
-	 * because a d-side MCE will clobber those registers so is
-	 * not recoverable if they are live.
-	 */
-	GET_SCRATCH0(r10)
-	std	r10,\area\()+EX_R13(r13)
-	.if \dar
-	mfspr	r10,SPRN_DAR
-	std	r10,\area\()+EX_DAR(r13)
-	.endif
-	.if \dsisr
-	mfspr	r10,SPRN_DSISR
-	stw	r10,\area\()+EX_DSISR(r13)
-	.endif
-.endm
-
-.macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri
+.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri
 	ld	r10,PACAKMSR(r13)	/* get MSR value for kernel */
 	.if ! \set_ri
 	xori	r10,r10,MSR_RI		/* Clear MSR_RI */
@@ -293,7 +223,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	b	.	/* prevent speculative execution */
 .endm
 
-.macro EXCEPTION_PROLOG_2_VIRT label, hsrr
+/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */
+.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr
 #ifdef CONFIG_RELOCATABLE
 	.if \hsrr == EXC_HV_OR_STD
 	BEGIN_FTR_SECTION
@@ -620,7 +551,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
  *   This is done if early=2.
  */
 .macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0
-	EXCEPTION_PROLOG_0 \area
+	SET_SCRATCH0(r13)			/* save r13 */
+	GET_PACA(r13)
+	std	r9,\area\()+EX_R9(r13)		/* save r9 */
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
+	HMT_MEDIUM
+	std	r10,\area\()+EX_R10(r13)	/* save r10 - r12 */
+	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
 	.if \ool
 	.if !\virt
 	b	tramp_real_\name
@@ -632,16 +569,74 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 	TRAMP_VIRT_BEGIN(tramp_virt_\name)
 	.endif
 	.endif
-	EXCEPTION_PROLOG_1 \hsrr, \area, \kvm, \vec, \dar, \dsisr, \bitmask
+
+	OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
+	OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
+	INTERRUPT_TO_KERNEL
+	SAVE_CTR(r10, \area\())
+	mfcr	r9
+	.if \kvm
+		KVMTEST \hsrr \vec
+	.endif
+	.if \bitmask
+		lbz	r10,PACAIRQSOFTMASK(r13)
+		andi.	r10,r10,\bitmask
+		/* Associate vector numbers with bits in paca->irq_happened */
+		.if \vec == 0x500 || \vec == 0xea0
+		li	r10,PACA_IRQ_EE
+		.elseif \vec == 0x900
+		li	r10,PACA_IRQ_DEC
+		.elseif \vec == 0xa00 || \vec == 0xe80
+		li	r10,PACA_IRQ_DBELL
+		.elseif \vec == 0xe60
+		li	r10,PACA_IRQ_HMI
+		.elseif \vec == 0xf00
+		li	r10,PACA_IRQ_PMI
+		.else
+		.abort "Bad maskable vector"
+		.endif
+
+		.if \hsrr == EXC_HV_OR_STD
+		BEGIN_FTR_SECTION
+		bne	masked_Hinterrupt
+		FTR_SECTION_ELSE
+		bne	masked_interrupt
+		ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+		.elseif \hsrr
+		bne	masked_Hinterrupt
+		.else
+		bne	masked_interrupt
+		.endif
+	.endif
+
+	std	r11,\area\()+EX_R11(r13)
+	std	r12,\area\()+EX_R12(r13)
+
+	/*
+	 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
+	 * because a d-side MCE will clobber those registers so is
+	 * not recoverable if they are live.
+	 */
+	GET_SCRATCH0(r10)
+	std	r10,\area\()+EX_R13(r13)
+	.if \dar
+	mfspr	r10,SPRN_DAR
+	std	r10,\area\()+EX_DAR(r13)
+	.endif
+	.if \dsisr
+	mfspr	r10,SPRN_DSISR
+	stw	r10,\area\()+EX_DSISR(r13)
+	.endif
+
 	.if \early == 2
 	/* nothing more */
 	.elseif \early
 	mfctr	r10			/* save ctr, even for !RELOCATABLE */
 	BRANCH_TO_C000(r11, \name\()_early_common)
 	.elseif !\virt
-	EXCEPTION_PROLOG_2_REAL \name\()_common, \hsrr, \ri
+	INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri
 	.else
-	EXCEPTION_PROLOG_2_VIRT \name\()_common, \hsrr
+	INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr
 	.endif
 	.if \ool
 	.popsection
@@ -1852,7 +1847,7 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
 	bne+	denorm_assist
 #endif
 	KVMTEST EXC_HV 0x1500
-	EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1
+	INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1
 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
 
 #ifdef CONFIG_PPC_DENORMALISATION
@@ -1986,7 +1981,7 @@ EXC_VIRT_NONE(0x5800, 0x100)
 	std	r12,PACA_EXGEN+EX_R12(r13);		\
 	GET_SCRATCH0(r10);				\
 	std	r10,PACA_EXGEN+EX_R13(r13);		\
-	EXCEPTION_PROLOG_2_REAL soft_nmi_common, _H, 1
+	INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1
 
 /*
  * Branch to soft_nmi_interrupt using the emergency stack. The emergency

From 52b989231c6fad42dff57b69a53f38756db48e06 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:49 +1000
Subject: [PATCH 169/264] powerpc/64s/exception: Add the virt variant of the
 denorm interrupt handler

All other virt handlers have the prolog code in the virt vector rather
than branch to the real vector. Follow this pattern in the denorm virt
handler.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-25-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1ae2a8d59aa0..d2aa63b6a8a8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1852,7 +1852,11 @@ EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
 
 #ifdef CONFIG_PPC_DENORMALISATION
 EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100)
-	b	exc_real_0x1500_denorm_exception_hv
+	INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0
+	mfspr	r10,SPRN_HSRR1
+	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
+	bne+	denorm_assist
+	INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV
 EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
 #else
 EXC_VIRT_NONE(0x5500, 0x100)

From 4515c5fa41936088a57efe0b64d1bb46a4943582 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:50 +1000
Subject: [PATCH 170/264] powerpc/64s/exception: INT_HANDLER support
 HDAR/HDSISR and use it in HDSI

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-26-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d2aa63b6a8a8..476e4bbf1bf5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -620,11 +620,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 	GET_SCRATCH0(r10)
 	std	r10,\area\()+EX_R13(r13)
 	.if \dar
+	.if \hsrr
+	mfspr	r10,SPRN_HDAR
+	.else
 	mfspr	r10,SPRN_DAR
+	.endif
 	std	r10,\area\()+EX_DAR(r13)
 	.endif
 	.if \dsisr
+	.if \hsrr
+	mfspr	r10,SPRN_HDSISR
+	.else
 	mfspr	r10,SPRN_DSISR
+	.endif
 	stw	r10,\area\()+EX_DSISR(r13)
 	.endif
 
@@ -1563,17 +1571,13 @@ EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 
 EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20)
-	INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, kvm=1
+	INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
 EXC_REAL_END(h_data_storage, 0xe00, 0x20)
 EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
-	INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, kvm=1
+	INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
-	mfspr   r10,SPRN_HDAR
-	std     r10,PACA_EXGEN+EX_DAR(r13)
-	mfspr   r10,SPRN_HDSISR
-	stw     r10,PACA_EXGEN+EX_DSISR(r13)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xe00)
 	bl      save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)

From 141fed2669a93604bd5ce8b793d85f4798626ef5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:51 +1000
Subject: [PATCH 171/264] powerpc/64s/exception: Add INT_KVM_HANDLER gas macro

Replace the 4 variants of cpp macros with one gas macro.

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-27-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 99 +++++++++++-----------------
 1 file changed, 40 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 476e4bbf1bf5..cff48d212011 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -651,22 +651,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 	.endif
 .endm
 
-
-#define TRAMP_KVM(area, n)						\
-	TRAMP_KVM_BEGIN(do_kvm_##n);					\
-	KVM_HANDLER area, EXC_STD, n, 0
-
-#define TRAMP_KVM_SKIP(area, n)						\
-	TRAMP_KVM_BEGIN(do_kvm_##n);					\
-	KVM_HANDLER area, EXC_STD, n, 1
-
-#define TRAMP_KVM_HV(area, n)						\
-	TRAMP_KVM_BEGIN(do_kvm_H##n);					\
-	KVM_HANDLER area, EXC_HV, n, 0
-
-#define TRAMP_KVM_HV_SKIP(area, n)					\
-	TRAMP_KVM_BEGIN(do_kvm_H##n);					\
-	KVM_HANDLER area, EXC_HV, n, 1
+.macro INT_KVM_HANDLER vec, hsrr, area, skip
+	.if \hsrr
+	TRAMP_KVM_BEGIN(do_kvm_H\vec\())
+	.else
+	TRAMP_KVM_BEGIN(do_kvm_\vec\())
+	.endif
+	KVM_HANDLER \area, \hsrr, \vec, \skip
+.endm
 
 #define EXC_COMMON(name, realvec, hdlr)					\
 	EXC_COMMON_BEGIN(name);						\
@@ -827,9 +819,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	 * be dangerous anyway.
 	 */
 EXC_REAL_END(system_reset, 0x100, 0x100)
-
 EXC_VIRT_NONE(0x4100, 0x100)
-TRAMP_KVM(PACA_EXNMI, 0x100)
+INT_KVM_HANDLER 0x100, EXC_STD, PACA_EXNMI, 0
 
 #ifdef CONFIG_PPC_P7_NAP
 TRAMP_REAL_BEGIN(system_reset_idle_wake)
@@ -923,7 +914,7 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi)
 	INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
 #endif
 
-TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
+INT_KVM_HANDLER 0x200, EXC_STD, PACA_EXMC, 1
 
 #define MACHINE_CHECK_HANDLER_WINDUP			\
 	/* Clear MSR_RI before setting SRR0 and SRR1. */\
@@ -1162,9 +1153,7 @@ EXC_REAL_END(data_access, 0x300, 0x80)
 EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
 	INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(data_access, 0x4300, 0x80)
-
-TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
-
+INT_KVM_HANDLER 0x300, EXC_STD, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(data_access_common)
 	/*
 	 * Here r13 points to the paca, r9 contains the saved CR,
@@ -1193,9 +1182,7 @@ EXC_REAL_END(data_access_slb, 0x380, 0x80)
 EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 	INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
-
-TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
-
+INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1
 EXC_COMMON_BEGIN(data_access_slb_common)
 	EXCEPTION_COMMON(PACA_EXSLB, 0x380)
 	ld	r4,PACA_EXSLB+EX_DAR(r13)
@@ -1228,9 +1215,7 @@ EXC_REAL_END(instruction_access, 0x400, 0x80)
 EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
 	INT_HANDLER instruction_access, 0x400, virt=1
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
-
-TRAMP_KVM(PACA_EXGEN, 0x400)
-
+INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(instruction_access_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0x400)
 	RECONCILE_IRQ_STATE(r10, r11)
@@ -1253,8 +1238,7 @@ EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
 EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 	INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
-TRAMP_KVM(PACA_EXSLB, 0x480)
-
+INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0
 EXC_COMMON_BEGIN(instruction_access_slb_common)
 	EXCEPTION_COMMON(PACA_EXSLB, 0x480)
 	ld	r4,_NIP(r1)
@@ -1285,9 +1269,8 @@ EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
 EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
 	INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
-
-TRAMP_KVM(PACA_EXGEN, 0x500)
-TRAMP_KVM_HV(PACA_EXGEN, 0x500)
+INT_KVM_HANDLER 0x500, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER 0x500, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
@@ -1297,8 +1280,7 @@ EXC_REAL_END(alignment, 0x600, 0x100)
 EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
 	INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(alignment, 0x4600, 0x100)
-
-TRAMP_KVM(PACA_EXGEN, 0x600)
+INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(alignment_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0x600)
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
@@ -1318,7 +1300,7 @@ EXC_REAL_END(program_check, 0x700, 0x100)
 EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
 	INT_HANDLER program_check, 0x700, virt=1
 EXC_VIRT_END(program_check, 0x4700, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0x700)
+INT_KVM_HANDLER 0x700, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(program_check_common)
 	/*
 	 * It's possible to receive a TM Bad Thing type program check with
@@ -1360,7 +1342,7 @@ EXC_REAL_END(fp_unavailable, 0x800, 0x100)
 EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
 	INT_HANDLER fp_unavailable, 0x800, virt=1
 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0x800)
+INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(fp_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0x800)
 	bne	1f			/* if from user, just load it up */
@@ -1398,7 +1380,7 @@ EXC_REAL_END(decrementer, 0x900, 0x80)
 EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
 	INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(decrementer, 0x4900, 0x80)
-TRAMP_KVM(PACA_EXGEN, 0x900)
+INT_KVM_HANDLER 0x900, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
 
 
@@ -1408,7 +1390,7 @@ EXC_REAL_END(hdecrementer, 0x980, 0x80)
 EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80)
 	INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(hdecrementer, 0x4980, 0x80)
-TRAMP_KVM_HV(PACA_EXGEN, 0x980)
+INT_KVM_HANDLER 0x980, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
 
 
@@ -1418,7 +1400,7 @@ EXC_REAL_END(doorbell_super, 0xa00, 0x100)
 EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
 	INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0xa00)
+INT_KVM_HANDLER 0xa00, EXC_STD, PACA_EXGEN, 0
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
 #else
@@ -1566,7 +1548,7 @@ EXC_REAL_END(single_step, 0xd00, 0x100)
 EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
 	INT_HANDLER single_step, 0xd00, virt=1
 EXC_VIRT_END(single_step, 0x4d00, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0xd00)
+INT_KVM_HANDLER 0xd00, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 
@@ -1576,7 +1558,7 @@ EXC_REAL_END(h_data_storage, 0xe00, 0x20)
 EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
 	INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
-TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
+INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(h_data_storage_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xe00)
 	bl      save_nvgprs
@@ -1601,7 +1583,7 @@ EXC_REAL_END(h_instr_storage, 0xe20, 0x20)
 EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
 	INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
+INT_KVM_HANDLER 0xe20, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
 
@@ -1611,7 +1593,7 @@ EXC_REAL_END(emulation_assist, 0xe40, 0x20)
 EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
 	INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
+INT_KVM_HANDLER 0xe40, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
 
 
@@ -1624,8 +1606,7 @@ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20)
 	INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1
 EXC_REAL_END(hmi_exception, 0xe60, 0x20)
 EXC_VIRT_NONE(0x4e60, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
-
+INT_KVM_HANDLER 0xe60, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(hmi_exception_early_common)
 	mtctr	r10			/* Restore ctr */
 	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
@@ -1670,7 +1651,7 @@ EXC_REAL_END(h_doorbell, 0xe80, 0x20)
 EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
 	INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
+INT_KVM_HANDLER 0xe80, EXC_HV, PACA_EXGEN, 0
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
 #else
@@ -1684,7 +1665,7 @@ EXC_REAL_END(h_virt_irq, 0xea0, 0x20)
 EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
 	INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
+INT_KVM_HANDLER 0xea0, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
 
 
@@ -1700,7 +1681,7 @@ EXC_REAL_END(performance_monitor, 0xf00, 0x20)
 EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
 	INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED
 EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
-TRAMP_KVM(PACA_EXGEN, 0xf00)
+INT_KVM_HANDLER 0xf00, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
 
 
@@ -1710,7 +1691,7 @@ EXC_REAL_END(altivec_unavailable, 0xf20, 0x20)
 EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
 	INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1
 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
-TRAMP_KVM(PACA_EXGEN, 0xf20)
+INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(altivec_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xf20)
 #ifdef CONFIG_ALTIVEC
@@ -1751,7 +1732,7 @@ EXC_REAL_END(vsx_unavailable, 0xf40, 0x20)
 EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
 	INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1
 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
-TRAMP_KVM(PACA_EXGEN, 0xf40)
+INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(vsx_unavailable_common)
 	EXCEPTION_COMMON(PACA_EXGEN, 0xf40)
 #ifdef CONFIG_VSX
@@ -1791,7 +1772,7 @@ EXC_REAL_END(facility_unavailable, 0xf60, 0x20)
 EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
 	INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1
 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
-TRAMP_KVM(PACA_EXGEN, 0xf60)
+INT_KVM_HANDLER 0xf60, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
 
@@ -1801,7 +1782,7 @@ EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20)
 EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
 	INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
-TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
+INT_KVM_HANDLER 0xf80, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
 
 
@@ -1822,7 +1803,7 @@ EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
 	INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
 EXC_VIRT_NONE(0x5200, 0x100)
-TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
+INT_KVM_HANDLER 0x1200, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1200, 0x100)
@@ -1836,7 +1817,7 @@ EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100)
 EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
 	INT_HANDLER instruction_breakpoint, 0x1300, virt=1
 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
-TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
+INT_KVM_HANDLER 0x1300, EXC_STD, PACA_EXGEN, 1
 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
 
 
@@ -1866,7 +1847,7 @@ EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
 EXC_VIRT_NONE(0x5500, 0x100)
 #endif
 
-TRAMP_KVM_HV(PACA_EXGEN, 0x1500)
+INT_KVM_HANDLER 0x1500, EXC_HV, PACA_EXGEN, 0
 
 #ifdef CONFIG_PPC_DENORMALISATION
 TRAMP_REAL_BEGIN(denorm_assist)
@@ -1945,7 +1926,7 @@ EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
 	INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
 EXC_VIRT_NONE(0x5600, 0x100)
-TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
+INT_KVM_HANDLER 0x1600, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1600, 0x100)
@@ -1959,7 +1940,7 @@ EXC_REAL_END(altivec_assist, 0x1700, 0x100)
 EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
 	INT_HANDLER altivec_assist, 0x1700, virt=1
 EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0x1700)
+INT_KVM_HANDLER 0x1700, EXC_STD, PACA_EXGEN, 0
 #ifdef CONFIG_ALTIVEC
 EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
 #else
@@ -1972,7 +1953,7 @@ EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
 	INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
-TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)
+INT_KVM_HANDLER 0x1800, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1800, 0x100)

From 7027d53d1ab17d28b65913148585d5a331446b8b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:52 +1000
Subject: [PATCH 172/264] powerpc/64s/exception: KVM_HANDLER reorder arguments
 to match other macros

Also change argument name (n -> vec) to match others.

No generated code change.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-28-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index cff48d212011..d37420dee447 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -316,7 +316,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	.endif
 .endm
 
-.macro KVM_HANDLER area, hsrr, n, skip
+.macro KVM_HANDLER vec, hsrr, area, skip
 	.if \skip
 	cmpwi	r10,KVM_GUEST_MODE_SKIP
 	beq	89f
@@ -337,14 +337,14 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	/* HSRR variants have the 0x2 bit added to their trap number */
 	.if \hsrr == EXC_HV_OR_STD
 	BEGIN_FTR_SECTION
-	ori	r12,r12,(\n + 0x2)
+	ori	r12,r12,(\vec + 0x2)
 	FTR_SECTION_ELSE
-	ori	r12,r12,(\n)
+	ori	r12,r12,(\vec)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	.elseif \hsrr
-	ori	r12,r12,(\n + 0x2)
+	ori	r12,r12,(\vec + 0x2)
 	.else
-	ori	r12,r12,(\n)
+	ori	r12,r12,(\vec)
 	.endif
 
 #ifdef CONFIG_RELOCATABLE
@@ -386,7 +386,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 #else
 .macro KVMTEST hsrr, n
 .endm
-.macro KVM_HANDLER area, hsrr, n, skip
+.macro KVM_HANDLER vec, hsrr, area, skip
 .endm
 #endif
 
@@ -657,7 +657,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 	.else
 	TRAMP_KVM_BEGIN(do_kvm_\vec\())
 	.endif
-	KVM_HANDLER \area, \hsrr, \vec, \skip
+	KVM_HANDLER \vec, \hsrr, \area, \skip
 .endm
 
 #define EXC_COMMON(name, realvec, hdlr)					\
@@ -1538,7 +1538,7 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00)
 	SET_SCRATCH0(r10)
 	std	r9,PACA_EXGEN+EX_R9(r13)
 	mfcr	r9
-	KVM_HANDLER PACA_EXGEN, EXC_STD, 0xc00, 0
+	KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0
 #endif
 
 

From 9a9c739aa83d031da7468028de8a65608146eccc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:53 +1000
Subject: [PATCH 173/264] powerpc/64s/exception: Merge
 EXCEPTION_PROLOG_COMMON_2/3

Merge EXCEPTION_PROLOG_COMMON_3 into EXCEPTION_PROLOG_COMMON_2.

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-29-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d37420dee447..0643ae57badc 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -399,7 +399,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
 
 /* Save original regs values from save area to stack frame. */
-#define EXCEPTION_PROLOG_COMMON_2(area)					   \
+#define EXCEPTION_PROLOG_COMMON_2(area, trap)				   \
 	ld	r9,area+EX_R9(r13);	/* move r9, r10 to stackframe	*/ \
 	ld	r10,area+EX_R10(r13);					   \
 	std	r9,GPR9(r1);						   \
@@ -415,9 +415,7 @@ BEGIN_FTR_SECTION_NESTED(66);						   \
 	std	r10,ORIG_GPR3(r1);					   \
 END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 	GET_CTR(r10, area);						   \
-	std	r10,_CTR(r1);
-
-#define EXCEPTION_PROLOG_COMMON_3(trap)					   \
+	std	r10,_CTR(r1);						   \
 	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
 	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe   */ \
 	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
@@ -453,8 +451,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 	beq	4f;			/* if from kernel mode		*/ \
 	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);				   \
 	SAVE_PPR(area, r9);						   \
-4:	EXCEPTION_PROLOG_COMMON_2(area);				   \
-	EXCEPTION_PROLOG_COMMON_3(trap);				   \
+4:	EXCEPTION_PROLOG_COMMON_2(area, trap);				   \
 	ACCOUNT_STOLEN_TIME
 
 /*
@@ -464,8 +461,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 #define EXCEPTION_COMMON_STACK(area, trap)			\
 	EXCEPTION_PROLOG_COMMON_1();				\
 	kuap_save_amr_and_lock r9, r10, cr1;			\
-	EXCEPTION_PROLOG_COMMON_2(area);			\
-	EXCEPTION_PROLOG_COMMON_3(trap)
+	EXCEPTION_PROLOG_COMMON_2(area, trap)
 
 /*
  * Restore all registers including H/SRR0/1 saved in a stack frame of a
@@ -968,8 +964,7 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 
 	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
-	EXCEPTION_PROLOG_COMMON_3(0x200)
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC, 0x200)
 
 	ld	r3,PACA_EXMC+EX_DAR(r13)
 	lwz	r4,PACA_EXMC+EX_DSISR(r13)
@@ -1616,8 +1611,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
 	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
-	EXCEPTION_PROLOG_COMMON_3(0xe60)
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN, 0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	hmi_exception_realmode
 	cmpdi	cr0,r3,0

From bcbceed40a8c355b48678d90c5c407dfca811f0e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:54 +1000
Subject: [PATCH 174/264] powerpc/64s/exception: Add INT_COMMON gas macro to
 generate common exception code

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-30-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 52 +++++++++++++++++-----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 0643ae57badc..7829a6ad99aa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -463,6 +463,18 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 	kuap_save_amr_and_lock r9, r10, cr1;			\
 	EXCEPTION_PROLOG_COMMON_2(area, trap)
 
+.macro INT_COMMON vec, area, stack, kaup
+	.if \stack
+	EXCEPTION_COMMON(\area, \vec)
+	.else
+	EXCEPTION_PROLOG_COMMON_1()
+	.if \kaup
+	kuap_save_amr_and_lock r9, r10, cr1
+	.endif
+	EXCEPTION_PROLOG_COMMON_2(\area, \vec)
+	.endif
+.endm
+
 /*
  * Restore all registers including H/SRR0/1 saved in a stack frame of a
  * standard exception.
@@ -658,7 +670,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_COMMON(name, realvec, hdlr)					\
 	EXC_COMMON_BEGIN(name);						\
-	EXCEPTION_COMMON(PACA_EXGEN, realvec);				\
+	INT_COMMON realvec, PACA_EXGEN, 1, 1 ;				\
 	bl	save_nvgprs;						\
 	RECONCILE_IRQ_STATE(r10, r11);					\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
@@ -671,7 +683,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
  */
 #define EXC_COMMON_ASYNC(name, realvec, hdlr)				\
 	EXC_COMMON_BEGIN(name);						\
-	EXCEPTION_COMMON(PACA_EXGEN, realvec);				\
+	INT_COMMON realvec, PACA_EXGEN, 1, 1 ;				\
 	FINISH_NAP;							\
 	RECONCILE_IRQ_STATE(r10, r11);					\
 	RUNLATCH_ON;							\
@@ -852,7 +864,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 	mr	r10,r1
 	ld	r1,PACA_NMI_EMERG_SP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
-	EXCEPTION_COMMON_STACK(PACA_EXNMI, 0x100)
+	INT_COMMON 0x100, PACA_EXNMI, 0, 1
 	bl	save_nvgprs
 	/*
 	 * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does
@@ -962,9 +974,8 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	bgt	cr1,unrecoverable_mce	/* Check if we hit limit of 4 */
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 
-	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC, 0x200)
+	INT_COMMON 0x200, PACA_EXMC, 0, 0
 
 	ld	r3,PACA_EXMC+EX_DAR(r13)
 	lwz	r4,PACA_EXMC+EX_DSISR(r13)
@@ -1064,7 +1075,7 @@ EXC_COMMON_BEGIN(machine_check_common)
 	 * Machine check is different because we use a different
 	 * save area: PACA_EXMC instead of PACA_EXGEN.
 	 */
-	EXCEPTION_COMMON(PACA_EXMC, 0x200)
+	INT_COMMON 0x200, PACA_EXMC, 1, 1
 	FINISH_NAP
 	RECONCILE_IRQ_STATE(r10, r11)
 	ld	r3,PACA_EXMC+EX_DAR(r13)
@@ -1156,7 +1167,7 @@ EXC_COMMON_BEGIN(data_access_common)
 	 * r9 - r13 are saved in paca->exgen.
 	 * EX_DAR and EX_DSISR have saved DAR/DSISR
 	 */
-	EXCEPTION_COMMON(PACA_EXGEN, 0x300)
+	INT_COMMON 0x300, PACA_EXGEN, 1, 1
 	RECONCILE_IRQ_STATE(r10, r11)
 	ld	r12,_MSR(r1)
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
@@ -1179,7 +1190,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1
 EXC_COMMON_BEGIN(data_access_slb_common)
-	EXCEPTION_COMMON(PACA_EXSLB, 0x380)
+	INT_COMMON 0x380, PACA_EXSLB, 1, 1
 	ld	r4,PACA_EXSLB+EX_DAR(r13)
 	std	r4,_DAR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -1212,7 +1223,7 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
 INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(instruction_access_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0x400)
+	INT_COMMON 0x400, PACA_EXGEN, 1, 1
 	RECONCILE_IRQ_STATE(r10, r11)
 	ld	r12,_MSR(r1)
 	ld	r3,_NIP(r1)
@@ -1235,7 +1246,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0
 EXC_COMMON_BEGIN(instruction_access_slb_common)
-	EXCEPTION_COMMON(PACA_EXSLB, 0x480)
+	INT_COMMON 0x480, PACA_EXSLB, 1, 1
 	ld	r4,_NIP(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
@@ -1277,7 +1288,7 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
 EXC_VIRT_END(alignment, 0x4600, 0x100)
 INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(alignment_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0x600)
+	INT_COMMON 0x600, PACA_EXGEN, 1, 1
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
 	std	r3,_DAR(r1)
@@ -1323,7 +1334,7 @@ EXC_COMMON_BEGIN(program_check_common)
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
 	b 3f				/* Jump into the macro !!	*/
 2:
-	EXCEPTION_COMMON(PACA_EXGEN, 0x700)
+	INT_COMMON 0x700, PACA_EXGEN, 1, 1
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -1339,7 +1350,7 @@ EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(fp_unavailable_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0x800)
+	INT_COMMON 0x800, PACA_EXGEN, 1, 1
 	bne	1f			/* if from user, just load it up */
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
@@ -1555,7 +1566,7 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
 INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(h_data_storage_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0xe00)
+	INT_COMMON 0xe00, PACA_EXGEN, 1, 1
 	bl      save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
 	addi    r3,r1,STACK_FRAME_OVERHEAD
@@ -1609,9 +1620,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	mr	r10,r1			/* Save r1 */
 	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack for realmode */
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	EXCEPTION_PROLOG_COMMON_1()
+
 	/* We don't touch AMR here, we never go to virtual mode */
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN, 0xe60)
+	INT_COMMON 0xe60, PACA_EXGEN, 0, 0
+
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	hmi_exception_realmode
 	cmpdi	cr0,r3,0
@@ -1629,7 +1641,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 
 EXC_COMMON_BEGIN(hmi_exception_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0xe60)
+	INT_COMMON 0xe60, PACA_EXGEN, 1, 1
 	FINISH_NAP
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
@@ -1687,7 +1699,7 @@ EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
 INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(altivec_unavailable_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0xf20)
+	INT_COMMON 0xf20, PACA_EXGEN, 1, 1
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
 	beq	1f
@@ -1728,7 +1740,7 @@ EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
 INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(vsx_unavailable_common)
-	EXCEPTION_COMMON(PACA_EXGEN, 0xf40)
+	INT_COMMON 0xf40, PACA_EXGEN, 1, 1
 #ifdef CONFIG_VSX
 BEGIN_FTR_SECTION
 	beq	1f
@@ -1979,7 +1991,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
 	mr	r10,r1
 	ld	r1,PACAEMERGSP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
-	EXCEPTION_COMMON_STACK(PACA_EXGEN, 0x900)
+	INT_COMMON 0x900, PACA_EXGEN, 0, 1
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD

From 5d5e0edfd5fa2a60d0f9b8d6ece1a5ce51aae3b5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:55 +1000
Subject: [PATCH 175/264] powerpc/64s/exception: Expand EXCEPTION_COMMON macro
 into caller

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-31-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 54 ++++++++++++++--------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 7829a6ad99aa..492786604b10 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -437,41 +437,41 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
  * On entry r13 points to the paca, r9-r13 are saved in the paca,
  * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
  * SRR1, and relocation is on.
+ *
+ * If stack=0, then the stack is already set in r1, and r1 is saved in r10.
+ * PPR save and CPU accounting is not done for the !stack case (XXX why not?)
  */
-#define EXCEPTION_COMMON(area, trap)					   \
-	andi.	r10,r12,MSR_PR;		/* See if coming from user	*/ \
-	mr	r10,r1;			/* Save r1			*/ \
-	subi	r1,r1,INT_FRAME_SIZE;	/* alloc frame on kernel stack	*/ \
-	beq-	1f;							   \
-	ld	r1,PACAKSAVE(r13);	/* kernel stack to use		*/ \
-1:	tdgei	r1,-INT_FRAME_SIZE;	/* trap if r1 is in userspace	*/ \
-	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;				   \
-3:	EXCEPTION_PROLOG_COMMON_1();					   \
-	kuap_save_amr_and_lock r9, r10, cr1, cr0;			   \
-	beq	4f;			/* if from kernel mode		*/ \
-	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);				   \
-	SAVE_PPR(area, r9);						   \
-4:	EXCEPTION_PROLOG_COMMON_2(area, trap);				   \
-	ACCOUNT_STOLEN_TIME
-
-/*
- * Exception where stack is already set in r1, r1 is saved in r10.
- * PPR save and CPU accounting is not done (for some reason).
- */
-#define EXCEPTION_COMMON_STACK(area, trap)			\
-	EXCEPTION_PROLOG_COMMON_1();				\
-	kuap_save_amr_and_lock r9, r10, cr1;			\
-	EXCEPTION_PROLOG_COMMON_2(area, trap)
-
 .macro INT_COMMON vec, area, stack, kaup
 	.if \stack
-	EXCEPTION_COMMON(\area, \vec)
-	.else
+	andi.	r10,r12,MSR_PR		/* See if coming from user	*/
+	mr	r10,r1			/* Save r1			*/
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc frame on kernel stack	*/
+	beq-	1f
+	ld	r1,PACAKSAVE(r13)	/* kernel stack to use		*/
+1:	tdgei	r1,-INT_FRAME_SIZE	/* trap if r1 is in userspace	*/
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+3:
+	.endif
 	EXCEPTION_PROLOG_COMMON_1()
+
+	.if \stack
+	.if \kaup
+	kuap_save_amr_and_lock r9, r10, cr1, cr0
+	.endif
+	beq	4f			/* if from kernel mode		*/
+	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10)
+	SAVE_PPR(\area, r9)
+4:
+	.else
 	.if \kaup
 	kuap_save_amr_and_lock r9, r10, cr1
 	.endif
+	.endif
+
 	EXCEPTION_PROLOG_COMMON_2(\area, \vec)
+
+	.if \stack
+	ACCOUNT_STOLEN_TIME
 	.endif
 .endm
 

From 8c9fb5d4f3ddf02fb0fa3dec2dffd6b007ac894c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:56 +1000
Subject: [PATCH 176/264] powerpc/64s/exception: Expand
 EXCEPTION_PROLOG_COMMON_1 and 2 into caller

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-32-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 85 +++++++++++++---------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 492786604b10..7a4f215c4c5b 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -390,49 +390,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 .endm
 #endif
 
-#define EXCEPTION_PROLOG_COMMON_1()					   \
-	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
-	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
-	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
-	std	r10,0(r1);		/* make stack chain pointer	*/ \
-	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
-	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
-
-/* Save original regs values from save area to stack frame. */
-#define EXCEPTION_PROLOG_COMMON_2(area, trap)				   \
-	ld	r9,area+EX_R9(r13);	/* move r9, r10 to stackframe	*/ \
-	ld	r10,area+EX_R10(r13);					   \
-	std	r9,GPR9(r1);						   \
-	std	r10,GPR10(r1);						   \
-	ld	r9,area+EX_R11(r13);	/* move r11 - r13 to stackframe	*/ \
-	ld	r10,area+EX_R12(r13);					   \
-	ld	r11,area+EX_R13(r13);					   \
-	std	r9,GPR11(r1);						   \
-	std	r10,GPR12(r1);						   \
-	std	r11,GPR13(r1);						   \
-BEGIN_FTR_SECTION_NESTED(66);						   \
-	ld	r10,area+EX_CFAR(r13);					   \
-	std	r10,ORIG_GPR3(r1);					   \
-END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
-	GET_CTR(r10, area);						   \
-	std	r10,_CTR(r1);						   \
-	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
-	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe   */ \
-	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
-	mflr	r9;			/* Get LR, later save to stack	*/ \
-	ld	r2,PACATOC(r13);	/* get kernel TOC into r2	*/ \
-	std	r9,_LINK(r1);						   \
-	lbz	r10,PACAIRQSOFTMASK(r13);				   \
-	mfspr	r11,SPRN_XER;		/* save XER in stackframe	*/ \
-	std	r10,SOFTE(r1);						   \
-	std	r11,_XER(r1);						   \
-	li	r9,(trap)+1;						   \
-	std	r9,_TRAP(r1);		/* set trap number		*/ \
-	li	r10,0;							   \
-	ld	r11,exception_marker@toc(r2);				   \
-	std	r10,RESULT(r1);		/* clear regs->result		*/ \
-	std	r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame	*/
-
 /*
  * On entry r13 points to the paca, r9-r13 are saved in the paca,
  * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
@@ -452,7 +409,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
 3:
 	.endif
-	EXCEPTION_PROLOG_COMMON_1()
+
+	std	r9,_CCR(r1)		/* save CR in stackframe	*/
+	std	r11,_NIP(r1)		/* save SRR0 in stackframe	*/
+	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
+	std	r10,0(r1)		/* make stack chain pointer	*/
+	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
+	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
 
 	.if \stack
 	.if \kaup
@@ -468,7 +431,39 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);			   \
 	.endif
 	.endif
 
-	EXCEPTION_PROLOG_COMMON_2(\area, \vec)
+	/* Save original regs values from save area to stack frame. */
+	ld	r9,\area+EX_R9(r13)	/* move r9, r10 to stackframe	*/
+	ld	r10,\area+EX_R10(r13)
+	std	r9,GPR9(r1)
+	std	r10,GPR10(r1)
+	ld	r9,\area+EX_R11(r13)	/* move r11 - r13 to stackframe	*/
+	ld	r10,\area+EX_R12(r13)
+	ld	r11,\area+EX_R13(r13)
+	std	r9,GPR11(r1)
+	std	r10,GPR12(r1)
+	std	r11,GPR13(r1)
+BEGIN_FTR_SECTION_NESTED(66)
+	ld	r10,\area+EX_CFAR(r13)
+	std	r10,ORIG_GPR3(r1)
+END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66)
+	GET_CTR(r10, \area)
+	std	r10,_CTR(r1)
+	std	r2,GPR2(r1)		/* save r2 in stackframe	*/
+	SAVE_4GPRS(3, r1)		/* save r3 - r6 in stackframe   */
+	SAVE_2GPRS(7, r1)		/* save r7, r8 in stackframe	*/
+	mflr	r9			/* Get LR, later save to stack	*/
+	ld	r2,PACATOC(r13)		/* get kernel TOC into r2	*/
+	std	r9,_LINK(r1)
+	lbz	r10,PACAIRQSOFTMASK(r13)
+	mfspr	r11,SPRN_XER		/* save XER in stackframe	*/
+	std	r10,SOFTE(r1)
+	std	r11,_XER(r1)
+	li	r9,(\vec)+1
+	std	r9,_TRAP(r1)		/* set trap number		*/
+	li	r10,0
+	ld	r11,exception_marker@toc(r2)
+	std	r10,RESULT(r1)		/* clear regs->result		*/
+	std	r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame	*/
 
 	.if \stack
 	ACCOUNT_STOLEN_TIME

From d1a84718888e768296557b83f4c56cb1caef8fdd Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:57 +1000
Subject: [PATCH 177/264] powerpc/64s/exception: INT_COMMON add DAR, DSISR,
 reconcile options

Move DAR and DSISR saving to pt_regs into INT_COMMON. Also add an
option to expand RECONCILE_IRQ_STATE.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-33-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 111 ++++++++++++---------------
 1 file changed, 51 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 7a4f215c4c5b..c2235876e397 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -398,7 +398,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
  * If stack=0, then the stack is already set in r1, and r1 is saved in r10.
  * PPR save and CPU accounting is not done for the !stack case (XXX why not?)
  */
-.macro INT_COMMON vec, area, stack, kaup
+.macro INT_COMMON vec, area, stack, kaup, reconcile, dar, dsisr
 	.if \stack
 	andi.	r10,r12,MSR_PR		/* See if coming from user	*/
 	mr	r10,r1			/* Save r1			*/
@@ -442,6 +442,24 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	std	r9,GPR11(r1)
 	std	r10,GPR12(r1)
 	std	r11,GPR13(r1)
+	.if \dar
+	.if \dar == 2
+	ld	r10,_NIP(r1)
+	.else
+	ld	r10,\area+EX_DAR(r13)
+	.endif
+	std	r10,_DAR(r1)
+	.endif
+	.if \dsisr
+	.if \dsisr == 2
+	ld	r10,_MSR(r1)
+	lis	r11,DSISR_SRR1_MATCH_64S@h
+	and	r10,r10,r11
+	.else
+	lwz	r10,\area+EX_DSISR(r13)
+	.endif
+	std	r10,_DSISR(r1)
+	.endif
 BEGIN_FTR_SECTION_NESTED(66)
 	ld	r10,\area+EX_CFAR(r13)
 	std	r10,ORIG_GPR3(r1)
@@ -468,6 +486,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66)
 	.if \stack
 	ACCOUNT_STOLEN_TIME
 	.endif
+
+	.if \reconcile
+	RECONCILE_IRQ_STATE(r10, r11)
+	.endif
 .endm
 
 /*
@@ -665,9 +687,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_COMMON(name, realvec, hdlr)					\
 	EXC_COMMON_BEGIN(name);						\
-	INT_COMMON realvec, PACA_EXGEN, 1, 1 ;				\
+	INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ;			\
 	bl	save_nvgprs;						\
-	RECONCILE_IRQ_STATE(r10, r11);					\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	bl	hdlr;							\
 	b	ret_from_except
@@ -678,9 +699,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
  */
 #define EXC_COMMON_ASYNC(name, realvec, hdlr)				\
 	EXC_COMMON_BEGIN(name);						\
-	INT_COMMON realvec, PACA_EXGEN, 1, 1 ;				\
+	INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ;			\
 	FINISH_NAP;							\
-	RECONCILE_IRQ_STATE(r10, r11);					\
 	RUNLATCH_ON;							\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	bl	hdlr;							\
@@ -859,7 +879,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 	mr	r10,r1
 	ld	r1,PACA_NMI_EMERG_SP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
-	INT_COMMON 0x100, PACA_EXNMI, 0, 1
+	INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0
 	bl	save_nvgprs
 	/*
 	 * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does
@@ -970,12 +990,7 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 
 	/* We don't touch AMR here, we never go to virtual mode */
-	INT_COMMON 0x200, PACA_EXMC, 0, 0
-
-	ld	r3,PACA_EXMC+EX_DAR(r13)
-	lwz	r4,PACA_EXMC+EX_DSISR(r13)
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
+	INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1
 
 BEGIN_FTR_SECTION
 	bl	enable_machine_check
@@ -1070,16 +1085,11 @@ EXC_COMMON_BEGIN(machine_check_common)
 	 * Machine check is different because we use a different
 	 * save area: PACA_EXMC instead of PACA_EXGEN.
 	 */
-	INT_COMMON 0x200, PACA_EXMC, 1, 1
+	INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1
 	FINISH_NAP
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r3,PACA_EXMC+EX_DAR(r13)
-	lwz	r4,PACA_EXMC+EX_DSISR(r13)
 	/* Enable MSR_RI when finished with PACA_EXMC */
 	li	r10,MSR_RI
 	mtmsrd 	r10,1
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_exception
@@ -1162,14 +1172,11 @@ EXC_COMMON_BEGIN(data_access_common)
 	 * r9 - r13 are saved in paca->exgen.
 	 * EX_DAR and EX_DSISR have saved DAR/DSISR
 	 */
-	INT_COMMON 0x300, PACA_EXGEN, 1, 1
-	RECONCILE_IRQ_STATE(r10, r11)
+	INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1
 	ld	r12,_MSR(r1)
-	ld	r3,PACA_EXGEN+EX_DAR(r13)
-	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	ld	r3,_DAR(r1)
+	ld	r4,_DSISR(r1)
 	li	r5,0x300
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
 BEGIN_MMU_FTR_SECTION
 	b	do_hash_page		/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
@@ -1185,9 +1192,8 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1
 EXC_COMMON_BEGIN(data_access_slb_common)
-	INT_COMMON 0x380, PACA_EXSLB, 1, 1
-	ld	r4,PACA_EXSLB+EX_DAR(r13)
-	std	r4,_DAR(r1)
+	INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0
+	ld	r4,_DAR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
 	/* HPT case, do SLB fault */
@@ -1218,14 +1224,11 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
 INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(instruction_access_common)
-	INT_COMMON 0x400, PACA_EXGEN, 1, 1
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r12,_MSR(r1)
-	ld	r3,_NIP(r1)
-	andis.	r4,r12,DSISR_SRR1_MATCH_64S@h
+	INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2
+	ld      r12,_MSR(r1)
+	ld	r3,_DAR(r1)
+	ld	r4,_DSISR(r1)
 	li	r5,0x400
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
 BEGIN_MMU_FTR_SECTION
 	b	do_hash_page		/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
@@ -1241,8 +1244,8 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0
 EXC_COMMON_BEGIN(instruction_access_slb_common)
-	INT_COMMON 0x480, PACA_EXSLB, 1, 1
-	ld	r4,_NIP(r1)
+	INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0
+	ld	r4,_DAR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
 	/* HPT case, do SLB fault */
@@ -1258,7 +1261,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	std	r3,RESULT(r1)
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r4,_NIP(r1)
+	ld	r4,_DAR(r1)
 	ld	r5,RESULT(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_bad_slb_fault
@@ -1283,13 +1286,8 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
 EXC_VIRT_END(alignment, 0x4600, 0x100)
 INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(alignment_common)
-	INT_COMMON 0x600, PACA_EXGEN, 1, 1
-	ld	r3,PACA_EXGEN+EX_DAR(r13)
-	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
+	INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1
 	bl	save_nvgprs
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	alignment_exception
 	b	ret_from_except
@@ -1329,9 +1327,8 @@ EXC_COMMON_BEGIN(program_check_common)
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
 	b 3f				/* Jump into the macro !!	*/
 2:
-	INT_COMMON 0x700, PACA_EXGEN, 1, 1
+	INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0
 	bl	save_nvgprs
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	program_check_exception
 	b	ret_from_except
@@ -1345,7 +1342,7 @@ EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(fp_unavailable_common)
-	INT_COMMON 0x800, PACA_EXGEN, 1, 1
+	INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0
 	bne	1f			/* if from user, just load it up */
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
@@ -1561,15 +1558,11 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
 INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(h_data_storage_common)
-	INT_COMMON 0xe00, PACA_EXGEN, 1, 1
+	INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1
 	bl      save_nvgprs
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi    r3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-	ld	r4,PACA_EXGEN+EX_DAR(r13)
-	lwz	r5,PACA_EXGEN+EX_DSISR(r13)
-	std	r4,_DAR(r1)
-	std	r5,_DSISR(r1)
+	ld	r4,_DAR(r1)
 	li	r5,SIGSEGV
 	bl      bad_page_fault
 MMU_FTR_SECTION_ELSE
@@ -1617,7 +1610,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
 
 	/* We don't touch AMR here, we never go to virtual mode */
-	INT_COMMON 0xe60, PACA_EXGEN, 0, 0
+	INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0
 
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	hmi_exception_realmode
@@ -1636,11 +1629,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 	INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 
 EXC_COMMON_BEGIN(hmi_exception_common)
-	INT_COMMON 0xe60, PACA_EXGEN, 1, 1
+	INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0
 	FINISH_NAP
-	bl	save_nvgprs
-	RECONCILE_IRQ_STATE(r10, r11)
 	RUNLATCH_ON
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	handle_hmi_exception
 	b	ret_from_except
@@ -1694,7 +1686,7 @@ EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
 INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(altivec_unavailable_common)
-	INT_COMMON 0xf20, PACA_EXGEN, 1, 1
+	INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
 	beq	1f
@@ -1735,7 +1727,7 @@ EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
 INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(vsx_unavailable_common)
-	INT_COMMON 0xf40, PACA_EXGEN, 1, 1
+	INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0
 #ifdef CONFIG_VSX
 BEGIN_FTR_SECTION
 	beq	1f
@@ -1986,9 +1978,8 @@ EXC_COMMON_BEGIN(soft_nmi_common)
 	mr	r10,r1
 	ld	r1,PACAEMERGSP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
-	INT_COMMON 0x900, PACA_EXGEN, 0, 1
+	INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0
 	bl	save_nvgprs
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	soft_nmi_interrupt
 	b	ret_from_except

From c7c5cbb42d6e207a059c64740b1654376619345e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:58 +1000
Subject: [PATCH 178/264] powerpc/64s/exception: move interrupt entry code
 above the common handler

This better reflects the order in which the code is executed.

No generated code change except BUG line number constants.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-34-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 450 +++++++++++++--------------
 1 file changed, 225 insertions(+), 225 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index c2235876e397..aabd84e83615 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -180,101 +180,6 @@ BEGIN_FTR_SECTION_NESTED(943)						\
 	std	ra,offset(r13);						\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
-.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri
-	ld	r10,PACAKMSR(r13)	/* get MSR value for kernel */
-	.if ! \set_ri
-	xori	r10,r10,MSR_RI		/* Clear MSR_RI */
-	.endif
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
-	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
-	mtspr	SPRN_HSRR1,r10
-	FTR_SECTION_ELSE
-	mfspr	r11,SPRN_SRR0		/* save SRR0 */
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mtspr	SPRN_SRR1,r10
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
-	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
-	mtspr	SPRN_HSRR1,r10
-	.else
-	mfspr	r11,SPRN_SRR0		/* save SRR0 */
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	mtspr	SPRN_SRR1,r10
-	.endif
-	LOAD_HANDLER(r10, \label\())
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	mtspr	SPRN_HSRR0,r10
-	HRFI_TO_KERNEL
-	FTR_SECTION_ELSE
-	mtspr	SPRN_SRR0,r10
-	RFI_TO_KERNEL
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	mtspr	SPRN_HSRR0,r10
-	HRFI_TO_KERNEL
-	.else
-	mtspr	SPRN_SRR0,r10
-	RFI_TO_KERNEL
-	.endif
-	b	.	/* prevent speculative execution */
-.endm
-
-/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */
-.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr
-#ifdef CONFIG_RELOCATABLE
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
-	FTR_SECTION_ELSE
-	mfspr	r11,SPRN_SRR0	/* save SRR0 */
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
-	.else
-	mfspr	r11,SPRN_SRR0	/* save SRR0 */
-	.endif
-	LOAD_HANDLER(r12, \label\())
-	mtctr	r12
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
-	FTR_SECTION_ELSE
-	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
-	.else
-	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
-	.endif
-	li	r10,MSR_RI
-	mtmsrd 	r10,1		/* Set RI (EE=0) */
-	bctr
-#else
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
-	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
-	FTR_SECTION_ELSE
-	mfspr	r11,SPRN_SRR0		/* save SRR0 */
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
-	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
-	.else
-	mfspr	r11,SPRN_SRR0		/* save SRR0 */
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
-	.endif
-	li	r10,MSR_RI
-	mtmsrd 	r10,1			/* Set RI (EE=0) */
-	b	\label
-#endif
-.endm
-
 /*
  * Branch to label using its 0xC000 address. This results in instruction
  * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned
@@ -288,6 +193,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
+.macro INT_KVM_HANDLER vec, hsrr, area, skip
+	.if \hsrr
+	TRAMP_KVM_BEGIN(do_kvm_H\vec\())
+	.else
+	TRAMP_KVM_BEGIN(do_kvm_\vec\())
+	.endif
+	KVM_HANDLER \vec, \hsrr, \area, \skip
+.endm
+
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
@@ -390,6 +304,222 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 .endm
 #endif
 
+.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri
+	ld	r10,PACAKMSR(r13)	/* get MSR value for kernel */
+	.if ! \set_ri
+	xori	r10,r10,MSR_RI		/* Clear MSR_RI */
+	.endif
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	mtspr	SPRN_HSRR1,r10
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mtspr	SPRN_SRR1,r10
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	mtspr	SPRN_HSRR1,r10
+	.else
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mtspr	SPRN_SRR1,r10
+	.endif
+	LOAD_HANDLER(r10, \label\())
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mtspr	SPRN_HSRR0,r10
+	HRFI_TO_KERNEL
+	FTR_SECTION_ELSE
+	mtspr	SPRN_SRR0,r10
+	RFI_TO_KERNEL
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
+	mtspr	SPRN_HSRR0,r10
+	HRFI_TO_KERNEL
+	.else
+	mtspr	SPRN_SRR0,r10
+	RFI_TO_KERNEL
+	.endif
+	b	.	/* prevent speculative execution */
+.endm
+
+/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */
+.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr
+#ifdef CONFIG_RELOCATABLE
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0	/* save SRR0 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
+	mfspr	r11,SPRN_HSRR0	/* save HSRR0 */
+	.else
+	mfspr	r11,SPRN_SRR0	/* save SRR0 */
+	.endif
+	LOAD_HANDLER(r12, \label\())
+	mtctr	r12
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
+	FTR_SECTION_ELSE
+	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
+	mfspr	r12,SPRN_HSRR1	/* and HSRR1 */
+	.else
+	mfspr	r12,SPRN_SRR1	/* and HSRR1 */
+	.endif
+	li	r10,MSR_RI
+	mtmsrd 	r10,1		/* Set RI (EE=0) */
+	bctr
+#else
+	.if \hsrr == EXC_HV_OR_STD
+	BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	FTR_SECTION_ELSE
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+	.elseif \hsrr
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* and HSRR1 */
+	.else
+	mfspr	r11,SPRN_SRR0		/* save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	.endif
+	li	r10,MSR_RI
+	mtmsrd 	r10,1			/* Set RI (EE=0) */
+	b	\label
+#endif
+.endm
+
+/*
+ * This is the BOOK3S interrupt entry code macro.
+ *
+ * This can result in one of several things happening:
+ * - Branch to the _common handler, relocated, in virtual mode.
+ *   These are normal interrupts (synchronous and asynchronous) handled by
+ *   the kernel.
+ * - Branch to KVM, relocated but real mode interrupts remain in real mode.
+ *   These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by
+ *   / intended for host or guest kernel, but KVM must always be involved
+ *   because the machine state is set for guest execution.
+ * - Branch to the masked handler, unrelocated.
+ *   These occur when maskable asynchronous interrupts are taken with the
+ *   irq_soft_mask set.
+ * - Branch to an "early" handler in real mode but relocated.
+ *   This is done if early=1. MCE and HMI use these to handle errors in real
+ *   mode.
+ * - Fall through and continue executing in real, unrelocated mode.
+ *   This is done if early=2.
+ */
+.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0
+	SET_SCRATCH0(r13)			/* save r13 */
+	GET_PACA(r13)
+	std	r9,\area\()+EX_R9(r13)		/* save r9 */
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
+	HMT_MEDIUM
+	std	r10,\area\()+EX_R10(r13)	/* save r10 - r12 */
+	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
+	.if \ool
+	.if !\virt
+	b	tramp_real_\name
+	.pushsection .text
+	TRAMP_REAL_BEGIN(tramp_real_\name)
+	.else
+	b	tramp_virt_\name
+	.pushsection .text
+	TRAMP_VIRT_BEGIN(tramp_virt_\name)
+	.endif
+	.endif
+
+	OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
+	OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
+	INTERRUPT_TO_KERNEL
+	SAVE_CTR(r10, \area\())
+	mfcr	r9
+	.if \kvm
+		KVMTEST \hsrr \vec
+	.endif
+	.if \bitmask
+		lbz	r10,PACAIRQSOFTMASK(r13)
+		andi.	r10,r10,\bitmask
+		/* Associate vector numbers with bits in paca->irq_happened */
+		.if \vec == 0x500 || \vec == 0xea0
+		li	r10,PACA_IRQ_EE
+		.elseif \vec == 0x900
+		li	r10,PACA_IRQ_DEC
+		.elseif \vec == 0xa00 || \vec == 0xe80
+		li	r10,PACA_IRQ_DBELL
+		.elseif \vec == 0xe60
+		li	r10,PACA_IRQ_HMI
+		.elseif \vec == 0xf00
+		li	r10,PACA_IRQ_PMI
+		.else
+		.abort "Bad maskable vector"
+		.endif
+
+		.if \hsrr == EXC_HV_OR_STD
+		BEGIN_FTR_SECTION
+		bne	masked_Hinterrupt
+		FTR_SECTION_ELSE
+		bne	masked_interrupt
+		ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+		.elseif \hsrr
+		bne	masked_Hinterrupt
+		.else
+		bne	masked_interrupt
+		.endif
+	.endif
+
+	std	r11,\area\()+EX_R11(r13)
+	std	r12,\area\()+EX_R12(r13)
+
+	/*
+	 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
+	 * because a d-side MCE will clobber those registers so is
+	 * not recoverable if they are live.
+	 */
+	GET_SCRATCH0(r10)
+	std	r10,\area\()+EX_R13(r13)
+	.if \dar
+	.if \hsrr
+	mfspr	r10,SPRN_HDAR
+	.else
+	mfspr	r10,SPRN_DAR
+	.endif
+	std	r10,\area\()+EX_DAR(r13)
+	.endif
+	.if \dsisr
+	.if \hsrr
+	mfspr	r10,SPRN_HDSISR
+	.else
+	mfspr	r10,SPRN_DSISR
+	.endif
+	stw	r10,\area\()+EX_DSISR(r13)
+	.endif
+
+	.if \early == 2
+	/* nothing more */
+	.elseif \early
+	mfctr	r10			/* save ctr, even for !RELOCATABLE */
+	BRANCH_TO_C000(r11, \name\()_early_common)
+	.elseif !\virt
+	INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri
+	.else
+	INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr
+	.endif
+	.if \ool
+	.popsection
+	.endif
+.endm
+
 /*
  * On entry r13 points to the paca, r9-r13 are saved in the paca,
  * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
@@ -555,136 +685,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
-/*
- * This is the BOOK3S interrupt entry code macro.
- *
- * This can result in one of several things happening:
- * - Branch to the _common handler, relocated, in virtual mode.
- *   These are normal interrupts (synchronous and asynchronous) handled by
- *   the kernel.
- * - Branch to KVM, relocated but real mode interrupts remain in real mode.
- *   These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by
- *   / intended for host or guest kernel, but KVM must always be involved
- *   because the machine state is set for guest execution.
- * - Branch to the masked handler, unrelocated.
- *   These occur when maskable asynchronous interrupts are taken with the
- *   irq_soft_mask set.
- * - Branch to an "early" handler in real mode but relocated.
- *   This is done if early=1. MCE and HMI use these to handle errors in real
- *   mode.
- * - Fall through and continue executing in real, unrelocated mode.
- *   This is done if early=2.
- */
-.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0
-	SET_SCRATCH0(r13)			/* save r13 */
-	GET_PACA(r13)
-	std	r9,\area\()+EX_R9(r13)		/* save r9 */
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
-	HMT_MEDIUM
-	std	r10,\area\()+EX_R10(r13)	/* save r10 - r12 */
-	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
-	.if \ool
-	.if !\virt
-	b	tramp_real_\name
-	.pushsection .text
-	TRAMP_REAL_BEGIN(tramp_real_\name)
-	.else
-	b	tramp_virt_\name
-	.pushsection .text
-	TRAMP_VIRT_BEGIN(tramp_virt_\name)
-	.endif
-	.endif
-
-	OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
-	OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
-	INTERRUPT_TO_KERNEL
-	SAVE_CTR(r10, \area\())
-	mfcr	r9
-	.if \kvm
-		KVMTEST \hsrr \vec
-	.endif
-	.if \bitmask
-		lbz	r10,PACAIRQSOFTMASK(r13)
-		andi.	r10,r10,\bitmask
-		/* Associate vector numbers with bits in paca->irq_happened */
-		.if \vec == 0x500 || \vec == 0xea0
-		li	r10,PACA_IRQ_EE
-		.elseif \vec == 0x900
-		li	r10,PACA_IRQ_DEC
-		.elseif \vec == 0xa00 || \vec == 0xe80
-		li	r10,PACA_IRQ_DBELL
-		.elseif \vec == 0xe60
-		li	r10,PACA_IRQ_HMI
-		.elseif \vec == 0xf00
-		li	r10,PACA_IRQ_PMI
-		.else
-		.abort "Bad maskable vector"
-		.endif
-
-		.if \hsrr == EXC_HV_OR_STD
-		BEGIN_FTR_SECTION
-		bne	masked_Hinterrupt
-		FTR_SECTION_ELSE
-		bne	masked_interrupt
-		ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-		.elseif \hsrr
-		bne	masked_Hinterrupt
-		.else
-		bne	masked_interrupt
-		.endif
-	.endif
-
-	std	r11,\area\()+EX_R11(r13)
-	std	r12,\area\()+EX_R12(r13)
-
-	/*
-	 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
-	 * because a d-side MCE will clobber those registers so is
-	 * not recoverable if they are live.
-	 */
-	GET_SCRATCH0(r10)
-	std	r10,\area\()+EX_R13(r13)
-	.if \dar
-	.if \hsrr
-	mfspr	r10,SPRN_HDAR
-	.else
-	mfspr	r10,SPRN_DAR
-	.endif
-	std	r10,\area\()+EX_DAR(r13)
-	.endif
-	.if \dsisr
-	.if \hsrr
-	mfspr	r10,SPRN_HDSISR
-	.else
-	mfspr	r10,SPRN_DSISR
-	.endif
-	stw	r10,\area\()+EX_DSISR(r13)
-	.endif
-
-	.if \early == 2
-	/* nothing more */
-	.elseif \early
-	mfctr	r10			/* save ctr, even for !RELOCATABLE */
-	BRANCH_TO_C000(r11, \name\()_early_common)
-	.elseif !\virt
-	INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri
-	.else
-	INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr
-	.endif
-	.if \ool
-	.popsection
-	.endif
-.endm
-
-.macro INT_KVM_HANDLER vec, hsrr, area, skip
-	.if \hsrr
-	TRAMP_KVM_BEGIN(do_kvm_H\vec\())
-	.else
-	TRAMP_KVM_BEGIN(do_kvm_\vec\())
-	.endif
-	KVM_HANDLER \vec, \hsrr, \area, \skip
-.endm
-
 #define EXC_COMMON(name, realvec, hdlr)					\
 	EXC_COMMON_BEGIN(name);						\
 	INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ;			\

From 1b3599829a2560fe6c9a5a4a6bb6b2bc4e5bdaee Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:56:59 +1000
Subject: [PATCH 179/264] powerpc/64s/exception: program check handler do not
 branch into a macro

It is clever, but the small code saving is not worth the spaghetti of
jumping to a label in an expanded macro, particularly when the label
is just a number rather than a descriptive name.

So expand the INT_COMMON macro twice, once for the stack and no stack
cases, and branch to those. The slight code size increase is worth
the improved clarity of branches for this non-performance critical
code.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-35-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index aabd84e83615..b0649d56bb15 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -533,11 +533,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	andi.	r10,r12,MSR_PR		/* See if coming from user	*/
 	mr	r10,r1			/* Save r1			*/
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc frame on kernel stack	*/
-	beq-	1f
+	beq-	100f
 	ld	r1,PACAKSAVE(r13)	/* kernel stack to use		*/
-1:	tdgei	r1,-INT_FRAME_SIZE	/* trap if r1 is in userspace	*/
-	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
-3:
+100:	tdgei	r1,-INT_FRAME_SIZE	/* trap if r1 is in userspace	*/
+	EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0
 	.endif
 
 	std	r9,_CCR(r1)		/* save CR in stackframe	*/
@@ -551,10 +550,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	.if \kaup
 	kuap_save_amr_and_lock r9, r10, cr1, cr0
 	.endif
-	beq	4f			/* if from kernel mode		*/
+	beq	101f			/* if from kernel mode		*/
 	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10)
 	SAVE_PPR(\area, r9)
-4:
+101:
 	.else
 	.if \kaup
 	kuap_save_amr_and_lock r9, r10, cr1
@@ -1325,9 +1324,11 @@ EXC_COMMON_BEGIN(program_check_common)
 	mr	r10,r1			/* Save r1			*/
 	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	b 3f				/* Jump into the macro !!	*/
+	INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0
+	b 3f
 2:
 	INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0
+3:
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	program_check_exception

From 05f97d94dd0e2883bc7b2e6b7b5e4c088e0d1437 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:57:00 +1000
Subject: [PATCH 180/264] powerpc/64s/exception: Remove pointless KVM handler
 name bifurcation

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-36-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S | 102 ++++++++++++---------------
 1 file changed, 44 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index b0649d56bb15..f54a38417904 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -193,12 +193,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
-.macro INT_KVM_HANDLER vec, hsrr, area, skip
-	.if \hsrr
-	TRAMP_KVM_BEGIN(do_kvm_H\vec\())
-	.else
-	TRAMP_KVM_BEGIN(do_kvm_\vec\())
-	.endif
+.macro INT_KVM_HANDLER name, vec, hsrr, area, skip
+	TRAMP_KVM_BEGIN(\name\()_kvm)
 	KVM_HANDLER \vec, \hsrr, \area, \skip
 .endm
 
@@ -214,20 +210,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define kvmppc_interrupt kvmppc_interrupt_pr
 #endif
 
-.macro KVMTEST hsrr, n
+.macro KVMTEST name, hsrr, n
 	lbz	r10,HSTATE_IN_GUEST(r13)
 	cmpwi	r10,0
-	.if \hsrr == EXC_HV_OR_STD
-	BEGIN_FTR_SECTION
-	bne	do_kvm_H\n
-	FTR_SECTION_ELSE
-	bne	do_kvm_\n
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif \hsrr
-	bne	do_kvm_H\n
-	.else
-	bne	do_kvm_\n
-	.endif
+	bne	\name\()_kvm
 .endm
 
 .macro KVM_HANDLER vec, hsrr, area, skip
@@ -298,9 +284,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 .endm
 
 #else
-.macro KVMTEST hsrr, n
+.macro KVMTEST name, hsrr, n
 .endm
-.macro KVM_HANDLER vec, hsrr, area, skip
+.macro KVM_HANDLER name, vec, hsrr, area, skip
 .endm
 #endif
 
@@ -445,7 +431,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
 	SAVE_CTR(r10, \area\())
 	mfcr	r9
 	.if \kvm
-		KVMTEST \hsrr \vec
+		KVMTEST \name \hsrr \vec
 	.endif
 	.if \bitmask
 		lbz	r10,PACAIRQSOFTMASK(r13)
@@ -842,7 +828,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 	 */
 EXC_REAL_END(system_reset, 0x100, 0x100)
 EXC_VIRT_NONE(0x4100, 0x100)
-INT_KVM_HANDLER 0x100, EXC_STD, PACA_EXNMI, 0
+INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0
 
 #ifdef CONFIG_PPC_P7_NAP
 TRAMP_REAL_BEGIN(system_reset_idle_wake)
@@ -936,7 +922,7 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi)
 	INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
 #endif
 
-INT_KVM_HANDLER 0x200, EXC_STD, PACA_EXMC, 1
+INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1
 
 #define MACHINE_CHECK_HANDLER_WINDUP			\
 	/* Clear MSR_RI before setting SRR0 and SRR1. */\
@@ -1022,8 +1008,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	/*
 	 * Check if we are coming from guest. If yes, then run the normal
-	 * exception handler which will take the do_kvm_200->kvmppc_interrupt
-	 * branch to deliver the MC event to guest.
+	 * exception handler which will take the
+	 * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event
+	 * to guest.
 	 */
 	lbz	r11,HSTATE_IN_GUEST(r13)
 	cmpwi	r11,0			/* Check if coming from guest */
@@ -1163,7 +1150,7 @@ EXC_REAL_END(data_access, 0x300, 0x80)
 EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
 	INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(data_access, 0x4300, 0x80)
-INT_KVM_HANDLER 0x300, EXC_STD, PACA_EXGEN, 1
+INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(data_access_common)
 	/*
 	 * Here r13 points to the paca, r9 contains the saved CR,
@@ -1189,7 +1176,7 @@ EXC_REAL_END(data_access_slb, 0x380, 0x80)
 EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 	INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
-INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1
+INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1
 EXC_COMMON_BEGIN(data_access_slb_common)
 	INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0
 	ld	r4,_DAR(r1)
@@ -1221,7 +1208,7 @@ EXC_REAL_END(instruction_access, 0x400, 0x80)
 EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
 	INT_HANDLER instruction_access, 0x400, virt=1
 EXC_VIRT_END(instruction_access, 0x4400, 0x80)
-INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(instruction_access_common)
 	INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2
 	ld      r12,_MSR(r1)
@@ -1241,7 +1228,7 @@ EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
 EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 	INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB
 EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
-INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0
+INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0
 EXC_COMMON_BEGIN(instruction_access_slb_common)
 	INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0
 	ld	r4,_DAR(r1)
@@ -1272,8 +1259,7 @@ EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
 EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
 	INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
-INT_KVM_HANDLER 0x500, EXC_STD, PACA_EXGEN, 0
-INT_KVM_HANDLER 0x500, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
@@ -1283,7 +1269,7 @@ EXC_REAL_END(alignment, 0x600, 0x100)
 EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
 	INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1
 EXC_VIRT_END(alignment, 0x4600, 0x100)
-INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(alignment_common)
 	INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1
 	bl	save_nvgprs
@@ -1298,7 +1284,7 @@ EXC_REAL_END(program_check, 0x700, 0x100)
 EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
 	INT_HANDLER program_check, 0x700, virt=1
 EXC_VIRT_END(program_check, 0x4700, 0x100)
-INT_KVM_HANDLER 0x700, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(program_check_common)
 	/*
 	 * It's possible to receive a TM Bad Thing type program check with
@@ -1341,7 +1327,7 @@ EXC_REAL_END(fp_unavailable, 0x800, 0x100)
 EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
 	INT_HANDLER fp_unavailable, 0x800, virt=1
 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
-INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(fp_unavailable_common)
 	INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0
 	bne	1f			/* if from user, just load it up */
@@ -1379,7 +1365,7 @@ EXC_REAL_END(decrementer, 0x900, 0x80)
 EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
 	INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(decrementer, 0x4900, 0x80)
-INT_KVM_HANDLER 0x900, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
 
 
@@ -1389,7 +1375,7 @@ EXC_REAL_END(hdecrementer, 0x980, 0x80)
 EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80)
 	INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(hdecrementer, 0x4980, 0x80)
-INT_KVM_HANDLER 0x980, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
 
 
@@ -1399,7 +1385,7 @@ EXC_REAL_END(doorbell_super, 0xa00, 0x100)
 EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
 	INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED
 EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
-INT_KVM_HANDLER 0xa00, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
 #else
@@ -1458,7 +1444,7 @@ EXC_VIRT_NONE(0x4b00, 0x100)
 	GET_PACA(r13)
 	std	r10,PACA_EXGEN+EX_R10(r13)
 	INTERRUPT_TO_KERNEL
-	KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */
+	KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */
 	mfctr	r9
 #else
 	mr	r9,r13
@@ -1524,7 +1510,7 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100)
 	 * ctr = orig r13
 	 * orig r10 saved in PACA
 	 */
-TRAMP_KVM_BEGIN(do_kvm_0xc00)
+TRAMP_KVM_BEGIN(system_call_kvm)
 	 /*
 	  * Save the PPR (on systems that support it) before changing to
 	  * HMT_MEDIUM. That allows the KVM code to save that value into the
@@ -1547,7 +1533,7 @@ EXC_REAL_END(single_step, 0xd00, 0x100)
 EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
 	INT_HANDLER single_step, 0xd00, virt=1
 EXC_VIRT_END(single_step, 0x4d00, 0x100)
-INT_KVM_HANDLER 0xd00, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 
@@ -1557,7 +1543,7 @@ EXC_REAL_END(h_data_storage, 0xe00, 0x20)
 EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
 	INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
-INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1
+INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON_BEGIN(h_data_storage_common)
 	INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1
 	bl      save_nvgprs
@@ -1578,7 +1564,7 @@ EXC_REAL_END(h_instr_storage, 0xe20, 0x20)
 EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
 	INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
-INT_KVM_HANDLER 0xe20, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
 
@@ -1588,7 +1574,7 @@ EXC_REAL_END(emulation_assist, 0xe40, 0x20)
 EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
 	INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
-INT_KVM_HANDLER 0xe40, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
 
 
@@ -1601,7 +1587,7 @@ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20)
 	INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1
 EXC_REAL_END(hmi_exception, 0xe60, 0x20)
 EXC_VIRT_NONE(0x4e60, 0x20)
-INT_KVM_HANDLER 0xe60, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(hmi_exception_early_common)
 	mtctr	r10			/* Restore ctr */
 	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
@@ -1645,7 +1631,7 @@ EXC_REAL_END(h_doorbell, 0xe80, 0x20)
 EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
 	INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
-INT_KVM_HANDLER 0xe80, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
 #else
@@ -1659,7 +1645,7 @@ EXC_REAL_END(h_virt_irq, 0xea0, 0x20)
 EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
 	INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
-INT_KVM_HANDLER 0xea0, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
 
 
@@ -1675,7 +1661,7 @@ EXC_REAL_END(performance_monitor, 0xf00, 0x20)
 EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
 	INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED
 EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
-INT_KVM_HANDLER 0xf00, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
 
 
@@ -1685,7 +1671,7 @@ EXC_REAL_END(altivec_unavailable, 0xf20, 0x20)
 EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
 	INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1
 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
-INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(altivec_unavailable_common)
 	INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0
 #ifdef CONFIG_ALTIVEC
@@ -1726,7 +1712,7 @@ EXC_REAL_END(vsx_unavailable, 0xf40, 0x20)
 EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
 	INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1
 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
-INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(vsx_unavailable_common)
 	INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0
 #ifdef CONFIG_VSX
@@ -1766,7 +1752,7 @@ EXC_REAL_END(facility_unavailable, 0xf60, 0x20)
 EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
 	INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1
 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
-INT_KVM_HANDLER 0xf60, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
 
@@ -1776,7 +1762,7 @@ EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20)
 EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
 	INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1
 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
-INT_KVM_HANDLER 0xf80, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0
 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
 
 
@@ -1797,7 +1783,7 @@ EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
 	INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
 EXC_VIRT_NONE(0x5200, 0x100)
-INT_KVM_HANDLER 0x1200, EXC_HV, PACA_EXGEN, 1
+INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1200, 0x100)
@@ -1811,7 +1797,7 @@ EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100)
 EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
 	INT_HANDLER instruction_breakpoint, 0x1300, virt=1
 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
-INT_KVM_HANDLER 0x1300, EXC_STD, PACA_EXGEN, 1
+INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1
 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
 
 
@@ -1825,7 +1811,7 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
 	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
 	bne+	denorm_assist
 #endif
-	KVMTEST EXC_HV 0x1500
+	KVMTEST denorm_exception_hv, EXC_HV 0x1500
 	INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1
 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
 
@@ -1841,7 +1827,7 @@ EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
 EXC_VIRT_NONE(0x5500, 0x100)
 #endif
 
-INT_KVM_HANDLER 0x1500, EXC_HV, PACA_EXGEN, 0
+INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0
 
 #ifdef CONFIG_PPC_DENORMALISATION
 TRAMP_REAL_BEGIN(denorm_assist)
@@ -1920,7 +1906,7 @@ EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
 	INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
 EXC_VIRT_NONE(0x5600, 0x100)
-INT_KVM_HANDLER 0x1600, EXC_HV, PACA_EXGEN, 1
+INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1600, 0x100)
@@ -1934,7 +1920,7 @@ EXC_REAL_END(altivec_assist, 0x1700, 0x100)
 EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
 	INT_HANDLER altivec_assist, 0x1700, virt=1
 EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
-INT_KVM_HANDLER 0x1700, EXC_STD, PACA_EXGEN, 0
+INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0
 #ifdef CONFIG_ALTIVEC
 EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
 #else
@@ -1947,7 +1933,7 @@ EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
 	INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1
 EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
-INT_KVM_HANDLER 0x1800, EXC_HV, PACA_EXGEN, 1
+INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1
 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
 #else /* CONFIG_CBE_RAS */
 EXC_REAL_NONE(0x1800, 0x100)

From 9b123d1ea23701bc00ebf712f1e03b25b8195eeb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 2 Aug 2019 20:57:01 +1000
Subject: [PATCH 181/264] powerpc/64s/exception: reduce page fault unnecessary
 loads

This avoids 3 loads in the radix page fault case, 1 load in the
hash fault case, and 2 loads in the hash miss page fault case.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190802105709.27696-37-npiggin@gmail.com
---
 arch/powerpc/kernel/exceptions-64s.S  | 38 ++++++++++++---------------
 arch/powerpc/mm/book3s64/hash_utils.c |  4 +--
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f54a38417904..d0018dd17e0a 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1159,11 +1159,11 @@ EXC_COMMON_BEGIN(data_access_common)
 	 * EX_DAR and EX_DSISR have saved DAR/DSISR
 	 */
 	INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1
-	ld	r12,_MSR(r1)
-	ld	r3,_DAR(r1)
-	ld	r4,_DSISR(r1)
-	li	r5,0x300
+	ld	r4,_DAR(r1)
+	ld	r5,_DSISR(r1)
 BEGIN_MMU_FTR_SECTION
+	ld	r6,_MSR(r1)
+	li	r3,0x300
 	b	do_hash_page		/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
 	b	handle_page_fault
@@ -1211,11 +1211,11 @@ EXC_VIRT_END(instruction_access, 0x4400, 0x80)
 INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0
 EXC_COMMON_BEGIN(instruction_access_common)
 	INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2
-	ld      r12,_MSR(r1)
-	ld	r3,_DAR(r1)
-	ld	r4,_DSISR(r1)
-	li	r5,0x400
+	ld	r4,_DAR(r1)
+	ld	r5,_DSISR(r1)
 BEGIN_MMU_FTR_SECTION
+	ld      r6,_MSR(r1)
+	li	r3,0x400
 	b	do_hash_page		/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
 	b	handle_page_fault
@@ -2260,7 +2260,7 @@ do_hash_page:
 #ifdef CONFIG_PPC_BOOK3S_64
 	lis	r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
 	ori	r0,r0,DSISR_BAD_FAULT_64S@l
-	and.	r0,r4,r0		/* weird error? */
+	and.	r0,r5,r0		/* weird error? */
 	bne-	handle_page_fault	/* if not, try to insert a HPTE */
 	ld	r11, PACA_THREAD_INFO(r13)
 	lwz	r0,TI_PREEMPT(r11)	/* If we're in an "NMI" */
@@ -2268,15 +2268,13 @@ do_hash_page:
 	bne	77f			/* then don't call hash_page now */
 
 	/*
-	 * r3 contains the faulting address
-	 * r4 msr
-	 * r5 contains the trap number
-	 * r6 contains dsisr
+	 * r3 contains the trap number
+	 * r4 contains the faulting address
+	 * r5 contains dsisr
+	 * r6 msr
 	 *
 	 * at return r3 = 0 for success, 1 for page fault, negative for error
 	 */
-        mr 	r4,r12
-	ld      r6,_DSISR(r1)
 	bl	__hash_page		/* build HPTE if possible */
         cmpdi	r3,0			/* see if __hash_page succeeded */
 
@@ -2286,16 +2284,15 @@ do_hash_page:
 	/* Error */
 	blt-	13f
 
-	/* Reload DSISR into r4 for the DABR check below */
-	ld      r4,_DSISR(r1)
+	/* Reload DAR/DSISR into r4/r5 for the DABR check below */
+	ld	r4,_DAR(r1)
+	ld      r5,_DSISR(r1)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 /* Here we have a page fault that hash_page can't handle. */
 handle_page_fault:
-11:	andis.  r0,r4,DSISR_DABRMATCH@h
+11:	andis.  r0,r5,DSISR_DABRMATCH@h
 	bne-    handle_dabr_fault
-	ld	r4,_DAR(r1)
-	ld	r5,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_page_fault
 	cmpdi	r3,0
@@ -2342,7 +2339,6 @@ handle_dabr_fault:
  * the access, or panic if there isn't a handler.
  */
 77:	bl	save_nvgprs
-	mr	r4,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	li	r5,SIGSEGV
 	bl	bad_page_fault
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 4d0bb194ed70..fe99bba39b69 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1462,8 +1462,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
 }
 EXPORT_SYMBOL_GPL(hash_page);
 
-int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
-		unsigned long dsisr)
+int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
+		unsigned long msr)
 {
 	unsigned long access = _PAGE_PRESENT | _PAGE_READ;
 	unsigned long flags = 0;

From 799abe283e5103d48e079149579b4f167c95ea0e Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:52 +1000
Subject: [PATCH 182/264] powerpc/eeh: Clean up EEH PEs after recovery finishes

When the last device in an eeh_pe is removed the eeh_pe structure itself
(and any empty parents) are freed since they are no longer needed. This
results in a crash when a hotplug driver is involved since the following
may occur:

1. Device is suprise removed.
2. Driver performs an MMIO, which fails and queues and eeh_event.
3. Hotplug driver receives a hotplug interrupt and removes any
   pci_devs that were under the slot.
4. pci_dev is torn down and the eeh_pe is freed.
5. The EEH event handler thread processes the eeh_event and crashes
   since the eeh_pe pointer in the eeh_event structure is no
   longer valid.

Crashing is generally considered poor form. Instead of doing that use
the fact PEs are marked as EEH_PE_INVALID to keep them around until the
end of the recovery cycle, at which point we can safely prune any empty
PEs.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-2-oohall@gmail.com
---
 arch/powerpc/kernel/eeh_driver.c | 36 ++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/eeh_event.c  |  8 +++++++
 arch/powerpc/kernel/eeh_pe.c     | 23 +++++++++++++++++++-
 3 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a31cd32c4ce9..75266156943f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -734,6 +734,33 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
+
+/* Walks the PE tree after processing an event to remove any stale PEs.
+ *
+ * NB: This needs to be recursive to ensure the leaf PEs get removed
+ * before their parents do. Although this is possible to do recursively
+ * we don't since this is easier to read and we need to garantee
+ * the leaf nodes will be handled first.
+ */
+static void eeh_pe_cleanup(struct eeh_pe *pe)
+{
+	struct eeh_pe *child_pe, *tmp;
+
+	list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child)
+		eeh_pe_cleanup(child_pe);
+
+	if (pe->state & EEH_PE_KEEP)
+		return;
+
+	if (!(pe->state & EEH_PE_INVALID))
+		return;
+
+	if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) {
+		list_del(&pe->child);
+		kfree(pe);
+	}
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -772,8 +799,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		return;
 	}
 
-	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
 	if (pe->freeze_count > eeh_max_freezes) {
@@ -963,6 +988,12 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 			return;
 		}
 	}
+
+	/*
+	 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
+	 * we don't want to modify the PE tree structure so we do it here.
+	 */
+	eeh_pe_cleanup(pe);
 	eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
@@ -1035,6 +1066,7 @@ void eeh_handle_special_event(void)
 		 */
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
 			eeh_handle_normal_event(pe);
 		} else {
 			pci_lock_rescan_remove();
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 64cfbe41174b..e36653e5f76b 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -121,6 +121,14 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
 	}
 	event->pe = pe;
 
+	/*
+	 * Mark the PE as recovering before inserting it in the queue.
+	 * This prevents the PE from being free()ed by a hotplug driver
+	 * while the PE is sitting in the event queue.
+	 */
+	if (pe)
+		eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+
 	/* We may or may not be called in an interrupt context */
 	spin_lock_irqsave(&eeh_eventlist_lock, flags);
 	list_add(&event->list, &eeh_eventlist);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 1a6254bcf056..177852e39a25 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -470,6 +470,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 {
 	struct eeh_pe *pe, *parent, *child;
+	bool keep, recover;
 	int cnt;
 
 	pe = eeh_dev_to_pe(edev);
@@ -490,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 	 */
 	while (1) {
 		parent = pe->parent;
+
+		/* PHB PEs should never be removed */
 		if (pe->type & EEH_PE_PHB)
 			break;
 
-		if (!(pe->state & EEH_PE_KEEP)) {
+		/*
+		 * XXX: KEEP is set while resetting a PE. I don't think it's
+		 * ever set without RECOVERING also being set. I could
+		 * be wrong though so catch that with a WARN.
+		 */
+		keep = !!(pe->state & EEH_PE_KEEP);
+		recover = !!(pe->state & EEH_PE_RECOVERING);
+		WARN_ON(keep && !recover);
+
+		if (!keep && !recover) {
 			if (list_empty(&pe->edevs) &&
 			    list_empty(&pe->child_list)) {
 				list_del(&pe->child);
@@ -502,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 				break;
 			}
 		} else {
+			/*
+			 * Mark the PE as invalid. At the end of the recovery
+			 * process any invalid PEs will be garbage collected.
+			 *
+			 * We need to delay the free()ing of them since we can
+			 * remove edev's while traversing the PE tree which
+			 * might trigger the removal of a PE and we can't
+			 * deal with that (yet).
+			 */
 			if (list_empty(&pe->edevs)) {
 				cnt = 0;
 				list_for_each_entry(child, &pe->child_list, child) {

From 5ef753ae435a5cea8af5c84a65fc5dd30b773040 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:53 +1000
Subject: [PATCH 183/264] powerpc/eeh: Fix race when freeing PDNs

When hot-adding devices we rely on the hotplug driver to create pci_dn's
for the devices under the hotplug slot. Converse, when hot-removing the
driver will remove the pci_dn's that it created. This is a problem because
the pci_dev is still live until it's refcount drops to zero. This can
happen if the driver is slow to tear down it's internal state. Ideally, the
driver would not attempt to perform any config accesses to the device once
it's been marked as removed, but sometimes it happens. As a result, we
might attempt to access the pci_dn for a device that has been torn down and
the kernel may crash as a result.

To fix this, don't free the pci_dn unless the corresponding pci_dev has
been released.  If the pci_dev is still live, then we mark the pci_dn with
a flag that indicates the pci_dev's release function should free it.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-3-oohall@gmail.com
---
 arch/powerpc/include/asm/pci-bridge.h |  1 +
 arch/powerpc/kernel/pci-hotplug.c     |  7 +++++++
 arch/powerpc/kernel/pci_dn.c          | 21 +++++++++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 8dad1fdf4bd2..ea6ec65970ef 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -183,6 +183,7 @@ struct iommu_table;
 struct pci_dn {
 	int     flags;
 #define PCI_DN_FLAG_IOV_VF	0x01
+#define PCI_DN_FLAG_DEAD	0x02    /* Device has been hot-removed */
 
 	int	busno;			/* pci bus number */
 	int	devfn;			/* pci device and function number */
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 0b0cf8168b47..fc62c4bc47b1 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
 void pcibios_release_device(struct pci_dev *dev)
 {
 	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+	struct pci_dn *pdn = pci_get_pdn(dev);
 
 	eeh_remove_device(dev);
 
 	if (phb->controller_ops.release_device)
 		phb->controller_ops.release_device(dev);
+
+	/* free()ing the pci_dn has been deferred to us, do it now */
+	if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) {
+		pci_dbg(dev, "freeing dead pdn\n");
+		kfree(pdn);
+	}
 }
 
 /**
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index c4c8c237a106..9524009ca1ae 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -323,6 +323,7 @@ void pci_remove_device_node_info(struct device_node *dn)
 {
 	struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
 	struct device_node *parent;
+	struct pci_dev *pdev;
 #ifdef CONFIG_EEH
 	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
@@ -336,12 +337,28 @@ void pci_remove_device_node_info(struct device_node *dn)
 	WARN_ON(!list_empty(&pdn->child_list));
 	list_del(&pdn->list);
 
+	/* Drop the parent pci_dn's ref to our backing dt node */
 	parent = of_get_parent(dn);
 	if (parent)
 		of_node_put(parent);
 
-	dn->data = NULL;
-	kfree(pdn);
+	/*
+	 * At this point we *might* still have a pci_dev that was
+	 * instantiated from this pci_dn. So defer free()ing it until
+	 * the pci_dev's release function is called.
+	 */
+	pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number,
+			pdn->busno, pdn->devfn);
+	if (pdev) {
+		/* NB: pdev has a ref to dn */
+		pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn);
+		pdn->flags |= PCI_DN_FLAG_DEAD;
+	} else {
+		dn->data = NULL;
+		kfree(pdn);
+	}
+
+	pci_dev_put(pdev);
 }
 EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
 

From 38ddc011478e573c9ab4e3e9bc54cc5bfc542351 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:54 +1000
Subject: [PATCH 184/264] powerpc/eeh: Make permanently failed devices
 non-actionable

If a device is torn down by a hotplug slot driver it's marked as removed
and marked as permaantly failed. There's no point in trying to recover a
permernantly failed device so it should be considered un-actionable.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-4-oohall@gmail.com
---
 arch/powerpc/kernel/eeh_driver.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 75266156943f..18a69fac4d80 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -96,8 +96,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev)
 
 static bool eeh_edev_actionable(struct eeh_dev *edev)
 {
-	return (edev->pdev && !eeh_dev_removed(edev) &&
-		!eeh_pe_passed(edev->pe));
+	if (!edev->pdev)
+		return false;
+	if (edev->pdev->error_state == pci_channel_io_perm_failure)
+		return false;
+	if (eeh_dev_removed(edev))
+		return false;
+	if (eeh_pe_passed(edev->pe))
+		return false;
+
+	return true;
 }
 
 /**

From b104af5a7687060792ca398bb86b033057afce75 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:55 +1000
Subject: [PATCH 185/264] powerpc/eeh: Check slot presence state in
 eeh_handle_normal_event()

When a device is surprise removed while undergoing IO we will probably
get an EEH PE freeze due to MMIO timeouts and other errors. When a freeze
is detected we send a recovery event to the EEH worker thread which will
notify drivers, and perform recovery as needed.

In the event of a hot-remove we don't want recovery to occur since there
isn't a device to recover. The recovery process is fairly long due to
the number of wait states (required by PCIe) which causes problems when
devices are removed and replaced (e.g. hot swapping of U.2 NVMe drives).

To determine if we need to skip the recovery process we can use the
get_adapter_state() operation of the hotplug_slot to determine if the
slot contains a device or not, and if the slot is empty we can skip
recovery entirely.

One thing to note is that the slot being EEH frozen does not prevent the
hotplug driver from working. We don't have the EEH recovery thread
remove any of the devices since it's assumed that the hotplug driver
will handle tearing down the slot state.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-5-oohall@gmail.com
---
 arch/powerpc/kernel/eeh_driver.c | 60 ++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 18a69fac4d80..52ce7584af43 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -27,6 +27,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pci_hotplug.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
 #include <asm/ppc-pci.h>
@@ -769,6 +770,46 @@ static void eeh_pe_cleanup(struct eeh_pe *pe)
 	}
 }
 
+/**
+ * eeh_check_slot_presence - Check if a device is still present in a slot
+ * @pdev: pci_dev to check
+ *
+ * This function may return a false positive if we can't determine the slot's
+ * presence state. This might happen for for PCIe slots if the PE containing
+ * the upstream bridge is also frozen, or the bridge is part of the same PE
+ * as the device.
+ *
+ * This shouldn't happen often, but you might see it if you hotplug a PCIe
+ * switch.
+ */
+static bool eeh_slot_presence_check(struct pci_dev *pdev)
+{
+	const struct hotplug_slot_ops *ops;
+	struct pci_slot *slot;
+	u8 state;
+	int rc;
+
+	if (!pdev)
+		return false;
+
+	if (pdev->error_state == pci_channel_io_perm_failure)
+		return false;
+
+	slot = pdev->slot;
+	if (!slot || !slot->hotplug)
+		return true;
+
+	ops = slot->hotplug->ops;
+	if (!ops || !ops->get_adapter_status)
+		return true;
+
+	rc = ops->get_adapter_status(slot->hotplug, &state);
+	if (rc)
+		return true;
+
+	return !!state;
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -799,6 +840,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
 	struct eeh_rmv_data rmv_data =
 		{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
+	int devices = 0;
 
 	bus = eeh_pe_bus_get(pe);
 	if (!bus) {
@@ -807,6 +849,23 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		return;
 	}
 
+	/*
+	 * When devices are hot-removed we might get an EEH due to
+	 * a driver attempting to touch the MMIO space of a removed
+	 * device. In this case we don't have a device to recover
+	 * so suppress the event if we can't find any present devices.
+	 *
+	 * The hotplug driver should take care of tearing down the
+	 * device itself.
+	 */
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			if (eeh_slot_presence_check(edev->pdev))
+				devices++;
+
+	if (!devices)
+		goto out; /* nothing to recover */
+
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
 	if (pe->freeze_count > eeh_max_freezes) {
@@ -997,6 +1056,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		}
 	}
 
+out:
 	/*
 	 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
 	 * we don't want to modify the PE tree structure so we do it here.

From 25baf3d81614b0b8ca8958f4d6f111ccaaaad578 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:56 +1000
Subject: [PATCH 186/264] powerpc/eeh: Defer printing stack trace

Currently we print a stack trace in the event handler to help with
debugging EEH issues. In the case of suprise hot-unplug this is unneeded,
so we want to prevent printing the stack trace unless we know it's due to
an actual device error. To accomplish this, we can save a stack trace at
the point of detection and only print it once the EEH recovery handler has
determined the freeze was due to an actual error.

Since the whole point of this is to prevent spurious EEH output we also
move a few prints out of the detection thread, or mark them as pr_debug
so anyone interested can get output from the eeh_check_dev_failure()
if they want.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-6-oohall@gmail.com
---
 arch/powerpc/include/asm/eeh.h   | 11 +++++++++
 arch/powerpc/kernel/eeh.c        | 15 ++++---------
 arch/powerpc/kernel/eeh_driver.c | 38 +++++++++++++++++++++++++++++++-
 arch/powerpc/kernel/eeh_event.c  | 26 ++++++++++------------
 4 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index c13119a5e69b..9d0e1694a94d 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -88,6 +88,17 @@ struct eeh_pe {
 	struct list_head child_list;	/* List of PEs below this PE	*/
 	struct list_head child;		/* Memb. child_list/eeh_phb_pe	*/
 	struct list_head edevs;		/* List of eeh_dev in this PE	*/
+
+	/*
+	 * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
+	 * the stack trace is saved here so we can print it in the recovery
+	 * thread if it turns out to due to a real problem rather than
+	 * a hot-remove.
+	 *
+	 * A max of 64 entries might be overkill, but it also might not be.
+	 */
+	unsigned long stack_trace[64];
+	int trace_entries;
 };
 
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 7b2755f5c6fd..398def61f8a6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
 	eeh_pe_mark_isolated(phb_pe);
 	eeh_serialize_unlock(flags);
 
-	pr_err("EEH: PHB#%x failure detected, location: %s\n",
+	pr_debug("EEH: PHB#%x failure detected, location: %s\n",
 		phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
-	dump_stack();
 	eeh_send_failure_event(phb_pe);
-
 	return 1;
 out:
 	eeh_serialize_unlock(flags);
@@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	unsigned long flags;
 	struct device_node *dn;
 	struct pci_dev *dev;
-	struct eeh_pe *pe, *parent_pe, *phb_pe;
+	struct eeh_pe *pe, *parent_pe;
 	int rc = 0;
 	const char *location = NULL;
 
@@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 * a stack trace will help the device-driver authors figure
 	 * out what happened.  So print that out.
 	 */
-	phb_pe = eeh_phb_pe_get(pe->phb);
-	pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
-	       pe->phb->global_number, pe->addr);
-	pr_err("EEH: PE location: %s, PHB location: %s\n",
-	       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
-	dump_stack();
-
+	pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
+		__func__, pe->phb->global_number, pe->addr);
 	eeh_send_failure_event(pe);
 
 	return 1;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 52ce7584af43..0d34cc12c529 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 			if (eeh_slot_presence_check(edev->pdev))
 				devices++;
 
-	if (!devices)
+	if (!devices) {
+		pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
+			pe->phb->global_number, pe->addr);
 		goto out; /* nothing to recover */
+	}
+
+	/* Log the event */
+	if (pe->type & EEH_PE_PHB) {
+		pr_err("EEH: PHB#%x failure detected, location: %s\n",
+			pe->phb->global_number, eeh_pe_loc_get(pe));
+	} else {
+		struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
+
+		pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+		       pe->phb->global_number, pe->addr);
+		pr_err("EEH: PE location: %s, PHB location: %s\n",
+		       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+	}
+
+	/*
+	 * Print the saved stack trace now that we've verified there's
+	 * something to recover.
+	 */
+	if (pe->trace_entries) {
+		void **ptrs = (void **) pe->stack_trace;
+		int i;
+
+		pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+		       pe->phb->global_number, pe->addr);
+
+		/* FIXME: Use the same format as dump_stack() */
+		pr_err("EEH: Call Trace:\n");
+		for (i = 0; i < pe->trace_entries; i++)
+			pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
+
+		pe->trace_entries = 0;
+	}
+
 
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index e36653e5f76b..1d55486adb0f 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy)
 {
 	unsigned long flags;
 	struct eeh_event *event;
-	struct eeh_pe *pe;
 
 	while (!kthread_should_stop()) {
 		if (wait_for_completion_interruptible(&eeh_eventlist_event))
@@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy)
 			continue;
 
 		/* We might have event without binding PE */
-		pe = event->pe;
-		if (pe) {
-			if (pe->type & EEH_PE_PHB)
-				pr_info("EEH: Detected error on PHB#%x\n",
-					 pe->phb->global_number);
-			else
-				pr_info("EEH: Detected PCI bus error on "
-					"PHB#%x-PE#%x\n",
-					pe->phb->global_number, pe->addr);
-			eeh_handle_normal_event(pe);
-		} else {
+		if (event->pe)
+			eeh_handle_normal_event(event->pe);
+		else
 			eeh_handle_special_event();
-		}
 
 		kfree(event);
 	}
@@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
 	 * This prevents the PE from being free()ed by a hotplug driver
 	 * while the PE is sitting in the event queue.
 	 */
-	if (pe)
+	if (pe) {
+		/*
+		 * Save the current stack trace so we can dump it from the
+		 * event handler thread.
+		 */
+		pe->trace_entries = stack_trace_save(pe->stack_trace,
+					 ARRAY_SIZE(pe->stack_trace), 0);
+
 		eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	}
 
 	/* We may or may not be called in an interrupt context */
 	spin_lock_irqsave(&eeh_eventlist_lock, flags);

From 50554533358571d0c23a88fa1fe3d0b44c6fc7b5 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:57 +1000
Subject: [PATCH 187/264] powerpc/eeh: Remove stale CAPI comment

Support for switching CAPI cards into and out of CAPI mode was removed a
while ago. Drop the comment since it's no longer relevant.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-7-oohall@gmail.com
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e7b867912f24..94e26d56ecd2 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1125,13 +1125,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		return -EIO;
 	}
 
-	/*
-	 * If dealing with the root bus (or the bus underneath the
-	 * root port), we reset the bus underneath the root port.
-	 *
-	 * The cxl driver depends on this behaviour for bi-modal card
-	 * switching.
-	 */
 	if (pci_is_root_bus(bus) ||
 	    pci_is_root_bus(bus->parent))
 		return pnv_eeh_root_reset(hose, option);

From 98fd32cde59ed71c2c9a6da4101e85f50c9425f3 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:58 +1000
Subject: [PATCH 188/264] powernv/eeh: Use generic code to handle hot resets

When we reset PCI devices managed by a hotplug driver the reset may
generate spurious hotplug events that cause the PCI device we're resetting
to be torn down accidently. This is a problem for EEH (when the driver is
EEH aware) since we want to leave the OS PCI device state intact so that
the device can be re-set without losing any resources (network, disks,
etc) provided by the driver.

Generic PCI code provides the pci_bus_error_reset() function to handle
resetting a PCI Device (or bus) by using the reset method provided by the
hotplug slot driver. We can use this function if the EEH core has
requested a hot reset (common case) without tripping over the hotplug
driver.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-8-oohall@gmail.com
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ++++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 94e26d56ecd2..6bc24a47e9ef 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -34,6 +34,7 @@
 
 #include "powernv.h"
 #include "pci.h"
+#include "../../../../drivers/pci/pci.h"
 
 static int eeh_event_irq = -EINVAL;
 
@@ -849,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
 	int aer = edev ? edev->aer_cap : 0;
 	u32 ctrl;
 
-	pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+	pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
 		 __func__, pci_domain_nr(dev->bus),
 		 dev->bus->number, option);
 
@@ -907,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
 	if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
 		return __pnv_eeh_bridge_reset(pdev, option);
 
+	pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(pdev->bus),
+		 pdev->bus->number, option);
+
 	switch (option) {
 	case EEH_RESET_FUNDAMENTAL:
 		scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -1125,10 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		return -EIO;
 	}
 
-	if (pci_is_root_bus(bus) ||
-	    pci_is_root_bus(bus->parent))
+	if (pci_is_root_bus(bus))
 		return pnv_eeh_root_reset(hose, option);
 
+	/*
+	 * For hot resets try use the generic PCI error recovery reset
+	 * functions. These correctly handles the case where the secondary
+	 * bus is behind a hotplug slot and it will use the slot provided
+	 * reset methods to prevent spurious hotplug events during the reset.
+	 *
+	 * Fundemental resets need to be handled internally to EEH since the
+	 * PCI core doesn't really have a concept of a fundemental reset,
+	 * mainly because there's no standard way to generate one. Only a
+	 * few devices require an FRESET so it should be fine.
+	 */
+	if (option != EEH_RESET_FUNDAMENTAL) {
+		/*
+		 * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+		 *     de-assert step. It's like the OPAL reset API was
+		 *     poorly designed or something...
+		 */
+		if (option == EEH_RESET_DEACTIVATE)
+			return 0;
+
+		rc = pci_bus_error_reset(bus->self);
+		if (!rc)
+			return 0;
+	}
+
+	/* otherwise, use the generic bridge reset. this might call into FW */
+	if (pci_is_root_bus(bus->parent))
+		return pnv_eeh_root_reset(hose, option);
 	return pnv_eeh_bridge_reset(bus->self, option);
 }
 

From 7fd1fe4e4811f52b854647b8ff324745135224a5 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:15:59 +1000
Subject: [PATCH 189/264] pci-hotplug/pnv_php: Add a reset_slot() callback

When performing EEH recovery of devices in a hotplug slot we need to use
the slot driver's ->reset_slot() callback to prevent spurious hotplug
events due to spurious DLActive and PresDet change interrupts. Add a
reset_slot() callback to pnv_php so we can handle recovery of devices
in pnv_php managed slots.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-9-oohall@gmail.com
---
 drivers/pci/hotplug/pnv_php.c | 39 +++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6758fd7c382e..b0e243dabf77 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -511,6 +511,37 @@ scan:
 	return 0;
 }
 
+static int pnv_php_reset_slot(struct hotplug_slot *slot, int probe)
+{
+	struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
+	struct pci_dev *bridge = php_slot->pdev;
+	uint16_t sts;
+
+	/*
+	 * The CAPI folks want pnv_php to drive OpenCAPI slots
+	 * which don't have a bridge. Only claim to support
+	 * reset_slot() if we have a bridge device (for now...)
+	 */
+	if (probe)
+		return !bridge;
+
+	/* mask our interrupt while resetting the bridge */
+	if (php_slot->irq > 0)
+		disable_irq(php_slot->irq);
+
+	pci_bridge_secondary_bus_reset(bridge);
+
+	/* clear any state changes that happened due to the reset */
+	pcie_capability_read_word(php_slot->pdev, PCI_EXP_SLTSTA, &sts);
+	sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
+	pcie_capability_write_word(php_slot->pdev, PCI_EXP_SLTSTA, sts);
+
+	if (php_slot->irq > 0)
+		enable_irq(php_slot->irq);
+
+	return 0;
+}
+
 static int pnv_php_enable_slot(struct hotplug_slot *slot)
 {
 	struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
@@ -548,6 +579,7 @@ static const struct hotplug_slot_ops php_slot_ops = {
 	.set_attention_status	= pnv_php_set_attention_state,
 	.enable_slot		= pnv_php_enable_slot,
 	.disable_slot		= pnv_php_disable_slot,
+	.reset_slot		= pnv_php_reset_slot,
 };
 
 static void pnv_php_release(struct pnv_php_slot *php_slot)
@@ -721,6 +753,12 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
 	pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &sts);
 	sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
 	pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, sts);
+
+	pci_dbg(pdev, "PCI slot [%s]: HP int! DLAct: %d, PresDet: %d\n",
+			php_slot->name,
+			!!(sts & PCI_EXP_SLTSTA_DLLSC),
+			!!(sts & PCI_EXP_SLTSTA_PDC));
+
 	if (sts & PCI_EXP_SLTSTA_DLLSC) {
 		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lsts);
 		added = !!(lsts & PCI_EXP_LNKSTA_DLLLA);
@@ -735,6 +773,7 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
 
 		added = !!(presence == OPAL_PCI_SLOT_PRESENT);
 	} else {
+		pci_dbg(pdev, "PCI slot [%s]: Spurious IRQ?\n", php_slot->name);
 		return IRQ_NONE;
 	}
 

From a839bd87a250068521393ca3804f4a5274e103ff Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:00 +1000
Subject: [PATCH 190/264] pci-hotplug/pnv_php: Add support for IODA3 Power9
 PHBs

Currently we check that an IODA2 compatible PHB is upstream of this slot.
This is mainly to avoid pnv_php creating slots for the various "virtual
PHBs" that we create for NVLink. There's no real need for this restriction
so allow it on IODA3.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-10-oohall@gmail.com
---
 arch/powerpc/platforms/powernv/pci.c | 3 ++-
 drivers/pci/hotplug/pnv_php.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6104418c9ad5..2825d004dece 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
 			break;
 		}
 
-		if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
+		if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
+		    !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
 			of_node_put(parent);
 			continue;
 		}
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index b0e243dabf77..6fdf8b74cb0a 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -994,6 +994,9 @@ static int __init pnv_php_init(void)
 	for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
 		pnv_php_register(dn);
 
+	for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
+		pnv_php_register(dn);
+
 	return 0;
 }
 
@@ -1003,6 +1006,9 @@ static void __exit pnv_php_exit(void)
 
 	for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
 		pnv_php_unregister(dn);
+
+	for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
+		pnv_php_unregister(dn);
 }
 
 module_init(pnv_php_init);

From 018c49e999ac4680e59de236a4a8cde209e8cc98 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:01 +1000
Subject: [PATCH 191/264] pci-hotplug/pnv_php: Add attention indicator support

pnv_php is generally used with PCIe bridges which provide a native
interface for setting the attention and power indicator LEDs. Wire up
those interfaces even if firmware does not have support for them (yet...)

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-11-oohall@gmail.com
---
 drivers/pci/hotplug/pnv_php.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6fdf8b74cb0a..d7b2b47bc33e 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -419,9 +419,21 @@ static int pnv_php_get_attention_state(struct hotplug_slot *slot, u8 *state)
 static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state)
 {
 	struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
+	struct pci_dev *bridge = php_slot->pdev;
+	u16 new, mask;
 
-	/* FIXME: Make it real once firmware supports it */
 	php_slot->attention_state = state;
+	if (!bridge)
+		return 0;
+
+	mask = PCI_EXP_SLTCTL_AIC;
+
+	if (state)
+		new = PCI_EXP_SLTCTL_ATTN_IND_ON;
+	else
+		new = PCI_EXP_SLTCTL_ATTN_IND_OFF;
+
+	pcie_capability_clear_and_set_word(bridge, PCI_EXP_SLTCTL, mask, new);
 
 	return 0;
 }

From aeff27c121ba7397c21a47c749e2b5be07f48c17 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:02 +1000
Subject: [PATCH 192/264] powerpc/eeh: Set attention indicator while recovering

I am the RAS team. Hear me roar.

Roar.

On a more serious note, being able to locate failed devices can be helpful.
Set the attention indicator if the slot supports it once we've determined
the device is present and only clear it if the device is fully recovered.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-12-oohall@gmail.com
---
 arch/powerpc/kernel/eeh_driver.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 0d34cc12c529..80bd157fcb45 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -803,6 +803,10 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev)
 	if (!ops || !ops->get_adapter_status)
 		return true;
 
+	/* set the attention indicator while we've got the slot ops */
+	if (ops->set_attention_status)
+		ops->set_attention_status(slot->hotplug, 1);
+
 	rc = ops->get_adapter_status(slot->hotplug, &state);
 	if (rc)
 		return true;
@@ -810,6 +814,28 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev)
 	return !!state;
 }
 
+static void eeh_clear_slot_attention(struct pci_dev *pdev)
+{
+	const struct hotplug_slot_ops *ops;
+	struct pci_slot *slot;
+
+	if (!pdev)
+		return;
+
+	if (pdev->error_state == pci_channel_io_perm_failure)
+		return;
+
+	slot = pdev->slot;
+	if (!slot || !slot->hotplug)
+		return;
+
+	ops = slot->hotplug->ops;
+	if (!ops || !ops->set_attention_status)
+		return;
+
+	ops->set_attention_status(slot->hotplug, 0);
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -1098,6 +1124,12 @@ out:
 	 * we don't want to modify the PE tree structure so we do it here.
 	 */
 	eeh_pe_cleanup(pe);
+
+	/* clear the slot attention LED for all recovered devices */
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			eeh_clear_slot_attention(edev->pdev);
+
 	eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 

From 22cda7c1680c1ddfe941adae45e7e7ef52d0e411 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:03 +1000
Subject: [PATCH 193/264] powerpc/eeh: Add debugfs interface to run an EEH
 check

Detecting an frozen EEH PE usually occurs when an MMIO load returns a 0xFFs
response. When performing EEH testing using the EEH error injection feature
available on some platforms there is no simple way to kick-off the kernel's
recovery process since any accesses from userspace (usually /dev/mem) will
bypass the MMIO helpers in the kernel which check if a 0xFF response is due
to an EEH freeze or not.

If a device contains a 0xFF byte in it's config space it's possible to
trigger the recovery process via config space read from userspace, but this
is not a reliable method. If a driver is bound to the device an in use it
will frequently trigger the MMIO check, but this is also inconsistent.

To solve these problems this patch adds a debugfs file called
"eeh_dev_check" which accepts a <domain>:<bus>:<dev>.<fn> string and runs
eeh_dev_check_failure() on it. This is the same check that's done when the
kernel gets a 0xFF result from an config or MMIO read with the added
benifit that it can be reliably triggered from userspace.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-13-oohall@gmail.com
---
 arch/powerpc/kernel/eeh.c | 61 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 398def61f8a6..2b3c03215a95 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1871,6 +1871,64 @@ static const struct file_operations eeh_force_recover_fops = {
 	.llseek	= no_llseek,
 	.write	= eeh_force_recover_write,
 };
+
+static ssize_t eeh_debugfs_dev_usage(struct file *filp,
+				char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	static const char usage[] = "input format: <domain>:<bus>:<dev>.<fn>\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       usage, sizeof(usage) - 1);
+}
+
+static ssize_t eeh_dev_check_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	uint32_t domain, bus, dev, fn;
+	struct pci_dev *pdev;
+	struct eeh_dev *edev;
+	char buf[20];
+	int ret;
+
+	ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+	if (ret != 4) {
+		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+		return -EINVAL;
+	}
+
+	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev) {
+		pci_err(pdev, "No eeh_dev for this device!\n");
+		pci_dev_put(pdev);
+		return -ENODEV;
+	}
+
+	ret = eeh_dev_check_failure(edev);
+	pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
+			domain, bus, dev, fn, ret);
+
+	pci_dev_put(pdev);
+
+	return count;
+}
+
+static const struct file_operations eeh_dev_check_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= eeh_dev_check_write,
+	.read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1886,6 +1944,9 @@ static int __init eeh_init_proc(void)
 		debugfs_create_bool("eeh_disable_recovery", 0600,
 				powerpc_debugfs_root,
 				&eeh_debugfs_no_recover);
+		debugfs_create_file_unsafe("eeh_dev_check", 0600,
+				powerpc_debugfs_root, NULL,
+				&eeh_dev_check_fops);
 		debugfs_create_file_unsafe("eeh_force_recover", 0600,
 				powerpc_debugfs_root, NULL,
 				&eeh_force_recover_fops);

From bd6461cc7b3c4fd12dcba4b0e95dfc612df872fd Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:04 +1000
Subject: [PATCH 194/264] powerpc/eeh: Add a eeh_dev_break debugfs interface

Add an interface to debugfs for generating an EEH event on a given device.
This works by disabling memory accesses to and from the device by setting
the PCI_COMMAND register (or the VF Memory Space Enable on the parent PF).

This is a somewhat portable alternative to using the platform specific
error injection mechanisms since those tend to be either hard to use, or
straight up broken. For pseries the interfaces also requires the use of
/dev/mem which is probably going to go away in a post-LOCKDOWN world
(and it's a horrific hack to begin with) so moving to a kernel-provided
interface makes sense and provides a sane, cross-platform interface for
userspace so we can write more generic testing scripts.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-14-oohall@gmail.com
---
 arch/powerpc/kernel/eeh.c | 139 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2b3c03215a95..0a91dee51245 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1892,7 +1892,8 @@ static ssize_t eeh_dev_check_write(struct file *filp,
 	char buf[20];
 	int ret;
 
-	ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+	memset(buf, 0, sizeof(buf));
+	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
 	if (!ret)
 		return -EFAULT;
 
@@ -1929,6 +1930,139 @@ static const struct file_operations eeh_dev_check_fops = {
 	.read   = eeh_debugfs_dev_usage,
 };
 
+static int eeh_debugfs_break_device(struct pci_dev *pdev)
+{
+	struct resource *bar = NULL;
+	void __iomem *mapped;
+	u16 old, bit;
+	int i, pos;
+
+	/* Do we have an MMIO BAR to disable? */
+	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+		struct resource *r = &pdev->resource[i];
+
+		if (!r->flags || !r->start)
+			continue;
+		if (r->flags & IORESOURCE_IO)
+			continue;
+		if (r->flags & IORESOURCE_UNSET)
+			continue;
+
+		bar = r;
+		break;
+	}
+
+	if (!bar) {
+		pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
+		return -ENXIO;
+	}
+
+	pci_err(pdev, "Going to break: %pR\n", bar);
+
+	if (pdev->is_virtfn) {
+#ifndef CONFIG_IOV
+		return -ENXIO;
+#else
+		/*
+		 * VFs don't have a per-function COMMAND register, so the best
+		 * we can do is clear the Memory Space Enable bit in the PF's
+		 * SRIOV control reg.
+		 *
+		 * Unfortunately, this requires that we have a PF (i.e doesn't
+		 * work for a passed-through VF) and it has the potential side
+		 * effect of also causing an EEH on every other VF under the
+		 * PF. Oh well.
+		 */
+		pdev = pdev->physfn;
+		if (!pdev)
+			return -ENXIO; /* passed through VFs have no PF */
+
+		pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+		pos += PCI_SRIOV_CTRL;
+		bit  = PCI_SRIOV_CTRL_MSE;
+#endif /* !CONFIG_IOV */
+	} else {
+		bit = PCI_COMMAND_MEMORY;
+		pos = PCI_COMMAND;
+	}
+
+	/*
+	 * Process here is:
+	 *
+	 * 1. Disable Memory space.
+	 *
+	 * 2. Perform an MMIO to the device. This should result in an error
+	 *    (CA  / UR) being raised by the device which results in an EEH
+	 *    PE freeze. Using the in_8() accessor skips the eeh detection hook
+	 *    so the freeze hook so the EEH Detection machinery won't be
+	 *    triggered here. This is to match the usual behaviour of EEH
+	 *    where the HW will asyncronously freeze a PE and it's up to
+	 *    the kernel to notice and deal with it.
+	 *
+	 * 3. Turn Memory space back on. This is more important for VFs
+	 *    since recovery will probably fail if we don't. For normal
+	 *    the COMMAND register is reset as a part of re-initialising
+	 *    the device.
+	 *
+	 * Breaking stuff is the point so who cares if it's racy ;)
+	 */
+	pci_read_config_word(pdev, pos, &old);
+
+	mapped = ioremap(bar->start, PAGE_SIZE);
+	if (!mapped) {
+		pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar);
+		return -ENXIO;
+	}
+
+	pci_write_config_word(pdev, pos, old & ~bit);
+	in_8(mapped);
+	pci_write_config_word(pdev, pos, old);
+
+	iounmap(mapped);
+
+	return 0;
+}
+
+static ssize_t eeh_dev_break_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	uint32_t domain, bus, dev, fn;
+	struct pci_dev *pdev;
+	char buf[20];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+	if (ret != 4) {
+		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+		return -EINVAL;
+	}
+
+	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+	if (!pdev)
+		return -ENODEV;
+
+	ret = eeh_debugfs_break_device(pdev);
+	pci_dev_put(pdev);
+
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static const struct file_operations eeh_dev_break_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= eeh_dev_break_write,
+	.read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1947,6 +2081,9 @@ static int __init eeh_init_proc(void)
 		debugfs_create_file_unsafe("eeh_dev_check", 0600,
 				powerpc_debugfs_root, NULL,
 				&eeh_dev_check_fops);
+		debugfs_create_file_unsafe("eeh_dev_break", 0600,
+				powerpc_debugfs_root, NULL,
+				&eeh_dev_break_fops);
 		debugfs_create_file_unsafe("eeh_force_recover", 0600,
 				powerpc_debugfs_root, NULL,
 				&eeh_force_recover_fops);

From 85d86c8aa52eb5b3539eebe3adcc2f077118b412 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 3 Sep 2019 20:16:05 +1000
Subject: [PATCH 195/264] selftests/powerpc: Add basic EEH selftest

Use the new eeh_dev_check and eeh_dev_break interfaces to test EEH
recovery.  Historically this has been done manually using platform specific
EEH error injection facilities (e.g. via RTAS). However, documentation on
how to use these facilities is haphazard at best and non-existent at worst
so it's hard to develop a cross-platform test.

The new debugfs interfaces allow the kernel to handle the platform specific
details so we can write a more generic set of sets. This patch adds the
most basic of recovery tests where:

a) Errors are injected and recovered from sequentially,
b) Errors are not injected into PCI-PCI bridges, such as PCIe switches.
c) Errors are only injected into device function zero.
d) No errors are injected into Virtual Functions.

a), b) and c) are largely due to limitations of Linux's EEH support.  EEH
recovery is serialised in the EEH recovery thread which forces a).
Similarly, multi-function PCI devices are almost always grouped into the
same PE so injecting an error on one function exercises the same code
paths. c) is because we currently more or less ignore PCI bridges during
recovery and assume that the recovered topology will be the same as the
original.

d) is due to the limits of the eeh_dev_break interface. With the current
implementation we can't inject an error into a specific VF without
potentially causing additional errors on other VFs. Due to the serialised
recovery process we might end up timing out waiting for another function to
recover before the function of interest is recovered. The platform specific
error injection facilities are finer-grained and allow this capability, but
doing that requires working out how to use those facilities first.

Basicly, it's better than nothing and it's a base to build on.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903101605.2890-15-oohall@gmail.com
---
 tools/testing/selftests/powerpc/Makefile      |  1 +
 tools/testing/selftests/powerpc/eeh/Makefile  |  9 ++
 .../selftests/powerpc/eeh/eeh-basic.sh        | 82 +++++++++++++++++++
 .../selftests/powerpc/eeh/eeh-functions.sh    | 76 +++++++++++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/eeh/Makefile
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-basic.sh
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-functions.sh

diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index b3ad909aefbc..644770c3b754 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -26,6 +26,7 @@ SUB_DIRS = alignment		\
 	   switch_endian	\
 	   syscalls		\
 	   tm			\
+	   eeh			\
 	   vphn         \
 	   math		\
 	   ptrace	\
diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile
new file mode 100644
index 000000000000..b397babd569b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+	$(MAKE) -C ../
+
+TEST_PROGS := eeh-basic.sh
+TEST_FILES := eeh-functions.sh
+
+top_srcdir = ../../../../..
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
new file mode 100755
index 000000000000..f988d2f42e8f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+if ! eeh_supported ; then
+	echo "EEH not supported on this system, skipping"
+	exit 0;
+fi
+
+if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
+   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
+	echo "debugfs EEH testing files are missing. Is debugfs mounted?"
+	exit 1;
+fi
+
+pre_lspci=`mktemp`
+lspci > $pre_lspci
+
+# Bump the max freeze count to something absurd so we don't
+# trip over it while breaking things.
+echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
+
+# record the devices that we break in here. Assuming everything
+# goes to plan we should get them back once the recover process
+# is finished.
+devices=""
+
+# Build up a list of candidate devices.
+for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
+	# skip bridges since we can't recover them (yet...)
+	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
+		echo "$dev, Skipped: bridge"
+		continue;
+	fi
+
+	# Skip VFs for now since we don't have a reliable way
+	# to break them.
+	if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
+		echo "$dev, Skipped: virtfn"
+		continue;
+	fi
+
+	# Don't inject errosr into an already-frozen PE. This happens with
+	# PEs that contain multiple PCI devices (e.g. multi-function cards)
+	# and injecting new errors during the recovery process will probably
+	# result in the recovery failing and the device being marked as
+	# failed.
+	if ! pe_ok $dev ; then
+		echo "$dev, Skipped: Bad initial PE state"
+		continue;
+	fi
+
+	echo "$dev, Added"
+
+	# Add to this list of device to check
+	devices="$devices $dev"
+done
+
+dev_count="$(echo $devices | wc -w)"
+echo "Found ${dev_count} breakable devices..."
+
+failed=0
+for dev in $devices ; do
+	echo "Breaking $dev..."
+
+	if ! pe_ok $dev ; then
+		echo "Skipping $dev, Initial PE state is not ok"
+		failed="$((failed + 1))"
+		continue;
+	fi
+
+	if ! eeh_one_dev $dev ; then
+		failed="$((failed + 1))"
+	fi
+done
+
+echo "$failed devices failed to recover ($dev_count tested)"
+lspci | diff -u $pre_lspci -
+rm -f $pre_lspci
+
+exit $failed
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
new file mode 100755
index 000000000000..26112ab5cdf4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+pe_ok() {
+	local dev="$1"
+	local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
+
+	if ! [ -e "$path" ] ; then
+		return 1;
+	fi
+
+	local fw_state="$(cut -d' ' -f1 < $path)"
+	local sw_state="$(cut -d' ' -f2 < $path)"
+
+	# If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
+	# error state or being recovered. Either way, not ok.
+	if [ "$((sw_state & 0x3))" -ne 0 ] ; then
+		return 1
+	fi
+
+	# A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
+	# EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
+	# the platform backends set these when the PE is in reset. The
+	# RECOVERING check above should stop any false positives though.
+	if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
+		return 1
+	fi
+
+	return 0;
+}
+
+eeh_supported() {
+	test -e /proc/powerpc/eeh && \
+	grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
+}
+
+eeh_one_dev() {
+	local dev="$1"
+
+	# Using this function from the command line is sometimes useful for
+	# testing so check that the argument is a well-formed sysfs device
+	# name.
+	if ! test -e /sys/bus/pci/devices/$dev/ ; then
+		echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
+		return 1;
+	fi
+
+	# Break it
+	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
+
+	# Force an EEH device check. If the kernel has already
+	# noticed the EEH (due to a driver poll or whatever), this
+	# is a no-op.
+	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
+
+	# Enforce a 30s timeout for recovery. Even the IPR, which is infamously
+	# slow to reset, should recover within 30s.
+	max_wait=30
+
+	for i in `seq 0 ${max_wait}` ; do
+		if pe_ok $dev ; then
+			break;
+		fi
+		echo "$dev, waited $i/${max_wait}"
+		sleep 1
+	done
+
+	if ! pe_ok $dev ; then
+		echo "$dev, Failed to recover!"
+		return 1;
+	fi
+
+	echo "$dev, Recovered after $i seconds"
+	return 0;
+}
+

From ed6546bdc61b7c4bd926cebd82ba52d056fcefa1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:26 +1000
Subject: [PATCH 196/264] powerpc/64s: remove register_process_table callback

This callback is only required because the partition table init comes
before process table allocation on powernv (aka bare metal aka native).

Change the order to allocate the process table first, and remove the
callback.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-2-npiggin@gmail.com
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  4 ---
 arch/powerpc/mm/book3s64/hash_utils.c    |  6 ----
 arch/powerpc/mm/book3s64/pgtable.c       |  3 --
 arch/powerpc/mm/book3s64/radix_pgtable.c | 45 +++++++-----------------
 arch/powerpc/platforms/pseries/lpar.c    | 17 +++++++--
 5 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 23b83d3593e2..bb3deb76c951 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -206,7 +206,6 @@ extern int mmu_io_psize;
 void mmu_early_init_devtree(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
-extern void radix_init_native(void);
 extern void hash__early_init_mmu(void);
 extern void radix__early_init_mmu(void);
 static inline void early_init_mmu(void)
@@ -238,9 +237,6 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 					   first_memblock_size);
 }
 
-extern int (*register_process_table)(unsigned long base, unsigned long page_size,
-				     unsigned long tbl_size);
-
 #ifdef CONFIG_PPC_PSERIES
 extern void radix_init_pseries(void);
 #else
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index fe99bba39b69..7aed27ea5361 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -859,12 +859,6 @@ static void __init htab_initialize(void)
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
 		_SDR1 = 0; 
-		/*
-		 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
-		 * to inform the hypervisor that we wish to use the HPT.
-		 */
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			register_process_table(0, 0, 0);
 #ifdef CONFIG_FA_DUMP
 		/*
 		 * If firmware assisted dump is active firmware preserves
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 206b43ae4000..97f3be778c79 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -23,9 +23,6 @@ EXPORT_SYMBOL(__pmd_frag_nr);
 unsigned long __pmd_frag_size_shift;
 EXPORT_SYMBOL(__pmd_frag_size_shift);
 
-int (*register_process_table)(unsigned long base, unsigned long page_size,
-			      unsigned long tbl_size);
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * This is called when relaxing access to a hugepage. It's also called in the page
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 71b649473045..83fa7864e8f4 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -34,19 +34,6 @@
 unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
 
-static int native_register_process_table(unsigned long base, unsigned long pg_sz,
-					 unsigned long table_size)
-{
-	unsigned long patb0, patb1;
-
-	patb0 = be64_to_cpu(partition_tb[0].patb0);
-	patb1 = base | table_size | PATB_GR;
-
-	mmu_partition_table_set_entry(0, patb0, patb1);
-
-	return 0;
-}
-
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 			unsigned long region_start, unsigned long region_end)
 {
@@ -381,18 +368,8 @@ static void __init radix_init_pgtable(void)
 	 */
 	rts_field = radix__get_tree_size();
 	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
-	/*
-	 * Fill in the partition table. We are suppose to use effective address
-	 * of process table here. But our linear mapping also enable us to use
-	 * physical address here.
-	 */
-	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
+
 	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
-	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 
 	/*
 	 * The init_mm context is given the first available (non-zero) PID,
@@ -413,22 +390,24 @@ static void __init radix_init_pgtable(void)
 
 static void __init radix_init_partition_table(void)
 {
-	unsigned long rts_field, dw0;
+	unsigned long rts_field, dw0, dw1;
 
 	mmu_partition_table_init();
 	rts_field = radix__get_tree_size();
 	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
-	mmu_partition_table_set_entry(0, dw0, 0);
+	dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
+	mmu_partition_table_set_entry(0, dw0, dw1);
+
+	asm volatile("ptesync" : : : "memory");
+	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 
 	pr_info("Initializing Radix MMU\n");
 	pr_info("Partition table %p\n", partition_tb);
 }
 
-void __init radix_init_native(void)
-{
-	register_process_table = native_register_process_table;
-}
-
 static int __init get_idx_from_shift(unsigned int shift)
 {
 	int idx = -1;
@@ -622,8 +601,9 @@ void __init radix__early_init_mmu(void)
 	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
 	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
 
+	radix_init_pgtable();
+
 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-		radix_init_native();
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
@@ -634,7 +614,6 @@ void __init radix__early_init_mmu(void)
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 
-	radix_init_pgtable();
 	/* Switch to the guard PID before turning on MMU */
 	radix__switch_mmu_context(NULL, &init_mm);
 	if (cpu_has_feature(CPU_FTR_HVMODE))
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 4f76e5f30c97..b3205a6c950c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1531,16 +1531,29 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
-	register_process_table		 = pseries_lpar_register_process_table;
 
 	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
 		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
+
+	/*
+	 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+	 * to inform the hypervisor that we wish to use the HPT.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pseries_lpar_register_process_table(0, 0, 0);
 }
 
 void radix_init_pseries(void)
 {
 	pr_info("Using radix MMU under hypervisor\n");
-	register_process_table = pseries_lpar_register_process_table;
+
+	pseries_lpar_register_process_table(__pa(process_tb),
+						0, PRTB_SIZE_SHIFT - 12);
+	asm volatile("ptesync" : : : "memory");
+	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 }
 
 #ifdef CONFIG_PPC_SMLPAR

From 99161de3a283af59f2813da6cbdccc1d2784c7de Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:27 +1000
Subject: [PATCH 197/264] powerpc/64s/radix: tidy up TLB flushing code

There should be no functional changes.

- Use calls to existing radix_tlb.c functions in flush_partition.

- Rename radix__flush_tlb_lpid to radix__flush_all_lpid and similar,
  because they flush everything, matching flush_all_mm rather than
  flush_tlb_mm for the lpid.

- Remove some unused radix_tlb.c flush primitives.

Signed-off: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-3-npiggin@gmail.com
---
 .../include/asm/book3s/64/tlbflush-radix.h    |  12 +-
 arch/powerpc/kvm/book3s_hv_nested.c           |   2 +-
 arch/powerpc/mm/book3s64/pgtable.c            |  13 +-
 arch/powerpc/mm/book3s64/radix_tlb.c          | 117 ++++--------------
 4 files changed, 34 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 05147cecb8df..4ce795d30377 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -17,8 +17,8 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
 					unsigned long addr,
 					unsigned long page_size);
 extern void radix__flush_pwc_lpid(unsigned int lpid);
-extern void radix__flush_tlb_lpid(unsigned int lpid);
-extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
+extern void radix__flush_all_lpid(unsigned int lpid);
+extern void radix__flush_all_lpid_guest(unsigned int lpid);
 #else
 static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
 static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
@@ -31,11 +31,7 @@ static inline void radix__flush_pwc_lpid(unsigned int lpid)
 {
 	WARN_ON(1);
 }
-static inline void radix__flush_tlb_lpid(unsigned int lpid)
-{
-	WARN_ON(1);
-}
-static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+static inline void radix__flush_all_lpid(unsigned int lpid)
 {
 	WARN_ON(1);
 }
@@ -73,6 +69,4 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
 extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
 extern void radix__flush_tlb_all(void);
 
-extern void radix__local_flush_tlb_lpid(unsigned int lpid);
-
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 735e0ac6f5b2..b3316da2f13e 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -398,7 +398,7 @@ static void kvmhv_flush_lpid(unsigned int lpid)
 	long rc;
 
 	if (!kvmhv_on_pseries()) {
-		radix__flush_tlb_lpid(lpid);
+		radix__flush_all_lpid(lpid);
 		return;
 	}
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 97f3be778c79..c2b87c5ba50b 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -210,20 +210,17 @@ void __init mmu_partition_table_init(void)
 
 static void flush_partition(unsigned int lpid, bool radix)
 {
-	asm volatile("ptesync" : : : "memory");
 	if (radix) {
-		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
-			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-		asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+		radix__flush_all_lpid(lpid);
+		radix__flush_all_lpid_guest(lpid);
 	} else {
+		asm volatile("ptesync" : : : "memory");
 		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+		/* do we need fixup here ?*/
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
 	}
-	/* do we need fixup here ?*/
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 }
 
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 71f7fede2fa4..082f90d068ee 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -116,22 +116,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static __always_inline void __tlbiel_lpid(unsigned long lpid, int set,
-				unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(52); /* IS = 2 */
-	rb |= set << PPC_BITLSHIFT(51);
-	rs = 0;  /* LPID comes from LPIDR */
-	prs = 0; /* partition scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
 static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -146,23 +130,20 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
 	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
 }
 
-static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
-						unsigned long ric)
+static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
 	rb = PPC_BIT(52); /* IS = 2 */
-	rb |= set << PPC_BITLSHIFT(51);
-	rs = 0;  /* LPID comes from LPIDR */
+	rs = lpid;
 	prs = 1; /* process scoped */
 	r = 1;   /* radix format */
 
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
 }
 
-
 static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
 					unsigned long ap, unsigned long ric)
 {
@@ -285,34 +266,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
-{
-	int set;
-
-	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-	 * also flush the entire Page Walk Cache.
-	 */
-	__tlbiel_lpid(lpid, 0, ric);
-
-	/* For PWC, only one flush is needed */
-	if (ric == RIC_FLUSH_PWC) {
-		asm volatile("ptesync": : :"memory");
-		return;
-	}
-
-	/* For the remaining sets, just flush the TLB */
-	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST "; isync" : : :"memory");
-}
-
 static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 {
 	asm volatile("ptesync": : :"memory");
@@ -337,35 +290,28 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-static __always_inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
 {
-	int set;
-
-	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-	asm volatile("ptesync": : :"memory");
-
 	/*
-	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-	 * also flush the entire Page Walk Cache.
+	 * Workaround the fact that the "ric" argument to __tlbie_pid
+	 * must be a compile-time contraint to match the "i" constraint
+	 * in the asm statement.
 	 */
-	__tlbiel_lpid_guest(lpid, 0, ric);
-
-	/* For PWC, only one flush is needed */
-	if (ric == RIC_FLUSH_PWC) {
-		asm volatile("ptesync": : :"memory");
-		return;
+	switch (ric) {
+	case RIC_FLUSH_TLB:
+		__tlbie_lpid_guest(lpid, RIC_FLUSH_TLB);
+		break;
+	case RIC_FLUSH_PWC:
+		__tlbie_lpid_guest(lpid, RIC_FLUSH_PWC);
+		break;
+	case RIC_FLUSH_ALL:
+	default:
+		__tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
 	}
-
-	/* For the remaining sets, just flush the TLB */
-	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory");
+	fixup_tlbie_lpid(lpid);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-
 static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
@@ -835,32 +781,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
 /*
  * Flush partition scoped translations from LPID (=LPIDR)
  */
-void radix__flush_tlb_lpid(unsigned int lpid)
+void radix__flush_all_lpid(unsigned int lpid)
 {
 	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
 }
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+EXPORT_SYMBOL_GPL(radix__flush_all_lpid);
 
 /*
- * Flush partition scoped translations from LPID (=LPIDR)
+ * Flush process scoped translations from LPID (=LPIDR)
  */
-void radix__local_flush_tlb_lpid(unsigned int lpid)
+void radix__flush_all_lpid_guest(unsigned int lpid)
 {
-	_tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+	_tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
 }
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
-
-/*
- * Flush process scoped translations from LPID (=LPIDR).
- * Important difference, the guest normally manages its own translations,
- * but some cases e.g., vCPU CPU migration require KVM to flush.
- */
-void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
-{
-	_tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
-
 
 static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize);

From fd13daea5f72605a0a7386ebedbb5ff2b2a48da4 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:28 +1000
Subject: [PATCH 198/264] powerpc/64s: make mmu_partition_table_set_entry TLB
 flush optional

No functional change.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-4-npiggin@gmail.com
---
 arch/powerpc/include/asm/mmu.h           | 2 +-
 arch/powerpc/kvm/book3s_hv_nested.c      | 2 +-
 arch/powerpc/mm/book3s64/hash_utils.c    | 2 +-
 arch/powerpc/mm/book3s64/pgtable.c       | 4 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index ba94ce8c22d7..0699cfeeb8c9 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -257,7 +257,7 @@ extern void radix__mmu_cleanup_all(void);
 /* Functions for creating and updating partition table on POWER9 */
 extern void mmu_partition_table_init(void);
 extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-					  unsigned long dw1);
+					  unsigned long dw1, bool flush);
 #endif /* CONFIG_PPC64 */
 
 struct mm_struct;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index b3316da2f13e..fff90f2c3de2 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -411,7 +411,7 @@ static void kvmhv_flush_lpid(unsigned int lpid)
 void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
 {
 	if (!kvmhv_on_pseries()) {
-		mmu_partition_table_set_entry(lpid, dw0, dw1);
+		mmu_partition_table_set_entry(lpid, dw0, dw1, true);
 		return;
 	}
 
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 7aed27ea5361..b73d08b54d12 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -825,7 +825,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	 * For now, UPRT is 0 and we have no segment table.
 	 */
 	htab_size =  __ilog2(htab_size) - 18;
-	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
+	mmu_partition_table_set_entry(0, hash_table | htab_size, 0, true);
 	pr_info("Partition table %p\n", partition_tb);
 }
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index c2b87c5ba50b..6fab9c0bbbaf 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -224,7 +224,7 @@ static void flush_partition(unsigned int lpid, bool radix)
 }
 
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-				  unsigned long dw1)
+				  unsigned long dw1, bool flush)
 {
 	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
 
@@ -251,7 +251,7 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 		uv_register_pate(lpid, dw0, dw1);
 		pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
 			dw0, dw1);
-	} else {
+	} else if (flush) {
 		flush_partition(lpid, (old & PATB_HR));
 	}
 }
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 83fa7864e8f4..078a7eeec1f5 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -396,7 +396,7 @@ static void __init radix_init_partition_table(void)
 	rts_field = radix__get_tree_size();
 	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
 	dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
-	mmu_partition_table_set_entry(0, dw0, dw1);
+	mmu_partition_table_set_entry(0, dw0, dw1, true);
 
 	asm volatile("ptesync" : : : "memory");
 	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :

From 7e71c428a60e2029585be7d7cc22775f442e5b2c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:29 +1000
Subject: [PATCH 199/264] powerpc/64s/pseries: radix flush translations before
 MMU is enabled at boot

Radix guests are responsible for managing their own translation caches,
so make them match bare metal radix and hash, and make each CPU flush
all its translations right before enabling its MMU.

Radix guests may not flush partition scope translations, so in
tlbiel_all, make these flushes conditional on CPU_FTR_HVMODE. Process
scope translations are the only type visible to the guest.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-5-npiggin@gmail.com
---
 arch/powerpc/mm/book3s64/radix_pgtable.c |  6 ++----
 arch/powerpc/mm/book3s64/radix_tlb.c     | 12 ++++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 078a7eeec1f5..e1e711c4704a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -616,8 +616,7 @@ void __init radix__early_init_mmu(void)
 
 	/* Switch to the guard PID before turning on MMU */
 	radix__switch_mmu_context(NULL, &init_mm);
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
+	tlbiel_all();
 }
 
 void radix__early_init_mmu_secondary(void)
@@ -637,8 +636,7 @@ void radix__early_init_mmu_secondary(void)
 	}
 
 	radix__switch_mmu_context(NULL, &init_mm);
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
+	tlbiel_all();
 }
 
 void radix__mmu_cleanup_all(void)
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 082f90d068ee..f9cf8ae59831 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -51,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
 	 * and partition table entries. Then flush the remaining sets of the
 	 * TLB.
 	 */
-	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
-	for (set = 1; set < num_sets; set++)
-		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
 
-	/* Do the same for process scoped entries. */
+	if (early_cpu_has_feature(CPU_FTR_HVMODE)) {
+		/* MSR[HV] should flush partition scope translations first. */
+		tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+		for (set = 1; set < num_sets; set++)
+			tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+	}
+
+	/* Flush process scoped entries. */
 	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
 	for (set = 1; set < num_sets; set++)
 		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);

From 7d805accbec57a151bd0dd305a1109feebdfd4a4 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:30 +1000
Subject: [PATCH 200/264] powerpc/64s: remove unnecessary translation cache
 flushes at boot

The various translation structure invalidations performed in early boot
when the MMU is off are not required, because everything is invalidated
immediately before a CPU first enables its MMU (see early_init_mmu
and early_init_mmu_secondary).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-6-npiggin@gmail.com
---
 arch/powerpc/mm/book3s64/hash_utils.c    | 2 +-
 arch/powerpc/mm/book3s64/pgtable.c       | 5 +++++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 8 +-------
 arch/powerpc/platforms/pseries/lpar.c    | 5 -----
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b73d08b54d12..7684a596158b 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -825,7 +825,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	 * For now, UPRT is 0 and we have no segment table.
 	 */
 	htab_size =  __ilog2(htab_size) - 18;
-	mmu_partition_table_set_entry(0, hash_table | htab_size, 0, true);
+	mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
 	pr_info("Partition table %p\n", partition_tb);
 }
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 6fab9c0bbbaf..351eb78eed55 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -252,6 +252,11 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 		pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
 			dw0, dw1);
 	} else if (flush) {
+		/*
+		 * Boot does not need to flush, because MMU is off and each
+		 * CPU does a tlbiel_all() before switching them on, which
+		 * flushes everything.
+		 */
 		flush_partition(lpid, (old & PATB_HR));
 	}
 }
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e1e711c4704a..0d1107fb34c1 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -396,13 +396,7 @@ static void __init radix_init_partition_table(void)
 	rts_field = radix__get_tree_size();
 	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
 	dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
-	mmu_partition_table_set_entry(0, dw0, dw1, true);
-
-	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
+	mmu_partition_table_set_entry(0, dw0, dw1, false);
 
 	pr_info("Initializing Radix MMU\n");
 	pr_info("Partition table %p\n", partition_tb);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index b3205a6c950c..36b846f6e74e 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1549,11 +1549,6 @@ void radix_init_pseries(void)
 
 	pseries_lpar_register_process_table(__pa(process_tb),
 						0, PRTB_SIZE_SHIFT - 12);
-	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 }
 
 #ifdef CONFIG_PPC_SMLPAR

From 2275d7b5754a573ffb2ca9e40bd0546eeb986696 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 3 Sep 2019 01:29:31 +1000
Subject: [PATCH 201/264] powerpc/64s/radix: introduce options to disable use
 of the tlbie instruction

Introduce two options to control the use of the tlbie instruction. A
boot time option which completely disables the kernel using the
instruction, this is currently incompatible with HASH MMU, KVM, and
coherent accelerators.

And a debugfs option can be switched at runtime and avoids using tlbie
for invalidating CPU TLBs for normal process and kernel address
mappings. Coherent accelerators are still managed with tlbie, as will
KVM partition scope translations.

Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a
basic implementation which does not attempt to make any optimisation
beyond the tlbie implementation.

This is useful for performance testing among other things. For example
in certain situations on large systems, using IPIs may be faster than
tlbie as they can be directed rather than broadcast. Later we may also
take advantage of the IPIs to do more interesting things such as trim
the mm cpumask more aggressively.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-7-npiggin@gmail.com
---
 .../admin-guide/kernel-parameters.txt         |   4 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   9 +
 arch/powerpc/kvm/book3s_hv.c                  |   6 +
 arch/powerpc/mm/book3s64/pgtable.c            |  47 +++++
 arch/powerpc/mm/book3s64/radix_tlb.c          | 190 ++++++++++++++++--
 drivers/misc/cxl/main.c                       |   4 +
 drivers/misc/ocxl/main.c                      |   4 +
 7 files changed, 246 insertions(+), 18 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4923d8f726e8..3cd757f9feaa 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -860,6 +860,10 @@
 	disable_radix	[PPC]
 			Disable RADIX MMU mode on POWER9
 
+	disable_tlbie	[PPC]
+			Disable TLBIE instruction. Currently does not work
+			with KVM, with HASH MMU, or with coherent accelerators.
+
 	disable_cpu_apicid= [X86,APIC,SMP]
 			Format: <int>
 			The number of initial APIC ID for the
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index ebf572ea621e..7aa8195b6cff 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long addre
 
 	radix__flush_tlb_pwc(tlb, address);
 }
+
+extern bool tlbie_capable;
+extern bool tlbie_enabled;
+
+static inline bool cputlb_use_tlbie(void)
+{
+	return tlbie_enabled;
+}
+
 #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cde3f5a4b3e4..3cdaa2a09a19 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void)
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
+
+	if (!tlbie_capable) {
+		pr_err("KVM-HV: Host does not support TLBIE\n");
+		return -ENODEV;
+	}
+
 	/*
 	 * FIXME!! Do we need to check on all cpus ?
 	 */
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 351eb78eed55..75483b40fcb1 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -8,6 +8,7 @@
 #include <linux/memblock.h>
 #include <misc/cxl-base.h>
 
+#include <asm/debugfs.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/trace.h>
@@ -469,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 
 	return true;
 }
+
+/*
+ * Does the CPU support tlbie?
+ */
+bool tlbie_capable __read_mostly = true;
+EXPORT_SYMBOL(tlbie_capable);
+
+/*
+ * Should tlbie be used for management of CPU TLBs, for kernel and process
+ * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
+ * guest address spaces.
+ */
+bool tlbie_enabled __read_mostly = true;
+
+static int __init setup_disable_tlbie(char *str)
+{
+	if (!radix_enabled()) {
+		pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
+		return 1;
+	}
+
+	tlbie_capable = false;
+	tlbie_enabled = false;
+
+        return 1;
+}
+__setup("disable_tlbie", setup_disable_tlbie);
+
+static int __init pgtable_debugfs_setup(void)
+{
+	if (!tlbie_capable)
+		return 0;
+
+	/*
+	 * There is no locking vs tlb flushing when changing this value.
+	 * The tlb flushers will see one value or another, and use either
+	 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
+	 * invalidated as expected.
+	 */
+	debugfs_create_bool("tlbie_enabled", 0600,
+			powerpc_debugfs_root,
+			&tlbie_enabled);
+
+	return 0;
+}
+arch_initcall(pgtable_debugfs_setup);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index f9cf8ae59831..631be42abd33 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -270,6 +270,39 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+struct tlbiel_pid {
+	unsigned long pid;
+	unsigned long ric;
+};
+
+static void do_tlbiel_pid(void *info)
+{
+	struct tlbiel_pid *t = info;
+
+	if (t->ric == RIC_FLUSH_TLB)
+		_tlbiel_pid(t->pid, RIC_FLUSH_TLB);
+	else if (t->ric == RIC_FLUSH_PWC)
+		_tlbiel_pid(t->pid, RIC_FLUSH_PWC);
+	else
+		_tlbiel_pid(t->pid, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
+				unsigned long pid, unsigned long ric)
+{
+	struct cpumask *cpus = mm_cpumask(mm);
+	struct tlbiel_pid t = { .pid = pid, .ric = ric };
+
+	on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
+	/*
+	 * Always want the CPU translations to be invalidated with tlbiel in
+	 * these paths, so while coprocessors must use tlbie, we can not
+	 * optimise away the tlbiel component.
+	 */
+	if (atomic_read(&mm->context.copros) > 0)
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
+}
+
 static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 {
 	asm volatile("ptesync": : :"memory");
@@ -370,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+struct tlbiel_va {
+	unsigned long pid;
+	unsigned long va;
+	unsigned long psize;
+	unsigned long ric;
+};
+
+static void do_tlbiel_va(void *info)
+{
+	struct tlbiel_va *t = info;
+
+	if (t->ric == RIC_FLUSH_TLB)
+		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
+	else if (t->ric == RIC_FLUSH_PWC)
+		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
+	else
+		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_va_multicast(struct mm_struct *mm,
+				unsigned long va, unsigned long pid,
+				unsigned long psize, unsigned long ric)
+{
+	struct cpumask *cpus = mm_cpumask(mm);
+	struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric };
+	on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
+	if (atomic_read(&mm->context.copros) > 0)
+		_tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
+}
+
+struct tlbiel_va_range {
+	unsigned long pid;
+	unsigned long start;
+	unsigned long end;
+	unsigned long page_size;
+	unsigned long psize;
+	bool also_pwc;
+};
+
+static void do_tlbiel_va_range(void *info)
+{
+	struct tlbiel_va_range *t = info;
+
+	_tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
+				    t->psize, t->also_pwc);
+}
+
 static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
 			      unsigned long psize, unsigned long ric)
 {
@@ -393,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				unsigned long pid, unsigned long page_size,
+				unsigned long psize, bool also_pwc)
+{
+	struct cpumask *cpus = mm_cpumask(mm);
+	struct tlbiel_va_range t = { .start = start, .end = end,
+				.pid = pid, .page_size = page_size,
+				.psize = psize, .also_pwc = also_pwc };
+
+	on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
+	if (atomic_read(&mm->context.copros) > 0)
+		_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+}
+
 /*
  * Base TLB flushing operations:
  *
@@ -530,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 			goto local;
 		}
 
-		if (mm_needs_flush_escalation(mm))
-			_tlbie_pid(pid, RIC_FLUSH_ALL);
-		else
-			_tlbie_pid(pid, RIC_FLUSH_TLB);
+		if (cputlb_use_tlbie()) {
+			if (mm_needs_flush_escalation(mm))
+				_tlbie_pid(pid, RIC_FLUSH_ALL);
+			else
+				_tlbie_pid(pid, RIC_FLUSH_TLB);
+		} else {
+			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+		}
 	} else {
 local:
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
@@ -559,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 				goto local;
 			}
 		}
-		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		if (cputlb_use_tlbie())
+			_tlbie_pid(pid, RIC_FLUSH_ALL);
+		else
+			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
 	} else {
 local:
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
@@ -594,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 			exit_flush_lazy_tlbs(mm);
 			goto local;
 		}
-		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+		if (cputlb_use_tlbie())
+			_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+		else
+			_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
 	} else {
 local:
 		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -616,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
 #define radix__flush_all_mm radix__local_flush_all_mm
 #endif /* CONFIG_SMP */
 
+static void do_tlbiel_kernel(void *info)
+{
+	_tlbiel_pid(0, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_kernel_broadcast(void)
+{
+	on_each_cpu(do_tlbiel_kernel, NULL, 1);
+	if (tlbie_capable) {
+		/*
+		 * Coherent accelerators don't refcount kernel memory mappings,
+		 * so have to always issue a tlbie for them. This is quite a
+		 * slow path anyway.
+		 */
+		_tlbie_pid(0, RIC_FLUSH_ALL);
+	}
+}
+
 /*
  * If kernel TLBIs ever become local rather than global, then
  * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
@@ -623,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
  */
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	_tlbie_pid(0, RIC_FLUSH_ALL);
+	if (cputlb_use_tlbie())
+		_tlbie_pid(0, RIC_FLUSH_ALL);
+	else
+		_tlbiel_kernel_broadcast();
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
@@ -679,10 +805,14 @@ is_local:
 		if (local) {
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		} else {
-			if (mm_needs_flush_escalation(mm))
-				_tlbie_pid(pid, RIC_FLUSH_ALL);
-			else
-				_tlbie_pid(pid, RIC_FLUSH_TLB);
+			if (cputlb_use_tlbie()) {
+				if (mm_needs_flush_escalation(mm))
+					_tlbie_pid(pid, RIC_FLUSH_ALL);
+				else
+					_tlbie_pid(pid, RIC_FLUSH_TLB);
+			} else {
+				_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+			}
 		}
 	} else {
 		bool hflush = flush_all_sizes;
@@ -707,8 +837,8 @@ is_local:
 				gflush = false;
 		}
 
-		asm volatile("ptesync": : :"memory");
 		if (local) {
+			asm volatile("ptesync": : :"memory");
 			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
 			if (hflush)
 				__tlbiel_va_range(hstart, hend, pid,
@@ -717,7 +847,8 @@ is_local:
 				__tlbiel_va_range(gstart, gend, pid,
 						PUD_SIZE, MMU_PAGE_1G);
 			asm volatile("ptesync": : :"memory");
-		} else {
+		} else if (cputlb_use_tlbie()) {
+			asm volatile("ptesync": : :"memory");
 			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
 			if (hflush)
 				__tlbie_va_range(hstart, hend, pid,
@@ -727,6 +858,15 @@ is_local:
 						PUD_SIZE, MMU_PAGE_1G);
 			fixup_tlbie();
 			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		} else {
+			_tlbiel_va_range_multicast(mm,
+					start, end, pid, page_size, mmu_virtual_psize, false);
+			if (hflush)
+				_tlbiel_va_range_multicast(mm,
+					hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
+			if (gflush)
+				_tlbiel_va_range_multicast(mm,
+					gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false);
 		}
 	}
 	preempt_enable();
@@ -903,16 +1043,26 @@ is_local:
 		if (local) {
 			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		} else {
-			if (mm_needs_flush_escalation(mm))
-				also_pwc = true;
+			if (cputlb_use_tlbie()) {
+				if (mm_needs_flush_escalation(mm))
+					also_pwc = true;
+
+				_tlbie_pid(pid,
+					also_pwc ?  RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+			} else {
+				_tlbiel_pid_multicast(mm, pid,
+					also_pwc ?  RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+			}
 
-			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		}
 	} else {
 		if (local)
 			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
-		else
+		else if (cputlb_use_tlbie())
 			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+		else
+			_tlbiel_va_range_multicast(mm,
+					start, end, pid, page_size, psize, also_pwc);
 	}
 	preempt_enable();
 }
@@ -954,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 			exit_flush_lazy_tlbs(mm);
 			goto local;
 		}
-		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+		if (cputlb_use_tlbie())
+			_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+		else
+			_tlbiel_va_range_multicast(mm,
+					addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else {
 local:
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 482a2c1b340a..43b312d06e3e 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -18,6 +18,7 @@
 #include <linux/sched/task.h>
 
 #include <asm/cputable.h>
+#include <asm/mmu.h>
 #include <misc/cxl-base.h>
 
 #include "cxl.h"
@@ -315,6 +316,9 @@ static int __init init_cxl(void)
 {
 	int rc = 0;
 
+	if (!tlbie_capable)
+		return -EINVAL;
+
 	if ((rc = cxl_file_init()))
 		return rc;
 
diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c
index 7210d9e059be..ef73cf35dda2 100644
--- a/drivers/misc/ocxl/main.c
+++ b/drivers/misc/ocxl/main.c
@@ -2,12 +2,16 @@
 // Copyright 2017 IBM Corp.
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <asm/mmu.h>
 #include "ocxl_internal.h"
 
 static int __init init_ocxl(void)
 {
 	int rc = 0;
 
+	if (!tlbie_capable)
+		return -EINVAL;
+
 	rc = ocxl_file_init();
 	if (rc)
 		return rc;

From 41ba17f20ea835c489e77bd54e2da73184e22060 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 27 Aug 2019 15:46:35 +0530
Subject: [PATCH 202/264] powerpc/imc: Dont create debugfs files for cpu-less
 nodes

Commit <684d984038aa> ('powerpc/powernv: Add debugfs interface for
imc-mode and imc') added debugfs interface for the nest imc pmu
devices to support changing of different ucode modes. Primarily adding
this capability for debug. But when doing so, the code did not
consider the case of cpu-less nodes. So when reading the _cmd_ or
_mode_ file of a cpu-less node will create this crash.

  Faulting instruction address: 0xc0000000000d0d58
  Oops: Kernel access of bad area, sig: 11 [#1]
  ...
  CPU: 67 PID: 5301 Comm: cat Not tainted 5.2.0-rc6-next-20190627+ #19
  NIP:  c0000000000d0d58 LR: c00000000049aa18 CTR:c0000000000d0d50
  REGS: c00020194548f9e0 TRAP: 0300   Not tainted  (5.2.0-rc6-next-20190627+)
  MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR:28022822  XER: 00000000
  CFAR: c00000000049aa14 DAR: 000000000003fc08 DSISR:40000000 IRQMASK: 0
  ...
  NIP imc_mem_get+0x8/0x20
  LR  simple_attr_read+0x118/0x170
  Call Trace:
    simple_attr_read+0x70/0x170 (unreliable)
    debugfs_attr_read+0x6c/0xb0
    __vfs_read+0x3c/0x70
     vfs_read+0xbc/0x1a0
    ksys_read+0x7c/0x140
    system_call+0x5c/0x70

Patch fixes the issue with a more robust check for vbase to NULL.

Before patch, ls output for the debugfs imc directory

  # ls /sys/kernel/debug/powerpc/imc/
  imc_cmd_0    imc_cmd_251  imc_cmd_253  imc_cmd_255  imc_mode_0    imc_mode_251  imc_mode_253  imc_mode_255
  imc_cmd_250  imc_cmd_252  imc_cmd_254  imc_cmd_8    imc_mode_250  imc_mode_252  imc_mode_254  imc_mode_8

After patch, ls output for the debugfs imc directory

  # ls /sys/kernel/debug/powerpc/imc/
  imc_cmd_0  imc_cmd_8  imc_mode_0  imc_mode_8

Actual bug here is that, we have two loops with potentially different
loop counts. That is, in imc_get_mem_addr_nest(), loop count is
obtained from the dt entries. But in case of export_imc_mode_and_cmd(),
loop was based on for_each_nid() count. Patch fixes the loop count in
latter based on the struct mem_info. Ideally it would be better to
have array size in struct imc_pmu.

Fixes: 684d984038aa ('powerpc/powernv: Add debugfs interface for imc-mode and imc')
Reported-by: Qian Cai <cai@lca.pw>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190827101635.6942-1-maddy@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/opal-imc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 				    struct imc_pmu *pmu_ptr)
 {
 	static u64 loc, *imc_mode_addr, *imc_cmd_addr;
-	int chip = 0, nid;
 	char mode[16], cmd[16];
 	u32 cb_offset;
+	struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
 	imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 	if (of_property_read_u32(node, "cb_offset", &cb_offset))
 		cb_offset = IMC_CNTL_BLK_OFFSET;
 
-	for_each_node(nid) {
-		loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+	while (ptr->vbase != NULL) {
+		loc = (u64)(ptr->vbase) + cb_offset;
 		imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
-		sprintf(mode, "imc_mode_%d", nid);
+		sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
 		if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
 					    imc_mode_addr))
 			goto err;
 
 		imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
-		sprintf(cmd, "imc_cmd_%d", nid);
+		sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
 		if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
 					    imc_cmd_addr))
 			goto err;
-		chip++;
+		ptr++;
 	}
 	return;
 

From 67c87892e2e17b7083cb8b4289ed8ff69ad9ac1e Mon Sep 17 00:00:00 2001
From: Jordan Niethe <jniethe5@gmail.com>
Date: Tue, 13 Aug 2019 15:12:12 +1000
Subject: [PATCH 203/264] powerpc: Remove empty comment

Commit 2874c5fd2842 ("treewide: Replace GPLv2 boilerplate/reference with
SPDX - rule 152") left an empty comment in machdep.h, as the boilerplate
was the only text in the comment. Remove the empty comment.

Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813051212.6387-1-jniethe5@gmail.com
---
 arch/powerpc/include/asm/machdep.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 657ec893bdcb..7bcb64444a39 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -3,9 +3,6 @@
 #define _ASM_POWERPC_MACHDEP_H
 #ifdef __KERNEL__
 
-/*
- */
-
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/dma-mapping.h>

From aa497d4352414aad22e792b35d0aaaa12bbc37c5 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Wed, 4 Sep 2019 14:11:07 +0000
Subject: [PATCH 204/264] powerpc: Add attributes for setjmp/longjmp

The setjmp function should be declared as "returns_twice", or bad
things can happen[1]. This does not actually change generated code in
my testing.

The longjmp function should be declared as "noreturn", so that the
compiler can optimise calls to it better. This makes the generated
code a little shorter.

1: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-returns_005ftwice-function-attribute

Signed-off-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/c02ce4a573f3bac907e2c70957a2d1275f910013.1567605586.git.segher@kernel.crashing.org
---
 arch/powerpc/include/asm/setjmp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/setjmp.h b/arch/powerpc/include/asm/setjmp.h
index d995061f5f86..e9f81bb3f83b 100644
--- a/arch/powerpc/include/asm/setjmp.h
+++ b/arch/powerpc/include/asm/setjmp.h
@@ -7,7 +7,7 @@
 
 #define JMP_BUF_LEN    23
 
-extern long setjmp(long *);
-extern void longjmp(long *, long);
+extern long setjmp(long *) __attribute__((returns_twice));
+extern void longjmp(long *, long) __attribute__((noreturn));
 
 #endif /* _ASM_POWERPC_SETJMP_H */

From 20055a8bfaaa75f2cb9c23ecc8ab12b4abd8dc84 Mon Sep 17 00:00:00 2001
From: Santosh Sivaraj <santosh@fossix.org>
Date: Wed, 4 Sep 2019 03:13:58 +0530
Subject: [PATCH 205/264] powerpc/memcpy: Fix stack corruption for smaller
 sizes

For sizes lesser than 128 bytes, the code branches out early without saving
the stack frame, which when restored later drops frame of the caller.

Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903214359.23887-1-santosh@fossix.org
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 949976dc115d..cb882d9a6d8a 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -84,7 +84,6 @@ err1;	stw	r0,0(r3)
 
 3:	sub	r5,r5,r6
 	cmpldi	r5,128
-	blt	5f
 
 	mflr	r0
 	stdu	r1,-STACKFRAMESIZE(r1)
@@ -99,6 +98,7 @@ err1;	stw	r0,0(r3)
 	std	r22,STK_REG(R22)(r1)
 	std	r0,STACKFRAMESIZE+16(r1)
 
+	blt	5f
 	srdi	r6,r5,7
 	mtctr	r6
 

From 6f62a8223e65c0571e48225d5d7e56de95225bae Mon Sep 17 00:00:00 2001
From: Santosh Sivaraj <santosh@fossix.org>
Date: Wed, 4 Sep 2019 03:13:59 +0530
Subject: [PATCH 206/264] seltests/powerpc: Add a selftest for memcpy_mcsafe

Appropriate self tests for memcpy_mcsafe

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903214359.23887-2-santosh@fossix.org
---
 tools/testing/selftests/powerpc/copyloops/.gitignore       | 1 +
 tools/testing/selftests/powerpc/copyloops/Makefile         | 7 ++++++-
 tools/testing/selftests/powerpc/copyloops/asm/export.h     | 1 +
 .../testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)
 create mode 120000 tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S

diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore
index de158104912a..12ef5b031974 100644
--- a/tools/testing/selftests/powerpc/copyloops/.gitignore
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -11,3 +11,4 @@ memcpy_p7_t1
 copyuser_64_exc_t0
 copyuser_64_exc_t1
 copyuser_64_exc_t2
+memcpy_mcsafe_64
diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile
index 44574f3818b3..0917983a1c78 100644
--- a/tools/testing/selftests/powerpc/copyloops/Makefile
+++ b/tools/testing/selftests/powerpc/copyloops/Makefile
@@ -12,7 +12,7 @@ ASFLAGS = $(CFLAGS) -Wa,-mpower4
 TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \
 		copyuser_p7_t0 copyuser_p7_t1 \
 		memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \
-		memcpy_p7_t0 memcpy_p7_t1 \
+		memcpy_p7_t0 memcpy_p7_t1 memcpy_mcsafe_64 \
 		copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2
 
 EXTRA_SOURCES := validate.c ../harness.c stubs.S
@@ -45,6 +45,11 @@ $(OUTPUT)/memcpy_p7_t%:	memcpy_power7.S $(EXTRA_SOURCES)
 		-D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \
 		-o $@ $^
 
+$(OUTPUT)/memcpy_mcsafe_64: memcpy_mcsafe_64.S $(EXTRA_SOURCES)
+	$(CC) $(CPPFLAGS) $(CFLAGS) \
+		-D COPY_LOOP=test_memcpy_mcsafe \
+		-o $@ $^
+
 $(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \
 		copy_tofrom_user_reference.S stubs.S
 	$(CC) $(CPPFLAGS) $(CFLAGS) \
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h
index 05c1663c89b0..e6b80d5fbd14 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/export.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h
@@ -1,3 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #define EXPORT_SYMBOL(x)
+#define EXPORT_SYMBOL_GPL(x)
 #define EXPORT_SYMBOL_KASAN(x)
diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
new file mode 120000
index 000000000000..f0feef3062f6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcpy_mcsafe_64.S
\ No newline at end of file

From b4d37a7b6934c0c16930c73f43c08e9a6af22f1a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Sep 2019 18:51:47 +0200
Subject: [PATCH 207/264] powerpc/powernv: Remove unused
 pnv_npu_try_dma_set_bypass() function

Neither pnv_npu_try_dma_set_bypass() nor the pnv_npu_dma_set_32() and
pnv_npu_dma_set_bypass() helpers called by it are used anywhere in the
kernel tree, so remove them.

mpe: They're unused since 2d6ad41b2c21 ("powerpc/powernv: use the
generic iommu bypass code") removed the last usage.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190903165147.11099-1-hch@lst.de
---
 arch/powerpc/platforms/powernv/npu-dma.c | 99 ------------------------
 1 file changed, 99 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c16249d251f1..a570a249edc3 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -192,105 +192,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
 	return 0;
 }
 
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
-	struct pci_dev *gpdev;
-	struct pnv_ioda_pe *gpe;
-	int64_t rc;
-
-	/*
-	 * Find the assoicated PCI devices and get the dma window
-	 * information from there.
-	 */
-	if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
-		return;
-
-	gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-	if (!gpe)
-		return;
-
-	rc = pnv_npu_set_window(&npe->table_group, 0,
-			gpe->table_group.tables[0]);
-
-	/*
-	 * NVLink devices use the same TCE table configuration as
-	 * their parent device so drivers shouldn't be doing DMA
-	 * operations directly on these devices.
-	 */
-	set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
-	struct pnv_phb *phb = npe->phb;
-	int64_t rc = 0;
-	phys_addr_t top = memblock_end_of_DRAM();
-
-	if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
-		return -EINVAL;
-
-	rc = pnv_npu_unset_window(&npe->table_group, 0);
-	if (rc != OPAL_SUCCESS)
-		return rc;
-
-	/* Enable the bypass window */
-
-	top = roundup_pow_of_two(top);
-	dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
-			npe->pe_number);
-	rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-			npe->pe_number, npe->pe_number,
-			0 /* bypass base */, top);
-
-	if (rc == OPAL_SUCCESS)
-		pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-	return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
-	int i;
-	struct pnv_phb *phb;
-	struct pci_dn *pdn;
-	struct pnv_ioda_pe *npe;
-	struct pci_dev *npdev;
-
-	for (i = 0; ; ++i) {
-		npdev = pnv_pci_get_npu_dev(gpdev, i);
-
-		if (!npdev)
-			break;
-
-		pdn = pci_get_pdn(npdev);
-		if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-			return;
-
-		phb = pci_bus_to_host(npdev->bus)->private_data;
-
-		/* We only do bypass if it's enabled on the linked device */
-		npe = &phb->ioda.pe_array[pdn->pe_number];
-
-		if (bypass) {
-			dev_info(&npdev->dev,
-					"Using 64-bit DMA iommu bypass\n");
-			pnv_npu_dma_set_bypass(npe);
-		} else {
-			dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
-			pnv_npu_dma_set_32(npe);
-		}
-	}
-}
-
 #ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
 static void pnv_npu_take_ownership(struct iommu_table_group *table_group)

From 2be1d5d147955e6aea273dc73a9f0ae4510fd225 Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Mon, 26 Aug 2019 12:27:00 +0530
Subject: [PATCH 208/264] powerpc/powernv: Enhance opal message read interface

Use "opal-msg-size" device tree property to allocate memory for
"opal_msg".

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
[mpe: s/uint32_t/u32/ and mark opal_msg_size as __ro_after_init]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190826065701.8853-1-hegdevasant@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/opal.c | 31 ++++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index d271accf224b..38e90270280b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock);
 static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static uint32_t opal_heartbeat;
 static struct task_struct *kopald_tsk;
+static struct opal_msg *opal_msg;
+static u32 opal_msg_size __ro_after_init;
 
 void opal_configure_cores(void)
 {
@@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg)
 static void opal_handle_message(void)
 {
 	s64 ret;
-	/*
-	 * TODO: pre-allocate a message buffer depending on opal-msg-size
-	 * value in /proc/device-tree.
-	 */
-	static struct opal_msg msg;
 	u32 type;
 
-	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
 	/* No opal message pending. */
 	if (ret == OPAL_RESOURCE)
 		return;
@@ -290,14 +287,14 @@ static void opal_handle_message(void)
 		return;
 	}
 
-	type = be32_to_cpu(msg.msg_type);
+	type = be32_to_cpu(opal_msg->msg_type);
 
 	/* Sanity check */
 	if (type >= OPAL_MSG_TYPE_MAX) {
 		pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
 		return;
 	}
-	opal_message_do_notify(type, (void *)&msg);
+	opal_message_do_notify(type, (void *)opal_msg);
 }
 
 static irqreturn_t opal_message_notify(int irq, void *data)
@@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __init opal_message_init(void)
+static int __init opal_message_init(struct device_node *opal_node)
 {
 	int ret, i, irq;
 
+	ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
+	if (ret) {
+		pr_notice("Failed to read opal-msg-size property\n");
+		opal_msg_size = sizeof(struct opal_msg);
+	}
+
+	opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+	if (!opal_msg) {
+		opal_msg_size = sizeof(struct opal_msg);
+		/* Try to allocate fixed message size */
+		opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+		BUG_ON(opal_msg == NULL);
+	}
+
 	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
 
@@ -913,7 +924,7 @@ static int __init opal_init(void)
 	}
 
 	/* Initialise OPAL messaging system */
-	opal_message_init();
+	opal_message_init(opal_node);
 
 	/* Initialise OPAL asynchronous completion interface */
 	opal_async_comp_init();

From 587164cd593c2cbdad376179f85159e31221989e Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Mon, 26 Aug 2019 12:27:01 +0530
Subject: [PATCH 209/264] powerpc/powernv: Add new opal message type

We have OPAL_MSG_PRD message type to pass prd related messages from
OPAL to `opal-prd`. It can handle messages upto 64 bytes. We have a
requirement to send bigger than 64 bytes of data from OPAL to
`opal-prd`. Lets add new message type (OPAL_MSG_PRD2) to pass bigger
data.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
[mpe: Make the error string clear that it's the PRD2 event that failed]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190826065701.8853-2-hegdevasant@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/opal-api.h       | 1 +
 arch/powerpc/platforms/powernv/opal-prd.c | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 383242eb0dea..1cad413e1e0e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -453,6 +453,7 @@ enum opal_msg_type {
 	OPAL_MSG_DPO		= 5,
 	OPAL_MSG_PRD		= 6,
 	OPAL_MSG_OCC		= 7,
+	OPAL_MSG_PRD2		= 8,
 	OPAL_MSG_TYPE_MAX,
 };
 
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index e072bf157d62..45f4223a790f 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
 	int msg_size, item_size;
 	unsigned long flags;
 
-	if (msg_type != OPAL_MSG_PRD)
+	if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
 		return 0;
 
 	/* Calculate total size of the message and item we need to store. The
@@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev)
 		return rc;
 	}
 
+	rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
+	if (rc) {
+		pr_err("Couldn't register PRD2 event notifier\n");
+		return rc;
+	}
+
 	rc = misc_register(&opal_prd_dev);
 	if (rc) {
 		pr_err("failed to register miscdev\n");

From bc01bdf6c5df5023272a7399962cf64f8fedc93e Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Date: Tue, 10 Sep 2019 18:45:13 +0530
Subject: [PATCH 210/264] powerpc/watchpoint: Disable watchpoint hit by
 larx/stcx instructions

If watchpoint exception is generated by larx/stcx instructions, the
reservation created by larx gets lost while handling exception, and
thus stcx instruction always fails. Generally these instructions are
used in a while(1) loop, for example spinlocks. And because stcx
never succeeds, it loops forever and ultimately hangs the system.

Note that ptrace anyway works in one-shot mode and thus for ptrace
we don't change the behaviour. It's up to ptrace user to take care
of this.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190910131513.30499-1-ravi.bangoria@linux.ibm.com
---
 arch/powerpc/kernel/hw_breakpoint.c | 51 +++++++++++++++++++----------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 28ad3171bb82..1007ec36b4cb 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -195,14 +195,32 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 	tsk->thread.last_hit_ubp = NULL;
 }
 
+static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr)
+{
+	int ret, type;
+	struct instruction_op op;
+
+	ret = analyse_instr(&op, regs, instr);
+	type = GETTYPE(op.type);
+	return (!ret && (type == LARX || type == STCX));
+}
+
 /*
  * Handle debug exception notifications.
  */
 static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
 			     unsigned long addr)
 {
-	int stepped;
-	unsigned int instr;
+	unsigned int instr = 0;
+
+	if (__get_user_inatomic(instr, (unsigned int *)regs->nip))
+		goto fail;
+
+	if (is_larx_stcx_instr(regs, instr)) {
+		printk_ratelimited("Breakpoint hit on instruction that can't be emulated."
+				   " Breakpoint at 0x%lx will be disabled.\n", addr);
+		goto disable;
+	}
 
 	/* Do not emulate user-space instructions, instead single-step them */
 	if (user_mode(regs)) {
@@ -211,23 +229,22 @@ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
 		return false;
 	}
 
-	stepped = 0;
-	instr = 0;
-	if (!__get_user_inatomic(instr, (unsigned int *)regs->nip))
-		stepped = emulate_step(regs, instr);
+	if (!emulate_step(regs, instr))
+		goto fail;
 
-	/*
-	 * emulate_step() could not execute it. We've failed in reliably
-	 * handling the hw-breakpoint. Unregister it and throw a warning
-	 * message to let the user know about it.
-	 */
-	if (!stepped) {
-		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
-			"0x%lx will be disabled.", addr);
-		perf_event_disable_inatomic(bp);
-		return false;
-	}
 	return true;
+
+fail:
+	/*
+	 * We've failed in reliably handling the hw-breakpoint. Unregister
+	 * it and throw a warning message to let the user know about it.
+	 */
+	WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
+		"0x%lx will be disabled.", addr);
+
+disable:
+	perf_event_disable_inatomic(bp);
+	return false;
 }
 
 int hw_breakpoint_handler(struct die_args *args)

From 92c94dfb69e350471473fd3075c74bc68150879e Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Tue, 10 Sep 2019 17:52:44 -0500
Subject: [PATCH 211/264] powerpc/pseries: correctly track irq state in default
 idle

prep_irq_for_idle() is intended to be called before entering
H_CEDE (and it is used by the pseries cpuidle driver). However the
default pseries idle routine does not call it, leading to mismanaged
lazy irq state when the cpuidle driver isn't in use. Manifestations of
this include:

* Dropped IPIs in the time immediately after a cpu comes
  online (before it has installed the cpuidle handler), making the
  online operation block indefinitely waiting for the new cpu to
  respond.

* Hitting this WARN_ON in arch_local_irq_restore():
	/*
	 * We should already be hard disabled here. We had bugs
	 * where that wasn't the case so let's dbl check it and
	 * warn if we are wrong. Only do that when IRQ tracing
	 * is enabled as mfmsr() can be costly.
	 */
	if (WARN_ON_ONCE(mfmsr() & MSR_EE))
		__hard_irq_disable();

Call prep_irq_for_idle() from pseries_lpar_idle() and honor its
result.

Fixes: 363edbe2614a ("powerpc: Default arch idle could cede processor on pseries")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190910225244.25056-1-nathanl@linux.ibm.com
---
 arch/powerpc/platforms/pseries/setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b955d54628ff..f8adcd0e4589 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -321,6 +321,9 @@ static void pseries_lpar_idle(void)
 	 * low power mode by ceding processor to hypervisor
 	 */
 
+	if (!prep_irq_for_idle())
+		return;
+
 	/* Indicate to hypervisor that we are idle. */
 	get_lppaca()->idle = 1;
 

From 6ccb4ac2bf8a35c694ead92f8ac5530a16e8f2c8 Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Wed, 11 Sep 2019 17:52:18 +0200
Subject: [PATCH 212/264] powerpc/xive: Fix bogus error code returned by OPAL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a bug in skiboot that causes the OPAL_XIVE_ALLOCATE_IRQ call
to return the 32-bit value 0xffffffff when OPAL has run out of IRQs.
Unfortunatelty, OPAL return values are signed 64-bit entities and
errors are supposed to be negative. If that happens, the linux code
confusingly treats 0xffffffff as a valid IRQ number and panics at some
point.

A fix was recently merged in skiboot:

e97391ae2bb5 ("xive: fix return value of opal_xive_allocate_irq()")

but we need a workaround anyway to support older skiboots already
in the field.

Internally convert 0xffffffff to OPAL_RESOURCE which is the usual error
returned upon resource exhaustion.

Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821713818.1985334.14123187368108582810.stgit@bahia.lan
---
 arch/powerpc/include/asm/opal.h            |  2 +-
 arch/powerpc/platforms/powernv/opal-call.c |  2 +-
 arch/powerpc/sysdev/xive/native.c          | 11 +++++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 57bd029c715e..d5a0807d21db 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -272,7 +272,7 @@ int64_t opal_xive_get_vp_info(uint64_t vp,
 int64_t opal_xive_set_vp_info(uint64_t vp,
 			      uint64_t flags,
 			      uint64_t report_cl_pair);
-int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_allocate_irq_raw(uint32_t chip_id);
 int64_t opal_xive_free_irq(uint32_t girq);
 int64_t opal_xive_sync(uint32_t type, uint32_t id);
 int64_t opal_xive_dump(uint32_t type, uint32_t id);
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 29ca523c1c79..dccdc9df5213 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info,		OPAL_XIVE_SET_QUEUE_INFO);
 OPAL_CALL(opal_xive_donate_page,		OPAL_XIVE_DONATE_PAGE);
 OPAL_CALL(opal_xive_alloc_vp_block,		OPAL_XIVE_ALLOCATE_VP_BLOCK);
 OPAL_CALL(opal_xive_free_vp_block,		OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq,		OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_allocate_irq_raw,		OPAL_XIVE_ALLOCATE_IRQ);
 OPAL_CALL(opal_xive_free_irq,			OPAL_XIVE_FREE_IRQ);
 OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 4b61e44f0171..e9481468ebd8 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -245,6 +245,17 @@ static bool xive_native_match(struct device_node *node)
 	return of_device_is_compatible(node, "ibm,opal-xive-vc");
 }
 
+static s64 opal_xive_allocate_irq(u32 chip_id)
+{
+	s64 irq = opal_xive_allocate_irq_raw(chip_id);
+
+	/*
+	 * Old versions of skiboot can incorrectly return 0xffffffff to
+	 * indicate no space, fix it up here.
+	 */
+	return irq == 0xffffffff ? OPAL_RESOURCE : irq;
+}
+
 #ifdef CONFIG_SMP
 static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
 {

From 1b7f3b6c43675ef2cfb9d8f48bde057794820f7c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 13 Sep 2019 23:32:13 +1000
Subject: [PATCH 213/264] powerpc/eeh: Fix build with STACKTRACE=n

The build breaks when STACKTRACE=n, eg. skiroot_defconfig:

  arch/powerpc/kernel/eeh_event.c:124:23: error: implicit declaration of function 'stack_trace_save'

Fix it with some ifdefs for now.

Fixes: 25baf3d81614 ("powerpc/eeh: Defer printing stack trace")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/eeh.h   | 2 ++
 arch/powerpc/kernel/eeh_driver.c | 3 ++-
 arch/powerpc/kernel/eeh_event.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9d0e1694a94d..6f9b2a12540a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -89,6 +89,7 @@ struct eeh_pe {
 	struct list_head child;		/* Memb. child_list/eeh_phb_pe	*/
 	struct list_head edevs;		/* List of eeh_dev in this PE	*/
 
+#ifdef CONFIG_STACKTRACE
 	/*
 	 * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
 	 * the stack trace is saved here so we can print it in the recovery
@@ -99,6 +100,7 @@ struct eeh_pe {
 	 */
 	unsigned long stack_trace[64];
 	int trace_entries;
+#endif /* CONFIG_STACKTRACE */
 };
 
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 80bd157fcb45..d9279d0ee9f5 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -908,6 +908,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
 	}
 
+#ifdef CONFIG_STACKTRACE
 	/*
 	 * Print the saved stack trace now that we've verified there's
 	 * something to recover.
@@ -926,7 +927,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 
 		pe->trace_entries = 0;
 	}
-
+#endif /* CONFIG_STACKTRACE */
 
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 1d55486adb0f..a7a8dc182efb 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -117,12 +117,14 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
 	 * while the PE is sitting in the event queue.
 	 */
 	if (pe) {
+#ifdef CONFIG_STACKTRACE
 		/*
 		 * Save the current stack trace so we can dump it from the
 		 * event handler thread.
 		 */
 		pe->trace_entries = stack_trace_save(pe->stack_trace,
 					 ARRAY_SIZE(pe->stack_trace), 0);
+#endif /* CONFIG_STACKTRACE */
 
 		eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
 	}

From 79cb6879135b7441f0955ca3f86be8c8d21c7199 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 13 Sep 2019 23:50:06 +1000
Subject: [PATCH 214/264] powerpc/powernv: Fix build with IOMMU_API=n

The builds breaks when IOMMU_API=n, eg. skiroot_defconfig:

  arch/powerpc/platforms/powernv/npu-dma.c:96:28: error: 'get_gpu_pci_dev_and_pe' defined but not used
  arch/powerpc/platforms/powernv/npu-dma.c:126:13: error: 'pnv_npu_set_window' defined but not used

Fixes: b4d37a7b6934 ("powerpc/powernv: Remove unused pnv_npu_try_dma_set_bypass() function")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index a570a249edc3..b95b9e3c4c98 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
 }
 EXPORT_SYMBOL(pnv_pci_get_npu_dev);
 
+#ifdef CONFIG_IOMMU_API
 /*
  * Returns the PE assoicated with the PCI device of the given
  * NPU. Returns the linked pci device if pci_dev != NULL.
@@ -192,7 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
 	return 0;
 }
 
-#ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
 static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {

From 0cb0837f9db1a6ed5b764ef61dd5f1a314b8231a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 11 Sep 2019 21:57:43 +1000
Subject: [PATCH 215/264] powerpc/kvm: Move kvm_tmp into .text, shrink to 64K

In some configurations of KVM, guests binary patch themselves to
avoid/reduce trapping into the hypervisor. For some instructions this
requires replacing one instruction with a sequence of instructions.

For those cases we need to write the sequence of instructions
somewhere and then patch the location of the original instruction to
branch to the sequence. That requires that the location of the
sequence be within 32MB of the original instruction.

The current solution for this is that we create a 1MB array in BSS,
write sequences into there, and then free the remainder of the array.

This has a few problems:
 - it confuses kmemleak.
 - it confuses lockdep.
 - it requires mapping kvm_tmp executable, which can cause adjacent
   areas to also be mapped executable if we're using 16M pages for the
   linear mapping.
 - the 32MB limit can be exceeded if the kernel is big enough,
   especially with STRICT_KERNEL_RWX enabled, which then prevents the
   patching from working at all.

We can fix all those problems by making kvm_tmp just a region of
regular .text. However currently it's 1MB in size, and we don't want
to waste 1MB of text. In practice however I only see ~30KB of kvm_tmp
being used even for an allyes_config. So shrink kvm_tmp to 64K, which
ought to be enough for everyone, and move it into .text.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190911115746.12433-1-mpe@ellerman.id.au
---
 arch/powerpc/kernel/kvm.c      | 24 +++++-------------------
 arch/powerpc/kernel/kvm_emul.S |  8 ++++++++
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index b7b3a5e4e224..e3b5aa583319 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -64,7 +64,8 @@
 #define KVM_INST_MTSRIN		0x7c0001e4
 
 static bool kvm_patching_worked = true;
-char kvm_tmp[1024 * 1024];
+extern char kvm_tmp[];
+extern char kvm_tmp_end[];
 static int kvm_tmp_index;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
@@ -132,7 +133,7 @@ static u32 *kvm_alloc(int len)
 {
 	u32 *p;
 
-	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+	if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) {
 		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
 				kvm_tmp_index, len);
 		kvm_patching_worked = false;
@@ -699,25 +700,13 @@ static void kvm_use_magic_page(void)
 			 kvm_patching_worked ? "worked" : "failed");
 }
 
-static __init void kvm_free_tmp(void)
-{
-	/*
-	 * Inform kmemleak about the hole in the .bss section since the
-	 * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y.
-	 */
-	kmemleak_free_part(&kvm_tmp[kvm_tmp_index],
-			   ARRAY_SIZE(kvm_tmp) - kvm_tmp_index);
-	free_reserved_area(&kvm_tmp[kvm_tmp_index],
-			   &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
-}
-
 static int __init kvm_guest_init(void)
 {
 	if (!kvm_para_available())
-		goto free_tmp;
+		return 0;
 
 	if (!epapr_paravirt_enabled)
-		goto free_tmp;
+		return 0;
 
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
@@ -727,9 +716,6 @@ static int __init kvm_guest_init(void)
 	powersave_nap = 1;
 #endif
 
-free_tmp:
-	kvm_free_tmp();
-
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index eb2568f583ae..9dd17dce10a1 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -334,5 +334,13 @@ kvm_emulate_mtsrin_orig_ins_offs:
 kvm_emulate_mtsrin_len:
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
 
+	.balign 4
+	.global kvm_tmp
+kvm_tmp:
+	.space	(64 * 1024)
+
+.global kvm_tmp_end
+kvm_tmp_end:
+
 .global kvm_template_end
 kvm_template_end:

From dac39f788546e2eb9838eca551ccd6fd413e75c7 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 11 Sep 2019 21:57:44 +1000
Subject: [PATCH 216/264] powerpc/64s: Remove overlaps_kvm_tmp()

kvm_tmp is now in .text and so doesn't need a special overlap check.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190911115746.12433-2-mpe@ellerman.id.au
---
 arch/powerpc/include/asm/sections.h   | 11 -----------
 arch/powerpc/mm/book3s64/hash_utils.c |  4 ----
 2 files changed, 15 deletions(-)

diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 4a1664a8658d..5a9b6eb651b6 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -61,17 +61,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end)
 		(unsigned long)_stext < end;
 }
 
-static inline int overlaps_kvm_tmp(unsigned long start, unsigned long end)
-{
-#ifdef CONFIG_KVM_GUEST
-	extern char kvm_tmp[];
-	return start < (unsigned long)kvm_tmp &&
-		(unsigned long)&kvm_tmp[1024 * 1024] < end;
-#else
-	return 0;
-#endif
-}
-
 #ifdef PPC64_ELF_ABI_v1
 
 #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 7684a596158b..3410ea9f4de1 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -273,10 +273,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		if (overlaps_kernel_text(vaddr, vaddr + step))
 			tprot &= ~HPTE_R_N;
 
-		/* Make kvm guest trampolines executable */
-		if (overlaps_kvm_tmp(vaddr, vaddr + step))
-			tprot &= ~HPTE_R_N;
-
 		/*
 		 * If relocatable, check if it overlaps interrupt vectors that
 		 * are copied down to real 0. For relocatable kernel

From 731dade128ebc35044e7f9b9d396e4c1bed6ecbc Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 11 Sep 2019 21:57:45 +1000
Subject: [PATCH 217/264] powerpc/kvm: Explicitly mark kvm guest code as __init

All the code in kvm.c can be marked __init. Most of it is already
inlined into the initcall, but not all. So instead of relying on the
inlining, mark it all as __init. This saves ~280 bytes of text for my
configuration.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190911115746.12433-3-mpe@ellerman.id.au
---
 arch/powerpc/kernel/kvm.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index e3b5aa583319..617eba82531c 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -68,13 +68,13 @@ extern char kvm_tmp[];
 extern char kvm_tmp_end[];
 static int kvm_tmp_index;
 
-static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
+static void __init kvm_patch_ins(u32 *inst, u32 new_inst)
 {
 	*inst = new_inst;
 	flush_icache_range((ulong)inst, (ulong)inst + 4);
 }
 
-static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@@ -83,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
 #endif
 }
 
-static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@@ -92,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
 #endif
 }
 
-static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
 {
 	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
 }
 
-static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
 	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
@@ -106,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
 #endif
 }
 
-static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
 {
 	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
 }
 
-static void kvm_patch_ins_nop(u32 *inst)
+static void __init kvm_patch_ins_nop(u32 *inst)
 {
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }
 
-static void kvm_patch_ins_b(u32 *inst, int addr)
+static void __init kvm_patch_ins_b(u32 *inst, int addr)
 {
 #if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S)
 	/* On relocatable kernels interrupts handlers and our code
@@ -129,7 +129,7 @@ static void kvm_patch_ins_b(u32 *inst, int addr)
 	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
 }
 
-static u32 *kvm_alloc(int len)
+static u32 * __init kvm_alloc(int len)
 {
 	u32 *p;
 
@@ -152,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
 extern u32 kvm_emulate_mtmsrd_len;
 extern u32 kvm_emulate_mtmsrd[];
 
-static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 {
 	u32 *p;
 	int distance_start;
@@ -205,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs;
 extern u32 kvm_emulate_mtmsr_len;
 extern u32 kvm_emulate_mtmsr[];
 
-static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 {
 	u32 *p;
 	int distance_start;
@@ -266,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs;
 extern u32 kvm_emulate_wrtee_len;
 extern u32 kvm_emulate_wrtee[];
 
-static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
+static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
 {
 	u32 *p;
 	int distance_start;
@@ -323,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs;
 extern u32 kvm_emulate_wrteei_0_len;
 extern u32 kvm_emulate_wrteei_0[];
 
-static void kvm_patch_ins_wrteei_0(u32 *inst)
+static void __init kvm_patch_ins_wrteei_0(u32 *inst)
 {
 	u32 *p;
 	int distance_start;
@@ -364,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs;
 extern u32 kvm_emulate_mtsrin_len;
 extern u32 kvm_emulate_mtsrin[];
 
-static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
 {
 	u32 *p;
 	int distance_start;
@@ -400,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
 
 #endif
 
-static void kvm_map_magic_page(void *data)
+static void __init kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
 
@@ -415,7 +415,7 @@ static void kvm_map_magic_page(void *data)
 	*features = out[0];
 }
 
-static void kvm_check_ins(u32 *inst, u32 features)
+static void __init kvm_check_ins(u32 *inst, u32 features)
 {
 	u32 _inst = *inst;
 	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
@@ -659,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features)
 extern u32 kvm_template_start[];
 extern u32 kvm_template_end[];
 
-static void kvm_use_magic_page(void)
+static void __init kvm_use_magic_page(void)
 {
 	u32 *p;
 	u32 *start, *end;

From caff52dc0b71538cff43c06ede3621fbbf359978 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 11 Sep 2019 21:57:46 +1000
Subject: [PATCH 218/264] powerpc/kvm: Add ifdefs around template code

Some of the templates used for KVM patching are only used on certain
platforms, but currently they are always built-in, fix that.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190911115746.12433-4-mpe@ellerman.id.au
---
 arch/powerpc/kernel/kvm_emul.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 9dd17dce10a1..7af6f8b50c5d 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs:
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
 
+#ifdef CONFIG_BOOKE
+
 /* also used for wrteei 1 */
 .global kvm_emulate_wrtee
 kvm_emulate_wrtee:
@@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs:
 kvm_emulate_wrteei_0_len:
 	.long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
 
+#endif /* CONFIG_BOOKE */
+
+#ifdef CONFIG_PPC_BOOK3S_32
+
 .global kvm_emulate_mtsrin
 kvm_emulate_mtsrin:
 
@@ -334,6 +340,8 @@ kvm_emulate_mtsrin_orig_ins_offs:
 kvm_emulate_mtsrin_len:
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
 
+#endif /* CONFIG_PPC_BOOK3S_32 */
+
 	.balign 4
 	.global kvm_tmp
 kvm_tmp:

From 1fdfa4c6af0cc1854b017f308af6bece94568bb6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 12 Sep 2019 16:40:37 +0900
Subject: [PATCH 219/264] powerpc: improve prom_init_check rule

This slightly improves the prom_init_check rule.

[1] Avoid needless check

Currently, prom_init_check.sh is invoked every time you run 'make'
even if you have changed nothing in prom_init.c. With this commit,
the script is re-run only when prom_init.o is recompiled.

[2] Beautify the build log

Currently, the O= build shows the absolute path to the script:

  CALL    /abs/path/to/source/of/linux/arch/powerpc/kernel/prom_init_check.sh

With this commit, it is always a relative path to the timestamp file:

  PROMCHK arch/powerpc/kernel/prom_init_check

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190912074037.13813-1-yamada.masahiro@socionext.com
---
 arch/powerpc/kernel/.gitignore |  1 +
 arch/powerpc/kernel/Makefile   | 14 ++++++--------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore
index c5f676c3c224..67ebd3003c05 100644
--- a/arch/powerpc/kernel/.gitignore
+++ b/arch/powerpc/kernel/.gitignore
@@ -1 +1,2 @@
+prom_init_check
 vmlinux.lds
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index c6ae0e7914bc..66c54443187d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -188,15 +188,13 @@ extra-$(CONFIG_ALTIVEC)		+= vector.o
 extra-$(CONFIG_PPC64)		+= entry_64.o
 extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o
 
-ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
-$(obj)/built-in.a:		prom_init_check
+extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init_check
 
-quiet_cmd_prom_init_check = CALL    $<
-      cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o"
+quiet_cmd_prom_init_check = PROMCHK $@
+      cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@
 
-PHONY += prom_init_check
-prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o
-	$(call cmd,prom_init_check)
-endif
+$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE
+	$(call if_changed,prom_init_check)
+targets += prom_init_check
 
 clean-files := vmlinux.lds

From ca986d7fa7e7f7b3f018f227b999f35e654fbb79 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:16:21 +0530
Subject: [PATCH 220/264] powerpc/fadump: move internal macros/definitions to a
 new header

Though asm/fadump.h is meant to be used by other components dealing
with FADump, it also has macros/definitions internal to FADump code.
Move them to a new header file used within FADump code. This also
makes way for refactoring platform specific FADump code.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821313134.5656.6597770626574392140.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h | 86 ++++++++++++++++++++++
 arch/powerpc/include/asm/fadump.h          | 71 ------------------
 arch/powerpc/kernel/fadump.c               |  1 +
 3 files changed, 87 insertions(+), 71 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fadump-internal.h

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
new file mode 100644
index 000000000000..071f377d352f
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump internal code.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_FADUMP_INTERNAL_H
+#define _ASM_POWERPC_FADUMP_INTERNAL_H
+
+/*
+ * The RMA region will be saved for later dumping when kernel crashes.
+ * RMA is Real Mode Area, the first block of logical memory address owned
+ * by logical partition, containing the storage that may be accessed with
+ * translate off.
+ */
+#define RMA_START	0x0
+#define RMA_END		(ppc64_rma_size)
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need additional 64M
+ * of memory to avoid OOM issue.
+ */
+#define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
+			+ (0x1UL << 26))
+
+/* The upper limit percentage for user specified boot memory size (25%) */
+#define MAX_BOOT_MEM_RATIO			4
+
+#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
+
+/* Alignment per CMA requirement. */
+#define FADUMP_CMA_ALIGNMENT	(PAGE_SIZE <<				\
+				 max_t(unsigned long, MAX_ORDER - 1,	\
+				 pageblock_order))
+
+/* FAD commands */
+#define FADUMP_REGISTER			1
+#define FADUMP_UNREGISTER		2
+#define FADUMP_INVALIDATE		3
+
+#define FADUMP_CRASH_INFO_MAGIC		str_to_u64("FADMPINF")
+
+/* fadump crash info structure */
+struct fadump_crash_info_header {
+	u64		magic_number;
+	u64		elfcorehdr_addr;
+	u32		crashing_cpu;
+	struct pt_regs	regs;
+	struct cpumask	online_mask;
+};
+
+struct fad_crash_memory_ranges {
+	unsigned long long	base;
+	unsigned long long	size;
+};
+
+/* Firmware-assisted dump configuration details. */
+struct fw_dump {
+	unsigned long	reserve_dump_area_start;
+	unsigned long	reserve_dump_area_size;
+	/* cmd line option during boot */
+	unsigned long	reserve_bootvar;
+
+	unsigned long	cpu_state_data_size;
+	unsigned long	hpte_region_size;
+	unsigned long	boot_memory_size;
+
+	unsigned long	fadumphdr_addr;
+	unsigned long	cpu_notes_buf;
+	unsigned long	cpu_notes_buf_size;
+
+	int		ibm_configure_kernel_dump;
+
+	unsigned long	fadump_enabled:1;
+	unsigned long	fadump_supported:1;
+	unsigned long	dump_active:1;
+	unsigned long	dump_registered:1;
+	unsigned long	nocma:1;
+};
+
+#endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 17d9b6acaf63..751794978427 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -11,34 +11,6 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START	0x0
-#define RMA_END		(ppc64_rma_size)
-
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need aditional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-			+ (0x1UL << 26))
-
-/* The upper limit percentage for user specified boot memory size (25%) */
-#define MAX_BOOT_MEM_RATIO			4
-
-#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
-
-/* Alignement per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT	(PAGE_SIZE <<				\
-			max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
-
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA	0x0001
 #define FADUMP_HPTE_REGION	0x0002
@@ -47,11 +19,6 @@
 /* Dump request flag */
 #define FADUMP_REQUEST_FLAG	0x00000001
 
-/* FAD commands */
-#define FADUMP_REGISTER		1
-#define FADUMP_UNREGISTER	2
-#define FADUMP_INVALIDATE	3
-
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG	0x2000
 
@@ -112,29 +79,6 @@ struct fadump_mem_struct {
 	struct fadump_section		rmr_region;
 };
 
-/* Firmware-assisted dump configuration details. */
-struct fw_dump {
-	unsigned long	cpu_state_data_size;
-	unsigned long	hpte_region_size;
-	unsigned long	boot_memory_size;
-	unsigned long	reserve_dump_area_start;
-	unsigned long	reserve_dump_area_size;
-	/* cmd line option during boot */
-	unsigned long	reserve_bootvar;
-
-	unsigned long	fadumphdr_addr;
-	unsigned long	cpu_notes_buf;
-	unsigned long	cpu_notes_buf_size;
-
-	int		ibm_configure_kernel_dump;
-
-	unsigned long	fadump_enabled:1;
-	unsigned long	fadump_supported:1;
-	unsigned long	dump_active:1;
-	unsigned long	dump_registered:1;
-	unsigned long	nocma:1;
-};
-
 /*
  * Copy the ascii values for first 8 characters from a string into u64
  * variable at their respective indexes.
@@ -153,7 +97,6 @@ static inline u64 str_to_u64(const char *str)
 #define STR_TO_HEX(x)	str_to_u64(x)
 #define REG_ID(x)	str_to_u64(x)
 
-#define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
 #define REGSAVE_AREA_MAGIC		STR_TO_HEX("REGSAVE")
 
 /* The firmware-assisted dump format.
@@ -178,20 +121,6 @@ struct fadump_reg_entry {
 	__be64		reg_value;
 };
 
-/* fadump crash info structure */
-struct fadump_crash_info_header {
-	u64		magic_number;
-	u64		elfcorehdr_addr;
-	u32		crashing_cpu;
-	struct pt_regs	regs;
-	struct cpumask	online_mask;
-};
-
-struct fad_crash_memory_ranges {
-	unsigned long long	base;
-	unsigned long long	size;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4eab97292cc2..7c55044cf9d4 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,6 +30,7 @@
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/fadump.h>
+#include <asm/fadump-internal.h>
 #include <asm/setup.h>
 
 static struct fw_dump fw_dump;

From 961cf26a98648a294de45ea6f806dc84dfc91197 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:16:36 +0530
Subject: [PATCH 221/264] powerpc/fadump: add helper functions

Add helper functions to setup & free CPU notes buffer and to find if a
given memory area is contiguous. Also, use boolean as return type for
the function that finds if boot memory area is contiguous. While at
it, save the virtual address of CPU notes buffer instead of physical
address as virtual address is used often.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821318971.5656.9281936950510635858.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h |   2 +-
 arch/powerpc/kernel/fadump.c               | 127 +++++++++++----------
 2 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 071f377d352f..16e2781a3e48 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -71,7 +71,7 @@ struct fw_dump {
 	unsigned long	boot_memory_size;
 
 	unsigned long	fadumphdr_addr;
-	unsigned long	cpu_notes_buf;
+	unsigned long	cpu_notes_buf_vaddr;
 	unsigned long	cpu_notes_buf_size;
 
 	int		ibm_configure_kernel_dump;
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 7c55044cf9d4..e2b83a991303 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -201,64 +201,55 @@ int is_fadump_active(void)
 }
 
 /*
- * Returns 1, if there are no holes in boot memory area,
- * 0 otherwise.
+ * Returns true, if there are no holes in memory area between d_start to d_end,
+ * false otherwise.
  */
-static int is_boot_memory_area_contiguous(void)
+static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
 {
 	struct memblock_region *reg;
-	unsigned long tstart, tend;
-	unsigned long start_pfn = PHYS_PFN(RMA_START);
-	unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size);
-	unsigned int ret = 0;
+	bool ret = false;
+	u64 start, end;
 
 	for_each_memblock(memory, reg) {
-		tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
-		tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
-		if (tstart < tend) {
-			/* Memory hole from start_pfn to tstart */
-			if (tstart > start_pfn)
+		start = max_t(u64, d_start, reg->base);
+		end = min_t(u64, d_end, (reg->base + reg->size));
+		if (d_start < end) {
+			/* Memory hole from d_start to start */
+			if (start > d_start)
 				break;
 
-			if (tend == end_pfn) {
-				ret = 1;
+			if (end == d_end) {
+				ret = true;
 				break;
 			}
 
-			start_pfn = tend + 1;
+			d_start = end + 1;
 		}
 	}
 
 	return ret;
 }
 
+/*
+ * Returns true, if there are no holes in boot memory area,
+ * false otherwise.
+ */
+static bool is_boot_memory_area_contiguous(void)
+{
+	return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size);
+}
+
 /*
  * Returns true, if there are no holes in reserved memory area,
  * false otherwise.
  */
 static bool is_reserved_memory_area_contiguous(void)
 {
-	struct memblock_region *reg;
-	unsigned long start, end;
-	unsigned long d_start = fw_dump.reserve_dump_area_start;
-	unsigned long d_end = d_start + fw_dump.reserve_dump_area_size;
+	u64 d_start, d_end;
 
-	for_each_memblock(memory, reg) {
-		start = max(d_start, (unsigned long)reg->base);
-		end = min(d_end, (unsigned long)(reg->base + reg->size));
-		if (d_start < end) {
-			/* Memory hole from d_start to start */
-			if (start > d_start)
-				break;
-
-			if (end == d_end)
-				return true;
-
-			d_start = end + 1;
-		}
-	}
-
-	return false;
+	d_start	= fw_dump.reserve_dump_area_start;
+	d_end	= d_start + fw_dump.reserve_dump_area_size;
+	return is_fadump_mem_area_contiguous(d_start, d_end);
 }
 
 /* Print firmware assisted dump configurations for debugging purpose. */
@@ -785,7 +776,7 @@ static void fadump_update_elfcore_header(char *bufp)
 	phdr = (struct elf_phdr *)bufp;
 
 	if (phdr->p_type == PT_NOTE) {
-		phdr->p_paddr = fw_dump.cpu_notes_buf;
+		phdr->p_paddr	= __pa(fw_dump.cpu_notes_buf_vaddr);
 		phdr->p_offset	= phdr->p_paddr;
 		phdr->p_filesz	= fw_dump.cpu_notes_buf_size;
 		phdr->p_memsz = fw_dump.cpu_notes_buf_size;
@@ -793,7 +784,7 @@ static void fadump_update_elfcore_header(char *bufp)
 	return;
 }
 
-static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+static void *fadump_alloc_buffer(unsigned long size)
 {
 	void *vaddr;
 	struct page *page;
@@ -811,7 +802,7 @@ static void *fadump_cpu_notes_buf_alloc(unsigned long size)
 	return vaddr;
 }
 
-static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
 {
 	struct page *page;
 	unsigned long order, count, i;
@@ -824,6 +815,36 @@ static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
 	__free_pages(page, order);
 }
 
+static s32 fadump_setup_cpu_notes_buf(u32 num_cpus)
+{
+	/* Allocate buffer to hold cpu crash notes. */
+	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf_vaddr =
+		(unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size);
+	if (!fw_dump.cpu_notes_buf_vaddr) {
+		pr_err("Failed to allocate %ld bytes for CPU notes buffer\n",
+		       fw_dump.cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n",
+		 fw_dump.cpu_notes_buf_size,
+		 fw_dump.cpu_notes_buf_vaddr);
+	return 0;
+}
+
+static void fadump_free_cpu_notes_buf(void)
+{
+	if (!fw_dump.cpu_notes_buf_vaddr)
+		return;
+
+	fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr,
+			   fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf_vaddr = 0;
+	fw_dump.cpu_notes_buf_size = 0;
+}
+
 /*
  * Read CPU state dump data and convert it into ELF notes.
  * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
@@ -870,19 +891,11 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 	vaddr += sizeof(u32);
 	reg_entry = (struct fadump_reg_entry *)vaddr;
 
-	/* Allocate buffer to hold cpu crash notes. */
-	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
-	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
-	note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
-	if (!note_buf) {
-		printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
-			"cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
-		return -ENOMEM;
-	}
-	fw_dump.cpu_notes_buf = __pa(note_buf);
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
 
-	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
-			(num_cpus * sizeof(note_buf_t)), note_buf);
+	note_buf = (u32 *)fw_dump.cpu_notes_buf_vaddr;
 
 	if (fw_dump.fadumphdr_addr)
 		fdh = __va(fw_dump.fadumphdr_addr);
@@ -920,10 +933,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 	return 0;
 
 error_out:
-	fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
-					fw_dump.cpu_notes_buf_size);
-	fw_dump.cpu_notes_buf = 0;
-	fw_dump.cpu_notes_buf_size = 0;
+	fadump_free_cpu_notes_buf();
 	return rc;
 
 }
@@ -1470,13 +1480,8 @@ static void fadump_invalidate_release_mem(void)
 	fw_dump.reserve_dump_area_size = get_fadump_area_size();
 
 	fadump_release_memory(reserved_area_start, reserved_area_end);
-	if (fw_dump.cpu_notes_buf) {
-		fadump_cpu_notes_buf_free(
-				(unsigned long)__va(fw_dump.cpu_notes_buf),
-				fw_dump.cpu_notes_buf_size);
-		fw_dump.cpu_notes_buf = 0;
-		fw_dump.cpu_notes_buf_size = 0;
-	}
+	fadump_free_cpu_notes_buf();
+
 	/* Initialize the kernel dump memory structure for FAD registration. */
 	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
 }

From 7f0ad11d3fb948a0d7770bd38ae17a51413c3dac Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:16:52 +0530
Subject: [PATCH 222/264] powerpc/fadump: declare helper functions in internal
 header file

Declare helper functions, that can be reused by multiple platforms,
in the internal header file.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821320487.5656.2660730464212209984.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h |  8 ++++++++
 arch/powerpc/kernel/fadump.c               | 16 ++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 16e2781a3e48..e004c89570b0 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -83,4 +83,12 @@ struct fw_dump {
 	unsigned long	nocma:1;
 };
 
+/* Helper functions */
+s32 fadump_setup_cpu_notes_buf(u32 num_cpus);
+void fadump_free_cpu_notes_buf(void);
+u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs);
+void fadump_update_elfcore_header(char *bufp);
+bool is_fadump_boot_mem_contiguous(void);
+bool is_fadump_reserved_mem_contiguous(void);
+
 #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index e2b83a991303..eb0745e418db 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -234,7 +234,7 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
  * Returns true, if there are no holes in boot memory area,
  * false otherwise.
  */
-static bool is_boot_memory_area_contiguous(void)
+bool is_fadump_boot_mem_contiguous(void)
 {
 	return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size);
 }
@@ -243,7 +243,7 @@ static bool is_boot_memory_area_contiguous(void)
  * Returns true, if there are no holes in reserved memory area,
  * false otherwise.
  */
-static bool is_reserved_memory_area_contiguous(void)
+bool is_fadump_reserved_mem_contiguous(void)
 {
 	u64 d_start, d_end;
 
@@ -617,9 +617,9 @@ static int register_fw_dump(struct fadump_mem_struct *fdm)
 			" dump. Hardware Error(%d).\n", rc);
 		break;
 	case -3:
-		if (!is_boot_memory_area_contiguous())
+		if (!is_fadump_boot_mem_contiguous())
 			pr_err("Can't have holes in boot memory area while registering fadump\n");
-		else if (!is_reserved_memory_area_contiguous())
+		else if (!is_fadump_reserved_mem_contiguous())
 			pr_err("Can't have holes in reserved memory area while"
 			       " registering fadump\n");
 
@@ -749,7 +749,7 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
 	return reg_entry;
 }
 
-static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
 {
 	struct elf_prstatus prstatus;
 
@@ -764,7 +764,7 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
 	return buf;
 }
 
-static void fadump_update_elfcore_header(char *bufp)
+void fadump_update_elfcore_header(char *bufp)
 {
 	struct elfhdr *elf;
 	struct elf_phdr *phdr;
@@ -815,7 +815,7 @@ static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
 	__free_pages(page, order);
 }
 
-static s32 fadump_setup_cpu_notes_buf(u32 num_cpus)
+s32 fadump_setup_cpu_notes_buf(u32 num_cpus)
 {
 	/* Allocate buffer to hold cpu crash notes. */
 	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
@@ -834,7 +834,7 @@ static s32 fadump_setup_cpu_notes_buf(u32 num_cpus)
 	return 0;
 }
 
-static void fadump_free_cpu_notes_buf(void)
+void fadump_free_cpu_notes_buf(void)
 {
 	if (!fw_dump.cpu_notes_buf_vaddr)
 		return;

From 8468d155450c89adbb400cdae42bd4095f3045a9 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:17:07 +0530
Subject: [PATCH 223/264] powerpc/fadump: Improve fadump documentation

The figures depicting FADump's (Firmware-Assisted Dump) memory layout
are missing some finer details like different memory regions and what
they represent. Improve the documentation by updating those details.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821322070.5656.8194734198500730487.stgit@hbathini.in.ibm.com
---
 .../powerpc/firmware-assisted-dump.rst        | 61 ++++++++++---------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index 9ca12830a48e..563c021df539 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -76,8 +76,9 @@ as follows:
    there is crash data available from a previous boot. During
    the early boot OS will reserve rest of the memory above
    boot memory size effectively booting with restricted memory
-   size. This will make sure that the second kernel will not
-   touch any of the dump memory area.
+   size. This will make sure that this kernel (also, referred
+   to as second kernel or capture kernel) will not touch any
+   of the dump memory area.
 
 -  User-space tools will read /proc/vmcore to obtain the contents
    of memory, which holds the previous crashed kernel dump in ELF
@@ -128,42 +129,46 @@ space memory except the user pages that were present in CMA region::
 
   o Memory Reservation during first kernel
 
-  Low memory                                         Top of memory
-  0      boot memory size                                       |
-  |           |                |<--Reserved dump area -->|      |
-  V           V                |   Permanent Reservation |      V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
-        |                                           ^
-        |                                           |
-        \                                           /
-         -------------------------------------------
-          Boot memory content gets transferred to
-          reserved area by firmware at the time of
-          crash
+  Low memory                                                Top of memory
+  0      boot memory size      |<--Reserved dump area --->|      |
+  |           |                | (Permanent Reservation)  |      |
+  V           V                |                          |      V
+  +-----------+----------/ /---+---+----+--------+---+----+------+
+  |           |                |CPU|HPTE|  DUMP  |HDR|ELF |      |
+  +-----------+----------/ /---+---+----+--------+---+----+------+
+        |                                   ^      ^
+        |                                   |      |
+        \                                   /      |
+         -----------------------------------     FADump Header
+          Boot memory content gets transferred   (meta area)
+          to reserved area by firmware at the
+          time of crash
+
                    Fig. 1
 
+
   o Memory Reservation during second kernel after crash
 
-  Low memory                                        Top of memory
-  0      boot memory size                                       |
-  |           |<------------- Reserved dump area ----------- -->|
-  V           V                                                 V
-  +-----------+----------/ /---+---+----+-----------+----+------+
-  |           |                |CPU|HPTE|  DUMP     |ELF |      |
-  +-----------+----------/ /---+---+----+-----------+----+------+
+  Low memory                                                Top of memory
+  0      boot memory size                                        |
+  |           |<----------- Crash preserved area --------------->|
+  V           V                |<-- Reserved dump area -->|      V
+  +-----------+----------/ /---+---+----+--------+---+----+------+
+  |           |                |CPU|HPTE|  DUMP  |HDR|ELF |      |
+  +-----------+----------/ /---+---+----+--------+---+----+------+
         |                                              |
         V                                              V
    Used by second                                /proc/vmcore
    kernel to boot
                    Fig. 2
 
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
+Currently the dump will be copied from /proc/vmcore to a new file upon
+user intervention. The dump data available through /proc/vmcore will be
+in ELF format. Hence the existing kdump infrastructure (kdump scripts)
+to save the dump works fine with minor modifications. KDump scripts on
+major Distro releases have already been modified to work seemlessly (no
+user intervention in saving the dump) when FADump is used, instead of
+KDump, as dump mechanism.
 
 The tools to examine the dump will be same as the ones
 used for kdump.

From 72aa651795f0e9f48bfdb2b2dd0b3e6900351d2a Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:17:56 +0530
Subject: [PATCH 224/264] powerpc/fadump: use helper functions to
 reserve/release cpu notes buffer

Use helper functions to simplify memory allocation, pinning down and
freeing the memory used for CPU notes buffer.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821323555.5656.2486038022572739622.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index eb0745e418db..994fc09e9cbf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -786,33 +786,24 @@ void fadump_update_elfcore_header(char *bufp)
 
 static void *fadump_alloc_buffer(unsigned long size)
 {
-	void *vaddr;
+	unsigned long count, i;
 	struct page *page;
-	unsigned long order, count, i;
+	void *vaddr;
 
-	order = get_order(size);
-	vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
 	if (!vaddr)
 		return NULL;
 
-	count = 1 << order;
+	count = PAGE_ALIGN(size) / PAGE_SIZE;
 	page = virt_to_page(vaddr);
 	for (i = 0; i < count; i++)
-		SetPageReserved(page + i);
+		mark_page_reserved(page + i);
 	return vaddr;
 }
 
 static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
 {
-	struct page *page;
-	unsigned long order, count, i;
-
-	order = get_order(size);
-	count = 1 << order;
-	page = virt_to_page(vaddr);
-	for (i = 0; i < count; i++)
-		ClearPageReserved(page + i);
-	__free_pages(page, order);
+	free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL);
 }
 
 s32 fadump_setup_cpu_notes_buf(u32 num_cpus)

From 0226e55275e569126882a7befe0b1a1c9bd270aa Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:18:14 +0530
Subject: [PATCH 225/264] powerpc/fadump: move rtas specific definitions to
 platform code

Currently, FADump is only supported on pSeries but that is going to
change soon with FADump support being added on PowerNV platform. So,
move rtas specific definitions to platform code to allow FADump
to have multiple platforms support.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821328494.5656.16219929140866195511.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  20 +++-
 arch/powerpc/include/asm/fadump.h            | 116 +------------------
 arch/powerpc/kernel/fadump.c                 |  96 +++++++--------
 arch/powerpc/platforms/pseries/rtas-fadump.h | 100 ++++++++++++++++
 4 files changed, 172 insertions(+), 160 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index e004c89570b0..f8097510e03f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -43,7 +43,25 @@
 #define FADUMP_UNREGISTER		2
 #define FADUMP_INVALIDATE		3
 
-#define FADUMP_CRASH_INFO_MAGIC		str_to_u64("FADMPINF")
+/*
+ * Copy the ascii values for first 8 characters from a string into u64
+ * variable at their respective indexes.
+ * e.g.
+ *  The string "FADMPINF" will be converted into 0x4641444d50494e46
+ */
+static inline u64 fadump_str_to_u64(const char *str)
+{
+	u64 val = 0;
+	int i;
+
+	for (i = 0; i < sizeof(val); i++)
+		val = (*str) ? (val << 8) | *str++ : val << 8;
+	return val;
+}
+
+#define FADUMP_CPU_UNKNOWN		(~((u32)0))
+
+#define FADUMP_CRASH_INFO_MAGIC		fadump_str_to_u64("FADMPINF")
 
 /* fadump crash info structure */
 struct fadump_crash_info_header {
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 751794978427..fd5002b4c18c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -6,121 +6,13 @@
  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
  */
 
-#ifndef __PPC64_FA_DUMP_H__
-#define __PPC64_FA_DUMP_H__
+#ifndef _ASM_POWERPC_FADUMP_H
+#define _ASM_POWERPC_FADUMP_H
 
 #ifdef CONFIG_FA_DUMP
 
-/* Firmware provided dump sections */
-#define FADUMP_CPU_STATE_DATA	0x0001
-#define FADUMP_HPTE_REGION	0x0002
-#define FADUMP_REAL_MODE_REGION	0x0011
-
-/* Dump request flag */
-#define FADUMP_REQUEST_FLAG	0x00000001
-
-/* Dump status flag */
-#define FADUMP_ERROR_FLAG	0x2000
-
-#define FADUMP_CPU_ID_MASK	((1UL << 32) - 1)
-
-#define CPU_UNKNOWN		(~((u32)0))
-
-/* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)					\
-({									\
-	while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))	\
-		reg_entry++;						\
-	reg_entry++;							\
-})
-
 extern int crashing_cpu;
 
-/* Kernel Dump section info */
-struct fadump_section {
-	__be32	request_flag;
-	__be16	source_data_type;
-	__be16	error_flags;
-	__be64	source_address;
-	__be64	source_len;
-	__be64	bytes_dumped;
-	__be64	destination_address;
-};
-
-/* ibm,configure-kernel-dump header. */
-struct fadump_section_header {
-	__be32	dump_format_version;
-	__be16	dump_num_sections;
-	__be16	dump_status_flag;
-	__be32	offset_first_dump_section;
-
-	/* Fields for disk dump option. */
-	__be32	dd_block_size;
-	__be64	dd_block_offset;
-	__be64	dd_num_blocks;
-	__be32	dd_offset_disk_path;
-
-	/* Maximum time allowed to prevent an automatic dump-reboot. */
-	__be32	max_time_auto;
-};
-
-/*
- * Firmware Assisted dump memory structure. This structure is required for
- * registering future kernel dump with power firmware through rtas call.
- *
- * No disk dump option. Hence disk dump path string section is not included.
- */
-struct fadump_mem_struct {
-	struct fadump_section_header	header;
-
-	/* Kernel dump sections */
-	struct fadump_section		cpu_state_data;
-	struct fadump_section		hpte_region;
-	struct fadump_section		rmr_region;
-};
-
-/*
- * Copy the ascii values for first 8 characters from a string into u64
- * variable at their respective indexes.
- * e.g.
- *  The string "FADMPINF" will be converted into 0x4641444d50494e46
- */
-static inline u64 str_to_u64(const char *str)
-{
-	u64 val = 0;
-	int i;
-
-	for (i = 0; i < sizeof(val); i++)
-		val = (*str) ? (val << 8) | *str++ : val << 8;
-	return val;
-}
-#define STR_TO_HEX(x)	str_to_u64(x)
-#define REG_ID(x)	str_to_u64(x)
-
-#define REGSAVE_AREA_MAGIC		STR_TO_HEX("REGSAVE")
-
-/* The firmware-assisted dump format.
- *
- * The register save area is an area in the partition's memory used to preserve
- * the register contents (CPU state data) for the active CPUs during a firmware
- * assisted dump. The dump format contains register save area header followed
- * by register entries. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND".
- */
-
-/* Register save area header. */
-struct fadump_reg_save_area_header {
-	__be64		magic_number;
-	__be32		version;
-	__be32		num_cpu_offset;
-};
-
-/* Register entry. */
-struct fadump_reg_entry {
-	__be64		reg_id;
-	__be64		reg_value;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
@@ -136,5 +28,5 @@ static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
-#endif
-#endif
+#endif /* !CONFIG_FA_DUMP */
+#endif /* _ASM_POWERPC_FADUMP_H */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 994fc09e9cbf..03f2708cd954 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -33,12 +33,11 @@
 #include <asm/fadump-internal.h>
 #include <asm/setup.h>
 
+#include "../platforms/pseries/rtas-fadump.h"
+
 static struct fw_dump fw_dump;
-static struct fadump_mem_struct fdm;
-static const struct fadump_mem_struct *fdm_active;
-#ifdef CONFIG_CMA
-static struct cma *fadump_cma;
-#endif
+static struct rtas_fadump_mem_struct fdm;
+static const struct rtas_fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
 struct fad_crash_memory_ranges *crash_memory_ranges;
@@ -47,6 +46,8 @@ int crash_mem_ranges;
 int max_crash_mem_ranges;
 
 #ifdef CONFIG_CMA
+static struct cma *fadump_cma;
+
 /*
  * fadump_cma_init() - Initialize CMA area from a fadump reserved memory
  *
@@ -156,11 +157,11 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 		u32 type = (u32)of_read_number(sections, 1);
 
 		switch (type) {
-		case FADUMP_CPU_STATE_DATA:
+		case RTAS_FADUMP_CPU_STATE_DATA:
 			fw_dump.cpu_state_data_size =
 					of_read_ulong(&sections[1], 2);
 			break;
-		case FADUMP_HPTE_REGION:
+		case RTAS_FADUMP_HPTE_REGION:
 			fw_dump.hpte_region_size =
 					of_read_ulong(&sections[1], 2);
 			break;
@@ -271,20 +272,20 @@ static void fadump_show_config(void)
 	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
 }
 
-static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
+static unsigned long init_fadump_mem_struct(struct rtas_fadump_mem_struct *fdm,
 				unsigned long addr)
 {
 	if (!fdm)
 		return 0;
 
-	memset(fdm, 0, sizeof(struct fadump_mem_struct));
+	memset(fdm, 0, sizeof(struct rtas_fadump_mem_struct));
 	addr = addr & PAGE_MASK;
 
 	fdm->header.dump_format_version = cpu_to_be32(0x00000001);
 	fdm->header.dump_num_sections = cpu_to_be16(3);
 	fdm->header.dump_status_flag = 0;
 	fdm->header.offset_first_dump_section =
-		cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data));
+		cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, cpu_state_data));
 
 	/*
 	 * Fields for disk dump option.
@@ -300,24 +301,24 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
 
 	/* Kernel dump sections */
 	/* cpu state data section. */
-	fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
-	fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA);
+	fdm->cpu_state_data.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm->cpu_state_data.source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
 	fdm->cpu_state_data.source_address = 0;
 	fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size);
 	fdm->cpu_state_data.destination_address = cpu_to_be64(addr);
 	addr += fw_dump.cpu_state_data_size;
 
 	/* hpte region section */
-	fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
-	fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION);
+	fdm->hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm->hpte_region.source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
 	fdm->hpte_region.source_address = 0;
 	fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size);
 	fdm->hpte_region.destination_address = cpu_to_be64(addr);
 	addr += fw_dump.hpte_region_size;
 
 	/* RMA region section */
-	fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
-	fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION);
+	fdm->rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm->rmr_region.source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
 	fdm->rmr_region.source_address = cpu_to_be64(RMA_START);
 	fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size);
 	fdm->rmr_region.destination_address = cpu_to_be64(addr);
@@ -588,7 +589,7 @@ static int __init early_fadump_reserve_mem(char *p)
 }
 early_param("fadump_reserve_mem", early_fadump_reserve_mem);
 
-static int register_fw_dump(struct fadump_mem_struct *fdm)
+static int register_fw_dump(struct rtas_fadump_mem_struct *fdm)
 {
 	int rc, err;
 	unsigned int wait_time;
@@ -599,7 +600,7 @@ static int register_fw_dump(struct fadump_mem_struct *fdm)
 	do {
 		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
 			FADUMP_REGISTER, fdm,
-			sizeof(struct fadump_mem_struct));
+			sizeof(struct rtas_fadump_mem_struct));
 
 		wait_time = rtas_busy_delay_time(rc);
 		if (wait_time)
@@ -695,7 +696,7 @@ static inline int fadump_gpr_index(u64 id)
 	int i = -1;
 	char str[3];
 
-	if ((id & GPR_MASK) == REG_ID("GPR")) {
+	if ((id & GPR_MASK) == fadump_str_to_u64("GPR")) {
 		/* get the digits at the end */
 		id &= ~GPR_MASK;
 		id >>= 24;
@@ -717,30 +718,30 @@ static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
 	i = fadump_gpr_index(reg_id);
 	if (i >= 0)
 		regs->gpr[i] = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("NIA"))
+	else if (reg_id == fadump_str_to_u64("NIA"))
 		regs->nip = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("MSR"))
+	else if (reg_id == fadump_str_to_u64("MSR"))
 		regs->msr = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("CTR"))
+	else if (reg_id == fadump_str_to_u64("CTR"))
 		regs->ctr = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("LR"))
+	else if (reg_id == fadump_str_to_u64("LR"))
 		regs->link = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("XER"))
+	else if (reg_id == fadump_str_to_u64("XER"))
 		regs->xer = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("CR"))
+	else if (reg_id == fadump_str_to_u64("CR"))
 		regs->ccr = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("DAR"))
+	else if (reg_id == fadump_str_to_u64("DAR"))
 		regs->dar = (unsigned long)reg_val;
-	else if (reg_id == REG_ID("DSISR"))
+	else if (reg_id == fadump_str_to_u64("DSISR"))
 		regs->dsisr = (unsigned long)reg_val;
 }
 
-static struct fadump_reg_entry*
-fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+static struct rtas_fadump_reg_entry*
+fadump_read_registers(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
 
-	while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) {
+	while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
 		fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
 					be64_to_cpu(reg_entry->reg_value));
 		reg_entry++;
@@ -850,10 +851,10 @@ void fadump_free_cpu_notes_buf(void)
  * state from fadump crash info structure populated by first kernel at the
  * time of crash.
  */
-static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+static int __init fadump_build_cpu_notes(const struct rtas_fadump_mem_struct *fdm)
 {
-	struct fadump_reg_save_area_header *reg_header;
-	struct fadump_reg_entry *reg_entry;
+	struct rtas_fadump_reg_save_area_header *reg_header;
+	struct rtas_fadump_reg_entry *reg_entry;
 	struct fadump_crash_info_header *fdh = NULL;
 	void *vaddr;
 	unsigned long addr;
@@ -868,7 +869,8 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 	vaddr = __va(addr);
 
 	reg_header = vaddr;
-	if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) {
+	if (be64_to_cpu(reg_header->magic_number) !=
+	    fadump_str_to_u64("REGSAVE")) {
 		printk(KERN_ERR "Unable to read register save area.\n");
 		return -ENOENT;
 	}
@@ -880,7 +882,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 	num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
 	pr_debug("NumCpus     : %u\n", num_cpus);
 	vaddr += sizeof(u32);
-	reg_entry = (struct fadump_reg_entry *)vaddr;
+	reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
 
 	rc = fadump_setup_cpu_notes_buf(num_cpus);
 	if (rc != 0)
@@ -892,22 +894,22 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 		fdh = __va(fw_dump.fadumphdr_addr);
 
 	for (i = 0; i < num_cpus; i++) {
-		if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) {
+		if (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUSTRT")) {
 			printk(KERN_ERR "Unable to read CPU state data\n");
 			rc = -ENOENT;
 			goto error_out;
 		}
 		/* Lower 4 bytes of reg_value contains logical cpu id */
-		cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK;
+		cpu = be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK;
 		if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
-			SKIP_TO_NEXT_CPU(reg_entry);
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
 			continue;
 		}
 		pr_debug("Reading register data for cpu %d...\n", cpu);
 		if (fdh && fdh->crashing_cpu == cpu) {
 			regs = fdh->regs;
 			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
-			SKIP_TO_NEXT_CPU(reg_entry);
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
 		} else {
 			reg_entry++;
 			reg_entry = fadump_read_registers(reg_entry, &regs);
@@ -933,7 +935,7 @@ error_out:
  * Validate and process the dump data stored by firmware before exporting
  * it through '/proc/vmcore'.
  */
-static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+static int __init process_fadump(const struct rtas_fadump_mem_struct *fdm_active)
 {
 	struct fadump_crash_info_header *fdh;
 	int rc = 0;
@@ -942,7 +944,7 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
 		return -EINVAL;
 
 	/* Check if the dump data is valid. */
-	if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) ||
+	if ((be16_to_cpu(fdm_active->header.dump_status_flag) == RTAS_FADUMP_ERROR_FLAG) ||
 			(fdm_active->cpu_state_data.error_flags != 0) ||
 			(fdm_active->rmr_region.error_flags != 0)) {
 		printk(KERN_ERR "Dump taken by platform is not valid\n");
@@ -1273,7 +1275,7 @@ static unsigned long init_fadump_header(unsigned long addr)
 	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
 	fdh->elfcorehdr_addr = addr;
 	/* We will set the crashing cpu id in crash_fadump() during crash. */
-	fdh->crashing_cpu = CPU_UNKNOWN;
+	fdh->crashing_cpu = FADUMP_CPU_UNKNOWN;
 
 	return addr;
 }
@@ -1307,7 +1309,7 @@ static int register_fadump(void)
 	return register_fw_dump(&fdm);
 }
 
-static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
+static int fadump_unregister_dump(struct rtas_fadump_mem_struct *fdm)
 {
 	int rc = 0;
 	unsigned int wait_time;
@@ -1318,7 +1320,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
 	do {
 		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
 			FADUMP_UNREGISTER, fdm,
-			sizeof(struct fadump_mem_struct));
+			sizeof(struct rtas_fadump_mem_struct));
 
 		wait_time = rtas_busy_delay_time(rc);
 		if (wait_time)
@@ -1334,7 +1336,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
 	return 0;
 }
 
-static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
+static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm)
 {
 	int rc = 0;
 	unsigned int wait_time;
@@ -1345,7 +1347,7 @@ static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
 	do {
 		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
 			FADUMP_INVALIDATE, fdm,
-			sizeof(struct fadump_mem_struct));
+			sizeof(struct rtas_fadump_mem_struct));
 
 		wait_time = rtas_busy_delay_time(rc);
 		if (wait_time)
@@ -1561,7 +1563,7 @@ unlock_out:
 
 static int fadump_region_show(struct seq_file *m, void *private)
 {
-	const struct fadump_mem_struct *fdm_ptr;
+	const struct rtas_fadump_mem_struct *fdm_ptr;
 
 	if (!fw_dump.fadump_enabled)
 		return 0;
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
new file mode 100644
index 000000000000..531f3f3e42b3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _PSERIES_RTAS_FADUMP_H
+#define _PSERIES_RTAS_FADUMP_H
+
+/* Firmware provided dump sections */
+#define RTAS_FADUMP_CPU_STATE_DATA	0x0001
+#define RTAS_FADUMP_HPTE_REGION		0x0002
+#define RTAS_FADUMP_REAL_MODE_REGION	0x0011
+
+/* Dump request flag */
+#define RTAS_FADUMP_REQUEST_FLAG	0x00000001
+
+/* Dump status flag */
+#define RTAS_FADUMP_ERROR_FLAG		0x2000
+
+/* Kernel Dump section info */
+struct rtas_fadump_section {
+	__be32	request_flag;
+	__be16	source_data_type;
+	__be16	error_flags;
+	__be64	source_address;
+	__be64	source_len;
+	__be64	bytes_dumped;
+	__be64	destination_address;
+};
+
+/* ibm,configure-kernel-dump header. */
+struct rtas_fadump_section_header {
+	__be32	dump_format_version;
+	__be16	dump_num_sections;
+	__be16	dump_status_flag;
+	__be32	offset_first_dump_section;
+
+	/* Fields for disk dump option. */
+	__be32	dd_block_size;
+	__be64	dd_block_offset;
+	__be64	dd_num_blocks;
+	__be32	dd_offset_disk_path;
+
+	/* Maximum time allowed to prevent an automatic dump-reboot. */
+	__be32	max_time_auto;
+};
+
+/*
+ * Firmware Assisted dump memory structure. This structure is required for
+ * registering future kernel dump with power firmware through rtas call.
+ *
+ * No disk dump option. Hence disk dump path string section is not included.
+ */
+struct rtas_fadump_mem_struct {
+	struct rtas_fadump_section_header	header;
+
+	/* Kernel dump sections */
+	struct rtas_fadump_section		cpu_state_data;
+	struct rtas_fadump_section		hpte_region;
+	struct rtas_fadump_section		rmr_region;
+};
+
+/*
+ * The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with "CPUSTRT"
+ * and ends with "CPUEND".
+ */
+
+/* Register save area header. */
+struct rtas_fadump_reg_save_area_header {
+	__be64		magic_number;
+	__be32		version;
+	__be32		num_cpu_offset;
+};
+
+/* Register entry. */
+struct rtas_fadump_reg_entry {
+	__be64		reg_id;
+	__be64		reg_value;
+};
+
+/* Utility macros */
+#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry)				\
+({									\
+	while (be64_to_cpu(reg_entry->reg_id) !=			\
+	       fadump_str_to_u64("CPUEND"))				\
+		reg_entry++;						\
+	reg_entry++;							\
+})
+
+#define RTAS_FADUMP_CPU_ID_MASK			((1UL << 32) - 1)
+
+#endif /* _PSERIES_RTAS_FADUMP_H */

From d3833a7010817f82bff373e26d146e6401c695f4 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:18:40 +0530
Subject: [PATCH 226/264] powerpc/fadump: introduce callbacks for platform
 specific operations

Introduce callback functions for platform specific operations like
register, unregister, invalidate & such. Also, define place-holders
for the same on pSeries platform.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821330286.5656.15538934400074110770.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  24 ++++
 arch/powerpc/kernel/fadump.c                 |  45 +------
 arch/powerpc/platforms/pseries/Makefile      |   1 +
 arch/powerpc/platforms/pseries/rtas-fadump.c | 122 +++++++++++++++++++
 4 files changed, 148 insertions(+), 44 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index f8097510e03f..8ecd23421b6c 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -77,6 +77,9 @@ struct fad_crash_memory_ranges {
 	unsigned long long	size;
 };
 
+/* Platform specific callback functions */
+struct fadump_ops;
+
 /* Firmware-assisted dump configuration details. */
 struct fw_dump {
 	unsigned long	reserve_dump_area_start;
@@ -99,6 +102,20 @@ struct fw_dump {
 	unsigned long	dump_active:1;
 	unsigned long	dump_registered:1;
 	unsigned long	nocma:1;
+
+	struct fadump_ops	*ops;
+};
+
+struct fadump_ops {
+	u64	(*fadump_init_mem_struct)(struct fw_dump *fadump_conf);
+	int	(*fadump_register)(struct fw_dump *fadump_conf);
+	int	(*fadump_unregister)(struct fw_dump *fadump_conf);
+	int	(*fadump_invalidate)(struct fw_dump *fadump_conf);
+	int	(*fadump_process)(struct fw_dump *fadump_conf);
+	void	(*fadump_region_show)(struct fw_dump *fadump_conf,
+				      struct seq_file *m);
+	void	(*fadump_trigger)(struct fadump_crash_info_header *fdh,
+				  const char *msg);
 };
 
 /* Helper functions */
@@ -109,4 +126,11 @@ void fadump_update_elfcore_header(char *bufp);
 bool is_fadump_boot_mem_contiguous(void);
 bool is_fadump_reserved_mem_contiguous(void);
 
+#ifdef CONFIG_PPC_PSERIES
+extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
+#else
+static inline void
+rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
+#endif
+
 #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 03f2708cd954..aa342ee53acb 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -112,24 +112,10 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node,
 			const char *uname, int depth, void *data)
 {
-	const __be32 *sections;
-	int i, num_sections;
-	int size;
-	const __be32 *token;
-
 	if (depth != 1 || strcmp(uname, "rtas") != 0)
 		return 0;
 
-	/*
-	 * Check if Firmware Assisted dump is supported. if yes, check
-	 * if dump has been initiated on last reboot.
-	 */
-	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
-	if (!token)
-		return 1;
-
-	fw_dump.fadump_supported = 1;
-	fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token);
+	rtas_fadump_dt_scan(&fw_dump, node);
 
 	/*
 	 * The 'ibm,kernel-dump' rtas node is present only if there is
@@ -139,35 +125,6 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 	if (fdm_active)
 		fw_dump.dump_active = 1;
 
-	/* Get the sizes required to store dump data for the firmware provided
-	 * dump sections.
-	 * For each dump section type supported, a 32bit cell which defines
-	 * the ID of a supported section followed by two 32 bit cells which
-	 * gives teh size of the section in bytes.
-	 */
-	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
-					&size);
-
-	if (!sections)
-		return 1;
-
-	num_sections = size / (3 * sizeof(u32));
-
-	for (i = 0; i < num_sections; i++, sections += 3) {
-		u32 type = (u32)of_read_number(sections, 1);
-
-		switch (type) {
-		case RTAS_FADUMP_CPU_STATE_DATA:
-			fw_dump.cpu_state_data_size =
-					of_read_ulong(&sections[1], 2);
-			break;
-		case RTAS_FADUMP_HPTE_REGION:
-			fw_dump.hpte_region_size =
-					of_read_ulong(&sections[1], 2);
-			break;
-		}
-	}
-
 	return 1;
 }
 
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index a420ef4c9d8e..a3c74a5cf20d 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_IBMEBUS)		+= ibmebus.o
 obj-$(CONFIG_PAPR_SCM)		+= papr_scm.o
 obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
 obj-$(CONFIG_PPC_SVM)		+= svm.o
+obj-$(CONFIG_FA_DUMP)		+= rtas-fadump.o
 
 ifdef CONFIG_PPC_PSERIES
 obj-$(CONFIG_SUSPEND)		+= suspend.o
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
new file mode 100644
index 000000000000..9efe7b1ac9cc
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "rtas fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/fadump-internal.h>
+
+#include "rtas-fadump.h"
+
+static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+	return fadump_conf->reserve_dump_area_start;
+}
+
+static int rtas_fadump_register(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
+{
+	return -EINVAL;
+}
+
+static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
+				    struct seq_file *m)
+{
+}
+
+static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
+				const char *msg)
+{
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)msg);
+}
+
+static struct fadump_ops rtas_fadump_ops = {
+	.fadump_init_mem_struct		= rtas_fadump_init_mem_struct,
+	.fadump_register		= rtas_fadump_register,
+	.fadump_unregister		= rtas_fadump_unregister,
+	.fadump_invalidate		= rtas_fadump_invalidate,
+	.fadump_process			= rtas_fadump_process,
+	.fadump_region_show		= rtas_fadump_region_show,
+	.fadump_trigger			= rtas_fadump_trigger,
+};
+
+void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	int i, size, num_sections;
+	const __be32 *sections;
+	const __be32 *token;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return;
+
+	fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token);
+	fadump_conf->ops		= &rtas_fadump_ops;
+	fadump_conf->fadump_supported	= 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 * For each dump section type supported, a 32bit cell which defines
+	 * the ID of a supported section followed by two 32 bit cells which
+	 * gives the size of the section in bytes.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return;
+
+	num_sections = size / (3 * sizeof(u32));
+
+	for (i = 0; i < num_sections; i++, sections += 3) {
+		u32 type = (u32)of_read_number(sections, 1);
+
+		switch (type) {
+		case RTAS_FADUMP_CPU_STATE_DATA:
+			fadump_conf->cpu_state_data_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		case RTAS_FADUMP_HPTE_REGION:
+			fadump_conf->hpte_region_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		}
+	}
+}

From 41a65d1618238e63be1439871eaf44dc3c6a737c Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:18:57 +0530
Subject: [PATCH 227/264] pseries/fadump: define RTAS register/un-register
 callback functions

Move platform specific register/un-register code, the RTAS calls, to
register/un-register callback functions. This would also mean moving
code that initializes and prints the platform specific FADump data.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821332856.5656.16380417702046411631.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |   2 +
 arch/powerpc/kernel/fadump.c                 | 165 ++-----------------
 arch/powerpc/platforms/pseries/rtas-fadump.c | 162 +++++++++++++++++-
 3 files changed, 176 insertions(+), 153 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 8ecd23421b6c..8c0eb0fc09d0 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -89,7 +89,9 @@ struct fw_dump {
 
 	unsigned long	cpu_state_data_size;
 	unsigned long	hpte_region_size;
+
 	unsigned long	boot_memory_size;
+	u64		boot_mem_dest_addr;
 
 	unsigned long	fadumphdr_addr;
 	unsigned long	cpu_notes_buf_vaddr;
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index aa342ee53acb..3cf621080427 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -36,7 +36,6 @@
 #include "../platforms/pseries/rtas-fadump.h"
 
 static struct fw_dump fw_dump;
-static struct rtas_fadump_mem_struct fdm;
 static const struct rtas_fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
@@ -229,61 +228,6 @@ static void fadump_show_config(void)
 	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
 }
 
-static unsigned long init_fadump_mem_struct(struct rtas_fadump_mem_struct *fdm,
-				unsigned long addr)
-{
-	if (!fdm)
-		return 0;
-
-	memset(fdm, 0, sizeof(struct rtas_fadump_mem_struct));
-	addr = addr & PAGE_MASK;
-
-	fdm->header.dump_format_version = cpu_to_be32(0x00000001);
-	fdm->header.dump_num_sections = cpu_to_be16(3);
-	fdm->header.dump_status_flag = 0;
-	fdm->header.offset_first_dump_section =
-		cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, cpu_state_data));
-
-	/*
-	 * Fields for disk dump option.
-	 * We are not using disk dump option, hence set these fields to 0.
-	 */
-	fdm->header.dd_block_size = 0;
-	fdm->header.dd_block_offset = 0;
-	fdm->header.dd_num_blocks = 0;
-	fdm->header.dd_offset_disk_path = 0;
-
-	/* set 0 to disable an automatic dump-reboot. */
-	fdm->header.max_time_auto = 0;
-
-	/* Kernel dump sections */
-	/* cpu state data section. */
-	fdm->cpu_state_data.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
-	fdm->cpu_state_data.source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
-	fdm->cpu_state_data.source_address = 0;
-	fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size);
-	fdm->cpu_state_data.destination_address = cpu_to_be64(addr);
-	addr += fw_dump.cpu_state_data_size;
-
-	/* hpte region section */
-	fdm->hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
-	fdm->hpte_region.source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
-	fdm->hpte_region.source_address = 0;
-	fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size);
-	fdm->hpte_region.destination_address = cpu_to_be64(addr);
-	addr += fw_dump.hpte_region_size;
-
-	/* RMA region section */
-	fdm->rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
-	fdm->rmr_region.source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
-	fdm->rmr_region.source_address = cpu_to_be64(RMA_START);
-	fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size);
-	fdm->rmr_region.destination_address = cpu_to_be64(addr);
-	addr += fw_dump.boot_memory_size;
-
-	return addr;
-}
-
 /**
  * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
  *
@@ -546,61 +490,6 @@ static int __init early_fadump_reserve_mem(char *p)
 }
 early_param("fadump_reserve_mem", early_fadump_reserve_mem);
 
-static int register_fw_dump(struct rtas_fadump_mem_struct *fdm)
-{
-	int rc, err;
-	unsigned int wait_time;
-
-	pr_debug("Registering for firmware-assisted kernel dump...\n");
-
-	/* TODO: Add upper time limit for the delay */
-	do {
-		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
-			FADUMP_REGISTER, fdm,
-			sizeof(struct rtas_fadump_mem_struct));
-
-		wait_time = rtas_busy_delay_time(rc);
-		if (wait_time)
-			mdelay(wait_time);
-
-	} while (wait_time);
-
-	err = -EIO;
-	switch (rc) {
-	default:
-		pr_err("Failed to register. Unknown Error(%d).\n", rc);
-		break;
-	case -1:
-		printk(KERN_ERR "Failed to register firmware-assisted kernel"
-			" dump. Hardware Error(%d).\n", rc);
-		break;
-	case -3:
-		if (!is_fadump_boot_mem_contiguous())
-			pr_err("Can't have holes in boot memory area while registering fadump\n");
-		else if (!is_fadump_reserved_mem_contiguous())
-			pr_err("Can't have holes in reserved memory area while"
-			       " registering fadump\n");
-
-		printk(KERN_ERR "Failed to register firmware-assisted kernel"
-			" dump. Parameter Error(%d).\n", rc);
-		err = -EINVAL;
-		break;
-	case -9:
-		printk(KERN_ERR "firmware-assisted kernel dump is already "
-			" registered.");
-		fw_dump.dump_registered = 1;
-		err = -EEXIST;
-		break;
-	case 0:
-		printk(KERN_INFO "firmware-assisted kernel dump registration"
-			" is successful\n");
-		fw_dump.dump_registered = 1;
-		err = 0;
-		break;
-	}
-	return err;
-}
-
 void crash_fadump(struct pt_regs *regs, const char *str)
 {
 	struct fadump_crash_info_header *fdh = NULL;
@@ -643,8 +532,7 @@ void crash_fadump(struct pt_regs *regs, const char *str)
 
 	fdh->online_mask = *cpu_online_mask;
 
-	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
-	rtas_os_term((char *)str);
+	fw_dump.ops->fadump_trigger(fdh, str);
 }
 
 #define GPR_MASK	0xffffff0000000000
@@ -1129,7 +1017,7 @@ static int fadump_setup_crash_memory_ranges(void)
 static inline unsigned long fadump_relocate(unsigned long paddr)
 {
 	if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
-		return be64_to_cpu(fdm.rmr_region.destination_address) + paddr;
+		return fw_dump.boot_mem_dest_addr + paddr;
 	else
 		return paddr;
 }
@@ -1202,7 +1090,7 @@ static int fadump_create_elfcore_headers(char *bufp)
 			 * to the specified destination_address. Hence set
 			 * the correct offset.
 			 */
-			phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address);
+			phdr->p_offset = fw_dump.boot_mem_dest_addr;
 		}
 
 		phdr->p_paddr = mbase;
@@ -1254,7 +1142,8 @@ static int register_fadump(void)
 	if (ret)
 		return ret;
 
-	addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len);
+	addr = fw_dump.fadumphdr_addr;
+
 	/* Initialize fadump crash info header. */
 	addr = init_fadump_header(addr);
 	vaddr = __va(addr);
@@ -1263,34 +1152,8 @@ static int register_fadump(void)
 	fadump_create_elfcore_headers(vaddr);
 
 	/* register the future kernel dump with firmware. */
-	return register_fw_dump(&fdm);
-}
-
-static int fadump_unregister_dump(struct rtas_fadump_mem_struct *fdm)
-{
-	int rc = 0;
-	unsigned int wait_time;
-
-	pr_debug("Un-register firmware-assisted dump\n");
-
-	/* TODO: Add upper time limit for the delay */
-	do {
-		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
-			FADUMP_UNREGISTER, fdm,
-			sizeof(struct rtas_fadump_mem_struct));
-
-		wait_time = rtas_busy_delay_time(rc);
-		if (wait_time)
-			mdelay(wait_time);
-	} while (wait_time);
-
-	if (rc) {
-		printk(KERN_ERR "Failed to un-register firmware-assisted dump."
-			" unexpected error(%d).\n", rc);
-		return rc;
-	}
-	fw_dump.dump_registered = 0;
-	return 0;
+	pr_debug("Registering for firmware-assisted kernel dump...\n");
+	return fw_dump.ops->fadump_register(&fw_dump);
 }
 
 static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm)
@@ -1328,7 +1191,7 @@ void fadump_cleanup(void)
 		fadump_invalidate_dump(fdm_active);
 	} else if (fw_dump.dump_registered) {
 		/* Un-register Firmware-assisted dump if it was registered. */
-		fadump_unregister_dump(&fdm);
+		fw_dump.ops->fadump_unregister(&fw_dump);
 		free_crash_memory_ranges();
 	}
 }
@@ -1433,7 +1296,7 @@ static void fadump_invalidate_release_mem(void)
 	fadump_free_cpu_notes_buf();
 
 	/* Initialize the kernel dump memory structure for FAD registration. */
-	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+	fw_dump.ops->fadump_init_mem_struct(&fw_dump);
 }
 
 static ssize_t fadump_release_memory_store(struct kobject *kobj,
@@ -1498,12 +1361,13 @@ static ssize_t fadump_register_store(struct kobject *kobj,
 			goto unlock_out;
 		}
 		/* Un-register Firmware-assisted dump */
-		fadump_unregister_dump(&fdm);
+		pr_debug("Un-register firmware-assisted dump\n");
+		fw_dump.ops->fadump_unregister(&fw_dump);
 		break;
 	case 1:
 		if (fw_dump.dump_registered == 1) {
 			/* Un-register Firmware-assisted dump */
-			fadump_unregister_dump(&fdm);
+			fw_dump.ops->fadump_unregister(&fw_dump);
 		}
 		/* Register Firmware-assisted dump */
 		ret = register_fadump();
@@ -1530,7 +1394,8 @@ static int fadump_region_show(struct seq_file *m, void *private)
 		fdm_ptr = fdm_active;
 	else {
 		mutex_unlock(&fadump_mutex);
-		fdm_ptr = &fdm;
+		fw_dump.ops->fadump_region_show(&fw_dump, m);
+		return 0;
 	}
 
 	seq_printf(m,
@@ -1651,7 +1516,7 @@ int __init setup_fadump(void)
 	}
 	/* Initialize the kernel dump memory structure for FAD registration. */
 	else if (fw_dump.reserve_dump_area_size)
-		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+		fw_dump.ops->fadump_init_mem_struct(&fw_dump);
 	fadump_init_files();
 
 	return 1;
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index 9efe7b1ac9cc..1daa29fddcaa 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -22,19 +22,151 @@
 
 #include "rtas-fadump.h"
 
+static struct rtas_fadump_mem_struct fdm;
+
+static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
+				      const struct rtas_fadump_mem_struct *fdm)
+{
+	fadump_conf->boot_mem_dest_addr =
+		be64_to_cpu(fdm->rmr_region.destination_address);
+
+	fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr +
+				       fadump_conf->boot_memory_size);
+}
+
 static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 {
-	return fadump_conf->reserve_dump_area_start;
+	u64 addr = fadump_conf->reserve_dump_area_start;
+
+	memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct));
+	addr = addr & PAGE_MASK;
+
+	fdm.header.dump_format_version = cpu_to_be32(0x00000001);
+	fdm.header.dump_num_sections = cpu_to_be16(3);
+	fdm.header.dump_status_flag = 0;
+	fdm.header.offset_first_dump_section =
+		cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct,
+					  cpu_state_data));
+
+	/*
+	 * Fields for disk dump option.
+	 * We are not using disk dump option, hence set these fields to 0.
+	 */
+	fdm.header.dd_block_size = 0;
+	fdm.header.dd_block_offset = 0;
+	fdm.header.dd_num_blocks = 0;
+	fdm.header.dd_offset_disk_path = 0;
+
+	/* set 0 to disable an automatic dump-reboot. */
+	fdm.header.max_time_auto = 0;
+
+	/* Kernel dump sections */
+	/* cpu state data section. */
+	fdm.cpu_state_data.request_flag =
+		cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.cpu_state_data.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
+	fdm.cpu_state_data.source_address = 0;
+	fdm.cpu_state_data.source_len =
+		cpu_to_be64(fadump_conf->cpu_state_data_size);
+	fdm.cpu_state_data.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->cpu_state_data_size;
+
+	/* hpte region section */
+	fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.hpte_region.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
+	fdm.hpte_region.source_address = 0;
+	fdm.hpte_region.source_len =
+		cpu_to_be64(fadump_conf->hpte_region_size);
+	fdm.hpte_region.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->hpte_region_size;
+
+	/* RMA region section */
+	fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.rmr_region.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
+	fdm.rmr_region.source_address = cpu_to_be64(RMA_START);
+	fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size);
+	fdm.rmr_region.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->boot_memory_size;
+
+	rtas_fadump_update_config(fadump_conf, &fdm);
+
+	return addr;
 }
 
 static int rtas_fadump_register(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	unsigned int wait_time;
+	int rc, err = -EIO;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_REGISTER, &fdm,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+
+	} while (wait_time);
+
+	switch (rc) {
+	case 0:
+		pr_info("Registration is successful!\n");
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case -1:
+		pr_err("Failed to register. Hardware Error(%d).\n", rc);
+		break;
+	case -3:
+		if (!is_fadump_boot_mem_contiguous())
+			pr_err("Can't have holes in boot memory area.\n");
+		else if (!is_fadump_reserved_mem_contiguous())
+			pr_err("Can't have holes in reserved memory area.\n");
+
+		pr_err("Failed to register. Parameter Error(%d).\n", rc);
+		err = -EINVAL;
+		break;
+	case -9:
+		pr_err("Already registered!\n");
+		fadump_conf->dump_registered = 1;
+		err = -EEXIST;
+		break;
+	default:
+		pr_err("Failed to register. Unknown Error(%d).\n", rc);
+		break;
+	}
+
+	return err;
 }
 
 static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	unsigned int wait_time;
+	int rc;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_UNREGISTER, &fdm,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		pr_err("Failed to un-register - unexpected error(%d).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_registered = 0;
+	return 0;
 }
 
 static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
@@ -54,6 +186,30 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
 static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
 				    struct seq_file *m)
 {
+	const struct rtas_fadump_mem_struct *fdm_ptr = &fdm;
+	const struct rtas_fadump_section *cpu_data_section;
+
+	cpu_data_section = &(fdm_ptr->cpu_state_data);
+	seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+		   be64_to_cpu(cpu_data_section->destination_address),
+		   be64_to_cpu(cpu_data_section->destination_address) +
+		   be64_to_cpu(cpu_data_section->source_len) - 1,
+		   be64_to_cpu(cpu_data_section->source_len),
+		   be64_to_cpu(cpu_data_section->bytes_dumped));
+
+	seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+		   be64_to_cpu(fdm_ptr->hpte_region.destination_address),
+		   be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
+		   be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
+		   be64_to_cpu(fdm_ptr->hpte_region.source_len),
+		   be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
+
+	seq_printf(m, "DUMP: [%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+		   be64_to_cpu(fdm_ptr->rmr_region.destination_address),
+		   be64_to_cpu(fdm_ptr->rmr_region.destination_address) +
+		   be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1,
+		   be64_to_cpu(fdm_ptr->rmr_region.source_len),
+		   be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
 }
 
 static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,

From 109f25cc5fae5f8fb403d925f7b3731f6f18218a Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:19:12 +0530
Subject: [PATCH 228/264] powerpc/fadump: add source info while displaying
 region contents

Improve how fadump_region contents are displayed by adding source
information of memory regions that are to be dumped by f/w.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821334740.5656.5897097884010195405.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/pseries/rtas-fadump.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index 1daa29fddcaa..a11e3c2458d2 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -204,10 +204,10 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
 		   be64_to_cpu(fdm_ptr->hpte_region.source_len),
 		   be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
 
-	seq_printf(m, "DUMP: [%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
-		   be64_to_cpu(fdm_ptr->rmr_region.destination_address),
-		   be64_to_cpu(fdm_ptr->rmr_region.destination_address) +
-		   be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1,
+	seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+		   be64_to_cpu(fdm_ptr->rmr_region.source_address),
+		   be64_to_cpu(fdm_ptr->rmr_region.destination_address));
+	seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
 		   be64_to_cpu(fdm_ptr->rmr_region.source_len),
 		   be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
 }

From 8255da95e54519bb74638c2448ac17f4b34fe6f5 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:19:28 +0530
Subject: [PATCH 229/264] powerpc/fadump: release all the memory above boot
 memory size

Except for Reserved dump area (see Documentation/powerpc/firmware-
assisted-dump.rst) which is permanent reserved, all memory above boot
memory size, where boot memory size is the memory required for the
kernel to boot successfully when booted with restricted memory (memory
for capture kernel), is released when the dump is invalidated. Make
this a bit more explicit in the code.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821336092.5656.1079046285366041687.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 3cf621080427..bff909151b44 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -391,6 +391,8 @@ int __init fadump_reserve_mem(void)
 	else
 		memory_boundary = memblock_end_of_DRAM();
 
+	size = get_fadump_area_size();
+	fw_dump.reserve_dump_area_size = size;
 	if (fw_dump.dump_active) {
 		pr_info("Firmware-assisted dump is active.\n");
 
@@ -416,11 +418,14 @@ int __init fadump_reserve_mem(void)
 				be64_to_cpu(fdm_active->rmr_region.destination_address) +
 				be64_to_cpu(fdm_active->rmr_region.source_len);
 		pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
-		fw_dump.reserve_dump_area_start = base;
-		fw_dump.reserve_dump_area_size = size;
-	} else {
-		size = get_fadump_area_size();
 
+		/*
+		 * Start address of reserve dump area (permanent reservation)
+		 * for re-registering FADump after dump capture.
+		 */
+		fw_dump.reserve_dump_area_start =
+			be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+	} else {
 		/*
 		 * Reserve memory at an offset closer to bottom of the RAM to
 		 * minimize the impact of memory hot-remove operation. We can't
@@ -447,7 +452,6 @@ int __init fadump_reserve_mem(void)
 			(unsigned long)(memblock_phys_mem_size() >> 20));
 
 		fw_dump.reserve_dump_area_start = base;
-		fw_dump.reserve_dump_area_size = size;
 		return fadump_cma_init();
 	}
 	return 1;
@@ -1265,34 +1269,16 @@ static void fadump_release_memory(unsigned long begin, unsigned long end)
 
 static void fadump_invalidate_release_mem(void)
 {
-	unsigned long reserved_area_start, reserved_area_end;
-	unsigned long destination_address;
-
 	mutex_lock(&fadump_mutex);
 	if (!fw_dump.dump_active) {
 		mutex_unlock(&fadump_mutex);
 		return;
 	}
 
-	destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
 	fadump_cleanup();
 	mutex_unlock(&fadump_mutex);
 
-	/*
-	 * Save the current reserved memory bounds we will require them
-	 * later for releasing the memory for general use.
-	 */
-	reserved_area_start = fw_dump.reserve_dump_area_start;
-	reserved_area_end = reserved_area_start +
-			fw_dump.reserve_dump_area_size;
-	/*
-	 * Setup reserve_dump_area_start and its size so that we can
-	 * reuse this reserved memory for Re-registration.
-	 */
-	fw_dump.reserve_dump_area_start = destination_address;
-	fw_dump.reserve_dump_area_size = get_fadump_area_size();
-
-	fadump_release_memory(reserved_area_start, reserved_area_end);
+	fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM());
 	fadump_free_cpu_notes_buf();
 
 	/* Initialize the kernel dump memory structure for FAD registration. */

From f35120115b767c49ad8de56dd78c86540a14df5b Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:19:44 +0530
Subject: [PATCH 230/264] pseries/fadump: move out platform specific support
 from generic code

Move code that supports processing the crash'ed kernel's memory
preserved by firmware to platform specific callback functions.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821337690.5656.13050665924800177744.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c                 | 329 +------------------
 arch/powerpc/platforms/pseries/rtas-fadump.c | 263 ++++++++++++++-
 2 files changed, 274 insertions(+), 318 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index bff909151b44..9d9f7c384a71 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -28,15 +28,11 @@
 #include <asm/debugfs.h>
 #include <asm/page.h>
 #include <asm/prom.h>
-#include <asm/rtas.h>
 #include <asm/fadump.h>
 #include <asm/fadump-internal.h>
 #include <asm/setup.h>
 
-#include "../platforms/pseries/rtas-fadump.h"
-
 static struct fw_dump fw_dump;
-static const struct rtas_fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
 struct fad_crash_memory_ranges *crash_memory_ranges;
@@ -108,22 +104,13 @@ static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
 
 /* Scan the Firmware Assisted dump configuration details. */
-int __init early_init_dt_scan_fw_dump(unsigned long node,
-			const char *uname, int depth, void *data)
+int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+				      int depth, void *data)
 {
 	if (depth != 1 || strcmp(uname, "rtas") != 0)
 		return 0;
 
 	rtas_fadump_dt_scan(&fw_dump, node);
-
-	/*
-	 * The 'ibm,kernel-dump' rtas node is present only if there is
-	 * dump data waiting for us.
-	 */
-	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
-	if (fdm_active)
-		fw_dump.dump_active = 1;
-
 	return 1;
 }
 
@@ -358,9 +345,7 @@ int __init fadump_reserve_mem(void)
 	 * If dump is active then we have already calculated the size during
 	 * first kernel.
 	 */
-	if (fdm_active)
-		fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
-	else {
+	if (!fw_dump.dump_active) {
 		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
 #ifdef CONFIG_CMA
 		if (!fw_dump.nocma)
@@ -414,17 +399,9 @@ int __init fadump_reserve_mem(void)
 		size = memory_boundary - base;
 		fadump_reserve_crash_area(base, size);
 
-		fw_dump.fadumphdr_addr =
-				be64_to_cpu(fdm_active->rmr_region.destination_address) +
-				be64_to_cpu(fdm_active->rmr_region.source_len);
-		pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
-
-		/*
-		 * Start address of reserve dump area (permanent reservation)
-		 * for re-registering FADump after dump capture.
-		 */
-		fw_dump.reserve_dump_area_start =
-			be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+		pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr);
+		pr_debug("Reserve dump area start address: 0x%lx\n",
+			 fw_dump.reserve_dump_area_start);
 	} else {
 		/*
 		 * Reserve memory at an offset closer to bottom of the RAM to
@@ -539,66 +516,6 @@ void crash_fadump(struct pt_regs *regs, const char *str)
 	fw_dump.ops->fadump_trigger(fdh, str);
 }
 
-#define GPR_MASK	0xffffff0000000000
-static inline int fadump_gpr_index(u64 id)
-{
-	int i = -1;
-	char str[3];
-
-	if ((id & GPR_MASK) == fadump_str_to_u64("GPR")) {
-		/* get the digits at the end */
-		id &= ~GPR_MASK;
-		id >>= 24;
-		str[2] = '\0';
-		str[1] = id & 0xff;
-		str[0] = (id >> 8) & 0xff;
-		sscanf(str, "%d", &i);
-		if (i > 31)
-			i = -1;
-	}
-	return i;
-}
-
-static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
-								u64 reg_val)
-{
-	int i;
-
-	i = fadump_gpr_index(reg_id);
-	if (i >= 0)
-		regs->gpr[i] = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("NIA"))
-		regs->nip = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("MSR"))
-		regs->msr = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("CTR"))
-		regs->ctr = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("LR"))
-		regs->link = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("XER"))
-		regs->xer = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("CR"))
-		regs->ccr = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("DAR"))
-		regs->dar = (unsigned long)reg_val;
-	else if (reg_id == fadump_str_to_u64("DSISR"))
-		regs->dsisr = (unsigned long)reg_val;
-}
-
-static struct rtas_fadump_reg_entry*
-fadump_read_registers(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs)
-{
-	memset(regs, 0, sizeof(struct pt_regs));
-
-	while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
-		fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
-					be64_to_cpu(reg_entry->reg_value));
-		reg_entry++;
-	}
-	reg_entry++;
-	return reg_entry;
-}
-
 u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
 {
 	struct elf_prstatus prstatus;
@@ -686,147 +603,6 @@ void fadump_free_cpu_notes_buf(void)
 	fw_dump.cpu_notes_buf_size = 0;
 }
 
-/*
- * Read CPU state dump data and convert it into ELF notes.
- * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
- * used to access the data to allow for additional fields to be added without
- * affecting compatibility. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
- * 8 Byte ASCII identifier and 8 Byte register value. The register entry
- * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
- * of register value. For more details refer to PAPR document.
- *
- * Only for the crashing cpu we ignore the CPU dump data and get exact
- * state from fadump crash info structure populated by first kernel at the
- * time of crash.
- */
-static int __init fadump_build_cpu_notes(const struct rtas_fadump_mem_struct *fdm)
-{
-	struct rtas_fadump_reg_save_area_header *reg_header;
-	struct rtas_fadump_reg_entry *reg_entry;
-	struct fadump_crash_info_header *fdh = NULL;
-	void *vaddr;
-	unsigned long addr;
-	u32 num_cpus, *note_buf;
-	struct pt_regs regs;
-	int i, rc = 0, cpu = 0;
-
-	if (!fdm->cpu_state_data.bytes_dumped)
-		return -EINVAL;
-
-	addr = be64_to_cpu(fdm->cpu_state_data.destination_address);
-	vaddr = __va(addr);
-
-	reg_header = vaddr;
-	if (be64_to_cpu(reg_header->magic_number) !=
-	    fadump_str_to_u64("REGSAVE")) {
-		printk(KERN_ERR "Unable to read register save area.\n");
-		return -ENOENT;
-	}
-	pr_debug("--------CPU State Data------------\n");
-	pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
-	pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
-
-	vaddr += be32_to_cpu(reg_header->num_cpu_offset);
-	num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
-	pr_debug("NumCpus     : %u\n", num_cpus);
-	vaddr += sizeof(u32);
-	reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
-
-	rc = fadump_setup_cpu_notes_buf(num_cpus);
-	if (rc != 0)
-		return rc;
-
-	note_buf = (u32 *)fw_dump.cpu_notes_buf_vaddr;
-
-	if (fw_dump.fadumphdr_addr)
-		fdh = __va(fw_dump.fadumphdr_addr);
-
-	for (i = 0; i < num_cpus; i++) {
-		if (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUSTRT")) {
-			printk(KERN_ERR "Unable to read CPU state data\n");
-			rc = -ENOENT;
-			goto error_out;
-		}
-		/* Lower 4 bytes of reg_value contains logical cpu id */
-		cpu = be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK;
-		if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
-			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
-			continue;
-		}
-		pr_debug("Reading register data for cpu %d...\n", cpu);
-		if (fdh && fdh->crashing_cpu == cpu) {
-			regs = fdh->regs;
-			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
-			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
-		} else {
-			reg_entry++;
-			reg_entry = fadump_read_registers(reg_entry, &regs);
-			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
-		}
-	}
-	final_note(note_buf);
-
-	if (fdh) {
-		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
-							fdh->elfcorehdr_addr);
-		fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
-	}
-	return 0;
-
-error_out:
-	fadump_free_cpu_notes_buf();
-	return rc;
-
-}
-
-/*
- * Validate and process the dump data stored by firmware before exporting
- * it through '/proc/vmcore'.
- */
-static int __init process_fadump(const struct rtas_fadump_mem_struct *fdm_active)
-{
-	struct fadump_crash_info_header *fdh;
-	int rc = 0;
-
-	if (!fdm_active || !fw_dump.fadumphdr_addr)
-		return -EINVAL;
-
-	/* Check if the dump data is valid. */
-	if ((be16_to_cpu(fdm_active->header.dump_status_flag) == RTAS_FADUMP_ERROR_FLAG) ||
-			(fdm_active->cpu_state_data.error_flags != 0) ||
-			(fdm_active->rmr_region.error_flags != 0)) {
-		printk(KERN_ERR "Dump taken by platform is not valid\n");
-		return -EINVAL;
-	}
-	if ((fdm_active->rmr_region.bytes_dumped !=
-			fdm_active->rmr_region.source_len) ||
-			!fdm_active->cpu_state_data.bytes_dumped) {
-		printk(KERN_ERR "Dump taken by platform is incomplete\n");
-		return -EINVAL;
-	}
-
-	/* Validate the fadump crash info header */
-	fdh = __va(fw_dump.fadumphdr_addr);
-	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
-		printk(KERN_ERR "Crash info header is not valid.\n");
-		return -EINVAL;
-	}
-
-	rc = fadump_build_cpu_notes(fdm_active);
-	if (rc)
-		return rc;
-
-	/*
-	 * We are done validating dump info and elfcore header is now ready
-	 * to be exported. set elfcorehdr_addr so that vmcore module will
-	 * export the elfcore header through '/proc/vmcore'.
-	 */
-	elfcorehdr_addr = fdh->elfcorehdr_addr;
-
-	return 0;
-}
-
 static void free_crash_memory_ranges(void)
 {
 	kfree(crash_memory_ranges);
@@ -1116,7 +892,6 @@ static unsigned long init_fadump_header(unsigned long addr)
 	if (!addr)
 		return 0;
 
-	fw_dump.fadumphdr_addr = addr;
 	fdh = __va(addr);
 	addr += sizeof(struct fadump_crash_info_header);
 
@@ -1160,39 +935,12 @@ static int register_fadump(void)
 	return fw_dump.ops->fadump_register(&fw_dump);
 }
 
-static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm)
-{
-	int rc = 0;
-	unsigned int wait_time;
-
-	pr_debug("Invalidating firmware-assisted dump registration\n");
-
-	/* TODO: Add upper time limit for the delay */
-	do {
-		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
-			FADUMP_INVALIDATE, fdm,
-			sizeof(struct rtas_fadump_mem_struct));
-
-		wait_time = rtas_busy_delay_time(rc);
-		if (wait_time)
-			mdelay(wait_time);
-	} while (wait_time);
-
-	if (rc) {
-		pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc);
-		return rc;
-	}
-	fw_dump.dump_active = 0;
-	fdm_active = NULL;
-	return 0;
-}
-
 void fadump_cleanup(void)
 {
 	/* Invalidate the registration only if dump is active. */
 	if (fw_dump.dump_active) {
-		/* pass the same memory dump structure provided by platform */
-		fadump_invalidate_dump(fdm_active);
+		pr_debug("Invalidating firmware-assisted dump registration\n");
+		fw_dump.ops->fadump_invalidate(&fw_dump);
 	} else if (fw_dump.dump_registered) {
 		/* Un-register Firmware-assisted dump if it was registered. */
 		fw_dump.ops->fadump_unregister(&fw_dump);
@@ -1333,7 +1081,7 @@ static ssize_t fadump_register_store(struct kobject *kobj,
 	int ret = 0;
 	int input = -1;
 
-	if (!fw_dump.fadump_enabled || fdm_active)
+	if (!fw_dump.fadump_enabled || fw_dump.dump_active)
 		return -EPERM;
 
 	if (kstrtoint(buf, 0, &input))
@@ -1346,6 +1094,7 @@ static ssize_t fadump_register_store(struct kobject *kobj,
 		if (fw_dump.dump_registered == 0) {
 			goto unlock_out;
 		}
+
 		/* Un-register Firmware-assisted dump */
 		pr_debug("Un-register firmware-assisted dump\n");
 		fw_dump.ops->fadump_unregister(&fw_dump);
@@ -1370,63 +1119,12 @@ unlock_out:
 
 static int fadump_region_show(struct seq_file *m, void *private)
 {
-	const struct rtas_fadump_mem_struct *fdm_ptr;
-
 	if (!fw_dump.fadump_enabled)
 		return 0;
 
 	mutex_lock(&fadump_mutex);
-	if (fdm_active)
-		fdm_ptr = fdm_active;
-	else {
-		mutex_unlock(&fadump_mutex);
-		fw_dump.ops->fadump_region_show(&fw_dump, m);
-		return 0;
-	}
-
-	seq_printf(m,
-			"CPU : [%#016llx-%#016llx] %#llx bytes, "
-			"Dumped: %#llx\n",
-			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address),
-			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) +
-			be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1,
-			be64_to_cpu(fdm_ptr->cpu_state_data.source_len),
-			be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped));
-	seq_printf(m,
-			"HPTE: [%#016llx-%#016llx] %#llx bytes, "
-			"Dumped: %#llx\n",
-			be64_to_cpu(fdm_ptr->hpte_region.destination_address),
-			be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
-			be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
-			be64_to_cpu(fdm_ptr->hpte_region.source_len),
-			be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
-	seq_printf(m,
-			"DUMP: [%#016llx-%#016llx] %#llx bytes, "
-			"Dumped: %#llx\n",
-			be64_to_cpu(fdm_ptr->rmr_region.destination_address),
-			be64_to_cpu(fdm_ptr->rmr_region.destination_address) +
-			be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1,
-			be64_to_cpu(fdm_ptr->rmr_region.source_len),
-			be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
-
-	if (!fdm_active ||
-		(fw_dump.reserve_dump_area_start ==
-		be64_to_cpu(fdm_ptr->cpu_state_data.destination_address)))
-		goto out;
-
-	/* Dump is active. Show reserved memory region. */
-	seq_printf(m,
-			"    : [%#016llx-%#016llx] %#llx bytes, "
-			"Dumped: %#llx\n",
-			(unsigned long long)fw_dump.reserve_dump_area_start,
-			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1,
-			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
-			fw_dump.reserve_dump_area_start,
-			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
-			fw_dump.reserve_dump_area_start);
-out:
-	if (fdm_active)
-		mutex_unlock(&fadump_mutex);
+	fw_dump.ops->fadump_region_show(&fw_dump, m);
+	mutex_unlock(&fadump_mutex);
 	return 0;
 }
 
@@ -1497,12 +1195,13 @@ int __init setup_fadump(void)
 		 * if dump process fails then invalidate the registration
 		 * and release memory before proceeding for re-registration.
 		 */
-		if (process_fadump(fdm_active) < 0)
+		if (fw_dump.ops->fadump_process(&fw_dump) < 0)
 			fadump_invalidate_release_mem();
 	}
 	/* Initialize the kernel dump memory structure for FAD registration. */
 	else if (fw_dump.reserve_dump_area_size)
 		fw_dump.ops->fadump_init_mem_struct(&fw_dump);
+
 	fadump_init_files();
 
 	return 1;
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index a11e3c2458d2..bf1e8fc549b5 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -23,6 +23,7 @@
 #include "rtas-fadump.h"
 
 static struct rtas_fadump_mem_struct fdm;
+static const struct rtas_fadump_mem_struct *fdm_active;
 
 static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
 				      const struct rtas_fadump_mem_struct *fdm)
@@ -34,6 +35,25 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
 				       fadump_conf->boot_memory_size);
 }
 
+/*
+ * This function is called in the capture kernel to get configuration details
+ * setup in the first kernel and passed to the f/w.
+ */
+static void rtas_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct rtas_fadump_mem_struct *fdm)
+{
+	fadump_conf->boot_memory_size = be64_to_cpu(fdm->rmr_region.source_len);
+
+	/*
+	 * Start address of reserve dump area (permanent reservation) for
+	 * re-registering FADump after dump capture.
+	 */
+	fadump_conf->reserve_dump_area_start =
+		be64_to_cpu(fdm->cpu_state_data.destination_address);
+
+	rtas_fadump_update_config(fadump_conf, fdm);
+}
+
 static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 {
 	u64 addr = fadump_conf->reserve_dump_area_start;
@@ -171,7 +191,183 @@ static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
 
 static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	unsigned int wait_time;
+	int rc;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_INVALIDATE, fdm_active,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected error (%d).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+#define RTAS_FADUMP_GPR_MASK		0xffffff0000000000
+static inline int rtas_fadump_gpr_index(u64 id)
+{
+	char str[3];
+	int i = -1;
+
+	if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) {
+		/* get the digits at the end */
+		id &= ~RTAS_FADUMP_GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		if (kstrtoint(str, 10, &i))
+			i = -EINVAL;
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val)
+{
+	int i;
+
+	i = rtas_fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct rtas_fadump_reg_entry*
+rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry,
+		      struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
+		rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
+				       be64_to_cpu(reg_entry->reg_value));
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
+{
+	struct rtas_fadump_reg_save_area_header *reg_header;
+	struct fadump_crash_info_header *fdh = NULL;
+	struct rtas_fadump_reg_entry *reg_entry;
+	u32 num_cpus, *note_buf;
+	int i, rc = 0, cpu = 0;
+	struct pt_regs regs;
+	unsigned long addr;
+	void *vaddr;
+
+	addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (be64_to_cpu(reg_header->magic_number) !=
+	    fadump_str_to_u64("REGSAVE")) {
+		pr_err("Unable to read register save area.\n");
+		return -ENOENT;
+	}
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
+	pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
+
+	vaddr += be32_to_cpu(reg_header->num_cpu_offset);
+	num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
+
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
+
+	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+
+	if (fadump_conf->fadumphdr_addr)
+		fdh = __va(fadump_conf->fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (be64_to_cpu(reg_entry->reg_id) !=
+		    fadump_str_to_u64("CPUSTRT")) {
+			pr_err("Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = (be64_to_cpu(reg_entry->reg_value) &
+		       RTAS_FADUMP_CPU_ID_MASK);
+		if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = rtas_fadump_read_regs(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	final_note(note_buf);
+
+	if (fdh) {
+		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+			 fdh->elfcorehdr_addr);
+		fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+	}
+	return 0;
+
+error_out:
+	fadump_free_cpu_notes_buf();
+	return rc;
+
 }
 
 /*
@@ -180,14 +376,58 @@ static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
  */
 static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
 {
-	return -EINVAL;
+	struct fadump_crash_info_header *fdh;
+	int rc = 0;
+
+	if (!fdm_active || !fadump_conf->fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((be16_to_cpu(fdm_active->header.dump_status_flag) ==
+			RTAS_FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		pr_err("Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
+		pr_err("Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	rc = rtas_fadump_build_cpu_notes(fadump_conf);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
 }
 
 static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
 				    struct seq_file *m)
 {
-	const struct rtas_fadump_mem_struct *fdm_ptr = &fdm;
 	const struct rtas_fadump_section *cpu_data_section;
+	const struct rtas_fadump_mem_struct *fdm_ptr;
+
+	if (fdm_active)
+		fdm_ptr = fdm_active;
+	else
+		fdm_ptr = &fdm;
 
 	cpu_data_section = &(fdm_ptr->cpu_state_data);
 	seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
@@ -210,6 +450,12 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
 	seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
 		   be64_to_cpu(fdm_ptr->rmr_region.source_len),
 		   be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
+
+	/* Dump is active. Show reserved area start address. */
+	if (fdm_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
 }
 
 static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
@@ -247,6 +493,17 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 	fadump_conf->ops		= &rtas_fadump_ops;
 	fadump_conf->fadump_supported	= 1;
 
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+	if (fdm_active) {
+		pr_info("Firmware-assisted dump is active.\n");
+		fadump_conf->dump_active = 1;
+		rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active));
+	}
+
 	/* Get the sizes required to store dump data for the firmware provided
 	 * dump sections.
 	 * For each dump section type supported, a 32bit cell which defines

From 1679b96e69ec2e10ccd9d82b691ade0b16cb64d1 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:19:58 +0530
Subject: [PATCH 231/264] powerpc/fadump: use FADump instead of fadump for how
 it is pronounced

fadump is pronounced f-a-dump. Update documentation accordingly. Also,
update how fadump_region contents look like with recent changes.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821339317.5656.15852294223821732082.stgit@hbathini.in.ibm.com
---
 .../powerpc/firmware-assisted-dump.rst        | 55 ++++++++++---------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index 563c021df539..d912755067d2 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -9,18 +9,18 @@ a crashed system, and to do so from a fully-reset system, and
 to minimize the total elapsed time until the system is back
 in production use.
 
-- Firmware assisted dump (fadump) infrastructure is intended to replace
+- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
   the existing phyp assisted dump.
 - Fadump uses the same firmware interfaces and memory reservation model
   as phyp assisted dump.
-- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
   in the ELF format in the same way as kdump. This helps us reuse the
   kdump infrastructure for dump capture and filtering.
 - Unlike phyp dump, userspace tool does not need to refer any sysfs
   interface while reading /proc/vmcore.
-- Unlike phyp dump, fadump allows user to release all the memory reserved
+- Unlike phyp dump, FADump allows user to release all the memory reserved
   for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
-- Once enabled through kernel boot parameter, fadump can be
+- Once enabled through kernel boot parameter, FADump can be
   started/stopped through /sys/kernel/fadump_registered interface (see
   sysfs files section below) and can be easily integrated with kdump
   service start/stop init scripts.
@@ -34,7 +34,7 @@ dump offers several strong, practical advantages:
    in a clean, consistent state.
 -  Once the dump is copied out, the memory that held the dump
    is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   unlike kdump, FADump doesn't need a 2nd reboot to get back
    the system to the production configuration.
 
 The above can only be accomplished by coordination with,
@@ -63,7 +63,7 @@ as follows:
          boot successfully. For syntax of crashkernel= parameter,
          refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is
          provided in crashkernel= parameter, it will be ignored
-         as fadump uses a predefined offset to reserve memory
+         as FADump uses a predefined offset to reserve memory
          for boot memory dump preservation in case of a crash.
 
 -  After the low memory (boot memory) area has been saved, the
@@ -123,7 +123,7 @@ blocking this significant chunk of memory from production kernel.
 Hence, the implementation uses the Linux kernel's Contiguous Memory
 Allocator (CMA) for memory reservation if CMA is configured for kernel.
 With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
+use it, while kernel is prevented from using it. With this FADump will
 still be able to capture all of the kernel memory and most of the user
 space memory except the user pages that were present in CMA region::
 
@@ -173,14 +173,14 @@ KDump, as dump mechanism.
 The tools to examine the dump will be same as the ones
 used for kdump.
 
-How to enable firmware-assisted dump (fadump):
+How to enable firmware-assisted dump (FADump):
 ----------------------------------------------
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
 2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
+   By default, FADump reserved memory will be initialized as CMA area.
    Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
+   prevent FADump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
    to specify size of the memory to reserve for boot memory dump
    preservation.
@@ -206,29 +206,29 @@ the control files and debugfs file to display memory reserved region.
 Here is the list of files under kernel sysfs:
 
  /sys/kernel/fadump_enabled
-    This is used to display the fadump status.
+    This is used to display the FADump status.
 
-    - 0 = fadump is disabled
-    - 1 = fadump is enabled
+    - 0 = FADump is disabled
+    - 1 = FADump is enabled
 
     This interface can be used by kdump init scripts to identify if
-    fadump is enabled in the kernel and act accordingly.
+    FADump is enabled in the kernel and act accordingly.
 
  /sys/kernel/fadump_registered
-    This is used to display the fadump registration status as well
-    as to control (start/stop) the fadump registration.
+    This is used to display the FADump registration status as well
+    as to control (start/stop) the FADump registration.
 
-    - 0 = fadump is not registered.
-    - 1 = fadump is registered and ready to handle system crash.
+    - 0 = FADump is not registered.
+    - 1 = FADump is registered and ready to handle system crash.
 
-    To register fadump echo 1 > /sys/kernel/fadump_registered and
+    To register FADump echo 1 > /sys/kernel/fadump_registered and
     echo 0 > /sys/kernel/fadump_registered for un-register and stop the
-    fadump. Once the fadump is un-registered, the system crash will not
+    FADump. Once the FADump is un-registered, the system crash will not
     be handled and vmcore will not be captured. This interface can be
     easily integrated with kdump service start/stop.
 
  /sys/kernel/fadump_release_mem
-    This file is available only when fadump is active during
+    This file is available only when FADump is active during
     second kernel. This is used to release the reserved memory
     region that are held for saving crash dump. To release the
     reserved memory echo 1 to it::
@@ -246,21 +246,25 @@ Here is the list of files under powerpc debugfs:
 (Assuming debugfs is mounted on /sys/kernel/debug directory.)
 
  /sys/kernel/debug/powerpc/fadump_region
-    This file shows the reserved memory regions if fadump is
+    This file shows the reserved memory regions if FADump is
     enabled otherwise this file is empty. The output format
     is::
 
       <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
 
+    and for kernel DUMP region is:
+
+    DUMP: Src: <src-addr>, Dest: <dest-addr>, Size: <size>, Dumped: # bytes
+
     e.g.
-    Contents when fadump is registered during first kernel::
+    Contents when FADump is registered during first kernel::
 
       # cat /sys/kernel/debug/powerpc/fadump_region
       CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
       HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
       DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
 
-    Contents when fadump is active during second kernel::
+    Contents when FADump is active during second kernel::
 
       # cat /sys/kernel/debug/powerpc/fadump_region
       CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
@@ -268,6 +272,7 @@ Here is the list of files under powerpc debugfs:
       DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
           : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
 
+
 NOTE:
       Please refer to Documentation/filesystems/debugfs.txt on
       how to mount the debugfs filesystem.
@@ -278,7 +283,7 @@ TODO:
  - Need to come up with the better approach to find out more
    accurate boot memory size that is required for a kernel to
    boot successfully when booted with restricted memory.
- - The fadump implementation introduces a fadump crash info structure
+ - The FADump implementation introduces a FADump crash info structure
    in the scratch area before the ELF core header. The idea of introducing
    this structure is to pass some important crash info data to the second
    kernel which will help second kernel to populate ELF core header with

From 6f5f193e84d3d7b55d406ecc7ac4ea1ef1d1876f Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:20:12 +0530
Subject: [PATCH 232/264] powerpc/opal: add MPIPL interface definitions

MPIPL is Memory Preserving IPL supported from POWER9. This enables the
kernel to reset the system with memory 'preserved'. Also, it supports
copying memory from a source address to some destination address during
MPIPL boot. Add MPIPL interface definitions here to leverage these f/w
features in adding FADump support for PowerNV platform.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821340710.5656.10071829040515662624.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/opal-api.h        | 44 +++++++++++++++++++++-
 arch/powerpc/include/asm/opal.h            |  5 +++
 arch/powerpc/platforms/powernv/opal-call.c |  3 ++
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 1cad413e1e0e..378e3997845a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,10 @@
 #define OPAL_HANDLE_HMI2			166
 #define	OPAL_NX_COPROC_INIT			167
 #define OPAL_XIVE_GET_VP_STATE			170
-#define OPAL_LAST				170
+#define OPAL_MPIPL_UPDATE			173
+#define OPAL_MPIPL_REGISTER_TAG			174
+#define OPAL_MPIPL_QUERY_TAG			175
+#define OPAL_LAST				175
 
 #define QUIESCE_HOLD			1 /* Spin all calls at entry */
 #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
@@ -1060,6 +1063,7 @@ enum {
 	OPAL_REBOOT_NORMAL		= 0,
 	OPAL_REBOOT_PLATFORM_ERROR	= 1,
 	OPAL_REBOOT_FULL_IPL		= 2,
+	OPAL_REBOOT_MPIPL		= 3,
 };
 
 /* Argument to OPAL_PCI_TCE_KILL */
@@ -1136,6 +1140,44 @@ enum {
 #define OPAL_PCI_P2P_LOAD		0x2
 #define OPAL_PCI_P2P_STORE		0x4
 
+/* MPIPL update operations */
+enum opal_mpipl_ops {
+	OPAL_MPIPL_ADD_RANGE			= 0,
+	OPAL_MPIPL_REMOVE_RANGE			= 1,
+	OPAL_MPIPL_REMOVE_ALL			= 2,
+	OPAL_MPIPL_FREE_PRESERVED_MEMORY	= 3,
+};
+
+/* Tag will point to various metadata area. Kernel will
+ * use tag to get metadata value.
+ */
+enum opal_mpipl_tags {
+	OPAL_MPIPL_TAG_CPU	= 0,
+	OPAL_MPIPL_TAG_OPAL	= 1,
+	OPAL_MPIPL_TAG_KERNEL	= 2,
+	OPAL_MPIPL_TAG_BOOT_MEM	= 3,
+};
+
+/* Preserved memory details */
+struct opal_mpipl_region {
+	__be64	src;
+	__be64	dest;
+	__be64	size;
+};
+
+/* Structure version */
+#define OPAL_MPIPL_VERSION		0x01
+
+struct opal_mpipl_fadump {
+	u8	version;
+	u8	reserved[7];
+	__be32	crashing_pir;	/* OPAL crashing CPU PIR */
+	__be32	cpu_data_version;
+	__be32	cpu_data_size;
+	__be32	region_cnt;
+	struct	opal_mpipl_region region[];
+} __packed;
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index d5a0807d21db..a0cf8fba4d12 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -39,6 +39,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn,
 				uint64_t PE_handle);
 int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
 			uint64_t rate_phys, uint32_t size);
+
 int64_t opal_console_write(int64_t term_number, __be64 *length,
 			   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
@@ -297,6 +298,10 @@ int opal_sensor_group_clear(u32 group_hndl, int token);
 int opal_sensor_group_enable(u32 group_hndl, int token, bool enable);
 int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct);
 
+s64 opal_mpipl_update(enum opal_mpipl_ops op, u64 src, u64 dest, u64 size);
+s64 opal_mpipl_register_tag(enum opal_mpipl_tags tag, u64 addr);
+s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, u64 *addr);
+
 s64 opal_signal_system_reset(s32 cpu);
 s64 opal_quiesce(u64 shutdown_type, s32 cpu);
 
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index dccdc9df5213..a2aa5e433ac8 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,		OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
 OPAL_CALL(opal_sensor_read_u64,			OPAL_SENSOR_READ_U64);
 OPAL_CALL(opal_sensor_group_enable,		OPAL_SENSOR_GROUP_ENABLE);
 OPAL_CALL(opal_nx_coproc_init,			OPAL_NX_COPROC_INIT);
+OPAL_CALL(opal_mpipl_update,			OPAL_MPIPL_UPDATE);
+OPAL_CALL(opal_mpipl_register_tag,		OPAL_MPIPL_REGISTER_TAG);
+OPAL_CALL(opal_mpipl_query_tag,			OPAL_MPIPL_QUERY_TAG);

From 41df5928721ff4b5f83767cd5e8b77862fc62bb3 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:20:26 +0530
Subject: [PATCH 233/264] powerpc/fadump: add fadump support on powernv

Add basic callback functions for FADump on PowerNV platform.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821342072.5656.4346362203141486452.stgit@hbathini.in.ibm.com
---
 arch/powerpc/Kconfig                         |  5 +-
 arch/powerpc/include/asm/fadump-internal.h   |  7 ++
 arch/powerpc/kernel/fadump.c                 | 15 +++-
 arch/powerpc/platforms/powernv/Makefile      |  1 +
 arch/powerpc/platforms/powernv/opal-fadump.c | 92 ++++++++++++++++++++
 5 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index edc19363a729..375ff893fa60 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -569,7 +569,7 @@ config CRASH_DUMP
 
 config FA_DUMP
 	bool "Firmware-assisted dump"
-	depends on PPC64 && PPC_RTAS
+	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select CRASH_CORE
 	select CRASH_DUMP
 	help
@@ -580,7 +580,8 @@ config FA_DUMP
 	  is meant to be a kdump replacement offering robustness and
 	  speed not possible without system firmware assistance.
 
-	  If unsure, say "N"
+	  If unsure, say "y". Only special kernels like petitboot may
+	  need to say "N" here.
 
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 8c0eb0fc09d0..8046fe0b742e 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -135,4 +135,11 @@ static inline void
 rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
 #endif
 
+#ifdef CONFIG_PPC_POWERNV
+extern void opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
+#else
+static inline void
+opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
+#endif
+
 #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 9d9f7c384a71..b17673d8d50b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -107,11 +107,20 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
 				      int depth, void *data)
 {
-	if (depth != 1 || strcmp(uname, "rtas") != 0)
+	if (depth != 1)
 		return 0;
 
-	rtas_fadump_dt_scan(&fw_dump, node);
-	return 1;
+	if (strcmp(uname, "rtas") == 0) {
+		rtas_fadump_dt_scan(&fw_dump, node);
+		return 1;
+	}
+
+	if (strcmp(uname, "ibm,opal") == 0) {
+		opal_fadump_dt_scan(&fw_dump, node);
+		return 1;
+	}
+
+	return 0;
 }
 
 /*
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 49186f0985a4..3e6d6d6d1ec1 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -7,6 +7,7 @@ obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
 obj-y			+= ultravisor.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000000..bbc5356bdc77
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+	return fadump_conf->reserve_dump_area_start;
+}
+
+static int opal_fadump_register(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+	return -EIO;
+}
+
+static int __init opal_fadump_process(struct fw_dump *fadump_conf)
+{
+	return -EINVAL;
+}
+
+static void opal_fadump_region_show(struct fw_dump *fadump_conf,
+				    struct seq_file *m)
+{
+}
+
+static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
+				const char *msg)
+{
+	int rc;
+
+	rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
+	if (rc == OPAL_UNSUPPORTED) {
+		pr_emerg("Reboot type %d not supported.\n",
+			 OPAL_REBOOT_MPIPL);
+	} else if (rc == OPAL_HARDWARE)
+		pr_emerg("No backend support for MPIPL!\n");
+}
+
+static struct fadump_ops opal_fadump_ops = {
+	.fadump_init_mem_struct		= opal_fadump_init_mem_struct,
+	.fadump_register		= opal_fadump_register,
+	.fadump_unregister		= opal_fadump_unregister,
+	.fadump_invalidate		= opal_fadump_invalidate,
+	.fadump_process			= opal_fadump_process,
+	.fadump_region_show		= opal_fadump_region_show,
+	.fadump_trigger			= opal_fadump_trigger,
+};
+
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	unsigned long dn;
+
+	/*
+	 * Check if Firmware-Assisted Dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND) {
+		pr_debug("FADump support is missing!\n");
+		return;
+	}
+
+	if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
+		pr_err("Support missing for this f/w version!\n");
+		return;
+	}
+
+	fadump_conf->ops		= &opal_fadump_ops;
+	fadump_conf->fadump_supported	= 1;
+}

From 6abec12c65e8870d8cafe154a86240fe0bcdd4f7 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:20:41 +0530
Subject: [PATCH 234/264] powerpc/fadump: improve fadump_reserve_mem()

Some code clean-up like using minimal assignments and updating printk
messages. Also, add an 'error_out' label for handling error cleanup
at one place.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821343485.5656.10202857091553646948.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 52 +++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index b17673d8d50b..7d47d4bb7d6e 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -338,16 +338,15 @@ static void __init fadump_reserve_crash_area(unsigned long base,
 
 int __init fadump_reserve_mem(void)
 {
-	unsigned long base, size, memory_boundary;
+	u64 base, size, mem_boundary;
+	int ret = 1;
 
 	if (!fw_dump.fadump_enabled)
 		return 0;
 
 	if (!fw_dump.fadump_supported) {
-		printk(KERN_INFO "Firmware-assisted dump is not supported on"
-				" this hardware\n");
-		fw_dump.fadump_enabled = 0;
-		return 0;
+		pr_info("Firmware-Assisted Dump is not supported on this hardware\n");
+		goto error_out;
 	}
 	/*
 	 * Initialize boot memory size
@@ -355,7 +354,8 @@ int __init fadump_reserve_mem(void)
 	 * first kernel.
 	 */
 	if (!fw_dump.dump_active) {
-		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+		fw_dump.boot_memory_size =
+			PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
 		if (!fw_dump.nocma)
 			fw_dump.boot_memory_size =
@@ -381,10 +381,11 @@ int __init fadump_reserve_mem(void)
 				" dump, now %#016llx\n", memory_limit);
 	}
 	if (memory_limit)
-		memory_boundary = memory_limit;
+		mem_boundary = memory_limit;
 	else
-		memory_boundary = memblock_end_of_DRAM();
+		mem_boundary = memblock_end_of_DRAM();
 
+	base = fw_dump.boot_memory_size;
 	size = get_fadump_area_size();
 	fw_dump.reserve_dump_area_size = size;
 	if (fw_dump.dump_active) {
@@ -404,8 +405,7 @@ int __init fadump_reserve_mem(void)
 		 * dump is written to disk by userspace tool. This memory
 		 * will be released for general use once the dump is saved.
 		 */
-		base = fw_dump.boot_memory_size;
-		size = memory_boundary - base;
+		size = mem_boundary - base;
 		fadump_reserve_crash_area(base, size);
 
 		pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr);
@@ -418,29 +418,31 @@ int __init fadump_reserve_mem(void)
 		 * use memblock_find_in_range() here since it doesn't allocate
 		 * from bottom to top.
 		 */
-		for (base = fw_dump.boot_memory_size;
-		     base <= (memory_boundary - size);
-		     base += size) {
+		while (base <= (mem_boundary - size)) {
 			if (memblock_is_region_memory(base, size) &&
 			    !memblock_is_region_reserved(base, size))
 				break;
-		}
-		if ((base > (memory_boundary - size)) ||
-		    memblock_reserve(base, size)) {
-			pr_err("Failed to reserve memory\n");
-			return 0;
+
+			base += size;
 		}
 
-		pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
-			"assisted dump (System RAM: %ldMB)\n",
-			(unsigned long)(size >> 20),
-			(unsigned long)(base >> 20),
-			(unsigned long)(memblock_phys_mem_size() >> 20));
+		if ((base > (mem_boundary - size)) ||
+		    memblock_reserve(base, size)) {
+			pr_err("Failed to reserve memory!\n");
+			goto error_out;
+		}
+
+		pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n",
+			(size >> 20), base, (memblock_phys_mem_size() >> 20));
 
 		fw_dump.reserve_dump_area_start = base;
-		return fadump_cma_init();
+		ret = fadump_cma_init();
 	}
-	return 1;
+
+	return ret;
+error_out:
+	fw_dump.fadump_enabled = 0;
+	return 0;
 }
 
 unsigned long __init arch_reserved_kernel_pages(void)

From 742a265accd3e3afcc8e7b17f409c93c1de8be85 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:20:57 +0530
Subject: [PATCH 235/264] powerpc/fadump: register kernel metadata address with
 opal

OPAL allows registering address with it in the first kernel and
retrieving it after MPIPL. Setup kernel metadata and register its
address with OPAL to use it for processing the crash dump.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821345011.5656.13567765019032928471.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  4 +
 arch/powerpc/kernel/fadump.c                 | 23 +++++-
 arch/powerpc/platforms/powernv/opal-fadump.c | 84 +++++++++++++++++++-
 arch/powerpc/platforms/powernv/opal-fadump.h | 39 +++++++++
 4 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 8046fe0b742e..5262c764c5fc 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -97,6 +97,8 @@ struct fw_dump {
 	unsigned long	cpu_notes_buf_vaddr;
 	unsigned long	cpu_notes_buf_size;
 
+	u64		kernel_metadata;
+
 	int		ibm_configure_kernel_dump;
 
 	unsigned long	fadump_enabled:1;
@@ -110,6 +112,8 @@ struct fw_dump {
 
 struct fadump_ops {
 	u64	(*fadump_init_mem_struct)(struct fw_dump *fadump_conf);
+	u64	(*fadump_get_metadata_size)(void);
+	int	(*fadump_setup_metadata)(struct fw_dump *fadump_conf);
 	int	(*fadump_register)(struct fw_dump *fadump_conf);
 	int	(*fadump_unregister)(struct fw_dump *fadump_conf);
 	int	(*fadump_invalidate)(struct fw_dump *fadump_conf);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 7d47d4bb7d6e..7e7056382d98 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -313,6 +313,10 @@ static unsigned long get_fadump_area_size(void)
 	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
 	size = PAGE_ALIGN(size);
+
+	/* This is to hold kernel metadata on platforms that support it */
+	size += (fw_dump.ops->fadump_get_metadata_size ?
+		 fw_dump.ops->fadump_get_metadata_size() : 0);
 	return size;
 }
 
@@ -348,6 +352,7 @@ int __init fadump_reserve_mem(void)
 		pr_info("Firmware-Assisted Dump is not supported on this hardware\n");
 		goto error_out;
 	}
+
 	/*
 	 * Initialize boot memory size
 	 * If dump is active then we have already calculated the size during
@@ -426,8 +431,21 @@ int __init fadump_reserve_mem(void)
 			base += size;
 		}
 
-		if ((base > (mem_boundary - size)) ||
-		    memblock_reserve(base, size)) {
+		if (base > (mem_boundary - size)) {
+			pr_err("Failed to find memory chunk for reservation!\n");
+			goto error_out;
+		}
+		fw_dump.reserve_dump_area_start = base;
+
+		/*
+		 * Calculate the kernel metadata address and register it with
+		 * f/w if the platform supports.
+		 */
+		if (fw_dump.ops->fadump_setup_metadata &&
+		    (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
+			goto error_out;
+
+		if (memblock_reserve(base, size)) {
 			pr_err("Failed to reserve memory!\n");
 			goto error_out;
 		}
@@ -435,7 +453,6 @@ int __init fadump_reserve_mem(void)
 		pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n",
 			(size >> 20), base, (memblock_phys_mem_size() >> 20));
 
-		fw_dump.reserve_dump_area_start = base;
 		ret = fadump_cma_init();
 	}
 
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index bbc5356bdc77..21de832f2ba4 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -11,13 +11,83 @@
 #include <linux/seq_file.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
+#include <linux/mm.h>
 
+#include <asm/page.h>
 #include <asm/opal.h>
 #include <asm/fadump-internal.h>
 
+#include "opal-fadump.h"
+
+static struct opal_fadump_mem_struct *opal_fdm;
+
+/* Initialize kernel metadata */
+static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
+{
+	fdm->version = OPAL_FADUMP_VERSION;
+	fdm->region_cnt = 0;
+	fdm->registered_regions = 0;
+	fdm->fadumphdr_addr = 0;
+}
+
 static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 {
-	return fadump_conf->reserve_dump_area_start;
+	u64 addr = fadump_conf->reserve_dump_area_start;
+
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	opal_fdm->region_cnt = 1;
+	opal_fdm->rgn[0].src	= 0;
+	opal_fdm->rgn[0].dest	= addr;
+	opal_fdm->rgn[0].size	= fadump_conf->boot_memory_size;
+	addr += fadump_conf->boot_memory_size;
+
+	/*
+	 * Kernel metadata is passed to f/w and retrieved in capture kerenl.
+	 * So, use it to save fadump header address instead of calculating it.
+	 */
+	opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
+				    fadump_conf->boot_memory_size);
+
+	return addr;
+}
+
+static u64 opal_fadump_get_metadata_size(void)
+{
+	return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
+}
+
+static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
+{
+	int err = 0;
+	s64 ret;
+
+	/*
+	 * Use the last page(s) in FADump memory reservation for
+	 * kernel metadata.
+	 */
+	fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
+					fadump_conf->reserve_dump_area_size -
+					opal_fadump_get_metadata_size());
+	pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
+
+	/* Initialize kernel metadata before registering the address with f/w */
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	/*
+	 * Register metadata address with f/w. Can be retrieved in
+	 * the capture kernel.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
+				      fadump_conf->kernel_metadata);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set kernel metadata tag!\n");
+		err = -EPERM;
+	}
+
+	return err;
 }
 
 static int opal_fadump_register(struct fw_dump *fadump_conf)
@@ -43,6 +113,16 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 static void opal_fadump_region_show(struct fw_dump *fadump_conf,
 				    struct seq_file *m)
 {
+	const struct opal_fadump_mem_struct *fdm_ptr = opal_fdm;
+	u64 dumped_bytes = 0;
+	int i;
+
+	for (i = 0; i < fdm_ptr->region_cnt; i++) {
+		seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+			   fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
+		seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+			   fdm_ptr->rgn[i].size, dumped_bytes);
+	}
 }
 
 static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
@@ -60,6 +140,8 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
 
 static struct fadump_ops opal_fadump_ops = {
 	.fadump_init_mem_struct		= opal_fadump_init_mem_struct,
+	.fadump_get_metadata_size	= opal_fadump_get_metadata_size,
+	.fadump_setup_metadata		= opal_fadump_setup_metadata,
 	.fadump_register		= opal_fadump_register,
 	.fadump_unregister		= opal_fadump_unregister,
 	.fadump_invalidate		= opal_fadump_invalidate,
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000000..0b83d895485c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _POWERNV_OPAL_FADUMP_H
+#define _POWERNV_OPAL_FADUMP_H
+
+/*
+ * OPAL FADump metadata structure format version
+ *
+ * OPAL FADump kernel metadata structure stores kernel metadata needed to
+ * register-for/process crash dump. Format version is used to keep a tab on
+ * the changes in the structure format. The changes, if any, to the format
+ * are expected to be minimal and backward compatible.
+ */
+#define OPAL_FADUMP_VERSION			0x1
+
+/* Maximum number of memory regions kernel supports */
+#define OPAL_FADUMP_MAX_MEM_REGS		128
+
+/*
+ * OPAL FADump kernel metadata
+ *
+ * The address of this structure will be registered with f/w for retrieving
+ * and processing during crash dump.
+ */
+struct opal_fadump_mem_struct {
+	u8	version;
+	u8	reserved[3];
+	u16	region_cnt;		/* number of regions */
+	u16	registered_regions;	/* Regions registered for MPIPL */
+	u64	fadumphdr_addr;
+	struct opal_mpipl_region	rgn[OPAL_FADUMP_MAX_MEM_REGS];
+} __packed;
+
+#endif /* _POWERNV_OPAL_FADUMP_H */

From 2790d01d1e1d22735d848eec55668f7d44417e22 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:21:16 +0530
Subject: [PATCH 236/264] powerpc/fadump: reset metadata address during clean
 up

During kexec boot, metadata address needs to be reset to avoid running
into errors interpreting stale metadata address, in case the kexec'ed
kernel crashes before metadata address could be setup again.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821346629.5656.10783321582005237813.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  1 +
 arch/powerpc/kernel/fadump.c                 |  6 ++++++
 arch/powerpc/platforms/powernv/opal-fadump.c | 10 ++++++++++
 3 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 5262c764c5fc..0c2c4f58d365 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -117,6 +117,7 @@ struct fadump_ops {
 	int	(*fadump_register)(struct fw_dump *fadump_conf);
 	int	(*fadump_unregister)(struct fw_dump *fadump_conf);
 	int	(*fadump_invalidate)(struct fw_dump *fadump_conf);
+	void	(*fadump_cleanup)(struct fw_dump *fadump_conf);
 	int	(*fadump_process)(struct fw_dump *fadump_conf);
 	void	(*fadump_region_show)(struct fw_dump *fadump_conf,
 				      struct seq_file *m);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 7e7056382d98..aab9b4db0363 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -965,6 +965,9 @@ static int register_fadump(void)
 
 void fadump_cleanup(void)
 {
+	if (!fw_dump.fadump_supported)
+		return;
+
 	/* Invalidate the registration only if dump is active. */
 	if (fw_dump.dump_active) {
 		pr_debug("Invalidating firmware-assisted dump registration\n");
@@ -974,6 +977,9 @@ void fadump_cleanup(void)
 		fw_dump.ops->fadump_unregister(&fw_dump);
 		free_crash_memory_ranges();
 	}
+
+	if (fw_dump.ops->fadump_cleanup)
+		fw_dump.ops->fadump_cleanup(&fw_dump);
 }
 
 static void fadump_free_reserved_memory(unsigned long start_pfn,
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 21de832f2ba4..871a129c73d3 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -105,6 +105,15 @@ static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
 	return -EIO;
 }
 
+static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
+{
+	s64 ret;
+
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
+	if (ret != OPAL_SUCCESS)
+		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
+}
+
 static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 {
 	return -EINVAL;
@@ -145,6 +154,7 @@ static struct fadump_ops opal_fadump_ops = {
 	.fadump_register		= opal_fadump_register,
 	.fadump_unregister		= opal_fadump_unregister,
 	.fadump_invalidate		= opal_fadump_invalidate,
+	.fadump_cleanup			= opal_fadump_cleanup,
 	.fadump_process			= opal_fadump_process,
 	.fadump_region_show		= opal_fadump_region_show,
 	.fadump_trigger			= opal_fadump_trigger,

From a20a8fa42def548f46c7e0401a94f62b8e595883 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:21:33 +0530
Subject: [PATCH 237/264] powerpc/fadump: define OPAL register/un-register
 callback functions

Make OPAL calls to register and un-register with firmware for MPIPL.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821348482.5656.13646250851483648241.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/powernv/opal-fadump.c | 79 +++++++++++++++++++-
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 871a129c73d3..b53007e82190 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -21,6 +21,22 @@
 
 static struct opal_fadump_mem_struct *opal_fdm;
 
+static int opal_fadump_unregister(struct fw_dump *fadump_conf);
+
+static void opal_fadump_update_config(struct fw_dump *fadump_conf,
+				      const struct opal_fadump_mem_struct *fdm)
+{
+	/*
+	 * The destination address of the first boot memory region is the
+	 * destination address of boot memory regions.
+	 */
+	fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest;
+	pr_debug("Destination address of boot memory regions: %#016llx\n",
+		 fadump_conf->boot_mem_dest_addr);
+
+	fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
+}
+
 /* Initialize kernel metadata */
 static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
 {
@@ -50,6 +66,8 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 	opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
 				    fadump_conf->boot_memory_size);
 
+	opal_fadump_update_config(fadump_conf, opal_fdm);
+
 	return addr;
 }
 
@@ -92,12 +110,69 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
 
 static int opal_fadump_register(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	s64 rc = OPAL_PARAMETER;
+	int i, err = -EIO;
+
+	for (i = 0; i < opal_fdm->region_cnt; i++) {
+		rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
+				       opal_fdm->rgn[i].src,
+				       opal_fdm->rgn[i].dest,
+				       opal_fdm->rgn[i].size);
+		if (rc != OPAL_SUCCESS)
+			break;
+
+		opal_fdm->registered_regions++;
+	}
+
+	switch (rc) {
+	case OPAL_SUCCESS:
+		pr_info("Registration is successful!\n");
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_RESOURCE:
+		/* If MAX regions limit in f/w is hit, warn and proceed. */
+		pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
+			(opal_fdm->region_cnt - opal_fdm->registered_regions));
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_PARAMETER:
+		pr_err("Failed to register. Parameter Error(%lld).\n", rc);
+		break;
+	case OPAL_HARDWARE:
+		pr_err("Support not available.\n");
+		fadump_conf->fadump_supported = 0;
+		fadump_conf->fadump_enabled = 0;
+		break;
+	default:
+		pr_err("Failed to register. Unknown Error(%lld).\n", rc);
+		break;
+	}
+
+	/*
+	 * If some regions were registered before OPAL_MPIPL_ADD_RANGE
+	 * OPAL call failed, unregister all regions.
+	 */
+	if ((err < 0) && (opal_fdm->registered_regions > 0))
+		opal_fadump_unregister(fadump_conf);
+
+	return err;
 }
 
 static int opal_fadump_unregister(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	opal_fdm->registered_regions = 0;
+	fadump_conf->dump_registered = 0;
+	return 0;
 }
 
 static int opal_fadump_invalidate(struct fw_dump *fadump_conf)

From 51bba8edef90cf579dba16de912d3ef809fe1d77 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:21:46 +0530
Subject: [PATCH 238/264] powerpc/fadump: support copying multiple kernel boot
 memory regions

Firmware uses a 32-bit field for size while copying/backing-up memory
during MPIPL. So, the maximum value that could be represented with
a PAGE_SIZE aligned 32-bit field will be the maximum copy size for a
region but FADump capture kernel usually needs more memory than that
to be preserved to avoid running into out of memory errors.

So, request firmware to copy multiple kernel boot memory regions
instead of just one (which worked fine for pseries as 64-bit field
was used for size there).

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821350193.5656.3664853158523582019.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/powernv/opal-fadump.c | 34 ++++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index b53007e82190..a4b96a7a5d6c 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -26,6 +26,8 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf);
 static void opal_fadump_update_config(struct fw_dump *fadump_conf,
 				      const struct opal_fadump_mem_struct *fdm)
 {
+	pr_debug("Boot memory regions count: %d\n", fdm->region_cnt);
+
 	/*
 	 * The destination address of the first boot memory region is the
 	 * destination address of boot memory regions.
@@ -48,16 +50,34 @@ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
 
 static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 {
-	u64 addr = fadump_conf->reserve_dump_area_start;
+	int max_copy_size, cur_size, size;
+	u64 src_addr, dest_addr;
 
 	opal_fdm = __va(fadump_conf->kernel_metadata);
 	opal_fadump_init_metadata(opal_fdm);
 
-	opal_fdm->region_cnt = 1;
-	opal_fdm->rgn[0].src	= 0;
-	opal_fdm->rgn[0].dest	= addr;
-	opal_fdm->rgn[0].size	= fadump_conf->boot_memory_size;
-	addr += fadump_conf->boot_memory_size;
+	/*
+	 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+	 * and request firmware to copy multiple kernel boot memory regions.
+	 */
+	max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
+	/* Boot memory regions */
+	src_addr = 0;
+	dest_addr = fadump_conf->reserve_dump_area_start;
+	size = fadump_conf->boot_memory_size;
+	while (size) {
+		cur_size = size > max_copy_size ? max_copy_size : size;
+
+		opal_fdm->rgn[opal_fdm->region_cnt].src  = src_addr;
+		opal_fdm->rgn[opal_fdm->region_cnt].dest = dest_addr;
+		opal_fdm->rgn[opal_fdm->region_cnt].size = cur_size;
+
+		opal_fdm->region_cnt++;
+		dest_addr	+= cur_size;
+		src_addr	+= cur_size;
+		size		-= cur_size;
+	}
 
 	/*
 	 * Kernel metadata is passed to f/w and retrieved in capture kerenl.
@@ -68,7 +88,7 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 
 	opal_fadump_update_config(fadump_conf, opal_fdm);
 
-	return addr;
+	return dest_addr;
 }
 
 static u64 opal_fadump_get_metadata_size(void)

From 2a1b06dd3a17ac278494da3c15cac84684346d22 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:21:59 +0530
Subject: [PATCH 239/264] powerpc/fadump: process the crashdump by exporting it
 as /proc/vmcore

Add support in the kernel to process the crash'ed kernel's memory
preserved during MPIPL and export it as /proc/vmcore file for the
userland scripts to filter and analyze it later.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821351482.5656.6255805804744333073.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/powernv/opal-fadump.c | 147 ++++++++++++++++++-
 1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index a4b96a7a5d6c..5ace7c4d6bc1 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -12,6 +12,7 @@
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <linux/mm.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/opal.h>
@@ -19,6 +20,7 @@
 
 #include "opal-fadump.h"
 
+static const struct opal_fadump_mem_struct *opal_fdm_active;
 static struct opal_fadump_mem_struct *opal_fdm;
 
 static int opal_fadump_unregister(struct fw_dump *fadump_conf);
@@ -39,6 +41,37 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf,
 	fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
 }
 
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct opal_fadump_mem_struct *fdm)
+{
+	int i;
+
+	if (!fadump_conf->dump_active)
+		return;
+
+	fadump_conf->boot_memory_size = 0;
+
+	pr_debug("Boot memory regions:\n");
+	for (i = 0; i < fdm->region_cnt; i++) {
+		pr_debug("\t%d. base: 0x%llx, size: 0x%llx\n",
+			 (i + 1), fdm->rgn[i].src, fdm->rgn[i].size);
+
+		fadump_conf->boot_memory_size += fdm->rgn[i].size;
+	}
+
+	/*
+	 * Start address of reserve dump area (permanent reservation) for
+	 * re-registering FADump after dump capture.
+	 */
+	fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
+
+	opal_fadump_update_config(fadump_conf, fdm);
+}
+
 /* Initialize kernel metadata */
 static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
 {
@@ -209,24 +242,97 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
 		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
 }
 
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * Append crashing CPU's register data saved by the kernel in the PT_NOTE.
+ */
+static int __init
+opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
+			    struct fadump_crash_info_header *fdh)
+{
+	u32 num_cpus = 1, *note_buf;
+	int rc;
+
+	if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN)
+		return -ENODEV;
+
+	/* Allocate CPU notes buffer to hold crashing cpu notes. */
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
+
+	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+	note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+	final_note(note_buf);
+
+	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+		 fdh->elfcorehdr_addr);
+	fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+	return 0;
+}
+
 static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 {
-	return -EINVAL;
+	struct fadump_crash_info_header *fdh;
+	int rc = -EINVAL;
+
+	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+		return rc;
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return rc;
+	}
+
+	rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return rc;
 }
 
 static void opal_fadump_region_show(struct fw_dump *fadump_conf,
 				    struct seq_file *m)
 {
-	const struct opal_fadump_mem_struct *fdm_ptr = opal_fdm;
+	const struct opal_fadump_mem_struct *fdm_ptr;
 	u64 dumped_bytes = 0;
 	int i;
 
+	if (fadump_conf->dump_active)
+		fdm_ptr = opal_fdm_active;
+	else
+		fdm_ptr = opal_fdm;
+
 	for (i = 0; i < fdm_ptr->region_cnt; i++) {
+		/*
+		 * Only regions that are registered for MPIPL
+		 * would have dump data.
+		 */
+		if ((fadump_conf->dump_active) &&
+		    (i < fdm_ptr->registered_regions))
+			dumped_bytes = fdm_ptr->rgn[i].size;
+
 		seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
 			   fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
 		seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
 			   fdm_ptr->rgn[i].size, dumped_bytes);
 	}
+
+	/* Dump is active. Show reserved area start address. */
+	if (fadump_conf->dump_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
 }
 
 static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
@@ -257,7 +363,11 @@ static struct fadump_ops opal_fadump_ops = {
 
 void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 {
+	const __be32 *prop;
 	unsigned long dn;
+	u64 addr = 0;
+	s64 ret;
+
 
 	/*
 	 * Check if Firmware-Assisted Dump is supported. if yes, check
@@ -276,4 +386,37 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 
 	fadump_conf->ops		= &opal_fadump_ops;
 	fadump_conf->fadump_supported	= 1;
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+
+	opal_fdm_active = __va(addr);
+	if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+		pr_warn("Supported kernel metadata version: %u, found: %d!\n",
+			OPAL_FADUMP_VERSION, opal_fdm_active->version);
+		pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
+	}
+
+	/* Kernel regions not registered with f/w for MPIPL */
+	if (opal_fdm_active->registered_regions == 0) {
+		opal_fdm_active = NULL;
+		return;
+	}
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+	opal_fadump_get_config(fadump_conf, opal_fdm_active);
 }

From 6071e8f9d5ac960f0d35495f070d4d0b7ae5fc76 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:22:32 +0530
Subject: [PATCH 240/264] powerpc/fadump: Warn before processing partial
 crashdump

If all kernel boot memory regions are not registered for MPIPL before
system crashes, try processing the partial crashdump but warn the user
before proceeding.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821352793.5656.1734051341024721407.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/powernv/opal-fadump.c | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 5ace7c4d6bc1..533adae545d8 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -69,6 +69,30 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
 	 */
 	fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
 
+	/*
+	 * Rarely, but it can so happen that system crashes before all
+	 * boot memory regions are registered for MPIPL. In such
+	 * cases, warn that the vmcore may not be accurate and proceed
+	 * anyway as that is the best bet considering free pages, cache
+	 * pages, user pages, etc are usually filtered out.
+	 *
+	 * Hope the memory that could not be preserved only has pages
+	 * that are usually filtered out while saving the vmcore.
+	 */
+	if (fdm->region_cnt > fdm->registered_regions) {
+		pr_warn("Not all memory regions were saved!!!\n");
+		pr_warn("  Unsaved memory regions:\n");
+		i = fdm->registered_regions;
+		while (i < fdm->region_cnt) {
+			pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
+				i, fdm->rgn[i].src, fdm->rgn[i].size);
+			i++;
+		}
+
+		pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
+		pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
+	}
+
 	opal_fadump_update_config(fadump_conf, fdm);
 }
 

From a4e2e2ca2f7bddf6d5d788033cc56f40af6e9c5a Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:23:28 +0530
Subject: [PATCH 241/264] powerpc/fadump: handle invalidation of crashdump and
 re-registraion

Make OPAL call to indicate that the dump is processed and the metadata
area in OPAL can be cleared/released. Also, setup/initialize FADump
for re-registration.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821356046.5656.12270927048195494911.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c                 |  8 +++++++-
 arch/powerpc/platforms/powernv/opal-fadump.c | 12 +++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index aab9b4db0363..852ac4761e90 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1063,7 +1063,13 @@ static void fadump_invalidate_release_mem(void)
 	fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM());
 	fadump_free_cpu_notes_buf();
 
-	/* Initialize the kernel dump memory structure for FAD registration. */
+	/*
+	 * Setup kernel metadata and initialize the kernel dump
+	 * memory structure for FADump re-registration.
+	 */
+	if (fw_dump.ops->fadump_setup_metadata &&
+	    (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
+		pr_warn("Failed to setup kernel metadata!\n");
 	fw_dump.ops->fadump_init_mem_struct(&fw_dump);
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 533adae545d8..ef6fb1fc66cb 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -254,7 +254,17 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf)
 
 static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
 {
-	return -EIO;
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	opal_fdm_active = NULL;
+	return 0;
 }
 
 static void opal_fadump_cleanup(struct fw_dump *fadump_conf)

From fbcafdaea2e234d3c6d79e7f5605a2e8373f6678 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:23:53 +0530
Subject: [PATCH 242/264] powerpc/fadump: Update documentation about OPAL
 platform support

With FADump support now available on both pseries and OPAL platforms,
update FADump documentation with these details.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821361692.5656.11377757995827253404.stgit@hbathini.in.ibm.com
---
 .../powerpc/firmware-assisted-dump.rst        | 122 +++++++++++-------
 1 file changed, 72 insertions(+), 50 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index d912755067d2..ab578ac64c09 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -46,10 +46,9 @@ as follows:
    These registered sections of memory are reserved by the first
    kernel during early boot.
 
--  When a system crashes, the Power firmware will save
-   the low memory (boot memory of size larger of 5% of system RAM
-   or 256MB) of RAM to the previous registered region. It will
-   also save system registers, and hardware PTE's.
+-  When system crashes, the Power firmware will copy the registered
+   low memory regions (boot memory) from source to destination area.
+   It will also save hardware PTE's.
 
    NOTE:
          The term 'boot memory' means size of the low memory chunk
@@ -61,9 +60,9 @@ as follows:
          the default calculated size. Use this option if default
          boot memory size is not sufficient for second kernel to
          boot successfully. For syntax of crashkernel= parameter,
-         refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is
-         provided in crashkernel= parameter, it will be ignored
-         as FADump uses a predefined offset to reserve memory
+         refer to Documentation/admin-guide/kdump/kdump.rst. If any
+         offset is provided in crashkernel= parameter, it will be
+         ignored as FADump uses a predefined offset to reserve memory
          for boot memory dump preservation in case of a crash.
 
 -  After the low memory (boot memory) area has been saved, the
@@ -71,8 +70,9 @@ as follows:
    *not* clear the RAM. It will then launch the bootloader, as
    normal.
 
--  The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
+-  The freshly booted kernel will notice that there is a new node
+   (rtas/ibm,kernel-dump on pSeries or ibm,opal/dump/mpipl-boot
+   on OPAL platform) in the device tree, indicating that
    there is crash data available from a previous boot. During
    the early boot OS will reserve rest of the memory above
    boot memory size effectively booting with restricted memory
@@ -95,8 +95,11 @@ as follows:
      # echo 1 > /sys/kernel/fadump_release_mem
 
 Please note that the firmware-assisted dump feature
-is only available on Power6 and above systems with recent
-firmware versions.
+is only available on POWER6 and above systems on pSeries
+(PowerVM) platform and POWER9 and above systems with OP940
+or later firmware versions on PowerNV (OPAL) platform.
+Note that, OPAL firmware exports ibm,opal/dump node when
+FADump is supported on PowerNV platform.
 
 Implementation details:
 -----------------------
@@ -111,57 +114,76 @@ that are run. If there is dump data, then the
 /sys/kernel/fadump_release_mem file is created, and the reserved
 memory is held.
 
-If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this FADump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region::
+If there is no waiting dump data, then only the memory required to
+hold CPU state, HPTE region, boot memory dump, FADump header and
+elfcore header, is usually reserved at an offset greater than boot
+memory size (see Fig. 1). This area is *not* released: this region
+will be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state and
+HPTE region, in the case a crash does occur.
+
+Since this reserved memory area is used only after the system crash,
+there is no point in blocking this significant chunk of memory from
+production kernel. Hence, the implementation uses the Linux kernel's
+Contiguous Memory Allocator (CMA) for memory reservation if CMA is
+configured for kernel. With CMA reservation this memory will be
+available for applications to use it, while kernel is prevented from
+using it. With this FADump will still be able to capture all of the
+kernel memory and most of the user space memory except the user pages
+that were present in CMA region::
 
   o Memory Reservation during first kernel
 
-  Low memory                                                Top of memory
-  0      boot memory size      |<--Reserved dump area --->|      |
-  |           |                | (Permanent Reservation)  |      |
-  V           V                |                          |      V
-  +-----------+----------/ /---+---+----+--------+---+----+------+
-  |           |                |CPU|HPTE|  DUMP  |HDR|ELF |      |
-  +-----------+----------/ /---+---+----+--------+---+----+------+
-        |                                   ^      ^
-        |                                   |      |
-        \                                   /      |
-         -----------------------------------     FADump Header
-          Boot memory content gets transferred   (meta area)
-          to reserved area by firmware at the
-          time of crash
+  Low memory                                                 Top of memory
+  0    boot memory size   |<--- Reserved dump area --->|       |
+  |           |           |    Permanent Reservation   |       |
+  V           V           |                            |       V
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+  |           |           |///|////|  DUMP | HDR | ELF |////|  |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+        |                   ^    ^     ^      ^           ^
+        |                   |    |     |      |           |
+        \                  CPU  HPTE   /      |           |
+         ------------------------------       |           |
+      Boot memory content gets transferred    |           |
+      to reserved area by firmware at the     |           |
+      time of crash.                          |           |
+                                          FADump Header   |
+                                           (meta area)    |
+                                                          |
+                                                          |
+                      Metadata: This area holds a metadata struture whose
+                      address is registered with f/w and retrieved in the
+                      second kernel after crash, on platforms that support
+                      tags (OPAL). Having such structure with info needed
+                      to process the crashdump eases dump capture process.
 
                    Fig. 1
 
 
   o Memory Reservation during second kernel after crash
 
-  Low memory                                                Top of memory
-  0      boot memory size                                        |
-  |           |<----------- Crash preserved area --------------->|
-  V           V                |<-- Reserved dump area -->|      V
-  +-----------+----------/ /---+---+----+--------+---+----+------+
-  |           |                |CPU|HPTE|  DUMP  |HDR|ELF |      |
-  +-----------+----------/ /---+---+----+--------+---+----+------+
-        |                                              |
-        V                                              V
-   Used by second                                /proc/vmcore
+  Low memory                                              Top of memory
+  0      boot memory size                                      |
+  |           |<------------ Crash preserved area ------------>|
+  V           V           |<--- Reserved dump area --->|       |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+  |           |           |///|////|  DUMP | HDR | ELF |////|  |
+  +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
+        |                                           |
+        V                                           V
+   Used by second                             /proc/vmcore
    kernel to boot
+
+        +---+
+        |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above
+        +---+    figures are not always present. For example, OPAL platform
+                 does not have CPU & HPTE regions while Metadata region is
+                 not supported on pSeries currently.
+
                    Fig. 2
 
+
 Currently the dump will be copied from /proc/vmcore to a new file upon
 user intervention. The dump data available through /proc/vmcore will be
 in ELF format. Hence the existing kdump infrastructure (kdump scripts)

From 579ca1a27675485a99da50cd7fedc14232f817c3 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:24:28 +0530
Subject: [PATCH 243/264] powerpc/fadump: make use of memblock's bottom up
 allocation mode

Earlier, memblock_find_in_range() was not used to find the memory to
be reserved for FADump as bottom up allocation mode was not supported.
But since commit 79442ed189acb8b ("mm/memblock.c: introduce bottom-up
allocation mode") bottom up allocation mode is supported for memblock.
So, use it to find the memory to be reserved for FADump.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821364211.5656.14336025460336135194.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 852ac4761e90..da751402c649 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -342,7 +342,8 @@ static void __init fadump_reserve_crash_area(unsigned long base,
 
 int __init fadump_reserve_mem(void)
 {
-	u64 base, size, mem_boundary;
+	bool is_memblock_bottom_up = memblock_bottom_up();
+	u64 base, size, mem_boundary, align = PAGE_SIZE;
 	int ret = 1;
 
 	if (!fw_dump.fadump_enabled)
@@ -362,10 +363,11 @@ int __init fadump_reserve_mem(void)
 		fw_dump.boot_memory_size =
 			PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
-		if (!fw_dump.nocma)
+		if (!fw_dump.nocma) {
+			align = FADUMP_CMA_ALIGNMENT;
 			fw_dump.boot_memory_size =
-				ALIGN(fw_dump.boot_memory_size,
-							FADUMP_CMA_ALIGNMENT);
+				ALIGN(fw_dump.boot_memory_size, align);
+		}
 #endif
 	}
 
@@ -419,19 +421,15 @@ int __init fadump_reserve_mem(void)
 	} else {
 		/*
 		 * Reserve memory at an offset closer to bottom of the RAM to
-		 * minimize the impact of memory hot-remove operation. We can't
-		 * use memblock_find_in_range() here since it doesn't allocate
-		 * from bottom to top.
+		 * minimize the impact of memory hot-remove operation.
 		 */
-		while (base <= (mem_boundary - size)) {
-			if (memblock_is_region_memory(base, size) &&
-			    !memblock_is_region_reserved(base, size))
-				break;
+		memblock_set_bottom_up(true);
+		base = memblock_find_in_range(base, mem_boundary, size, align);
 
-			base += size;
-		}
+		/* Restore the previous allocation mode */
+		memblock_set_bottom_up(is_memblock_bottom_up);
 
-		if (base > (mem_boundary - size)) {
+		if (!base) {
 			pr_err("Failed to find memory chunk for reservation!\n");
 			goto error_out;
 		}

From 5000a17afbd56b4e58b7f72ac77eabb92c418412 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2019 20:24:50 +0530
Subject: [PATCH 244/264] powerpc/fadump: process architected register state
 data provided by firmware

Firmware provides architected register state data at the time of crash.
Process this data and build CPU notes to append to ELF core. In case
this data is missing or in unsupported format, at least append crashing
CPU's register data, to have something to work with in the vmcore file.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821367702.5656.5546683836236508389.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |   4 +
 arch/powerpc/platforms/powernv/opal-fadump.c | 207 ++++++++++++++++++-
 arch/powerpc/platforms/powernv/opal-fadump.h |  41 ++++
 3 files changed, 246 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 0c2c4f58d365..2c72685dfff5 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -88,6 +88,10 @@ struct fw_dump {
 	unsigned long	reserve_bootvar;
 
 	unsigned long	cpu_state_data_size;
+	u64		cpu_state_dest_vaddr;
+	u32		cpu_state_data_version;
+	u32		cpu_state_entry_size;
+
 	unsigned long	hpte_region_size;
 
 	unsigned long	boot_memory_size;
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index ef6fb1fc66cb..ad4c18bd8809 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -21,6 +21,7 @@
 #include "opal-fadump.h"
 
 static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
 static struct opal_fadump_mem_struct *opal_fdm;
 
 static int opal_fadump_unregister(struct fw_dump *fadump_conf);
@@ -276,28 +277,207 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
 		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
 }
 
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+						 u32 reg_type, u32 reg_num,
+						 u64 reg_val)
+{
+	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+		if (reg_num < 32)
+			regs->gpr[reg_num] = reg_val;
+		return;
+	}
+
+	switch (reg_num) {
+	case SPRN_CTR:
+		regs->ctr = reg_val;
+		break;
+	case SPRN_LR:
+		regs->link = reg_val;
+		break;
+	case SPRN_XER:
+		regs->xer = reg_val;
+		break;
+	case SPRN_DAR:
+		regs->dar = reg_val;
+		break;
+	case SPRN_DSISR:
+		regs->dsisr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_NIP:
+		regs->nip = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_MSR:
+		regs->msr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_CCR:
+		regs->ccr = reg_val;
+		break;
+	}
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+					 unsigned int reg_entry_size,
+					 struct pt_regs *regs)
+{
+	struct hdat_fadump_reg_entry *reg_entry;
+	int i;
+
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+		opal_fadump_set_regval_regnum(regs,
+					      be32_to_cpu(reg_entry->reg_type),
+					      be32_to_cpu(reg_entry->reg_num),
+					      be64_to_cpu(reg_entry->reg_val));
+	}
+}
+
+/*
+ * Verify if CPU state data is available. If available, do a bit of sanity
+ * checking before processing this data.
+ */
+static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
+{
+	if (!opal_cpu_metadata)
+		return false;
+
+	fadump_conf->cpu_state_data_version =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_version);
+	fadump_conf->cpu_state_entry_size =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+	fadump_conf->cpu_state_dest_vaddr =
+		(u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
+	fadump_conf->cpu_state_data_size =
+		be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+	if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU state data version: %u, found: %d!\n",
+			HDAT_FADUMP_CPU_DATA_VER,
+			fadump_conf->cpu_state_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
+	    (fadump_conf->cpu_state_entry_size == 0) ||
+	    (fadump_conf->cpu_state_entry_size >
+	     fadump_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid. Ignoring!\n");
+		return false;
+	}
+
+	return true;
+}
+
 /*
  * Convert CPU state data saved at the time of crash into ELF notes.
  *
- * Append crashing CPU's register data saved by the kernel in the PT_NOTE.
+ * While the crashing CPU's register data is saved by the kernel, CPU state
+ * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
+ * each register entry is of 16 bytes, a numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation. If this data is
+ * missing or in unsupported format, append crashing CPU's register data
+ * saved by the kernel in the PT_NOTE, to have something to work with in
+ * the vmcore file.
  */
 static int __init
 opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
 			    struct fadump_crash_info_header *fdh)
 {
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	bool is_cpu_data_valid = false;
 	u32 num_cpus = 1, *note_buf;
-	int rc;
+	struct pt_regs regs;
+	char *bufp;
+	int rc, i;
 
-	if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN)
-		return -ENODEV;
+	if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
+		size_per_thread = fadump_conf->cpu_state_entry_size;
+		num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
+		bufp = __va(fadump_conf->cpu_state_dest_vaddr);
+		is_cpu_data_valid = true;
+	}
 
-	/* Allocate CPU notes buffer to hold crashing cpu notes. */
 	rc = fadump_setup_cpu_notes_buf(num_cpus);
 	if (rc != 0)
 		return rc;
 
 	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
-	note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+	if (!is_cpu_data_valid)
+		goto out;
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+		thread_pir = be32_to_cpu(thdr->pir);
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * If this is kernel initiated crash, crashing_cpu would be set
+		 * appropriately and register data of the crashing CPU saved by
+		 * crashing kernel. Add this saved register data of crashing CPU
+		 * to elf notes and populate the pt_regs for the remaining CPUs
+		 * from register state data provided by firmware.
+		 */
+		if (fdh->crashing_cpu == thread_pir) {
+			note_buf = fadump_regs_to_elf_notes(note_buf,
+							    &fdh->regs);
+			pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+				 fdh->crashing_cpu, fdh->regs.gpr[1],
+				 fdh->regs.nip);
+			continue;
+		}
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, &regs);
+		note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+			 thread_pir, regs.gpr[1], regs.nip);
+	}
+
+out:
+	/*
+	 * CPU state data is invalid/unsupported. Try appending crashing CPU's
+	 * register data, if it is saved by the kernel.
+	 */
+	if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
+		if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
+			fadump_free_cpu_notes_buf();
+			return -ENODEV;
+		}
+
+		pr_warn("WARNING: appending only crashing CPU's register data\n");
+		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+	}
+
 	final_note(note_buf);
 
 	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
@@ -374,6 +554,14 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
 {
 	int rc;
 
+	/*
+	 * Unlike on pSeries platform, logical CPU number is not provided
+	 * with architected register state data. So, store the crashing
+	 * CPU's PIR instead to plug the appropriate register data for
+	 * crashing CPU in the vmcore file.
+	 */
+	fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
 	rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
 	if (rc == OPAL_UNSUPPORTED) {
 		pr_emerg("Reboot type %d not supported.\n",
@@ -450,6 +638,13 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 		return;
 	}
 
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+	if (addr) {
+		addr = be64_to_cpu(addr);
+		pr_debug("CPU metadata addr: %llx\n", addr);
+		opal_cpu_metadata = __va(addr);
+	}
+
 	pr_info("Firmware-assisted dump is active.\n");
 	fadump_conf->dump_active = 1;
 	opal_fadump_get_config(fadump_conf, opal_fdm_active);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index 0b83d895485c..c0b5d6e42ba6 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -36,4 +36,45 @@ struct opal_fadump_mem_struct {
 	struct opal_mpipl_region	rgn[OPAL_FADUMP_MAX_MEM_REGS];
 } __packed;
 
+/*
+ * CPU state data
+ *
+ * CPU state data information is provided by f/w. The format for this data
+ * is defined in the HDAT spec. Version is used to keep a tab on the changes
+ * in this CPU state data format. Changes to this format are unlikely, but
+ * if there are any changes, please refer to latest HDAT specification.
+ */
+#define HDAT_FADUMP_CPU_DATA_VER		1
+
+#define HDAT_FADUMP_CORE_INACTIVE		(0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+	__be32  pir;
+	/* 0x00 - 0x0F - The corresponding stop state of the core */
+	u8      core_state;
+	u8      reserved[3];
+
+	__be32	offset;	/* Offset to Register Entries array */
+	__be32	ecnt;	/* Number of entries */
+	__be32	esize;	/* Alloc size of each array entry in bytes */
+	__be32	eactsz;	/* Actual size of each array entry in bytes */
+} __packed;
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR		0x01
+#define HDAT_FADUMP_REG_TYPE_SPR		0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP			0x7D0
+#define HDAT_FADUMP_REG_ID_MSR			0x7D1
+#define HDAT_FADUMP_REG_ID_CCR			0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+	__be32		reg_type;
+	__be32		reg_num;
+	__be64		reg_val;
+} __packed;
+
 #endif /* _POWERNV_OPAL_FADUMP_H */

From e4fc48fb4d34f7e7d42eb980a9c130bb93aba3b9 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:25:05 +0530
Subject: [PATCH 245/264] powerpc/fadump: make crash memory ranges array
 allocation generic

Make allocate_crash_memory_ranges() and free_crash_memory_ranges()
functions generic to reuse them for memory management of all types of
dynamic memory range arrays. This change helps in memory management
of reserved ranges array to be added later.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821369863.5656.4375667005352155892.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h |  15 ++-
 arch/powerpc/kernel/fadump.c               | 113 +++++++++++----------
 2 files changed, 72 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 2c72685dfff5..0f2ce85ee7b9 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -72,9 +72,18 @@ struct fadump_crash_info_header {
 	struct cpumask	online_mask;
 };
 
-struct fad_crash_memory_ranges {
-	unsigned long long	base;
-	unsigned long long	size;
+struct fadump_memory_range {
+	u64	base;
+	u64	size;
+};
+
+/* fadump memory ranges info */
+struct fadump_mrange_info {
+	char				name[16];
+	struct fadump_memory_range	*mem_ranges;
+	u32				mem_ranges_sz;
+	u32				mem_range_cnt;
+	u32				max_mem_ranges;
 };
 
 /* Platform specific callback functions */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index da751402c649..f95ec1fd797a 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -35,10 +35,7 @@
 static struct fw_dump fw_dump;
 
 static DEFINE_MUTEX(fadump_mutex);
-struct fad_crash_memory_ranges *crash_memory_ranges;
-int crash_memory_ranges_size;
-int crash_mem_ranges;
-int max_crash_mem_ranges;
+struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -629,46 +626,48 @@ void fadump_free_cpu_notes_buf(void)
 	fw_dump.cpu_notes_buf_size = 0;
 }
 
-static void free_crash_memory_ranges(void)
+static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
-	kfree(crash_memory_ranges);
-	crash_memory_ranges = NULL;
-	crash_memory_ranges_size = 0;
-	max_crash_mem_ranges = 0;
+	kfree(mrange_info->mem_ranges);
+	mrange_info->mem_ranges = NULL;
+	mrange_info->mem_ranges_sz = 0;
+	mrange_info->max_mem_ranges = 0;
 }
 
 /*
- * Allocate or reallocate crash memory ranges array in incremental units
+ * Allocate or reallocate mem_ranges array in incremental units
  * of PAGE_SIZE.
  */
-static int allocate_crash_memory_ranges(void)
+static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
-	struct fad_crash_memory_ranges *new_array;
+	struct fadump_memory_range *new_array;
 	u64 new_size;
 
-	new_size = crash_memory_ranges_size + PAGE_SIZE;
-	pr_debug("Allocating %llu bytes of memory for crash memory ranges\n",
-		 new_size);
+	new_size = mrange_info->mem_ranges_sz + PAGE_SIZE;
+	pr_debug("Allocating %llu bytes of memory for %s memory ranges\n",
+		 new_size, mrange_info->name);
 
-	new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL);
+	new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL);
 	if (new_array == NULL) {
-		pr_err("Insufficient memory for setting up crash memory ranges\n");
-		free_crash_memory_ranges();
+		pr_err("Insufficient memory for setting up %s memory ranges\n",
+		       mrange_info->name);
+		fadump_free_mem_ranges(mrange_info);
 		return -ENOMEM;
 	}
 
-	crash_memory_ranges = new_array;
-	crash_memory_ranges_size = new_size;
-	max_crash_mem_ranges = (new_size /
-				sizeof(struct fad_crash_memory_ranges));
+	mrange_info->mem_ranges = new_array;
+	mrange_info->mem_ranges_sz = new_size;
+	mrange_info->max_mem_ranges = (new_size /
+				       sizeof(struct fadump_memory_range));
 	return 0;
 }
 
-static inline int fadump_add_crash_memory(unsigned long long base,
-					  unsigned long long end)
+static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info,
+				       u64 base, u64 end)
 {
-	u64  start, size;
+	struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges;
 	bool is_adjacent = false;
+	u64 start, size;
 
 	if (base == end)
 		return 0;
@@ -677,38 +676,41 @@ static inline int fadump_add_crash_memory(unsigned long long base,
 	 * Fold adjacent memory ranges to bring down the memory ranges/
 	 * PT_LOAD segments count.
 	 */
-	if (crash_mem_ranges) {
-		start = crash_memory_ranges[crash_mem_ranges - 1].base;
-		size = crash_memory_ranges[crash_mem_ranges - 1].size;
+	if (mrange_info->mem_range_cnt) {
+		start = mem_ranges[mrange_info->mem_range_cnt - 1].base;
+		size  = mem_ranges[mrange_info->mem_range_cnt - 1].size;
 
 		if ((start + size) == base)
 			is_adjacent = true;
 	}
 	if (!is_adjacent) {
 		/* resize the array on reaching the limit */
-		if (crash_mem_ranges == max_crash_mem_ranges) {
+		if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
 			int ret;
 
-			ret = allocate_crash_memory_ranges();
+			ret = fadump_alloc_mem_ranges(mrange_info);
 			if (ret)
 				return ret;
+
+			/* Update to the new resized array */
+			mem_ranges = mrange_info->mem_ranges;
 		}
 
 		start = base;
-		crash_memory_ranges[crash_mem_ranges].base = start;
-		crash_mem_ranges++;
+		mem_ranges[mrange_info->mem_range_cnt].base = start;
+		mrange_info->mem_range_cnt++;
 	}
 
-	crash_memory_ranges[crash_mem_ranges - 1].size = (end - start);
-	pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
-		(crash_mem_ranges - 1), start, end - 1, (end - start));
+	mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start);
+	pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+		 mrange_info->name, (mrange_info->mem_range_cnt - 1),
+		 start, end - 1, (end - start));
 	return 0;
 }
 
-static int fadump_exclude_reserved_area(unsigned long long start,
-					unsigned long long end)
+static int fadump_exclude_reserved_area(u64 start, u64 end)
 {
-	unsigned long long ra_start, ra_end;
+	u64 ra_start, ra_end;
 	int ret = 0;
 
 	ra_start = fw_dump.reserve_dump_area_start;
@@ -716,18 +718,22 @@ static int fadump_exclude_reserved_area(unsigned long long start,
 
 	if ((ra_start < end) && (ra_end > start)) {
 		if ((start < ra_start) && (end > ra_end)) {
-			ret = fadump_add_crash_memory(start, ra_start);
+			ret = fadump_add_mem_range(&crash_mrange_info,
+						   start, ra_start);
 			if (ret)
 				return ret;
 
-			ret = fadump_add_crash_memory(ra_end, end);
+			ret = fadump_add_mem_range(&crash_mrange_info,
+						   ra_end, end);
 		} else if (start < ra_start) {
-			ret = fadump_add_crash_memory(start, ra_start);
+			ret = fadump_add_mem_range(&crash_mrange_info,
+						   start, ra_start);
 		} else if (ra_end < end) {
-			ret = fadump_add_crash_memory(ra_end, end);
+			ret = fadump_add_mem_range(&crash_mrange_info,
+						   ra_end, end);
 		}
 	} else
-		ret = fadump_add_crash_memory(start, end);
+		ret = fadump_add_mem_range(&crash_mrange_info, start, end);
 
 	return ret;
 }
@@ -772,11 +778,11 @@ static int fadump_init_elfcore_header(char *bufp)
 static int fadump_setup_crash_memory_ranges(void)
 {
 	struct memblock_region *reg;
-	unsigned long long start, end;
+	u64 start, end;
 	int ret;
 
 	pr_debug("Setup crash memory ranges.\n");
-	crash_mem_ranges = 0;
+	crash_mrange_info.mem_range_cnt = 0;
 
 	/*
 	 * add the first memory chunk (RMA_START through boot_memory_size) as
@@ -785,13 +791,14 @@ static int fadump_setup_crash_memory_ranges(void)
 	 * specified during fadump registration. We need to create a separate
 	 * program header for this chunk with the correct offset.
 	 */
-	ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size);
+	ret = fadump_add_mem_range(&crash_mrange_info,
+				   RMA_START, fw_dump.boot_memory_size);
 	if (ret)
 		return ret;
 
 	for_each_memblock(memory, reg) {
-		start = (unsigned long long)reg->base;
-		end = start + (unsigned long long)reg->size;
+		start = (u64)reg->base;
+		end = start + (u64)reg->size;
 
 		/*
 		 * skip the first memory chunk that is already added (RMA_START
@@ -876,11 +883,11 @@ static int fadump_create_elfcore_headers(char *bufp)
 
 	/* setup PT_LOAD sections. */
 
-	for (i = 0; i < crash_mem_ranges; i++) {
-		unsigned long long mbase, msize;
-		mbase = crash_memory_ranges[i].base;
-		msize = crash_memory_ranges[i].size;
+	for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) {
+		u64 mbase, msize;
 
+		mbase = crash_mrange_info.mem_ranges[i].base;
+		msize = crash_mrange_info.mem_ranges[i].size;
 		if (!msize)
 			continue;
 
@@ -973,7 +980,7 @@ void fadump_cleanup(void)
 	} else if (fw_dump.dump_registered) {
 		/* Un-register Firmware-assisted dump if it was registered. */
 		fw_dump.ops->fadump_unregister(&fw_dump);
-		free_crash_memory_ranges();
+		fadump_free_mem_ranges(&crash_mrange_info);
 	}
 
 	if (fw_dump.ops->fadump_cleanup)

From dda9dbfeeb7a855a75965b8ba7269f4edb35cde7 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:25:36 +0530
Subject: [PATCH 246/264] powerpc/fadump: consider reserved ranges while
 releasing memory

Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse 'reserved-ranges' DT
node to reserve kernel memory falling in these ranges for firmware
purposes. Along with the preserved area memory, ensure memory in
reserved ranges is not overlapped with memory released by capture
kernel aftering saving vmcore. Also, fix the off-by-one error in
fadump_release_reserved_area function while releasing memory.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821371358.5656.6061214942558818661.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 167 ++++++++++++++++++++++++++++++-----
 1 file changed, 146 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index f95ec1fd797a..502e49ab4b98 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -36,6 +36,7 @@ static struct fw_dump fw_dump;
 
 static DEFINE_MUTEX(fadump_mutex);
 struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
+struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -1009,49 +1010,173 @@ static void fadump_free_reserved_memory(unsigned long start_pfn,
 /*
  * Skip memory holes and free memory that was actually reserved.
  */
-static void fadump_release_reserved_area(unsigned long start, unsigned long end)
+static void fadump_release_reserved_area(u64 start, u64 end)
 {
+	u64 tstart, tend, spfn, epfn;
 	struct memblock_region *reg;
-	unsigned long tstart, tend;
-	unsigned long start_pfn = PHYS_PFN(start);
-	unsigned long end_pfn = PHYS_PFN(end);
 
+	spfn = PHYS_PFN(start);
+	epfn = PHYS_PFN(end);
 	for_each_memblock(memory, reg) {
-		tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
-		tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+		tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg));
+		tend   = min_t(u64, epfn, memblock_region_memory_end_pfn(reg));
 		if (tstart < tend) {
 			fadump_free_reserved_memory(tstart, tend);
 
-			if (tend == end_pfn)
+			if (tend == epfn)
 				break;
 
-			start_pfn = tend + 1;
+			spfn = tend;
 		}
 	}
 }
 
 /*
- * Release the memory that was reserved in early boot to preserve the memory
- * contents. The released memory will be available for general use.
+ * Sort the mem ranges in-place and merge adjacent ranges
+ * to minimize the memory ranges count.
  */
-static void fadump_release_memory(unsigned long begin, unsigned long end)
+static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
-	unsigned long ra_start, ra_end;
+	struct fadump_memory_range *mem_ranges;
+	struct fadump_memory_range tmp_range;
+	u64 base, size;
+	int i, j, idx;
+
+	if (!reserved_mrange_info.mem_range_cnt)
+		return;
+
+	/* Sort the memory ranges */
+	mem_ranges = mrange_info->mem_ranges;
+	for (i = 0; i < mrange_info->mem_range_cnt; i++) {
+		idx = i;
+		for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) {
+			if (mem_ranges[idx].base > mem_ranges[j].base)
+				idx = j;
+		}
+		if (idx != i) {
+			tmp_range = mem_ranges[idx];
+			mem_ranges[idx] = mem_ranges[i];
+			mem_ranges[i] = tmp_range;
+		}
+	}
+
+	/* Merge adjacent reserved ranges */
+	idx = 0;
+	for (i = 1; i < mrange_info->mem_range_cnt; i++) {
+		base = mem_ranges[i-1].base;
+		size = mem_ranges[i-1].size;
+		if (mem_ranges[i].base == (base + size))
+			mem_ranges[idx].size += mem_ranges[i].size;
+		else {
+			idx++;
+			if (i == idx)
+				continue;
+
+			mem_ranges[idx] = mem_ranges[i];
+		}
+	}
+	mrange_info->mem_range_cnt = idx + 1;
+}
+
+/*
+ * Scan reserved-ranges to consider them while reserving/releasing
+ * memory for FADump.
+ */
+static inline int fadump_scan_reserved_mem_ranges(void)
+{
+	struct device_node *root;
+	const __be32 *prop;
+	int len, ret = -1;
+	unsigned long i;
+
+	root = of_find_node_by_path("/");
+	if (!root)
+		return ret;
+
+	prop = of_get_property(root, "reserved-ranges", &len);
+	if (!prop)
+		return ret;
+
+	/*
+	 * Each reserved range is an (address,size) pair, 2 cells each,
+	 * totalling 4 cells per range.
+	 */
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+		u64 base, size;
+
+		base = of_read_number(prop + (i * 4) + 0, 2);
+		size = of_read_number(prop + (i * 4) + 2, 2);
+
+		if (size) {
+			ret = fadump_add_mem_range(&reserved_mrange_info,
+						   base, base + size);
+			if (ret < 0) {
+				pr_warn("some reserved ranges are ignored!\n");
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Release the memory that was reserved during early boot to preserve the
+ * crash'ed kernel's memory contents except reserved dump area (permanent
+ * reservation) and reserved ranges used by F/W. The released memory will
+ * be available for general use.
+ */
+static void fadump_release_memory(u64 begin, u64 end)
+{
+	u64 ra_start, ra_end, tstart;
+	int i, ret;
+
+	fadump_scan_reserved_mem_ranges();
 
 	ra_start = fw_dump.reserve_dump_area_start;
 	ra_end = ra_start + fw_dump.reserve_dump_area_size;
 
 	/*
-	 * exclude the dump reserve area. Will reuse it for next
-	 * fadump registration.
+	 * Add reserved dump area to reserved ranges list
+	 * and exclude all these ranges while releasing memory.
 	 */
-	if (begin < ra_end && end > ra_start) {
-		if (begin < ra_start)
-			fadump_release_reserved_area(begin, ra_start);
-		if (end > ra_end)
-			fadump_release_reserved_area(ra_end, end);
-	} else
-		fadump_release_reserved_area(begin, end);
+	ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end);
+	if (ret != 0) {
+		/*
+		 * Not enough memory to setup reserved ranges but the system is
+		 * running shortage of memory. So, release all the memory except
+		 * Reserved dump area (reused for next fadump registration).
+		 */
+		if (begin < ra_end && end > ra_start) {
+			if (begin < ra_start)
+				fadump_release_reserved_area(begin, ra_start);
+			if (end > ra_end)
+				fadump_release_reserved_area(ra_end, end);
+		} else
+			fadump_release_reserved_area(begin, end);
+
+		return;
+	}
+
+	/* Get the reserved ranges list in order first. */
+	sort_and_merge_mem_ranges(&reserved_mrange_info);
+
+	/* Exclude reserved ranges and release remaining memory */
+	tstart = begin;
+	for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) {
+		ra_start = reserved_mrange_info.mem_ranges[i].base;
+		ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size;
+
+		if (tstart >= ra_end)
+			continue;
+
+		if (tstart < ra_start)
+			fadump_release_reserved_area(tstart, ra_start);
+		tstart = ra_end;
+	}
+
+	if (tstart < end)
+		fadump_release_reserved_area(tstart, end);
 }
 
 static void fadump_invalidate_release_mem(void)

From b2a815a554a34f0e6fab4526ae762d5528783600 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:25:49 +0530
Subject: [PATCH 247/264] powerpc/fadump: improve how crashed kernel's memory
 is reserved

The size parameter to fadump_reserve_crash_area() function is not needed
as all the memory above boot memory size must be preserved anyway. Update
the function by dropping this redundant parameter.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821374440.5656.2945512543806951766.stgit@hbathini.in.ibm.com
---
 arch/powerpc/kernel/fadump.c | 53 ++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 502e49ab4b98..6f52a60bc212 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -34,6 +34,8 @@
 
 static struct fw_dump fw_dump;
 
+static void __init fadump_reserve_crash_area(u64 base);
+
 static DEFINE_MUTEX(fadump_mutex);
 struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
 struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
@@ -318,26 +320,6 @@ static unsigned long get_fadump_area_size(void)
 	return size;
 }
 
-static void __init fadump_reserve_crash_area(unsigned long base,
-					     unsigned long size)
-{
-	struct memblock_region *reg;
-	unsigned long mstart, mend, msize;
-
-	for_each_memblock(memory, reg) {
-		mstart = max_t(unsigned long, base, reg->base);
-		mend = reg->base + reg->size;
-		mend = min(base + size, mend);
-
-		if (mstart < mend) {
-			msize = mend - mstart;
-			memblock_reserve(mstart, msize);
-			pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n",
-				(msize >> 20), mstart);
-		}
-	}
-}
-
 int __init fadump_reserve_mem(void)
 {
 	bool is_memblock_bottom_up = memblock_bottom_up();
@@ -406,12 +388,11 @@ int __init fadump_reserve_mem(void)
 #endif
 		/*
 		 * If last boot has crashed then reserve all the memory
-		 * above boot_memory_size so that we don't touch it until
+		 * above boot memory size so that we don't touch it until
 		 * dump is written to disk by userspace tool. This memory
-		 * will be released for general use once the dump is saved.
+		 * can be released for general use by invalidating fadump.
 		 */
-		size = mem_boundary - base;
-		fadump_reserve_crash_area(base, size);
+		fadump_reserve_crash_area(base);
 
 		pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr);
 		pr_debug("Reserve dump area start address: 0x%lx\n",
@@ -1377,3 +1358,27 @@ int __init setup_fadump(void)
 	return 1;
 }
 subsys_initcall(setup_fadump);
+
+/* Preserve everything above the base address */
+static void __init fadump_reserve_crash_area(u64 base)
+{
+	struct memblock_region *reg;
+	u64 mstart, msize;
+
+	for_each_memblock(memory, reg) {
+		mstart = reg->base;
+		msize  = reg->size;
+
+		if ((mstart + msize) < base)
+			continue;
+
+		if (mstart < base) {
+			msize -= (base - mstart);
+			mstart = base;
+		}
+
+		pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data",
+			(msize >> 20), mstart);
+		memblock_reserve(mstart, msize);
+	}
+}

From bec53196adf4791d466adf0e339b61186c7b5283 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:26:03 +0530
Subject: [PATCH 248/264] powerpc/fadump: add support to preserve crash data on
 FADUMP disabled kernel

Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures
that crash data, from previously crash'ed kernel, is preserved. This
helps in cases where FADump is not enabled but the subsequent memory
preserving kernel boot is likely to process this crash data. One
typical usecase for this config option is petitboot kernel.

As OPAL allows registering address with it in the first kernel and
retrieving it after MPIPL, use it to store the top of boot memory.
A kernel that intends to preserve crash data retrieves it and avoids
using memory beyond this address.

Move arch_reserved_kernel_pages() function as it is needed for both
FA_DUMP and PRESERVE_FA_DUMP configurations.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821375751.5656.11459483669542541602.stgit@hbathini.in.ibm.com
---
 arch/powerpc/Kconfig                         |  9 +++
 arch/powerpc/include/asm/fadump-internal.h   | 11 +++
 arch/powerpc/include/asm/fadump.h            |  9 ++-
 arch/powerpc/kernel/Makefile                 |  4 +-
 arch/powerpc/kernel/fadump.c                 | 44 ++++++++++--
 arch/powerpc/kernel/prom.c                   |  4 +-
 arch/powerpc/platforms/powernv/Makefile      |  1 +
 arch/powerpc/platforms/powernv/opal-fadump.c | 73 ++++++++++++++++++++
 8 files changed, 144 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 375ff893fa60..62c483133953 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -583,6 +583,15 @@ config FA_DUMP
 	  If unsure, say "y". Only special kernels like petitboot may
 	  need to say "N" here.
 
+config PRESERVE_FA_DUMP
+	bool "Preserve Firmware-assisted dump"
+	depends on PPC64 && PPC_POWERNV && !FA_DUMP
+	help
+	  On a kernel with FA_DUMP disabled, this option helps to preserve
+	  crash data from a previously crash'ed kernel. Useful when the next
+	  memory preserving kernel boot would process this crash data.
+	  Petitboot kernel is the typical usecase for this option.
+
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
 	depends on SMP
diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 0f2ce85ee7b9..6c2868d38bd3 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -9,6 +9,7 @@
 #ifndef _ASM_POWERPC_FADUMP_INTERNAL_H
 #define _ASM_POWERPC_FADUMP_INTERNAL_H
 
+#ifndef CONFIG_PRESERVE_FA_DUMP
 /*
  * The RMA region will be saved for later dumping when kernel crashes.
  * RMA is Real Mode Area, the first block of logical memory address owned
@@ -146,6 +147,16 @@ void fadump_update_elfcore_header(char *bufp);
 bool is_fadump_boot_mem_contiguous(void);
 bool is_fadump_reserved_mem_contiguous(void);
 
+#else /* !CONFIG_PRESERVE_FA_DUMP */
+
+/* Firmware-assisted dump configuration details. */
+struct fw_dump {
+	u64	boot_mem_top;
+	u64	dump_active;
+};
+
+#endif /* CONFIG_PRESERVE_FA_DUMP */
+
 #ifdef CONFIG_PPC_PSERIES
 extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
 #else
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index fd5002b4c18c..526a6a647312 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -14,9 +14,6 @@
 extern int crashing_cpu;
 
 extern int is_fadump_memory_area(u64 addr, ulong size);
-extern int early_init_dt_scan_fw_dump(unsigned long node,
-		const char *uname, int depth, void *data);
-extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
@@ -29,4 +26,10 @@ static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
 #endif /* !CONFIG_FA_DUMP */
+
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
+extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+				      int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
 #endif /* _ASM_POWERPC_FADUMP_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 66c54443187d..21ab769e8530 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -79,7 +79,9 @@ obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
 				  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_FA_DUMP)		+= fadump.o
+ifneq ($(CONFIG_FA_DUMP)$(CONFIG_PRESERVE_FA_DUMP),)
+obj-y				+= fadump.o
+endif
 ifdef CONFIG_PPC32
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 6f52a60bc212..645d9d4d9332 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -36,6 +36,7 @@ static struct fw_dump fw_dump;
 
 static void __init fadump_reserve_crash_area(u64 base);
 
+#ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
 struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
 struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
@@ -439,11 +440,6 @@ error_out:
 	return 0;
 }
 
-unsigned long __init arch_reserved_kernel_pages(void)
-{
-	return memblock_reserved_size() / PAGE_SIZE;
-}
-
 /* Look for fadump= cmdline option. */
 static int __init early_fadump_param(char *p)
 {
@@ -1358,6 +1354,39 @@ int __init setup_fadump(void)
 	return 1;
 }
 subsys_initcall(setup_fadump);
+#else /* !CONFIG_PRESERVE_FA_DUMP */
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+				      int depth, void *data)
+{
+	if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0))
+		return 0;
+
+	opal_fadump_dt_scan(&fw_dump, node);
+	return 1;
+}
+
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * preserve crash data. The subsequent memory preserving kernel boot
+ * is likely to process this crash data.
+ */
+int __init fadump_reserve_mem(void)
+{
+	if (fw_dump.dump_active) {
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot memory to preserve crash data.
+		 */
+		pr_info("Preserving crash data for processing in next boot.\n");
+		fadump_reserve_crash_area(fw_dump.boot_mem_top);
+	} else
+		pr_debug("FADump-aware kernel..\n");
+
+	return 1;
+}
+#endif /* CONFIG_PRESERVE_FA_DUMP */
 
 /* Preserve everything above the base address */
 static void __init fadump_reserve_crash_area(u64 base)
@@ -1382,3 +1411,8 @@ static void __init fadump_reserve_crash_area(u64 base)
 		memblock_reserve(mstart, msize);
 	}
 }
+
+unsigned long __init arch_reserved_kernel_pages(void)
+{
+	return memblock_reserved_size() / PAGE_SIZE;
+}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 5828f1c81dc9..6620f37abe73 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -708,7 +708,7 @@ void __init early_init_devtree(void *params)
 	of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL);
 #endif
 
-#ifdef CONFIG_FA_DUMP
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
 	/* scan tree to see if dump is active during last boot */
 	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
 #endif
@@ -735,7 +735,7 @@ void __init early_init_devtree(void *params)
 	if (PHYSICAL_START > MEMORY_START)
 		memblock_reserve(MEMORY_START, 0x8000);
 	reserve_kdump_trampoline();
-#ifdef CONFIG_FA_DUMP
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
 	/*
 	 * If we fail to reserve memory for firmware-assisted dump then
 	 * fallback to kexec based kdump.
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 3e6d6d6d1ec1..2d13e6462339 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -8,6 +8,7 @@ obj-y			+= ultravisor.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index ad4c18bd8809..c66fa96915ae 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -20,6 +20,67 @@
 
 #include "opal-fadump.h"
 
+
+#ifdef CONFIG_PRESERVE_FA_DUMP
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * ensure crash data is preserved in hope that the subsequent memory
+ * preserving kernel boot is going to process this crash data.
+ */
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	const struct opal_fadump_mem_struct *opal_fdm_active;
+	const __be32 *prop;
+	unsigned long dn;
+	u64 addr = 0;
+	s64 ret;
+
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND)
+		return;
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_debug("Could not get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Preserve memory only if kernel memory regions are registered
+	 * with f/w for MPIPL.
+	 */
+	addr = be64_to_cpu(addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+	opal_fdm_active = (void *)addr;
+	if (opal_fdm_active->registered_regions == 0)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get boot memory tag (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Memory below this address can be used for booting a
+	 * capture kernel or petitboot kernel. Preserve everything
+	 * above this address for processing crashdump.
+	 */
+	fadump_conf->boot_mem_top = be64_to_cpu(addr);
+	pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+}
+
+#else /* CONFIG_PRESERVE_FA_DUMP */
 static const struct opal_fadump_mem_struct *opal_fdm_active;
 static const struct opal_mpipl_fadump *opal_cpu_metadata;
 static struct opal_fadump_mem_struct *opal_fdm;
@@ -183,6 +244,17 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
 		err = -EPERM;
 	}
 
+	/*
+	 * Register boot memory top address with f/w. Should be retrieved
+	 * by a kernel that intends to preserve crash'ed kernel's memory.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
+				      fadump_conf->boot_memory_size);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set boot memory tag!\n");
+		err = -EPERM;
+	}
+
 	return err;
 }
 
@@ -649,3 +721,4 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 	fadump_conf->dump_active = 1;
 	opal_fadump_get_config(fadump_conf, opal_fdm_active);
 }
+#endif /* !CONFIG_PRESERVE_FA_DUMP */

From 58cf055df47bb115e7c1533eec2a8f66639b80e3 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:26:16 +0530
Subject: [PATCH 249/264] powerpc/fadump: update documentation about
 CONFIG_PRESERVE_FA_DUMP

Kernel config option CONFIG_PRESERVE_FA_DUMP is introduced to ensure
crash data, from previously crash'ed kernel, is preserved. Update
documentation with this details.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821377195.5656.15840065629705958324.stgit@hbathini.in.ibm.com
---
 Documentation/powerpc/firmware-assisted-dump.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index ab578ac64c09..08a9b5efcdc8 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -101,6 +101,15 @@ or later firmware versions on PowerNV (OPAL) platform.
 Note that, OPAL firmware exports ibm,opal/dump node when
 FADump is supported on PowerNV platform.
 
+On OPAL based machines, system first boots into an intermittent
+kernel (referred to as petitboot kernel) before booting into the
+capture kernel. This kernel would have minimal kernel and/or
+userspace support to process crash data. Such kernel needs to
+preserve previously crash'ed kernel's memory for the subsequent
+capture kernel boot to process this crash data. Kernel config
+option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
+to ensure that crash data is preserved to process later.
+
 Implementation details:
 -----------------------
 

From 6f713d18144ce86c9f01cdf64222d6339e26129e Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2019 20:26:33 +0530
Subject: [PATCH 250/264] powerpc/opalcore: export /sys/firmware/opal/core for
 analysing opal crashes

Export /sys/firmware/opal/core file to analyze opal crashes. Since OPAL
core can be generated independent of CONFIG_FA_DUMP support in kernel,
add this support under a new kernel config option CONFIG_OPAL_CORE.
Also, avoid code duplication by moving common code used while exporting
/proc/vmcore and/or /sys/firmware/opal/core file(s).

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821378503.5656.3693769384945087756.stgit@hbathini.in.ibm.com
---
 arch/powerpc/Kconfig                         |   9 +
 arch/powerpc/platforms/powernv/Makefile      |   1 +
 arch/powerpc/platforms/powernv/opal-core.c   | 598 +++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-fadump.c |  74 +--
 arch/powerpc/platforms/powernv/opal-fadump.h |  62 ++
 5 files changed, 687 insertions(+), 57 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 62c483133953..eea6d5095f71 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -592,6 +592,15 @@ config PRESERVE_FA_DUMP
 	  memory preserving kernel boot would process this crash data.
 	  Petitboot kernel is the typical usecase for this option.
 
+config OPAL_CORE
+	bool "Export OPAL memory as /sys/firmware/opal/core"
+	depends on PPC64 && PPC_POWERNV
+	help
+	  This option uses the MPIPL support in firmware to provide an
+	  ELF core of OPAL memory after a crash. The ELF core is exported
+	  as /sys/firmware/opal/core file which is helpful in debugging
+	  OPAL crashes using GDB.
+
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
 	depends on SMP
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 2d13e6462339..a3ac9646119d 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -9,6 +9,7 @@ obj-y			+= ultravisor.o
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
 obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_OPAL_CORE)	+= opal-core.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000000..6fa6b8a6d510
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal core: " fmt
+
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/slab.h>
+#include <linux/crash_core.h>
+#include <linux/of.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT		8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT		1
+#define AUXV_DESC_SZ		(((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+	u32			num_cpus;
+	/* PIR value of crashing CPU */
+	u32			crashing_cpu;
+
+	/* CPU state data info from F/W */
+	u64			cpu_state_destination_vaddr;
+	u64			cpu_state_data_size;
+	u64			cpu_state_entry_size;
+
+	/* OPAL memory to be exported as PT_LOAD segments */
+	u64			ptload_addr[MAX_PT_LOAD_CNT];
+	u64			ptload_size[MAX_PT_LOAD_CNT];
+	u64			ptload_cnt;
+
+	/* Pointer to the first PT_LOAD in the ELF core file */
+	Elf64_Phdr		*ptload_phdr;
+
+	/* Total size of opalcore file. */
+	size_t			opalcore_size;
+
+	/* Buffer for all the ELF core headers and the PT_NOTE */
+	size_t			opalcorebuf_sz;
+	char			*opalcorebuf;
+
+	/* NT_AUXV buffer */
+	char			auxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+	struct list_head	list;
+	u64			paddr;
+	size_t			size;
+	loff_t			offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_mpipl_fadump *opalc_metadata;
+static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new_element(void)
+{
+	return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+	return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+				     u32 type, void *data,
+				     size_t data_len)
+{
+	Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+	Elf64_Word namesz = strlen(name) + 1;
+
+	note->n_namesz = cpu_to_be32(namesz);
+	note->n_descsz = cpu_to_be32(data_len);
+	note->n_type   = cpu_to_be32(type);
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+	memcpy(buf, name, namesz);
+	buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+	return buf;
+}
+
+static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
+			  struct pt_regs *regs)
+{
+	memset(prstatus, 0, sizeof(struct elf_prstatus));
+	elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
+
+	/*
+	 * Overload PID with PIR value.
+	 * As a PIR value could also be '0', add an offset of '100'
+	 * to every PIR to avoid misinterpretations in GDB.
+	 */
+	prstatus->pr_pid  = cpu_to_be32(100 + pir);
+	prstatus->pr_ppid = cpu_to_be32(1);
+
+	/*
+	 * Indicate SIGUSR1 for crash initiated from kernel.
+	 * SIGTERM otherwise.
+	 */
+	if (pir == oc_conf->crashing_cpu) {
+		short sig;
+
+		sig = kernel_initiated ? SIGUSR1 : SIGTERM;
+		prstatus->pr_cursig = cpu_to_be16(sig);
+	}
+}
+
+static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
+				       u64 opal_boot_entry)
+{
+	Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
+	int idx = 0;
+
+	memset(bufp, 0, AUXV_DESC_SZ);
+
+	/* Entry point of OPAL */
+	bufp[idx++] = cpu_to_be64(AT_ENTRY);
+	bufp[idx++] = cpu_to_be64(opal_boot_entry);
+
+	/* end of vector */
+	bufp[idx++] = cpu_to_be64(AT_NULL);
+
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+				oc_conf->auxv_buf, AUXV_DESC_SZ);
+	return buf;
+}
+
+/*
+ * Read from the ELF header and then the crash dump.
+ * Returns number of bytes read on success, -errno on failure.
+ */
+static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
+			     struct bin_attribute *bin_attr, char *to,
+			     loff_t pos, size_t count)
+{
+	struct opalcore *m;
+	ssize_t tsz, avail;
+	loff_t tpos = pos;
+
+	if (pos >= oc_conf->opalcore_size)
+		return 0;
+
+	/* Adjust count if it goes beyond opalcore size */
+	avail = oc_conf->opalcore_size - pos;
+	if (count > avail)
+		count = avail;
+
+	if (count == 0)
+		return 0;
+
+	/* Read ELF core header and/or PT_NOTE segment */
+	if (tpos < oc_conf->opalcorebuf_sz) {
+		tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
+		memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
+		to += tsz;
+		tpos += tsz;
+		count -= tsz;
+	}
+
+	list_for_each_entry(m, &opalcore_list, list) {
+		/* nothing more to read here */
+		if (count == 0)
+			break;
+
+		if (tpos < m->offset + m->size) {
+			void *addr;
+
+			tsz = min_t(size_t, m->offset + m->size - tpos, count);
+			addr = (void *)(m->paddr + tpos - m->offset);
+			memcpy(to, __va(addr), tsz);
+			to += tsz;
+			tpos += tsz;
+			count -= tsz;
+		}
+	}
+
+	return (tpos - pos);
+}
+
+static struct bin_attribute opal_core_attr = {
+	.attr = {.name = "core", .mode = 0400},
+	.read = read_opalcore
+};
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
+ */
+static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
+{
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	struct elf_prstatus prstatus;
+	Elf64_Word *first_cpu_note;
+	struct pt_regs regs;
+	char *bufp;
+	int i;
+
+	size_per_thread = oc_conf->cpu_state_entry_size;
+	bufp = __va(oc_conf->cpu_state_destination_vaddr);
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", oc_conf->num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	/*
+	 * Skip past the first CPU note. Fill this note with the
+	 * crashing CPU's prstatus.
+	 */
+	first_cpu_note = buf;
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+
+	for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+		thread_pir = be32_to_cpu(thdr->pir);
+
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, false, &regs);
+
+		pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
+			 be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
+		fill_prstatus(&prstatus, thread_pir, &regs);
+
+		if (thread_pir != oc_conf->crashing_cpu) {
+			buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+						NT_PRSTATUS, &prstatus,
+						sizeof(prstatus));
+		} else {
+			/*
+			 * Add crashing CPU as the first NT_PRSTATUS note for
+			 * GDB to process the core file appropriately.
+			 */
+			append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+					  NT_PRSTATUS, &prstatus,
+					  sizeof(prstatus));
+		}
+	}
+
+	return buf;
+}
+
+static int __init create_opalcore(void)
+{
+	u64 opal_boot_entry, opal_base_addr, paddr;
+	u32 hdr_size, cpu_notes_size, count;
+	struct device_node *dn;
+	struct opalcore *new;
+	loff_t opalcore_off;
+	struct page *page;
+	Elf64_Phdr *phdr;
+	Elf64_Ehdr *elf;
+	int i, ret;
+	char *bufp;
+
+	/* Get size of header & CPU notes for OPAL core */
+	hdr_size = (sizeof(Elf64_Ehdr) +
+		    ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
+	cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES +
+			  CRASH_CORE_NOTE_DESC_BYTES)) +
+			  (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
+
+	/* Allocate buffer to setup OPAL core */
+	oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
+	oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
+						 GFP_KERNEL | __GFP_ZERO);
+	if (!oc_conf->opalcorebuf) {
+		pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
+		       oc_conf->opalcorebuf_sz);
+		oc_conf->opalcorebuf_sz = 0;
+		return -ENOMEM;
+	}
+	count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
+	page = virt_to_page(oc_conf->opalcorebuf);
+	for (i = 0; i < count; i++)
+		mark_page_reserved(page + i);
+
+	pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
+
+	/* Read OPAL related device-tree entries */
+	dn = of_find_node_by_name(NULL, "ibm,opal");
+	if (dn) {
+		ret = of_property_read_u64(dn, "opal-base-address",
+					   &opal_base_addr);
+		pr_debug("opal-base-address: %llx\n", opal_base_addr);
+		ret |= of_property_read_u64(dn, "opal-boot-address",
+					    &opal_boot_entry);
+		pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
+	}
+	if (!dn || ret)
+		pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+
+	/* Use count to keep track of the program headers */
+	count = 0;
+
+	bufp = oc_conf->opalcorebuf;
+	elf = (Elf64_Ehdr *)bufp;
+	bufp += sizeof(Elf64_Ehdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELFDATA2MSB;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = cpu_to_be16(ET_CORE);
+	elf->e_machine = cpu_to_be16(ELF_ARCH);
+	elf->e_version = cpu_to_be32(EV_CURRENT);
+	elf->e_entry = 0;
+	elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
+	elf->e_shoff = 0;
+	elf->e_flags = 0;
+
+	elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
+	elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	phdr = (Elf64_Phdr *)bufp;
+	bufp += sizeof(Elf64_Phdr);
+	phdr->p_type	= cpu_to_be32(PT_NOTE);
+	phdr->p_flags	= 0;
+	phdr->p_align	= 0;
+	phdr->p_paddr	= phdr->p_vaddr = 0;
+	phdr->p_offset	= cpu_to_be64(hdr_size);
+	phdr->p_filesz	= phdr->p_memsz = cpu_to_be64(cpu_notes_size);
+	count++;
+
+	opalcore_off = oc_conf->opalcorebuf_sz;
+	oc_conf->ptload_phdr  = (Elf64_Phdr *)bufp;
+	paddr = 0;
+	for (i = 0; i < oc_conf->ptload_cnt; i++) {
+		phdr = (Elf64_Phdr *)bufp;
+		bufp += sizeof(Elf64_Phdr);
+		phdr->p_type	= cpu_to_be32(PT_LOAD);
+		phdr->p_flags	= cpu_to_be32(PF_R|PF_W|PF_X);
+		phdr->p_align	= 0;
+
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr  = oc_conf->ptload_addr[i];
+		new->size   = oc_conf->ptload_size[i];
+		new->offset = opalcore_off;
+		list_add_tail(&new->list, &opalcore_list);
+
+		phdr->p_paddr	= cpu_to_be64(paddr);
+		phdr->p_vaddr	= cpu_to_be64(opal_base_addr + paddr);
+		phdr->p_filesz	= phdr->p_memsz  =
+			cpu_to_be64(oc_conf->ptload_size[i]);
+		phdr->p_offset	= cpu_to_be64(opalcore_off);
+
+		count++;
+		opalcore_off += oc_conf->ptload_size[i];
+		paddr += oc_conf->ptload_size[i];
+	}
+
+	elf->e_phnum = cpu_to_be16(count);
+
+	bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
+	bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
+
+	oc_conf->opalcore_size = opalcore_off;
+	return 0;
+}
+
+static void opalcore_cleanup(void)
+{
+	if (oc_conf == NULL)
+		return;
+
+	/* Remove OPAL core sysfs file */
+	sysfs_remove_bin_file(opal_kobj, &opal_core_attr);
+	oc_conf->ptload_phdr = NULL;
+	oc_conf->ptload_cnt = 0;
+
+	/* free the buffer used for setting up OPAL core */
+	if (oc_conf->opalcorebuf) {
+		void *end = (void *)((u64)oc_conf->opalcorebuf +
+				     oc_conf->opalcorebuf_sz);
+
+		free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+		oc_conf->opalcorebuf = NULL;
+		oc_conf->opalcorebuf_sz = 0;
+	}
+
+	kfree(oc_conf);
+	oc_conf = NULL;
+}
+__exitcall(opalcore_cleanup);
+
+static void __init opalcore_config_init(void)
+{
+	u32 idx, cpu_data_version;
+	struct device_node *np;
+	const __be32 *prop;
+	u64 addr = 0;
+	int i, ret;
+
+	np = of_find_node_by_path("/ibm,opal/dump");
+	if (np == NULL)
+		return;
+
+	if (!of_device_is_compatible(np, "ibm,opal-dump")) {
+		pr_warn("Support missing for this f/w version!\n");
+		return;
+	}
+
+	/* Check if dump has been initiated on last reboot */
+	prop = of_get_property(np, "mpipl-boot", NULL);
+	if (!prop) {
+		of_node_put(np);
+		return;
+	}
+
+	/* Get OPAL metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("OPAL metadata addr: %llx\n", addr);
+	opalc_metadata = __va(addr);
+
+	/* Get OPAL CPU metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("CPU metadata addr: %llx\n", addr);
+	opalc_cpu_metadata = __va(addr);
+
+	/* Allocate memory for config buffer */
+	oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
+	if (oc_conf == NULL)
+		goto error_out;
+
+	/* Parse OPAL metadata */
+	if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
+		pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
+			OPAL_MPIPL_VERSION, opalc_metadata->version);
+		pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
+	}
+
+	oc_conf->ptload_cnt = 0;
+	idx = be32_to_cpu(opalc_metadata->region_cnt);
+	if (idx > MAX_PT_LOAD_CNT) {
+		pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
+			MAX_PT_LOAD_CNT, idx);
+		idx = MAX_PT_LOAD_CNT;
+	}
+	for (i = 0; i < idx; i++) {
+		oc_conf->ptload_addr[oc_conf->ptload_cnt] =
+				be64_to_cpu(opalc_metadata->region[i].dest);
+		oc_conf->ptload_size[oc_conf->ptload_cnt++] =
+				be64_to_cpu(opalc_metadata->region[i].size);
+	}
+	oc_conf->ptload_cnt = i;
+	oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
+
+	if (!oc_conf->ptload_cnt) {
+		pr_err("OPAL memory regions not found\n");
+		goto error_out;
+	}
+
+	/* Parse OPAL CPU metadata */
+	cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
+	if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU data version: %u, found: %u!\n",
+			HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
+	if (!addr) {
+		pr_err("CPU state data not found!\n");
+		goto error_out;
+	}
+	oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
+
+	oc_conf->cpu_state_data_size =
+			be64_to_cpu(opalc_cpu_metadata->region[0].size);
+	oc_conf->cpu_state_entry_size =
+			be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
+
+	if ((oc_conf->cpu_state_entry_size == 0) ||
+	    (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid.\n");
+		goto error_out;
+	}
+	oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
+			     oc_conf->cpu_state_entry_size);
+
+	of_node_put(np);
+	return;
+
+error_out:
+	pr_err("Could not export /sys/firmware/opal/core\n");
+	opalcore_cleanup();
+	of_node_put(np);
+}
+
+static int __init opalcore_init(void)
+{
+	int rc = -1;
+
+	opalcore_config_init();
+
+	if (oc_conf == NULL)
+		return rc;
+
+	create_opalcore();
+
+	/*
+	 * If oc_conf->opalcorebuf= is set in the 2nd kernel,
+	 * then capture the dump.
+	 */
+	if (!(is_opalcore_usable())) {
+		pr_err("Failed to export /sys/firmware/opal/core\n");
+		opalcore_cleanup();
+		return rc;
+	}
+
+	/* Set OPAL core file size */
+	opal_core_attr.size = oc_conf->opalcore_size;
+
+	/* Export OPAL core sysfs file */
+	rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr);
+	if (rc != 0) {
+		pr_err("Failed to export /sys/firmware/opal/core\n");
+		opalcore_cleanup();
+		return rc;
+	}
+
+	return 0;
+}
+fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index c66fa96915ae..a89d3a0cccae 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -85,6 +85,10 @@ static const struct opal_fadump_mem_struct *opal_fdm_active;
 static const struct opal_mpipl_fadump *opal_cpu_metadata;
 static struct opal_fadump_mem_struct *opal_fdm;
 
+#ifdef CONFIG_OPAL_CORE
+extern bool kernel_initiated;
+#endif
+
 static int opal_fadump_unregister(struct fw_dump *fadump_conf);
 
 static void opal_fadump_update_config(struct fw_dump *fadump_conf,
@@ -349,62 +353,6 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
 		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
 }
 
-static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
-						 u32 reg_type, u32 reg_num,
-						 u64 reg_val)
-{
-	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
-		if (reg_num < 32)
-			regs->gpr[reg_num] = reg_val;
-		return;
-	}
-
-	switch (reg_num) {
-	case SPRN_CTR:
-		regs->ctr = reg_val;
-		break;
-	case SPRN_LR:
-		regs->link = reg_val;
-		break;
-	case SPRN_XER:
-		regs->xer = reg_val;
-		break;
-	case SPRN_DAR:
-		regs->dar = reg_val;
-		break;
-	case SPRN_DSISR:
-		regs->dsisr = reg_val;
-		break;
-	case HDAT_FADUMP_REG_ID_NIP:
-		regs->nip = reg_val;
-		break;
-	case HDAT_FADUMP_REG_ID_MSR:
-		regs->msr = reg_val;
-		break;
-	case HDAT_FADUMP_REG_ID_CCR:
-		regs->ccr = reg_val;
-		break;
-	}
-}
-
-static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
-					 unsigned int reg_entry_size,
-					 struct pt_regs *regs)
-{
-	struct hdat_fadump_reg_entry *reg_entry;
-	int i;
-
-	memset(regs, 0, sizeof(struct pt_regs));
-
-	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
-		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
-		opal_fadump_set_regval_regnum(regs,
-					      be32_to_cpu(reg_entry->reg_type),
-					      be32_to_cpu(reg_entry->reg_num),
-					      be64_to_cpu(reg_entry->reg_val));
-	}
-}
-
 /*
  * Verify if CPU state data is available. If available, do a bit of sanity
  * checking before processing this data.
@@ -529,7 +477,7 @@ opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
 			continue;
 
 		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
-				      reg_esize, &regs);
+				      reg_esize, true, &regs);
 		note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
 		pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
 			 thread_pir, regs.gpr[1], regs.nip);
@@ -573,6 +521,18 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf)
 		return rc;
 	}
 
+#ifdef CONFIG_OPAL_CORE
+	/*
+	 * If this is a kernel initiated crash, crashing_cpu would be set
+	 * appropriately and register data of the crashing CPU saved by
+	 * crashing kernel. Add this saved register data of crashing CPU
+	 * to elf notes and populate the pt_regs for the remaining CPUs
+	 * from register state data provided by firmware.
+	 */
+	if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
+		kernel_initiated = true;
+#endif
+
 	rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
 	if (rc)
 		return rc;
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index c0b5d6e42ba6..d0e17d6a2b5c 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -8,6 +8,8 @@
 #ifndef _POWERNV_OPAL_FADUMP_H
 #define _POWERNV_OPAL_FADUMP_H
 
+#include <asm/reg.h>
+
 /*
  * OPAL FADump metadata structure format version
  *
@@ -77,4 +79,64 @@ struct hdat_fadump_reg_entry {
 	__be64		reg_val;
 } __packed;
 
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+						 u32 reg_type, u32 reg_num,
+						 u64 reg_val)
+{
+	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+		if (reg_num < 32)
+			regs->gpr[reg_num] = reg_val;
+		return;
+	}
+
+	switch (reg_num) {
+	case SPRN_CTR:
+		regs->ctr = reg_val;
+		break;
+	case SPRN_LR:
+		regs->link = reg_val;
+		break;
+	case SPRN_XER:
+		regs->xer = reg_val;
+		break;
+	case SPRN_DAR:
+		regs->dar = reg_val;
+		break;
+	case SPRN_DSISR:
+		regs->dsisr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_NIP:
+		regs->nip = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_MSR:
+		regs->msr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_CCR:
+		regs->ccr = reg_val;
+		break;
+	}
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+					 unsigned int reg_entry_size,
+					 bool cpu_endian,
+					 struct pt_regs *regs)
+{
+	struct hdat_fadump_reg_entry *reg_entry;
+	u64 val;
+	int i;
+
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+		val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
+		       reg_entry->reg_val);
+		opal_fadump_set_regval_regnum(regs,
+					      be32_to_cpu(reg_entry->reg_type),
+					      be32_to_cpu(reg_entry->reg_num),
+					      val);
+	}
+}
+
 #endif /* _POWERNV_OPAL_FADUMP_H */

From 845426f3f3443c3f60737751b7af0142e3020f08 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:26:45 +0530
Subject: [PATCH 251/264] powerpc/opalcore: provide an option to invalidate
 /sys/firmware/opal/core file

Writing '1' to /sys/kernel/fadump_release_opalcore would release the
memory held by kernel in exporting /sys/firmware/opal/core file.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821380161.5656.17827032108471421830.stgit@hbathini.in.ibm.com
---
 arch/powerpc/platforms/powernv/opal-core.c | 38 ++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index 6fa6b8a6d510..ed895d82c048 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -13,6 +13,8 @@
 #include <linux/proc_fs.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 #include <linux/slab.h>
 #include <linux/crash_core.h>
 #include <linux/of.h>
@@ -561,6 +563,36 @@ error_out:
 	of_node_put(np);
 }
 
+static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     const char *buf, size_t count)
+{
+	int input = -1;
+
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	if (input == 1) {
+		if (oc_conf == NULL) {
+			pr_err("'/sys/firmware/opal/core' file not accessible!\n");
+			return -EPERM;
+		}
+
+		/*
+		 * Take away '/sys/firmware/opal/core' and release all memory
+		 * used for exporting this file.
+		 */
+		opalcore_cleanup();
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore,
+						0200, NULL,
+						fadump_release_opalcore_store);
+
 static int __init opalcore_init(void)
 {
 	int rc = -1;
@@ -593,6 +625,12 @@ static int __init opalcore_init(void)
 		return rc;
 	}
 
+	rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
+	if (rc) {
+		pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n",
+			rc);
+	}
+
 	return 0;
 }
 fs_initcall(opalcore_init);

From 7b1b3b48250acbfd7f15ba950d4654b7f02a8300 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:26:59 +0530
Subject: [PATCH 252/264] powerpc/fadump: consider f/w load area

OPAL loads kernel & initrd at 512MB offset (256MB size), also exported
as ibm,opal/dump/fw-load-area. So, if boot memory size of FADump is
less than 768MB, kernel memory to be exported as '/proc/vmcore' would
be overwritten by f/w while loading kernel & initrd. To avoid such a
scenario, enforce a minimum boot memory size of 768MB on OPAL platform
and skip using FADump if a newer F/W version loads kernel & initrd
above 768MB.

Also, irrespective of RMA size, set the minimum boot memory size
expected on pseries platform at 320MB. This is to avoid inflating the
minimum memory requirements on systems with 512M/1024M RMA size.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821381414.5656.1592867278535469652.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   | 11 +------
 arch/powerpc/kernel/fadump.c                 | 16 ++++++++---
 arch/powerpc/platforms/powernv/opal-fadump.c | 30 +++++++++++++++++++-
 arch/powerpc/platforms/powernv/opal-fadump.h |  7 +++++
 arch/powerpc/platforms/pseries/rtas-fadump.c |  6 ++++
 arch/powerpc/platforms/pseries/rtas-fadump.h |  9 ++++++
 6 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index 6c2868d38bd3..a669aaa7c610 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -19,16 +19,6 @@
 #define RMA_START	0x0
 #define RMA_END		(ppc64_rma_size)
 
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need additional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-			+ (0x1UL << 26))
-
 /* The upper limit percentage for user specified boot memory size (25%) */
 #define MAX_BOOT_MEM_RATIO			4
 
@@ -128,6 +118,7 @@ struct fadump_ops {
 	u64	(*fadump_init_mem_struct)(struct fw_dump *fadump_conf);
 	u64	(*fadump_get_metadata_size)(void);
 	int	(*fadump_setup_metadata)(struct fw_dump *fadump_conf);
+	u64	(*fadump_get_bootmem_min)(void);
 	int	(*fadump_register)(struct fw_dump *fadump_conf);
 	int	(*fadump_unregister)(struct fw_dump *fadump_conf);
 	int	(*fadump_invalidate)(struct fw_dump *fadump_conf);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 645d9d4d9332..bd49b1f200bf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -240,10 +240,10 @@ static void fadump_show_config(void)
  * that is required for a kernel to boot successfully.
  *
  */
-static inline unsigned long fadump_calculate_reserve_size(void)
+static inline u64 fadump_calculate_reserve_size(void)
 {
+	u64 base, size, bootmem_min;
 	int ret;
-	unsigned long long base, size;
 
 	if (fw_dump.reserve_bootvar)
 		pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
@@ -293,7 +293,8 @@ static inline unsigned long fadump_calculate_reserve_size(void)
 	if (memory_limit && size > memory_limit)
 		size = memory_limit;
 
-	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+	bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
+	return (size > bootmem_min ? size : bootmem_min);
 }
 
 /*
@@ -323,8 +324,8 @@ static unsigned long get_fadump_area_size(void)
 
 int __init fadump_reserve_mem(void)
 {
+	u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
 	bool is_memblock_bottom_up = memblock_bottom_up();
-	u64 base, size, mem_boundary, align = PAGE_SIZE;
 	int ret = 1;
 
 	if (!fw_dump.fadump_enabled)
@@ -350,6 +351,13 @@ int __init fadump_reserve_mem(void)
 				ALIGN(fw_dump.boot_memory_size, align);
 		}
 #endif
+
+		bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
+		if (fw_dump.boot_memory_size < bootmem_min) {
+			pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n",
+			       fw_dump.boot_memory_size, bootmem_min);
+			goto error_out;
+		}
 	}
 
 	/*
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index a89d3a0cccae..006648e4d5e6 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -9,6 +9,7 @@
 
 #include <linux/string.h>
 #include <linux/seq_file.h>
+#include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <linux/mm.h>
@@ -262,6 +263,11 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
 	return err;
 }
 
+static u64 opal_fadump_get_bootmem_min(void)
+{
+	return OPAL_FADUMP_MIN_BOOT_MEM;
+}
+
 static int opal_fadump_register(struct fw_dump *fadump_conf)
 {
 	s64 rc = OPAL_PARAMETER;
@@ -606,6 +612,7 @@ static struct fadump_ops opal_fadump_ops = {
 	.fadump_init_mem_struct		= opal_fadump_init_mem_struct,
 	.fadump_get_metadata_size	= opal_fadump_get_metadata_size,
 	.fadump_setup_metadata		= opal_fadump_setup_metadata,
+	.fadump_get_bootmem_min		= opal_fadump_get_bootmem_min,
 	.fadump_register		= opal_fadump_register,
 	.fadump_unregister		= opal_fadump_unregister,
 	.fadump_invalidate		= opal_fadump_invalidate,
@@ -620,9 +627,9 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 	const __be32 *prop;
 	unsigned long dn;
 	u64 addr = 0;
+	int i, len;
 	s64 ret;
 
-
 	/*
 	 * Check if Firmware-Assisted Dump is supported. if yes, check
 	 * if dump has been initiated on last reboot.
@@ -638,6 +645,27 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 		return;
 	}
 
+	prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+	if (prop) {
+		/*
+		 * Each f/w load area is an (address,size) pair,
+		 * 2 cells each, totalling 4 cells per range.
+		 */
+		for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+			u64 base, end;
+
+			base = of_read_number(prop + (i * 4) + 0, 2);
+			end = base;
+			end += of_read_number(prop + (i * 4) + 2, 2);
+			if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
+				pr_err("F/W load area: 0x%llx-0x%llx\n",
+				       base, end);
+				pr_err("F/W version not supported!\n");
+				return;
+			}
+		}
+	}
+
 	fadump_conf->ops		= &opal_fadump_ops;
 	fadump_conf->fadump_supported	= 1;
 
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index d0e17d6a2b5c..e630cb0f108f 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -10,6 +10,13 @@
 
 #include <asm/reg.h>
 
+/*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_FADUMP_MIN_BOOT_MEM		(0x30000000UL)
+
 /*
  * OPAL FADump metadata structure format version
  *
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index bf1e8fc549b5..1d8f2871fa34 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -116,6 +116,11 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 	return addr;
 }
 
+static u64 rtas_fadump_get_bootmem_min(void)
+{
+	return RTAS_FADUMP_MIN_BOOT_MEM;
+}
+
 static int rtas_fadump_register(struct fw_dump *fadump_conf)
 {
 	unsigned int wait_time;
@@ -467,6 +472,7 @@ static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
 
 static struct fadump_ops rtas_fadump_ops = {
 	.fadump_init_mem_struct		= rtas_fadump_init_mem_struct,
+	.fadump_get_bootmem_min		= rtas_fadump_get_bootmem_min,
 	.fadump_register		= rtas_fadump_register,
 	.fadump_unregister		= rtas_fadump_unregister,
 	.fadump_invalidate		= rtas_fadump_invalidate,
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
index 531f3f3e42b3..6602ff69e10d 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.h
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -9,6 +9,15 @@
 #ifndef _PSERIES_RTAS_FADUMP_H
 #define _PSERIES_RTAS_FADUMP_H
 
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need additional 64M
+ * of memory to avoid OOM issue.
+ */
+#define RTAS_FADUMP_MIN_BOOT_MEM	((0x1UL << 28) + (0x1UL << 26))
+
 /* Firmware provided dump sections */
 #define RTAS_FADUMP_CPU_STATE_DATA	0x0001
 #define RTAS_FADUMP_HPTE_REGION		0x0002

From b3bba79d5a7ba27bf7e31e124afbcb386128f2c6 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:27:12 +0530
Subject: [PATCH 253/264] powerpc/fadump: update documentation about option to
 release opalcore

With /sys/firmware/opal/core support available on OPAL based machines
and an option to the release memory used by kernel in exporting this
core file, update FADump documentation with these details.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821382786.5656.13173494907671241231.stgit@hbathini.in.ibm.com
---
 .../powerpc/firmware-assisted-dump.rst        | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index 08a9b5efcdc8..0455a78486d5 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -110,6 +110,16 @@ capture kernel boot to process this crash data. Kernel config
 option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
 to ensure that crash data is preserved to process later.
 
+-- On OPAL based machines (PowerNV), if the kernel is build with
+   CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also
+   exported as /sys/firmware/opal/core file. This procfs file is
+   helpful in debugging OPAL crashes with GDB. The kernel memory
+   used for exporting this procfs file can be released by echo'ing
+   '1' to /sys/kernel/fadump_release_opalcore node.
+
+   e.g.
+     # echo 1 > /sys/kernel/fadump_release_opalcore
+
 Implementation details:
 -----------------------
 
@@ -273,6 +283,15 @@ Here is the list of files under kernel sysfs:
     enhanced to use this interface to release the memory reserved for
     dump and continue without 2nd reboot.
 
+ /sys/kernel/fadump_release_opalcore
+
+    This file is available only on OPAL based machines when FADump is
+    active during capture kernel. This is used to release the memory
+    used by the kernel to export /sys/firmware/opal/core file. To
+    release this memory, echo '1' to it:
+
+    echo 1  > /sys/kernel/fadump_release_opalcore
+
 Here is the list of files under powerpc debugfs:
 (Assuming debugfs is mounted on /sys/kernel/debug directory.)
 

From becd91d9c5467160984a0380df72fdf71fee82f6 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:27:26 +0530
Subject: [PATCH 254/264] powerpc/fadump: remove RMA_START and RMA_END macros

RMA_START is defined as '0' and there is even a BUILD_BUG_ON() to
make sure it is never anything else. Remove this macro and use '0'
instead as code change is needed anyway when it has to be something
else. Also, remove unused RMA_END macro.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821384096.5656.15026984053970204652.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  8 -----
 arch/powerpc/kernel/fadump.c                 | 32 +++++++++++---------
 arch/powerpc/platforms/pseries/rtas-fadump.c |  2 +-
 3 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index a669aaa7c610..aec1515ac8f8 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -10,14 +10,6 @@
 #define _ASM_POWERPC_FADUMP_INTERNAL_H
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START	0x0
-#define RMA_END		(ppc64_rma_size)
 
 /* The upper limit percentage for user specified boot memory size (25%) */
 #define MAX_BOOT_MEM_RATIO			4
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index bd49b1f200bf..2e139259474d 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -128,18 +128,22 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  * If fadump is registered, check if the memory provided
  * falls within boot memory area and reserved memory area.
  */
-int is_fadump_memory_area(u64 addr, ulong size)
+int is_fadump_memory_area(u64 addr, unsigned long size)
 {
-	u64 d_start = fw_dump.reserve_dump_area_start;
-	u64 d_end = d_start + fw_dump.reserve_dump_area_size;
+	u64 d_start, d_end;
 
 	if (!fw_dump.dump_registered)
 		return 0;
 
+	if (!size)
+		return 0;
+
+	d_start = fw_dump.reserve_dump_area_start;
+	d_end = d_start + fw_dump.reserve_dump_area_size;
 	if (((addr + size) > d_start) && (addr <= d_end))
 		return 1;
 
-	return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
+	return (addr <= fw_dump.boot_memory_size);
 }
 
 int should_fadump_crash(void)
@@ -771,14 +775,14 @@ static int fadump_setup_crash_memory_ranges(void)
 	crash_mrange_info.mem_range_cnt = 0;
 
 	/*
-	 * add the first memory chunk (RMA_START through boot_memory_size) as
+	 * add the first memory chunk (0 through boot_memory_size) as
 	 * a separate memory chunk. The reason is, at the time crash firmware
 	 * will move the content of this memory chunk to different location
 	 * specified during fadump registration. We need to create a separate
 	 * program header for this chunk with the correct offset.
 	 */
 	ret = fadump_add_mem_range(&crash_mrange_info,
-				   RMA_START, fw_dump.boot_memory_size);
+				   0, fw_dump.boot_memory_size);
 	if (ret)
 		return ret;
 
@@ -787,11 +791,9 @@ static int fadump_setup_crash_memory_ranges(void)
 		end = start + (u64)reg->size;
 
 		/*
-		 * skip the first memory chunk that is already added (RMA_START
-		 * through boot_memory_size). This logic needs a relook if and
-		 * when RMA_START changes to a non-zero value.
+		 * skip the first memory chunk that is already added
+		 * (0 through boot_memory_size).
 		 */
-		BUILD_BUG_ON(RMA_START != 0);
 		if (start < fw_dump.boot_memory_size) {
 			if (end > fw_dump.boot_memory_size)
 				start = fw_dump.boot_memory_size;
@@ -815,7 +817,7 @@ static int fadump_setup_crash_memory_ranges(void)
  */
 static inline unsigned long fadump_relocate(unsigned long paddr)
 {
-	if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
+	if ((paddr > 0) && (paddr < fw_dump.boot_memory_size))
 		return fw_dump.boot_mem_dest_addr + paddr;
 	else
 		return paddr;
@@ -883,11 +885,11 @@ static int fadump_create_elfcore_headers(char *bufp)
 		phdr->p_flags	= PF_R|PF_W|PF_X;
 		phdr->p_offset	= mbase;
 
-		if (mbase == RMA_START) {
+		if (mbase == 0) {
 			/*
-			 * The entire RMA region will be moved by firmware
-			 * to the specified destination_address. Hence set
-			 * the correct offset.
+			 * The entire real memory region will be moved by
+			 * firmware to the specified destination_address.
+			 * Hence set the correct offset.
 			 */
 			phdr->p_offset = fw_dump.boot_mem_dest_addr;
 		}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index 1d8f2871fa34..a525180c1f0f 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -106,7 +106,7 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 	fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
 	fdm.rmr_region.source_data_type =
 		cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
-	fdm.rmr_region.source_address = cpu_to_be64(RMA_START);
+	fdm.rmr_region.source_address = cpu_to_be64(0);
 	fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size);
 	fdm.rmr_region.destination_address = cpu_to_be64(addr);
 	addr += fadump_conf->boot_memory_size;

From 7dee93a9a8808b3d8595e1cc79ccb8b1a7bc7a77 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 11 Sep 2019 20:27:39 +0530
Subject: [PATCH 255/264] powerpc/fadump: support holes in kernel boot memory
 area

With support to copy multiple kernel boot memory regions owing to copy
size limitation, also handle holes in the memory area to be preserved.
Support as many as 128 kernel boot memory regions. This allows having
an adequate FADump capture kernel size for different scenarios.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/156821385448.5656.6124791213910877759.stgit@hbathini.in.ibm.com
---
 arch/powerpc/include/asm/fadump-internal.h   |  12 ++
 arch/powerpc/kernel/fadump.c                 | 191 ++++++++++++++++---
 arch/powerpc/platforms/powernv/opal-fadump.c |  54 +++---
 arch/powerpc/platforms/powernv/opal-fadump.h |   5 +-
 arch/powerpc/platforms/pseries/rtas-fadump.c |  11 +-
 arch/powerpc/platforms/pseries/rtas-fadump.h |   5 +
 6 files changed, 219 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
index aec1515ac8f8..c814a2b55389 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -9,6 +9,9 @@
 #ifndef _ASM_POWERPC_FADUMP_INTERNAL_H
 #define _ASM_POWERPC_FADUMP_INTERNAL_H
 
+/* Maximum number of memory regions kernel supports */
+#define FADUMP_MAX_MEM_REGS			128
+
 #ifndef CONFIG_PRESERVE_FA_DUMP
 
 /* The upper limit percentage for user specified boot memory size (25%) */
@@ -88,11 +91,20 @@ struct fw_dump {
 
 	unsigned long	boot_memory_size;
 	u64		boot_mem_dest_addr;
+	u64		boot_mem_addr[FADUMP_MAX_MEM_REGS];
+	u64		boot_mem_sz[FADUMP_MAX_MEM_REGS];
+	u64		boot_mem_top;
+	u64		boot_mem_regs_cnt;
 
 	unsigned long	fadumphdr_addr;
 	unsigned long	cpu_notes_buf_vaddr;
 	unsigned long	cpu_notes_buf_size;
 
+	/*
+	 * Maximum size supported by firmware to copy from source to
+	 * destination address per entry.
+	 */
+	u64		max_copy_size;
 	u64		kernel_metadata;
 
 	int		ibm_configure_kernel_dump;
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 2e139259474d..ed59855430b9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -143,7 +143,7 @@ int is_fadump_memory_area(u64 addr, unsigned long size)
 	if (((addr + size) > d_start) && (addr <= d_end))
 		return 1;
 
-	return (addr <= fw_dump.boot_memory_size);
+	return (addr <= fw_dump.boot_mem_top);
 }
 
 int should_fadump_crash(void)
@@ -194,7 +194,20 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
  */
 bool is_fadump_boot_mem_contiguous(void)
 {
-	return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size);
+	unsigned long d_start, d_end;
+	bool ret = false;
+	int i;
+
+	for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
+		d_start = fw_dump.boot_mem_addr[i];
+		d_end   = d_start + fw_dump.boot_mem_sz[i];
+
+		ret = is_fadump_mem_area_contiguous(d_start, d_end);
+		if (!ret)
+			break;
+	}
+
+	return ret;
 }
 
 /*
@@ -213,6 +226,8 @@ bool is_fadump_reserved_mem_contiguous(void)
 /* Print firmware assisted dump configurations for debugging purpose. */
 static void fadump_show_config(void)
 {
+	int i;
+
 	pr_debug("Support for firmware-assisted dump (fadump): %s\n",
 			(fw_dump.fadump_supported ? "present" : "no support"));
 
@@ -226,7 +241,13 @@ static void fadump_show_config(void)
 	pr_debug("Dump section sizes:\n");
 	pr_debug("    CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
 	pr_debug("    HPTE region size   : %lx\n", fw_dump.hpte_region_size);
-	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
+	pr_debug("    Boot memory size   : %lx\n", fw_dump.boot_memory_size);
+	pr_debug("    Boot memory top    : %llx\n", fw_dump.boot_mem_top);
+	pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt);
+	for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
+		pr_debug("[%03d] base = %llx, size = %llx\n", i,
+			 fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]);
+	}
 }
 
 /**
@@ -326,6 +347,88 @@ static unsigned long get_fadump_area_size(void)
 	return size;
 }
 
+static int __init add_boot_mem_region(unsigned long rstart,
+				      unsigned long rsize)
+{
+	int i = fw_dump.boot_mem_regs_cnt++;
+
+	if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) {
+		fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS;
+		return 0;
+	}
+
+	pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n",
+		 i, rstart, (rstart + rsize));
+	fw_dump.boot_mem_addr[i] = rstart;
+	fw_dump.boot_mem_sz[i] = rsize;
+	return 1;
+}
+
+/*
+ * Firmware usually has a hard limit on the data it can copy per region.
+ * Honour that by splitting a memory range into multiple regions.
+ */
+static int __init add_boot_mem_regions(unsigned long mstart,
+				       unsigned long msize)
+{
+	unsigned long rstart, rsize, max_size;
+	int ret = 1;
+
+	rstart = mstart;
+	max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize;
+	while (msize) {
+		if (msize > max_size)
+			rsize = max_size;
+		else
+			rsize = msize;
+
+		ret = add_boot_mem_region(rstart, rsize);
+		if (!ret)
+			break;
+
+		msize -= rsize;
+		rstart += rsize;
+	}
+
+	return ret;
+}
+
+static int __init fadump_get_boot_mem_regions(void)
+{
+	unsigned long base, size, cur_size, hole_size, last_end;
+	unsigned long mem_size = fw_dump.boot_memory_size;
+	struct memblock_region *reg;
+	int ret = 1;
+
+	fw_dump.boot_mem_regs_cnt = 0;
+
+	last_end = 0;
+	hole_size = 0;
+	cur_size = 0;
+	for_each_memblock(memory, reg) {
+		base = reg->base;
+		size = reg->size;
+		hole_size += (base - last_end);
+
+		if ((cur_size + size) >= mem_size) {
+			size = (mem_size - cur_size);
+			ret = add_boot_mem_regions(base, size);
+			break;
+		}
+
+		mem_size -= size;
+		cur_size += size;
+		ret = add_boot_mem_regions(base, size);
+		if (!ret)
+			break;
+
+		last_end = base + size;
+	}
+	fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size);
+
+	return ret;
+}
+
 int __init fadump_reserve_mem(void)
 {
 	u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
@@ -362,6 +465,11 @@ int __init fadump_reserve_mem(void)
 			       fw_dump.boot_memory_size, bootmem_min);
 			goto error_out;
 		}
+
+		if (!fadump_get_boot_mem_regions()) {
+			pr_err("Too many holes in boot memory area to enable fadump\n");
+			goto error_out;
+		}
 	}
 
 	/*
@@ -385,7 +493,7 @@ int __init fadump_reserve_mem(void)
 	else
 		mem_boundary = memblock_end_of_DRAM();
 
-	base = fw_dump.boot_memory_size;
+	base = fw_dump.boot_mem_top;
 	size = get_fadump_area_size();
 	fw_dump.reserve_dump_area_size = size;
 	if (fw_dump.dump_active) {
@@ -769,34 +877,35 @@ static int fadump_setup_crash_memory_ranges(void)
 {
 	struct memblock_region *reg;
 	u64 start, end;
-	int ret;
+	int i, ret;
 
 	pr_debug("Setup crash memory ranges.\n");
 	crash_mrange_info.mem_range_cnt = 0;
 
 	/*
-	 * add the first memory chunk (0 through boot_memory_size) as
-	 * a separate memory chunk. The reason is, at the time crash firmware
-	 * will move the content of this memory chunk to different location
-	 * specified during fadump registration. We need to create a separate
-	 * program header for this chunk with the correct offset.
+	 * Boot memory region(s) registered with firmware are moved to
+	 * different location at the time of crash. Create separate program
+	 * header(s) for this memory chunk(s) with the correct offset.
 	 */
-	ret = fadump_add_mem_range(&crash_mrange_info,
-				   0, fw_dump.boot_memory_size);
-	if (ret)
-		return ret;
+	for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
+		start = fw_dump.boot_mem_addr[i];
+		end = start + fw_dump.boot_mem_sz[i];
+		ret = fadump_add_mem_range(&crash_mrange_info, start, end);
+		if (ret)
+			return ret;
+	}
 
 	for_each_memblock(memory, reg) {
 		start = (u64)reg->base;
 		end = start + (u64)reg->size;
 
 		/*
-		 * skip the first memory chunk that is already added
-		 * (0 through boot_memory_size).
+		 * skip the memory chunk that is already added
+		 * (0 through boot_memory_top).
 		 */
-		if (start < fw_dump.boot_memory_size) {
-			if (end > fw_dump.boot_memory_size)
-				start = fw_dump.boot_memory_size;
+		if (start < fw_dump.boot_mem_top) {
+			if (end > fw_dump.boot_mem_top)
+				start = fw_dump.boot_mem_top;
 			else
 				continue;
 		}
@@ -817,17 +926,35 @@ static int fadump_setup_crash_memory_ranges(void)
  */
 static inline unsigned long fadump_relocate(unsigned long paddr)
 {
-	if ((paddr > 0) && (paddr < fw_dump.boot_memory_size))
-		return fw_dump.boot_mem_dest_addr + paddr;
-	else
-		return paddr;
+	unsigned long raddr, rstart, rend, rlast, hole_size;
+	int i;
+
+	hole_size = 0;
+	rlast = 0;
+	raddr = paddr;
+	for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
+		rstart = fw_dump.boot_mem_addr[i];
+		rend = rstart + fw_dump.boot_mem_sz[i];
+		hole_size += (rstart - rlast);
+
+		if (paddr >= rstart && paddr < rend) {
+			raddr += fw_dump.boot_mem_dest_addr - hole_size;
+			break;
+		}
+
+		rlast = rend;
+	}
+
+	pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr);
+	return raddr;
 }
 
 static int fadump_create_elfcore_headers(char *bufp)
 {
-	struct elfhdr *elf;
+	unsigned long long raddr, offset;
 	struct elf_phdr *phdr;
-	int i;
+	struct elfhdr *elf;
+	int i, j;
 
 	fadump_init_elfcore_header(bufp);
 	elf = (struct elfhdr *)bufp;
@@ -870,7 +997,9 @@ static int fadump_create_elfcore_headers(char *bufp)
 	(elf->e_phnum)++;
 
 	/* setup PT_LOAD sections. */
-
+	j = 0;
+	offset = 0;
+	raddr = fw_dump.boot_mem_addr[0];
 	for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) {
 		u64 mbase, msize;
 
@@ -885,13 +1014,17 @@ static int fadump_create_elfcore_headers(char *bufp)
 		phdr->p_flags	= PF_R|PF_W|PF_X;
 		phdr->p_offset	= mbase;
 
-		if (mbase == 0) {
+		if (mbase == raddr) {
 			/*
 			 * The entire real memory region will be moved by
 			 * firmware to the specified destination_address.
 			 * Hence set the correct offset.
 			 */
-			phdr->p_offset = fw_dump.boot_mem_dest_addr;
+			phdr->p_offset = fw_dump.boot_mem_dest_addr + offset;
+			if (j < (fw_dump.boot_mem_regs_cnt - 1)) {
+				offset += fw_dump.boot_mem_sz[j];
+				raddr = fw_dump.boot_mem_addr[++j];
+			}
 		}
 
 		phdr->p_paddr = mbase;
@@ -1177,7 +1310,7 @@ static void fadump_invalidate_release_mem(void)
 	fadump_cleanup();
 	mutex_unlock(&fadump_mutex);
 
-	fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM());
+	fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM());
 	fadump_free_cpu_notes_buf();
 
 	/*
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index 006648e4d5e6..d361d37d975f 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -115,19 +115,28 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf,
 static void opal_fadump_get_config(struct fw_dump *fadump_conf,
 				   const struct opal_fadump_mem_struct *fdm)
 {
+	unsigned long base, size, last_end, hole_size;
 	int i;
 
 	if (!fadump_conf->dump_active)
 		return;
 
+	last_end = 0;
+	hole_size = 0;
 	fadump_conf->boot_memory_size = 0;
 
 	pr_debug("Boot memory regions:\n");
 	for (i = 0; i < fdm->region_cnt; i++) {
-		pr_debug("\t%d. base: 0x%llx, size: 0x%llx\n",
-			 (i + 1), fdm->rgn[i].src, fdm->rgn[i].size);
+		base = fdm->rgn[i].src;
+		size = fdm->rgn[i].size;
+		pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
 
-		fadump_conf->boot_memory_size += fdm->rgn[i].size;
+		fadump_conf->boot_mem_addr[i] = base;
+		fadump_conf->boot_mem_sz[i] = size;
+		fadump_conf->boot_memory_size += size;
+		hole_size += (base - last_end);
+
+		last_end = base + size;
 	}
 
 	/*
@@ -160,6 +169,8 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
 		pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
 	}
 
+	fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+	fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
 	opal_fadump_update_config(fadump_conf, fdm);
 }
 
@@ -174,33 +185,20 @@ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
 
 static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 {
-	int max_copy_size, cur_size, size;
-	u64 src_addr, dest_addr;
+	u64 addr = fadump_conf->reserve_dump_area_start;
+	int i;
 
 	opal_fdm = __va(fadump_conf->kernel_metadata);
 	opal_fadump_init_metadata(opal_fdm);
 
-	/*
-	 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
-	 * and request firmware to copy multiple kernel boot memory regions.
-	 */
-	max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
-
 	/* Boot memory regions */
-	src_addr = 0;
-	dest_addr = fadump_conf->reserve_dump_area_start;
-	size = fadump_conf->boot_memory_size;
-	while (size) {
-		cur_size = size > max_copy_size ? max_copy_size : size;
-
-		opal_fdm->rgn[opal_fdm->region_cnt].src  = src_addr;
-		opal_fdm->rgn[opal_fdm->region_cnt].dest = dest_addr;
-		opal_fdm->rgn[opal_fdm->region_cnt].size = cur_size;
+	for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
+		opal_fdm->rgn[i].src	= fadump_conf->boot_mem_addr[i];
+		opal_fdm->rgn[i].dest	= addr;
+		opal_fdm->rgn[i].size	= fadump_conf->boot_mem_sz[i];
 
 		opal_fdm->region_cnt++;
-		dest_addr	+= cur_size;
-		src_addr	+= cur_size;
-		size		-= cur_size;
+		addr += fadump_conf->boot_mem_sz[i];
 	}
 
 	/*
@@ -212,7 +210,7 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
 
 	opal_fadump_update_config(fadump_conf, opal_fdm);
 
-	return dest_addr;
+	return addr;
 }
 
 static u64 opal_fadump_get_metadata_size(void)
@@ -254,7 +252,7 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
 	 * by a kernel that intends to preserve crash'ed kernel's memory.
 	 */
 	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
-				      fadump_conf->boot_memory_size);
+				      fadump_conf->boot_mem_top);
 	if (ret != OPAL_SUCCESS) {
 		pr_err("Failed to set boot memory tag!\n");
 		err = -EPERM;
@@ -669,6 +667,12 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 	fadump_conf->ops		= &opal_fadump_ops;
 	fadump_conf->fadump_supported	= 1;
 
+	/*
+	 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+	 * and request firmware to copy multiple kernel boot memory regions.
+	 */
+	fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
 	/*
 	 * Check if dump has been initiated on last reboot.
 	 */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index e630cb0f108f..f1e9ecf548c5 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -27,9 +27,6 @@
  */
 #define OPAL_FADUMP_VERSION			0x1
 
-/* Maximum number of memory regions kernel supports */
-#define OPAL_FADUMP_MAX_MEM_REGS		128
-
 /*
  * OPAL FADump kernel metadata
  *
@@ -42,7 +39,7 @@ struct opal_fadump_mem_struct {
 	u16	region_cnt;		/* number of regions */
 	u16	registered_regions;	/* Regions registered for MPIPL */
 	u64	fadumphdr_addr;
-	struct opal_mpipl_region	rgn[OPAL_FADUMP_MAX_MEM_REGS];
+	struct opal_mpipl_region	rgn[FADUMP_MAX_MEM_REGS];
 } __packed;
 
 /*
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
index a525180c1f0f..70c3013fdd07 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -42,7 +42,13 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
 static void rtas_fadump_get_config(struct fw_dump *fadump_conf,
 				   const struct rtas_fadump_mem_struct *fdm)
 {
-	fadump_conf->boot_memory_size = be64_to_cpu(fdm->rmr_region.source_len);
+	fadump_conf->boot_mem_addr[0] =
+		be64_to_cpu(fdm->rmr_region.source_address);
+	fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len);
+	fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0];
+
+	fadump_conf->boot_mem_top = fadump_conf->boot_memory_size;
+	fadump_conf->boot_mem_regs_cnt = 1;
 
 	/*
 	 * Start address of reserve dump area (permanent reservation) for
@@ -499,6 +505,9 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
 	fadump_conf->ops		= &rtas_fadump_ops;
 	fadump_conf->fadump_supported	= 1;
 
+	/* Firmware supports 64-bit value for size, align it to pagesize. */
+	fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE);
+
 	/*
 	 * The 'ibm,kernel-dump' rtas node is present only if there is
 	 * dump data waiting for us.
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
index 6602ff69e10d..fd59bd7ca9c3 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.h
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -69,6 +69,11 @@ struct rtas_fadump_mem_struct {
 	/* Kernel dump sections */
 	struct rtas_fadump_section		cpu_state_data;
 	struct rtas_fadump_section		hpte_region;
+
+	/*
+	 * TODO: Extend multiple boot memory regions support in the kernel
+	 *       for this platform.
+	 */
 	struct rtas_fadump_section		rmr_region;
 };
 

From ec5b705c48365549c483fab17d68d15d83bef265 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Fri, 23 Aug 2019 10:22:00 -0400
Subject: [PATCH 256/264] powerpc/mm/radix: remove useless kernel messages

Booting a POWER9 PowerNV system generates a few messages below with
"____ptrval____" due to the pointers printed without a specifier
extension (i.e unadorned %p) are hashed to prevent leaking information
about the kernel memory layout.

radix-mmu: Initializing Radix MMU
radix-mmu: Partition table (____ptrval____)
radix-mmu: Mapped 0x0000000000000000-0x0000000040000000 with 1.00 GiB
pages (exec)
radix-mmu: Mapped 0x0000000040000000-0x0000002000000000 with 1.00 GiB
pages
radix-mmu: Mapped 0x0000200000000000-0x0000202000000000 with 1.00 GiB
pages
radix-mmu: Process table (____ptrval____) and radix root for kernel:
(____ptrval____)

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1566570120-16529-1-git-send-email-cai@lca.pw
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 0d1107fb34c1..3a1fbf9cb8f8 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -369,8 +369,6 @@ static void __init radix_init_pgtable(void)
 	rts_field = radix__get_tree_size();
 	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
 
-	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
-
 	/*
 	 * The init_mm context is given the first available (non-zero) PID,
 	 * which is the "guard PID" and contains no page table. PIDR should
@@ -399,7 +397,6 @@ static void __init radix_init_partition_table(void)
 	mmu_partition_table_set_entry(0, dw0, dw1, false);
 
 	pr_info("Initializing Radix MMU\n");
-	pr_info("Partition table %p\n", partition_tb);
 }
 
 static int __init get_idx_from_shift(unsigned int shift)

From 5896163f7f91c0560cc41908c808661eee4c4121 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Tue, 10 Sep 2019 10:18:49 +0200
Subject: [PATCH 257/264] powerpc/xmon: Improve output of XIVE interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When looping on the list of interrupts, add the current value of the
PQ bits with a load on the ESB page. This has the side effect of
faulting the ESB page of all interrupts.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190910081850.26038-2-clg@kaod.org
---
 arch/powerpc/include/asm/xive.h   |  3 +--
 arch/powerpc/sysdev/xive/common.c | 29 ++++++++++++++++++++++++++---
 arch/powerpc/xmon/xmon.c          | 15 ++++-----------
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 967d6ab3c977..71f52f22c36b 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,8 +99,7 @@ extern void xive_flush_interrupt(void);
 
 /* xmon hook */
 extern void xmon_xive_do_dump(int cpu);
-extern int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio,
-				    u32 *sw_irq);
+extern int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d);
 
 /* APIs used by KVM */
 extern u32 xive_native_default_eq_shift(void);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index f75a660365e5..b10919f08250 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -258,10 +258,33 @@ notrace void xmon_xive_do_dump(int cpu)
 #endif
 }
 
-int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio,
-			     u32 *sw_irq)
+int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
 {
-	return xive_ops->get_irq_config(irq, target, prio, sw_irq);
+	int rc;
+	u32 target;
+	u8 prio;
+	u32 lirq;
+
+	rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
+	if (rc) {
+		xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
+		return rc;
+	}
+
+	xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
+		    hw_irq, target, prio, lirq);
+
+	if (d) {
+		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+		u64 val = xive_esb_read(xd, XIVE_ESB_GET);
+
+		xmon_printf("PQ=%c%c",
+			    val & XIVE_ESB_VAL_P ? 'P' : '-',
+			    val & XIVE_ESB_VAL_Q ? 'Q' : '-');
+	}
+
+	xmon_printf("\n");
+	return 0;
 }
 
 #endif /* CONFIG_XMON */
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index dc9832e06256..d83364ebc5c5 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2572,16 +2572,9 @@ static void dump_all_xives(void)
 		dump_one_xive(cpu);
 }
 
-static void dump_one_xive_irq(u32 num)
+static void dump_one_xive_irq(u32 num, struct irq_data *d)
 {
-	int rc;
-	u32 target;
-	u8 prio;
-	u32 lirq;
-
-	rc = xmon_xive_get_irq_config(num, &target, &prio, &lirq);
-	xmon_printf("IRQ 0x%08x : target=0x%x prio=%d lirq=0x%x (rc=%d)\n",
-		    num, target, prio, lirq, rc);
+	xmon_xive_get_irq_config(num, d);
 }
 
 static void dump_all_xive_irq(void)
@@ -2599,7 +2592,7 @@ static void dump_all_xive_irq(void)
 		hwirq = (unsigned int)irqd_to_hwirq(d);
 		/* IPIs are special (HW number 0) */
 		if (hwirq)
-			dump_one_xive_irq(hwirq);
+			dump_one_xive_irq(hwirq, d);
 	}
 }
 
@@ -2619,7 +2612,7 @@ static void dump_xives(void)
 		return;
 	} else if (c == 'i') {
 		if (scanhex(&num))
-			dump_one_xive_irq(num);
+			dump_one_xive_irq(num, NULL);
 		else
 			dump_all_xive_irq();
 		return;

From 855d9140a394229f21fd4fd216f377ed45bd93a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Tue, 10 Sep 2019 10:18:50 +0200
Subject: [PATCH 258/264] powerpc/xmon: Fix output of XIVE IPI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When dumping the XIVE state of an CPU IPI, xmon does not check if the
CPU is started or not which can cause an error. Add a check for that
and change the output to be on one line just as the XIVE interrupts of
the machine.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190910081850.26038-3-clg@kaod.org
---
 arch/powerpc/sysdev/xive/common.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index b10919f08250..df832b09e3e9 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -237,25 +237,30 @@ static notrace void xive_dump_eq(const char *name, struct xive_q *q)
 	i0 = be32_to_cpup(q->qpage + idx);
 	idx = (idx + 1) & q->msk;
 	i1 = be32_to_cpup(q->qpage + idx);
-	xmon_printf("  %s Q T=%d %08x %08x ...\n", name,
-		    q->toggle, i0, i1);
+	xmon_printf("%s idx=%d T=%d %08x %08x ...", name,
+		     q->idx, q->toggle, i0, i1);
 }
 
 notrace void xmon_xive_do_dump(int cpu)
 {
 	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
 
-	xmon_printf("XIVE state for CPU %d:\n", cpu);
-	xmon_printf("  pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
-	xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
+	xmon_printf("CPU %d:", cpu);
+	if (xc) {
+		xmon_printf("pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr);
+
 #ifdef CONFIG_SMP
-	{
-		u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET);
-		xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
-			val & XIVE_ESB_VAL_P ? 'P' : 'p',
-			val & XIVE_ESB_VAL_Q ? 'Q' : 'q');
-	}
+		{
+			u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET);
+
+			xmon_printf("IPI=0x%08x PQ=%c%c ", xc->hw_ipi,
+				    val & XIVE_ESB_VAL_P ? 'P' : '-',
+				    val & XIVE_ESB_VAL_Q ? 'Q' : '-');
+		}
 #endif
+		xive_dump_eq("EQ", &xc->queue[xive_irq_priority]);
+	}
+	xmon_printf("\n");
 }
 
 int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)

From ad9b48c52296ddb7620e3f2b66a7479b78632dff Mon Sep 17 00:00:00 2001
From: Adam Zerella <adam.zerella@gmail.com>
Date: Sun, 15 Sep 2019 15:29:05 +1000
Subject: [PATCH 259/264] docs: powerpc: Add missing documentation reference

The documentation pages for 'elfnote' and 'ultravisor'
are not included in the powerpc documentation index, this
generates Sphinx warnings:

WARNING: document isn't included in any toctree

Additionally, when one includes these missing doc pages,
more Sphinx warnings appear. Unused footnote references,
syntax highlighting and table of content ordering has
been adjusted.

Signed-off-by: Adam Zerella <adam.zerella@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190915052905.13431-1-adam.zerella@gmail.com
---
 Documentation/powerpc/elfnote.rst    | 29 ++++++++++++++--------------
 Documentation/powerpc/index.rst      |  2 ++
 Documentation/powerpc/ultravisor.rst | 13 +++++--------
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/Documentation/powerpc/elfnote.rst b/Documentation/powerpc/elfnote.rst
index 2a5c4beeb809..06602248621c 100644
--- a/Documentation/powerpc/elfnote.rst
+++ b/Documentation/powerpc/elfnote.rst
@@ -8,8 +8,7 @@ capabilities and information which can be used by a bootloader or userland.
 Types and Descriptors
 ---------------------
 
-The types to be used with the "PowerPC" namesapce are defined in the
-include/uapi/asm/elfnote.h
+The types to be used with the "PowerPC" namesapce are defined in [#f1]_.
 
 	1) PPC_ELFNOTE_CAPABILITIES
 
@@ -18,25 +17,25 @@ bitmap as "descriptor" field. Each bit is described below:
 
 - Ultravisor-capable bit (PowerNV only).
 
+.. code-block:: c
+
 	#define PPCCAP_ULTRAVISOR_BIT (1 << 0)
 
-	Indicate that the powerpc kernel binary knows how to run in an
-	ultravisor-enabled system.
+Indicate that the powerpc kernel binary knows how to run in an
+ultravisor-enabled system.
 
-	In an ultravisor-enabled system, some machine resources are now controlled
-	by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
-	being run on a machine with ultravisor, the kernel will probably crash
-	trying to access ultravisor resources. For instance, it may crash in early
-	boot trying to set the partition table entry 0.
+In an ultravisor-enabled system, some machine resources are now controlled
+by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
+being run on a machine with ultravisor, the kernel will probably crash
+trying to access ultravisor resources. For instance, it may crash in early
+boot trying to set the partition table entry 0.
 
-	In an ultravisor-enabled system, a bootloader could warn the user or prevent
-   	the kernel from being run if the PowerPC ultravisor capability doesn't exist
-	or the Ultravisor-capable bit is not set.
+In an ultravisor-enabled system, a bootloader could warn the user or prevent
+the kernel from being run if the PowerPC ultravisor capability doesn't exist
+or the Ultravisor-capable bit is not set.
 
 References
 ----------
 
-arch/powerpc/include/asm/elfnote.h
-arch/powerpc/kernel/note.S
-
+.. [#f1] arch/powerpc/include/asm/elfnote.h
 
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index 549b1cdd77ae..db7b6a880f52 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -15,6 +15,7 @@ powerpc
     dawr-power9
     dscr
     eeh-pci-error-recovery
+    elfnote
     firmware-assisted-dump
     hvcs
     isa-versions
@@ -25,6 +26,7 @@ powerpc
     qe_firmware
     syscall64-abi
     transactional_memory
+    ultravisor
 
 .. only::  subproject and html
 
diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index 94a149f34ec3..730854f73830 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -1,5 +1,5 @@
 .. SPDX-License-Identifier: GPL-2.0
-.. _ultravisor:
+.. _ultravisor:
 
 ============================
 Protected Execution Facility
@@ -8,9 +8,6 @@ Protected Execution Facility
 .. contents::
     :depth: 3
 
-.. sectnum::
-    :depth: 3
-
 Protected Execution Facility
 ############################
 
@@ -255,10 +252,10 @@ Ultravisor calls API
     be made available in the public/OpenPower version of the PAPR
     specification.
 
-    **Note**
+    .. note::
 
-    If PEF is not enabled, the ultracalls will be redirected to the
-    Hypervisor which must handle/fail the calls.
+        If PEF is not enabled, the ultracalls will be redirected to the
+        Hypervisor which must handle/fail the calls.
 
 Ultracalls used by Hypervisor
 =============================
@@ -1054,4 +1051,4 @@ Use cases
 References
 ##########
 
-.. [1] `Supporting Protected Computing on IBM Power Architecture <https://developer.ibm.com/articles/l-support-protected-computing/>`_
+- `Supporting Protected Computing on IBM Power Architecture <https://developer.ibm.com/articles/l-support-protected-computing/>`_

From e7ca44ed3ba77fc26cf32650bb71584896662474 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@linux.ibm.com>
Date: Wed, 4 Sep 2019 13:29:49 +0530
Subject: [PATCH 260/264] powerpc: dump kernel log before carrying out fadump
 or kdump

Since commit 4388c9b3a6ee ("powerpc: Do not send system reset request
through the oops path"), pstore dmesg file is not updated when dump is
triggered from HMC. This commit modified system reset (sreset) handler
to invoke fadump or kdump (if configured), without pushing dmesg to
pstore. This leaves pstore to have old dmesg data which won't be much
of a help if kdump fails to capture the dump. This patch fixes that by
calling kmsg_dump() before heading to fadump ot kdump.

Fixes: 4388c9b3a6ee ("powerpc: Do not send system reset request through the oops path")
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190904075949.15607-1-ganeshgr@linux.ibm.com
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..82f43535e686 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs)
 	if (debugger(regs))
 		goto out;
 
+	kmsg_dump(KMSG_DUMP_OOPS);
 	/*
 	 * A system reset is a request to dump, so we always send
 	 * it through the crashdump code (if fadump or kdump are

From a3db31ff6ce31f5a544a66b61613a098029031cc Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 5 Sep 2019 23:50:28 +0530
Subject: [PATCH 261/264] ftrace: Look up the address of return_to_handler()
 using helpers

This ensures that we use the right address on architectures that use
function descriptors.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/8f6f14d192a994008ac370ce14036bbe67224c7d.1567707399.git.naveen.n.rao@linux.vnet.ibm.com
---
 kernel/trace/fgraph.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8dfd5021b933..7950a0356042 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 	int index = task->curr_ret_stack;
 	int i;
 
-	if (ret != (unsigned long)return_to_handler)
+	if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
 		return ret;
 
 	if (index < 0)
@@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 {
 	int task_idx;
 
-	if (ret != (unsigned long)return_to_handler)
+	if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
 		return ret;
 
 	task_idx = task->curr_ret_stack;

From 370011a27028d6f05e598ed6211a0ca2dc0213f7 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 5 Sep 2019 23:50:29 +0530
Subject: [PATCH 262/264] powerpc/ftrace: Enable
 HAVE_FUNCTION_GRAPH_RET_ADDR_PTR

This associates entries in the ftrace_ret_stack with corresponding stack
frames, enabling more robust stack unwinding. Also update the only user
of ftrace_graph_ret_addr() to pass the stack pointer.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/0224f2d0971b069c678e2ff678cfc2cd1e114cfe.1567707399.git.naveen.n.rao@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/asm-prototypes.h      | 3 ++-
 arch/powerpc/include/asm/ftrace.h              | 2 ++
 arch/powerpc/kernel/stacktrace.c               | 2 +-
 arch/powerpc/kernel/trace/ftrace.c             | 5 +++--
 arch/powerpc/kernel/trace/ftrace_32.S          | 1 +
 arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 1 +
 arch/powerpc/kernel/trace/ftrace_64_pg.S       | 1 +
 7 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 49196d35e3bb..8561498e653c 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -134,7 +134,8 @@ extern int __ucmpdi2(u64, u64);
 
 /* tracing */
 void _mcount(void);
-unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
+						unsigned long sp);
 
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 3dfb80b86561..f54a08a2cd70 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -8,6 +8,8 @@
 #define MCOUNT_ADDR		((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
 
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
 #ifdef __ASSEMBLY__
 
 /* Based off of objdump optput from glibc */
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 1e2276963f6d..e2a46cfed5fd 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk,
 		 * FIXME: IMHO these tests do not belong in
 		 * arch-dependent code, they are generic.
 		 */
-		ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL);
+		ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack);
 #ifdef CONFIG_KPROBES
 		/*
 		 * Mark stacktraces with kretprobed functions on them
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index be1ca98fce5c..7ea0ca044b65 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void)
  * Hook the return address and push it in the stack of return addrs
  * in current thread info. Return the address we want to divert to.
  */
-unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
+						unsigned long sp)
 {
 	unsigned long return_hooker;
 
@@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
 
 	return_hooker = ppc_function_entry(return_to_handler);
 
-	if (!function_graph_enter(parent, ip, 0, NULL))
+	if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp))
 		parent = return_hooker;
 out:
 	return parent;
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
index 183f608efb81..e023ae59c429 100644
--- a/arch/powerpc/kernel/trace/ftrace_32.S
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 _GLOBAL(ftrace_graph_caller)
+	addi	r5, r1, 48
 	/* load r4 with local address */
 	lwz	r4, 44(r1)
 	subi	r4, r4, MCOUNT_INSN_SIZE
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index 74acbf16a666..f9fd5f743eba 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller)
 	std	r2, 24(r1)
 	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
 
+	addi	r5, r1, 112
 	mfctr	r4		/* ftrace_caller has moved local addr here */
 	std	r4, 40(r1)
 	mflr	r3		/* ftrace_caller has restored LR from stack */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
index e41a7d13c99c..6708e24db0ab 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 _GLOBAL(ftrace_graph_caller)
+	addi	r5, r1, 112
 	/* load r4 with local address */
 	ld	r4, 128(r1)
 	subi	r4, r4, MCOUNT_INSN_SIZE

From 7c1bb6bbf75d8ca5ec878627d3170effcaf54f27 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 5 Sep 2019 23:50:30 +0530
Subject: [PATCH 263/264] powerpc: Use ftrace_graph_ret_addr() when unwinding

With support for HAVE_FUNCTION_GRAPH_RET_ADDR_PTR,
ftrace_graph_ret_addr() provides more robust unwinding when function
graph is in use. Update show_stack() to use the same.

With dump_stack() added to sysrq_sysctl_handler(), before this patch:
  root@(none):/sys/kernel/debug/tracing# cat /proc/sys/kernel/sysrq
  CPU: 0 PID: 218 Comm: cat Not tainted 5.3.0-rc7-00868-g8453ad4a078c-dirty #20
  Call Trace:
  [c0000000d1e13c30] [c00000000006ab98] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable)
  [c0000000d1e13c80] [c000000000145680] sysrq_sysctl_handler+0x48/0xb8
  [c0000000d1e13cd0] [c00000000006ab98] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0)
  [c0000000d1e13d60] [c00000000006ab98] return_to_handler+0x0/0x40 (return_to_handler+0x0/0x40)
  [c0000000d1e13d80] [c00000000006ab98] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70)
  [c0000000d1e13dd0] [c00000000006ab98] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0)
  [c0000000d1e13e20] [c00000000006ab98] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140)

After this patch:
  Call Trace:
  [c0000000d1e33c30] [c00000000006ab58] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable)
  [c0000000d1e33c80] [c000000000145680] sysrq_sysctl_handler+0x48/0xb8
  [c0000000d1e33cd0] [c00000000006ab58] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0)
  [c0000000d1e33d60] [c00000000006ab58] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70)
  [c0000000d1e33d80] [c00000000006ab58] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0)
  [c0000000d1e33dd0] [c00000000006ab58] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140)
  [c0000000d1e33e20] [c00000000006ab58] return_to_handler+0x0/0x40 (system_call+0x5c/0x68)

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/dc89c9a887121342d9c7819482c3dabdece2a323.1567707399.git.naveen.n.rao@linux.vnet.ibm.com
---
 arch/powerpc/kernel/process.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 24621e7e5033..f289bdd2b562 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2047,10 +2047,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 	int count = 0;
 	int firstframe = 1;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	struct ftrace_ret_stack *ret_stack;
-	extern void return_to_handler(void);
-	unsigned long rth = (unsigned long)return_to_handler;
-	int curr_frame = 0;
+	unsigned long ret_addr;
+	int ftrace_idx = 0;
 #endif
 
 	if (tsk == NULL)
@@ -2079,15 +2077,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 		if (!firstframe || ip != lr) {
 			printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-			if ((ip == rth) && curr_frame >= 0) {
-				ret_stack = ftrace_graph_get_ret_stack(current,
-								  curr_frame++);
-				if (ret_stack)
-					pr_cont(" (%pS)",
-						(void *)ret_stack->ret);
-				else
-					curr_frame = -1;
-			}
+			ret_addr = ftrace_graph_ret_addr(current,
+						&ftrace_idx, ip, stack);
+			if (ret_addr != ip)
+				pr_cont(" (%pS)", (void *)ret_addr);
 #endif
 			if (firstframe)
 				pr_cont(" (unreliable)");

From d9101bfa6adc831bda8836c4d774820553c14942 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 18 Sep 2019 20:23:28 +0530
Subject: [PATCH 264/264] powerpc/mm/mce: Keep irqs disabled during lockless
 page table walk

__find_linux_mm_pte() returns a page table entry pointer after walking
the page table without holding locks. To make it safe against a THP
split and/or collapse, we disable interrupts around the lockless page
table walk. However we need to keep interrupts disabled as long as we
use the page table entry pointer that is returned.

Fix addr_to_pfn() to do that.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
[mpe: Rearrange code slightly and tweak change log wording]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190918145328.28602-1-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/kernel/mce_power.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 356e7b99f661..1cbf7f1a4e3d 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -29,7 +29,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 {
 	pte_t *ptep;
 	unsigned int shift;
-	unsigned long flags;
+	unsigned long pfn, flags;
 	struct mm_struct *mm;
 
 	if (user_mode(regs))
@@ -39,18 +39,22 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 
 	local_irq_save(flags);
 	ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
-	local_irq_restore(flags);
 
-	if (!ptep || pte_special(*ptep))
-		return ULONG_MAX;
-
-	if (shift > PAGE_SHIFT) {
-		unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
-
-		return pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+	if (!ptep || pte_special(*ptep)) {
+		pfn = ULONG_MAX;
+		goto out;
 	}
 
-	return pte_pfn(*ptep);
+	if (shift <= PAGE_SHIFT)
+		pfn = pte_pfn(*ptep);
+	else {
+		unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+		pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+	}
+
+out:
+	local_irq_restore(flags);
+	return pfn;
 }
 
 /* flush SLBs and reload */