From 5290ae2b8e5fecc465e2fe92350ff02aa2e5acae Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Wed, 11 Dec 2019 13:35:52 +1100 Subject: [PATCH 001/131] powerpc/64: Use {SAVE,REST}_NVGPRS macros In entry_64.S there are places that open code saving and restoring the non-volatile registers. There are already macros for doing this so use them. Signed-off-by: Jordan Niethe Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191211023552.16480-1-jniethe5@gmail.com --- arch/powerpc/kernel/entry_64.S | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3fd3ef352e3f..0a82bfd44010 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -597,8 +597,7 @@ _GLOBAL(_switch) std r0,16(r1) stdu r1,-SWITCH_FRAME_SIZE(r1) /* r3-r13 are caller saved -- Cort */ - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_NVGPRS(r1) std r0,_NIP(r1) /* Return to switch caller */ mfcr r23 std r23,_CCR(r1) @@ -722,8 +721,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtcrf 0xFF,r6 /* r3-r13 are destroyed -- Cort */ - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_NVGPRS(r1) /* convert old thread to its task_struct for return value */ addi r3,r3,-THREAD @@ -1155,8 +1153,7 @@ _GLOBAL(enter_rtas) */ SAVE_GPR(2, r1) /* Save the TOC */ SAVE_GPR(13, r1) /* Save paca */ - SAVE_8GPRS(14, r1) /* Save the non-volatiles */ - SAVE_10GPRS(22, r1) /* ditto */ + SAVE_NVGPRS(r1) /* Save the non-volatiles */ mfcr r4 std r4,_CCR(r1) @@ -1263,8 +1260,7 @@ rtas_restore_regs: /* relocation is on at this point */ REST_GPR(2, r1) /* Restore the TOC */ REST_GPR(13, r1) /* Restore paca */ - REST_8GPRS(14, r1) /* Restore the non-volatiles */ - REST_10GPRS(22, r1) /* ditto */ + REST_NVGPRS(r1) /* Restore the non-volatiles */ GET_PACA(r13) @@ -1298,8 +1294,7 @@ _GLOBAL(enter_prom) */ SAVE_GPR(2, r1) SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_NVGPRS(r1) mfcr r10 mfmsr r11 std r10,_CCR(r1) @@ -1343,8 +1338,7 @@ _GLOBAL(enter_prom) /* Restore other registers */ REST_GPR(2, r1) REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_NVGPRS(r1) ld r4,_CCR(r1) mtcr r4 From c3aae14e5d468d18dbb5d7c0c8c7e2968cc14aad Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Dec 2019 13:03:38 -0700 Subject: [PATCH 002/131] powerpc/44x: Adjust indentation in ibm4xx_denali_fixup_memsize Clang warns: ../arch/powerpc/boot/4xx.c:231:3: warning: misleading indentation; statement is not part of the previous 'else' [-Wmisleading-indentation] val = SDRAM0_READ(DDR0_42); ^ ../arch/powerpc/boot/4xx.c:227:2: note: previous statement is here else ^ This is because there is a space at the beginning of this line; remove it so that the indentation is consistent according to the Linux kernel coding style and clang no longer warns. Fixes: d23f5099297c ("[POWERPC] 4xx: Adds decoding of 440SPE memory size to boot wrapper library") Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://github.com/ClangBuiltLinux/linux/issues/780 Link: https://lore.kernel.org/r/20191209200338.12546-1-natechancellor@gmail.com --- arch/powerpc/boot/4xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/4xx.c b/arch/powerpc/boot/4xx.c index 1699e9531552..00c4d843a023 100644 --- a/arch/powerpc/boot/4xx.c +++ b/arch/powerpc/boot/4xx.c @@ -228,7 +228,7 @@ void ibm4xx_denali_fixup_memsize(void) dpath = 8; /* 64 bits */ /* get address pins (rows) */ - val = SDRAM0_READ(DDR0_42); + val = SDRAM0_READ(DDR0_42); row = DDR_GET_VAL(val, DDR_APIN, DDR_APIN_SHIFT); if (row > max_row) From 1a3ec143a90a4674e01099c3ba47c3268536a462 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Mon, 28 Oct 2019 21:08:16 +1100 Subject: [PATCH 003/131] powerpc/imc: Add documentation for IMC and trace-mode Documentation for IMC (In-Memory Collection Counters) infrastructure and trace-mode of IMC. Signed-off-by: Anju T Sudhakar [mpe: Convert to rst, minor rewording, make PMI example more concise] Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191028100816.6270-1-mpe@ellerman.id.au --- Documentation/powerpc/imc.rst | 199 ++++++++++++++++++++++++++++++++ Documentation/powerpc/index.rst | 1 + 2 files changed, 200 insertions(+) create mode 100644 Documentation/powerpc/imc.rst diff --git a/Documentation/powerpc/imc.rst b/Documentation/powerpc/imc.rst new file mode 100644 index 000000000000..633bcee7dc85 --- /dev/null +++ b/Documentation/powerpc/imc.rst @@ -0,0 +1,199 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _imc: + +=================================== +IMC (In-Memory Collection Counters) +=================================== + +Anju T Sudhakar, 10 May 2019 + +.. contents:: + :depth: 3 + + +Basic overview +============== + +IMC (In-Memory collection counters) is a hardware monitoring facility that +collects large numbers of hardware performance events at Nest level (these are +on-chip but off-core), Core level and Thread level. + +The Nest PMU counters are handled by a Nest IMC microcode which runs in the OCC +(On-Chip Controller) complex. The microcode collects the counter data and moves +the nest IMC counter data to memory. + +The Core and Thread IMC PMU counters are handled in the core. Core level PMU +counters give us the IMC counters' data per core and thread level PMU counters +give us the IMC counters' data per CPU thread. + +OPAL obtains the IMC PMU and supported events information from the IMC Catalog +and passes on to the kernel via the device tree. The event's information +contains: + +- Event name +- Event Offset +- Event description + +and possibly also: + +- Event scale +- Event unit + +Some PMUs may have a common scale and unit values for all their supported +events. For those cases, the scale and unit properties for those events must be +inherited from the PMU. + +The event offset in the memory is where the counter data gets accumulated. + +IMC catalog is available at: + https://github.com/open-power/ima-catalog + +The kernel discovers the IMC counters information in the device tree at the +`imc-counters` device node which has a compatible field +`ibm,opal-in-memory-counters`. From the device tree, the kernel parses the PMUs +and their event's information and register the PMU and its attributes in the +kernel. + +IMC example usage +================= + +.. code-block:: sh + + # perf list + [...] + nest_mcs01/PM_MCS01_64B_RD_DISP_PORT01/ [Kernel PMU event] + nest_mcs01/PM_MCS01_64B_RD_DISP_PORT23/ [Kernel PMU event] + [...] + core_imc/CPM_0THRD_NON_IDLE_PCYC/ [Kernel PMU event] + core_imc/CPM_1THRD_NON_IDLE_INST/ [Kernel PMU event] + [...] + thread_imc/CPM_0THRD_NON_IDLE_PCYC/ [Kernel PMU event] + thread_imc/CPM_1THRD_NON_IDLE_INST/ [Kernel PMU event] + +To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/: + +.. code-block:: sh + + # ./perf stat -e "nest_mcs01/PM_MCS01_64B_WR_DISP_PORT01/" -a --per-socket + +To see non-idle instructions for core 0: + +.. code-block:: sh + + # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000 + +To see non-idle instructions for a "make": + +.. code-block:: sh + + # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make + + +IMC Trace-mode +=============== + +POWER9 supports two modes for IMC which are the Accumulation mode and Trace +mode. In Accumulation mode, event counts are accumulated in system Memory. +Hypervisor then reads the posted counts periodically or when requested. In IMC +Trace mode, the 64 bit trace SCOM value is initialized with the event +information. The CPMCxSEL and CPMC_LOAD in the trace SCOM, specifies the event +to be monitored and the sampling duration. On each overflow in the CPMCxSEL, +hardware snapshots the program counter along with event counts and writes into +memory pointed by LDBAR. + +LDBAR is a 64 bit special purpose per thread register, it has bits to indicate +whether hardware is configured for accumulation or trace mode. + +LDBAR Register Layout +--------------------- + + +-------+----------------------+ + | 0 | Enable/Disable | + +-------+----------------------+ + | 1 | 0: Accumulation Mode | + | +----------------------+ + | | 1: Trace Mode | + +-------+----------------------+ + | 2:3 | Reserved | + +-------+----------------------+ + | 4-6 | PB scope | + +-------+----------------------+ + | 7 | Reserved | + +-------+----------------------+ + | 8:50 | Counter Address | + +-------+----------------------+ + | 51:63 | Reserved | + +-------+----------------------+ + +TRACE_IMC_SCOM bit representation +--------------------------------- + + +-------+------------+ + | 0:1 | SAMPSEL | + +-------+------------+ + | 2:33 | CPMC_LOAD | + +-------+------------+ + | 34:40 | CPMC1SEL | + +-------+------------+ + | 41:47 | CPMC2SEL | + +-------+------------+ + | 48:50 | BUFFERSIZE | + +-------+------------+ + | 51:63 | RESERVED | + +-------+------------+ + +CPMC_LOAD contains the sampling duration. SAMPSEL and CPMCxSEL determines the +event to count. BUFFERSIZE indicates the memory range. On each overflow, +hardware snapshots the program counter along with event counts and updates the +memory and reloads the CMPC_LOAD value for the next sampling duration. IMC +hardware does not support exceptions, so it quietly wraps around if memory +buffer reaches the end. + +*Currently the event monitored for trace-mode is fixed as cycle.* + +Trace IMC example usage +======================= + +.. code-block:: sh + + # perf list + [....] + trace_imc/trace_cycles/ [Kernel PMU event] + +To record an application/process with trace-imc event: + +.. code-block:: sh + + # perf record -e trace_imc/trace_cycles/ yes > /dev/null + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.012 MB perf.data (21 samples) ] + +The `perf.data` generated, can be read using perf report. + +Benefits of using IMC trace-mode +================================ + +PMI (Performance Monitoring Interrupts) interrupt handling is avoided, since IMC +trace mode snapshots the program counter and updates to the memory. And this +also provide a way for the operating system to do instruction sampling in real +time without PMI processing overhead. + +Performance data using `perf top` with and without trace-imc event. + +PMI interrupts count when `perf top` command is executed without trace-imc event. + +.. code-block:: sh + + # grep PMI /proc/interrupts + PMI: 0 0 0 0 Performance monitoring interrupts + # ./perf top + ... + # grep PMI /proc/interrupts + PMI: 39735 8710 17338 17801 Performance monitoring interrupts + # ./perf top -e trace_imc/trace_cycles/ + ... + # grep PMI /proc/interrupts + PMI: 39735 8710 17338 17801 Performance monitoring interrupts + + +That is, the PMI interrupt counts do not increment when using the `trace_imc` event. diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index ba5edb3211c0..2df04259db65 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -18,6 +18,7 @@ powerpc elfnote firmware-assisted-dump hvcs + imc isa-versions kaslr-booke32 mpc52xx From 5eb7cfb3a2b178f3d443301cda0825bb9f475657 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 20 May 2019 20:20:51 +1000 Subject: [PATCH 004/131] selftests/powerpc: Add a test of bad (out-of-range) accesses Userspace isn't allowed to access certain address ranges, make sure we actually test that to at least some degree. This would have caught the recent bug where the SLB fault handler was incorrectly called on an out-of-range access when using the Radix MMU. It also would have caught the bug we had in get_region_id() where we were inserting SLB entries for bad addresses. Signed-off-by: Michael Ellerman Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190520102051.12103-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/mm/.gitignore | 1 + tools/testing/selftests/powerpc/mm/Makefile | 3 +- .../selftests/powerpc/mm/bad_accesses.c | 171 ++++++++++++++++++ 3 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mm/bad_accesses.c diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index 7101ffd08d66..0ebeaea22641 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -5,3 +5,4 @@ prot_sao segv_errors wild_bctr large_vm_fork_separation +bad_accesses diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index ed1565809d2b..b9103c4bb414 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -3,7 +3,7 @@ noarg: $(MAKE) -C ../ TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ - large_vm_fork_separation + large_vm_fork_separation bad_accesses TEST_GEN_PROGS_EXTENDED := tlbie_test TEST_GEN_FILES := tempfile @@ -16,6 +16,7 @@ $(OUTPUT)/prot_sao: ../utils.c $(OUTPUT)/wild_bctr: CFLAGS += -m64 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 +$(OUTPUT)/bad_accesses: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c b/tools/testing/selftests/powerpc/mm/bad_accesses.c new file mode 100644 index 000000000000..adc465f499ef --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright 2019, Michael Ellerman, IBM Corp. +// +// Test that out-of-bounds reads/writes behave as expected. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +// Old distros (Ubuntu 16.04 at least) don't define this +#ifndef SEGV_BNDERR +#define SEGV_BNDERR 3 +#endif + +// 64-bit kernel is always here +#define PAGE_OFFSET (0xcul << 60) + +static unsigned long kernel_virt_end; + +static volatile int fault_code; +static volatile unsigned long fault_addr; +static jmp_buf setjmp_env; + +static void segv_handler(int n, siginfo_t *info, void *ctxt_v) +{ + fault_code = info->si_code; + fault_addr = (unsigned long)info->si_addr; + siglongjmp(setjmp_env, 1); +} + +int bad_access(char *p, bool write) +{ + char x; + + fault_code = 0; + fault_addr = 0; + + if (sigsetjmp(setjmp_env, 1) == 0) { + if (write) + *p = 1; + else + x = *p; + + printf("Bad - no SEGV! (%c)\n", x); + return 1; + } + + // If we see MAPERR that means we took a page fault rather than an SLB + // miss. We only expect to take page faults for addresses within the + // valid kernel range. + FAIL_IF(fault_code == SEGV_MAPERR && \ + (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end)); + + FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR); + + return 0; +} + +static int using_hash_mmu(bool *using_hash) +{ + char line[128]; + FILE *f; + int rc; + + f = fopen("/proc/cpuinfo", "r"); + FAIL_IF(!f); + + rc = 0; + while (fgets(line, sizeof(line), f) != NULL) { + if (strcmp(line, "MMU : Hash\n") == 0) { + *using_hash = true; + goto out; + } + + if (strcmp(line, "MMU : Radix\n") == 0) { + *using_hash = false; + goto out; + } + } + + rc = -1; +out: + fclose(f); + return rc; +} + +static int test(void) +{ + unsigned long i, j, addr, region_shift, page_shift, page_size; + struct sigaction sig; + bool hash_mmu; + + sig = (struct sigaction) { + .sa_sigaction = segv_handler, + .sa_flags = SA_SIGINFO, + }; + + FAIL_IF(sigaction(SIGSEGV, &sig, NULL) != 0); + + FAIL_IF(using_hash_mmu(&hash_mmu)); + + page_size = sysconf(_SC_PAGESIZE); + if (page_size == (64 * 1024)) + page_shift = 16; + else + page_shift = 12; + + if (page_size == (64 * 1024) || !hash_mmu) { + region_shift = 52; + + // We have 7 512T regions (4 kernel linear, vmalloc, io, vmemmap) + kernel_virt_end = PAGE_OFFSET + (7 * (512ul << 40)); + } else if (page_size == (4 * 1024) && hash_mmu) { + region_shift = 46; + + // We have 7 64T regions (4 kernel linear, vmalloc, io, vmemmap) + kernel_virt_end = PAGE_OFFSET + (7 * (64ul << 40)); + } else + FAIL_IF(true); + + printf("Using %s MMU, PAGE_SIZE = %dKB start address 0x%016lx\n", + hash_mmu ? "hash" : "radix", + (1 << page_shift) >> 10, + 1ul << region_shift); + + // This generates access patterns like: + // 0x0010000000000000 + // 0x0010000000010000 + // 0x0010000000020000 + // ... + // 0x0014000000000000 + // 0x0018000000000000 + // 0x0020000000000000 + // 0x0020000000010000 + // 0x0020000000020000 + // ... + // 0xf400000000000000 + // 0xf800000000000000 + + for (i = 1; i <= ((0xful << 60) >> region_shift); i++) { + for (j = page_shift - 1; j < 60; j++) { + unsigned long base, delta; + + base = i << region_shift; + delta = 1ul << j; + + if (delta >= base) + break; + + addr = (base | delta) & ~((1 << page_shift) - 1); + + FAIL_IF(bad_access((char *)addr, false)); + FAIL_IF(bad_access((char *)addr, true)); + } + } + + return 0; +} + +int main(void) +{ + return test_harness(test, "bad_accesses"); +} From 0eb59382dff23910e7104c397b617fb0fede538e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 Dec 2019 12:08:55 +0530 Subject: [PATCH 005/131] powerpc/papr_scm: Update debug message Resource struct p->res is assigned later. Avoid using %pR before the resource struct is assigned. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191202063855.154321-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c2ef320ba1bf..cdd316c6cef3 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -69,7 +69,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p) return rc; p->bound_addr = saved; - dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", + p->drc_index, (unsigned long)saved); return rc; } @@ -133,7 +134,7 @@ static int drc_pmem_query_n_bind(struct papr_scm_priv *p) goto err_out; p->bound_addr = start_addr; - dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", p->drc_index, start_addr); return rc; err_out: From 3b5b9997b331e77ce967eba2c4bc80dc3134a7fe Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 28 Oct 2019 19:54:22 +1100 Subject: [PATCH 006/131] powerpc/powernv/iov: Ensure the pdn for VFs always contains a valid PE number On pseries there is a bug with adding hotplugged devices to an IOMMU group. For a number of dumb reasons fixing that bug first requires re-working how VFs are configured on PowerNV. For background, on PowerNV we use the pcibios_sriov_enable() hook to do two things: 1. Create a pci_dn structure for each of the VFs, and 2. Configure the PHB's internal BARs so the MMIO range for each VF maps to a unique PE. Roughly speaking a PE is the hardware counterpart to a Linux IOMMU group since all the devices in a PE share the same IOMMU table. A PE also defines the set of devices that should be isolated in response to a PCI error (i.e. bad DMA, UR/CA, AER events, etc). When isolated all MMIO and DMA traffic to and from devicein the PE is blocked by the root complex until the PE is recovered by the OS. The requirement to block MMIO causes a giant headache because the P8 PHB generally uses a fixed mapping between MMIO addresses and PEs. As a result we need to delay configuring the IOMMU groups for device until after MMIO resources are assigned. For physical devices (i.e. non-VFs) the PE assignment is done in pcibios_setup_bridge() which is called immediately after the MMIO resources for downstream devices (and the bridge's windows) are assigned. For VFs the setup is more complicated because: a) pcibios_setup_bridge() is not called again when VFs are activated, and b) The pci_dev for VFs are created by generic code which runs after pcibios_sriov_enable() is called. The work around for this is a two step process: 1. A fixup in pcibios_add_device() is used to initialised the cached pe_number in pci_dn, then 2. A bus notifier then adds the device to the IOMMU group for the PE specified in pci_dn->pe_number. A side effect fixing the pseries bug mentioned in the first paragraph is moving the fixup out of pcibios_add_device() and into pcibios_bus_add_device(), which is called much later. This results in step 2. failing because pci_dn->pe_number won't be initialised when the bus notifier is run. We can fix this by removing the need for the fixup. The PE for a VF is known before the VF is even scanned so we can initialise pci_dn->pe_number pcibios_sriov_enable() instead. Unfortunately, moving the initialisation causes two problems: 1. We trip the WARN_ON() in the current fixup code, and 2. The EEH core clears pdn->pe_number when recovering a VF and relies on the fixup to correctly re-set it. The only justification for either of these is a comment in eeh_rmv_device() suggesting that pdn->pe_number *must* be set to IODA_INVALID_PE in order for the VF to be scanned. However, this comment appears to have no basis in reality. Both bugs can be fixed by just deleting the code. Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191028085424.12006-1-oohall@gmail.com --- arch/powerpc/kernel/eeh_driver.c | 6 ------ arch/powerpc/platforms/powernv/pci-ioda.c | 19 +++++++++++++++---- arch/powerpc/platforms/powernv/pci.c | 4 ---- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3dd1a422fc29..a1eaffe868de 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -525,12 +525,6 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); edev->pdev = NULL; - - /* - * We have to set the VF PE number to invalid one, which is - * required to plug the VF successfully. - */ - pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index da1068a9c263..4374836b033b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1558,6 +1558,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) /* Reserve PE for each VF */ for (vf_index = 0; vf_index < num_vfs; vf_index++) { + int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); + int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); + struct pci_dn *vf_pdn; + if (pdn->m64_single_mode) pe_num = pdn->pe_num_map[vf_index]; else @@ -1570,13 +1574,11 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pe->pbus = NULL; pe->parent_dev = pdev; pe->mve_number = -1; - pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | - pci_iov_virtfn_devfn(pdev, vf_index); + pe->rid = (vf_bus << 8) | vf_devfn; pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", hose->global_number, pdev->bus->number, - PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), - PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); + PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ @@ -1590,6 +1592,15 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) list_add_tail(&pe->list, &phb->ioda.pe_list); mutex_unlock(&phb->ioda.pe_list_mutex); + /* associate this pe to it's pdn */ + list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { + if (vf_pdn->busno == vf_bus && + vf_pdn->devfn == vf_devfn) { + vf_pdn->pe_number = pe_num; + break; + } + } + pnv_pci_ioda2_setup_dma_pe(phb, pe); #ifdef CONFIG_IOMMU_API iommu_register_group(&pe->table_group, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index c0bea75ac27b..e8e58a2cccdd 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -816,16 +816,12 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) struct pnv_phb *phb = hose->private_data; #ifdef CONFIG_PCI_IOV struct pnv_ioda_pe *pe; - struct pci_dn *pdn; /* Fix the VF pdn PE number */ if (pdev->is_virtfn) { - pdn = pci_get_pdn(pdev); - WARN_ON(pdn->pe_number != IODA_INVALID_PE); list_for_each_entry(pe, &phb->ioda.pe_list, list) { if (pe->rid == ((pdev->bus->number << 8) | (pdev->devfn & 0xff))) { - pdn->pe_number = pe->pe_number; pe->pdev = pdev; break; } From 30d87ef8b38d471ab5c0f1226926ebd856da8647 Mon Sep 17 00:00:00 2001 From: Shawn Anastasio Date: Mon, 28 Oct 2019 19:54:23 +1100 Subject: [PATCH 007/131] powerpc/pci: Fix pcibios_setup_device() ordering Move PCI device setup from pcibios_add_device() and pcibios_fixup_bus() to pcibios_bus_add_device(). This ensures that platform-specific DMA and IOMMU setup occurs after the device has been registered in sysfs, which is a requirement for IOMMU group assignment to work This fixes IOMMU group assignment for hotplugged devices on pseries, where the existing behavior results in IOMMU assignment before registration. Thanks to Lukas Wunner for the suggestion. Signed-off-by: Shawn Anastasio Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191028085424.12006-2-oohall@gmail.com --- arch/powerpc/kernel/pci-common.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 1c448cf25506..b89925ede935 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -261,12 +261,6 @@ int pcibios_sriov_disable(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ -void pcibios_bus_add_device(struct pci_dev *pdev) -{ - if (ppc_md.pcibios_bus_add_device) - ppc_md.pcibios_bus_add_device(pdev); -} - static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -987,15 +981,17 @@ static void pcibios_setup_device(struct pci_dev *dev) ppc_md.pci_irq_fixup(dev); } +void pcibios_bus_add_device(struct pci_dev *pdev) +{ + /* Perform platform-specific device setup */ + pcibios_setup_device(pdev); + + if (ppc_md.pcibios_bus_add_device) + ppc_md.pcibios_bus_add_device(pdev); +} + int pcibios_add_device(struct pci_dev *dev) { - /* - * We can only call pcibios_setup_device() after bus setup is complete, - * since some of the platform specific DMA setup code depends on it. - */ - if (dev->bus->is_added) - pcibios_setup_device(dev); - #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); @@ -1037,9 +1033,6 @@ void pcibios_fixup_bus(struct pci_bus *bus) /* Now fixup the bus bus */ pcibios_setup_bus_self(bus); - - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); } EXPORT_SYMBOL(pcibios_fixup_bus); From 1c7f4fe86f17f296bafab1d418c040533324bf4f Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 28 Oct 2019 19:54:24 +1100 Subject: [PATCH 008/131] powerpc/pci: Remove pcibios_setup_bus_devices() With the previous patch applied pcibios_setup_device() will always be run when pcibios_bus_add_device() is called. There are several code paths where pcibios_setup_bus_device() is still called (the PowerPC specific PCI hotplug support is one) so with just the previous patch applied the setup can be run multiple times on a device, once before the device is added to the bus and once after. There's no need to run the setup in the early case any more so just remove it entirely. Signed-off-by: Oliver O'Halloran Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191028085424.12006-3-oohall@gmail.com --- arch/powerpc/include/asm/pci.h | 1 - arch/powerpc/kernel/pci-common.c | 25 ------------------------- arch/powerpc/kernel/pci-hotplug.c | 1 - arch/powerpc/kernel/pci_of_scan.c | 1 - 4 files changed, 28 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 327567b8f7d6..63ed7e3b0ba3 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -113,7 +113,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); -extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); extern void pcibios_setup_phb_io_space(struct pci_controller *hose); extern void pcibios_scan_phb(struct pci_controller *hose); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index b89925ede935..f8a59d7b724c 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1000,24 +1000,6 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -void pcibios_setup_bus_devices(struct pci_bus *bus) -{ - struct pci_dev *dev; - - pr_debug("PCI: Fixup bus devices %d (%s)\n", - bus->number, bus->self ? pci_name(bus->self) : "PHB"); - - list_for_each_entry(dev, &bus->devices, bus_list) { - /* Cardbus can call us to add new devices to a bus, so ignore - * those who are already fully discovered - */ - if (pci_dev_is_added(dev)) - continue; - - pcibios_setup_device(dev); - } -} - void pcibios_set_master(struct pci_dev *dev) { /* No special bus mastering setup handling */ @@ -1036,13 +1018,6 @@ void pcibios_fixup_bus(struct pci_bus *bus) } EXPORT_SYMBOL(pcibios_fixup_bus); -void pci_fixup_cardbus(struct pci_bus *bus) -{ - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); -} - - static int skip_isa_ioresource_align(struct pci_dev *dev) { if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index fc62c4bc47b1..d6a67f814983 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -134,7 +134,6 @@ void pci_hp_add_devices(struct pci_bus *bus) */ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - pcibios_setup_bus_devices(bus); max = bus->busn_res.start; /* * Scan bridges that are already configured. We don't touch diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index f91d7e94872e..c3024f104765 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -414,7 +414,6 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, */ if (!rescan_existing) pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); /* Now scan child busses */ for_each_pci_bridge(dev, bus) From fb185a4052b18c97ebc98f6a8db30a60abca35e0 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 17 Dec 2019 09:37:30 +0200 Subject: [PATCH 009/131] powerpc/512x: Use dma_request_chan() instead dma_request_slave_channel() dma_request_slave_channel() is a wrapper on top of dma_request_chan() eating up the error code. By using dma_request_chan() directly the driver can support deferred probing against DMA. Signed-off-by: Peter Ujfalusi Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191217073730.21249-1-peter.ujfalusi@ti.com --- arch/powerpc/platforms/512x/mpc512x_lpbfifo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c b/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c index 13631f35cd14..04bf6ecf7d55 100644 --- a/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c +++ b/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c @@ -434,9 +434,9 @@ static int mpc512x_lpbfifo_probe(struct platform_device *pdev) memset(&lpbfifo, 0, sizeof(struct lpbfifo_data)); spin_lock_init(&lpbfifo.lock); - lpbfifo.chan = dma_request_slave_channel(&pdev->dev, "rx-tx"); - if (lpbfifo.chan == NULL) - return -EPROBE_DEFER; + lpbfifo.chan = dma_request_chan(&pdev->dev, "rx-tx"); + if (IS_ERR(lpbfifo.chan)) + return PTR_ERR(lpbfifo.chan); if (of_address_to_resource(pdev->dev.of_node, 0, &r) != 0) { dev_err(&pdev->dev, "bad 'reg' in 'sclpc' device tree node\n"); From 4a8e274e2d8cc5628d3027be0900e8835a2dfa7b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 Dec 2019 23:26:38 +1100 Subject: [PATCH 010/131] powerpc/pseries: Remove redundant select of PPC_DOORBELL Commit d4e58e5928f8 ("powerpc/powernv: Enable POWER8 doorbell IPIs") added a select of PPC_DOORBELL to PPC_PSERIES, but it already had a select of PPC_DOORBELL. One is enough. Reported-by: Jason A. Donenfeld Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191219125840.32592-1-mpe@ellerman.id.au --- arch/powerpc/platforms/pseries/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 595e9f8a6539..24c18362e5ea 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -21,7 +21,6 @@ config PPC_PSERIES select PPC_DOORBELL select HOTPLUG_CPU select ARCH_RANDOM - select PPC_DOORBELL select FORCE_SMP select SWIOTLB default y From d862b44133b7a1d7de25288e09eabf4df415e971 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Mon, 16 Dec 2019 15:19:21 +1100 Subject: [PATCH 011/131] Revert "powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests" This reverts commit edea902c1c1efb855f77e041f9daf1abe7a9768a. At the time the change allowed direct DMA ops for secure VMs; however since then we switched on using SWIOTLB backed with IOMMU (direct mapping) and to make this work, we need dma_iommu_ops which handles all cases including TCE mapping I/O pages in the presence of an IOMMU. Fixes: edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests") Signed-off-by: Ram Pai [aik: added "revert" and "fixes:"] Signed-off-by: Alexey Kardashevskiy Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191216041924.42318-2-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 6ba081dd61c9..df7db33ca93b 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "pseries.h" @@ -1320,15 +1319,7 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - /* - * Secure guest memory is inacessible to devices so regular DMA isn't - * possible. - * - * In that case keep devices' dma_map_ops as NULL so that the generic - * DMA code path will use SWIOTLB to bounce buffers for DMA. - */ - if (!is_secure_guest()) - set_pci_dma_ops(&dma_iommu_ops); + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str) From 7559d3d295f3365ea7ac0c0274c05e633fe4f594 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 16 Dec 2019 15:19:22 +1100 Subject: [PATCH 012/131] powerpc/pseries: Allow not having ibm, hypertas-functions::hcall-multi-tce for DDW By default a pseries guest supports a H_PUT_TCE hypercall which maps a single IOMMU page in a DMA window. Additionally the hypervisor may support H_PUT_TCE_INDIRECT/H_STUFF_TCE which update multiple TCEs at once; this is advertised via the device tree /rtas/ibm,hypertas-functions property which Linux converts to FW_FEATURE_MULTITCE. FW_FEATURE_MULTITCE is checked when dma_iommu_ops is used; however the code managing the huge DMA window (DDW) ignores it and calls H_PUT_TCE_INDIRECT even if it is explicitly disabled via the "multitce=off" kernel command line parameter. This adds FW_FEATURE_MULTITCE checking to the DDW code path. This changes tce_build_pSeriesLP to take liobn and page size as the huge window does not have iommu_table descriptor which usually the place to store these numbers. Fixes: 4e8b0cf46b25 ("powerpc/pseries: Add support for dynamic dma windows") Signed-off-by: Alexey Kardashevskiy Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191216041924.42318-3-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 43 +++++++++++++++++--------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index df7db33ca93b..b4ce9d472dfe 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -132,10 +132,10 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(struct iommu_table*, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); -static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, long npages, unsigned long uaddr, enum dma_data_direction direction, unsigned long attrs) @@ -146,25 +146,25 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; long tcenum_start = tcenum, npages_start = npages; - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); + tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift; + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(tbl, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, (npages_start - (npages + 1))); break; } if (rc && printk_ratelimit()) { printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); printk("\ttce val = 0x%llx\n", tce ); dump_stack(); @@ -193,7 +193,8 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, unsigned long flags; if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages, uaddr, direction, attrs); } @@ -209,8 +210,9 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, /* If allocation fails, fall back to the loop implementation */ if (!tcep) { local_irq_restore(flags); - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, - direction, attrs); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, + npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } @@ -261,16 +263,16 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); dump_stack(); } @@ -285,7 +287,7 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n u64 rc; if (!firmware_has_feature(FW_FEATURE_MULTITCE)) - return tce_free_pSeriesLP(tbl, tcenum, npages); + return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); @@ -400,6 +402,19 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, u64 rc = 0; long l, limit; + if (!firmware_has_feature(FW_FEATURE_MULTITCE)) { + unsigned long tceshift = be32_to_cpu(maprange->tce_shift); + unsigned long dmastart = (start_pfn << PAGE_SHIFT) + + be64_to_cpu(maprange->dma_base); + unsigned long tcenum = dmastart >> tceshift; + unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; + void *uaddr = __va(start_pfn << PAGE_SHIFT); + + return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), + tcenum, tceshift, npages, (unsigned long) uaddr, + DMA_BIDIRECTIONAL, 0); + } + local_irq_disable(); /* to protect tcep and the page behind it */ tcep = __this_cpu_read(tce_page); From 17a0364cb07c173f64cefe973a93b8dbd9c61795 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 16 Dec 2019 15:19:23 +1100 Subject: [PATCH 013/131] powerpc/pseries/iommu: Separate FW_FEATURE_MULTITCE to put/stuff features H_PUT_TCE_INDIRECT allows packing up to 512 TCE updates into a single hypercall; H_STUFF_TCE can clear lots in a single hypercall too. However, unlike H_STUFF_TCE (which writes the same TCE to all entries), H_PUT_TCE_INDIRECT uses a 4K page with new TCEs. In a secure VM environment this means sharing a secure VM page with a hypervisor which we would rather avoid. This splits the FW_FEATURE_MULTITCE feature into FW_FEATURE_PUT_TCE_IND and FW_FEATURE_STUFF_TCE. "hcall-multi-tce" in the "/rtas/ibm,hypertas-functions" device tree property sets both; the "multitce=off" kernel command line parameter disables both. This should not cause behavioural change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191216041924.42318-4-aik@ozlabs.ru --- arch/powerpc/include/asm/firmware.h | 6 ++++-- arch/powerpc/platforms/pseries/firmware.c | 3 ++- arch/powerpc/platforms/pseries/iommu.c | 12 +++++++----- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index b3e214a97f3a..ca33f4ef6cb4 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -33,7 +33,7 @@ #define FW_FEATURE_LLAN ASM_CONST(0x0000000000010000) #define FW_FEATURE_BULK_REMOVE ASM_CONST(0x0000000000020000) #define FW_FEATURE_XDABR ASM_CONST(0x0000000000040000) -#define FW_FEATURE_MULTITCE ASM_CONST(0x0000000000080000) +#define FW_FEATURE_PUT_TCE_IND ASM_CONST(0x0000000000080000) #define FW_FEATURE_SPLPAR ASM_CONST(0x0000000000100000) #define FW_FEATURE_LPAR ASM_CONST(0x0000000000400000) #define FW_FEATURE_PS3_LV1 ASM_CONST(0x0000000000800000) @@ -51,6 +51,7 @@ #define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) #define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) +#define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #ifndef __ASSEMBLY__ @@ -63,7 +64,8 @@ enum { FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ | FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | FW_FEATURE_BULK_REMOVE | FW_FEATURE_XDABR | - FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | + FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE | + FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index d4a8f1702417..d3acff23f2e3 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -55,7 +55,8 @@ hypertas_fw_features_table[] = { {FW_FEATURE_LLAN, "hcall-lLAN"}, {FW_FEATURE_BULK_REMOVE, "hcall-bulk"}, {FW_FEATURE_XDABR, "hcall-xdabr"}, - {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE, + "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, {FW_FEATURE_VPHN, "hcall-vphn"}, {FW_FEATURE_SET_MODE, "hcall-set-mode"}, diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b4ce9d472dfe..2e0a8eab5588 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -192,7 +192,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; unsigned long flags; - if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { + if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { return tce_build_pSeriesLP(tbl->it_index, tcenum, tbl->it_page_shift, npages, uaddr, direction, attrs); @@ -286,7 +286,7 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n { u64 rc; - if (!firmware_has_feature(FW_FEATURE_MULTITCE)) + if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); @@ -402,7 +402,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, u64 rc = 0; long l, limit; - if (!firmware_has_feature(FW_FEATURE_MULTITCE)) { + if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { unsigned long tceshift = be32_to_cpu(maprange->tce_shift); unsigned long dmastart = (start_pfn << PAGE_SHIFT) + be64_to_cpu(maprange->dma_base); @@ -1341,9 +1341,11 @@ static int __init disable_multitce(char *str) { if (strcmp(str, "off") == 0 && firmware_has_feature(FW_FEATURE_LPAR) && - firmware_has_feature(FW_FEATURE_MULTITCE)) { + (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || + firmware_has_feature(FW_FEATURE_STUFF_TCE))) { printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); - powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; + powerpc_firmware_features &= + ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); } return 1; } From 978bff4e521dab0a72ee4f2627f9245f7d3d6f64 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 16 Dec 2019 15:19:24 +1100 Subject: [PATCH 014/131] powerpc/pseries/svm: Allow IOMMU to work in SVM H_PUT_TCE_INDIRECT uses a shared page to send up to 512 TCE to a hypervisor in a single hypercall. This does not work for secure VMs as the page needs to be shared or the VM should use H_PUT_TCE instead. This disables H_PUT_TCE_INDIRECT by clearing the FW_FEATURE_PUT_TCE_IND feature bit so SVMs will map TCEs using H_PUT_TCE. This is not a part of init_svm() as it is called too late after FW patching is done and may result in a warning like this: [ 3.727716] Firmware features changed after feature patching! [ 3.727965] WARNING: CPU: 0 PID: 1 at (...)arch/powerpc/lib/feature-fixups.c:466 check_features+0xa4/0xc0 Signed-off-by: Alexey Kardashevskiy Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191216041924.42318-5-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/firmware.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index d3acff23f2e3..3e49cc23a97a 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "pseries.h" @@ -101,6 +102,12 @@ static void __init fw_hypertas_feature_init(const char *hypertas, } } + if (is_secure_guest() && + (powerpc_firmware_features & FW_FEATURE_PUT_TCE_IND)) { + powerpc_firmware_features &= ~FW_FEATURE_PUT_TCE_IND; + pr_debug("SVM: disabling PUT_TCE_IND firmware feature\n"); + } + pr_debug(" <- fw_hypertas_feature_init()\n"); } From 3a9d970f17e05a7b26f782beb8f7f2118d1741ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 19 Dec 2019 16:16:02 +0100 Subject: [PATCH 015/131] powerpc/85xx: Get twr_p102x to compile again With CONFIG_QUICC_ENGINE enabled and CONFIG_UCC_GETH + CONFIG_SERIAL_QE disabled we have an unused variable (np). The code won't compile with -Werror. Move the np variable to the block where it is actually used. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Scott Wood Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191219151602.1908411-1-bigeasy@linutronix.de --- arch/powerpc/platforms/85xx/twr_p102x.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/85xx/twr_p102x.c b/arch/powerpc/platforms/85xx/twr_p102x.c index 6c3c0cdaee9a..b301ef9d6ce7 100644 --- a/arch/powerpc/platforms/85xx/twr_p102x.c +++ b/arch/powerpc/platforms/85xx/twr_p102x.c @@ -60,10 +60,6 @@ static void __init twr_p1025_pic_init(void) */ static void __init twr_p1025_setup_arch(void) { -#ifdef CONFIG_QUICC_ENGINE - struct device_node *np; -#endif - if (ppc_md.progress) ppc_md.progress("twr_p1025_setup_arch()", 0); @@ -77,6 +73,7 @@ static void __init twr_p1025_setup_arch(void) #if IS_ENABLED(CONFIG_UCC_GETH) || IS_ENABLED(CONFIG_SERIAL_QE) if (machine_is(twr_p1025)) { struct ccsr_guts __iomem *guts; + struct device_node *np; np = of_find_compatible_node(NULL, NULL, "fsl,p1021-guts"); if (np) { From 5084ff33cac0988c1b979814501dcc2e1ecbf9c0 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jan 2020 08:43:27 +0100 Subject: [PATCH 016/131] powerpc/mpic: constify copied structure The mpic_ipi_chip and mpic_irq_ht_chip structures are only copied into other structures, so make them const. The opportunity for this change was found using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1577864614-5543-10-git-send-email-Julia.Lawall@inria.fr --- arch/powerpc/sysdev/mpic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 934a77324f6b..a3a72b780e67 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -964,7 +964,7 @@ static struct irq_chip mpic_irq_chip = { }; #ifdef CONFIG_SMP -static struct irq_chip mpic_ipi_chip = { +static const struct irq_chip mpic_ipi_chip = { .irq_mask = mpic_mask_ipi, .irq_unmask = mpic_unmask_ipi, .irq_eoi = mpic_end_ipi, @@ -978,7 +978,7 @@ static struct irq_chip mpic_tm_chip = { }; #ifdef CONFIG_MPIC_U3_HT_IRQS -static struct irq_chip mpic_irq_ht_chip = { +static const struct irq_chip mpic_irq_ht_chip = { .irq_startup = mpic_startup_ht_irq, .irq_shutdown = mpic_shutdown_ht_irq, .irq_mask = mpic_mask_irq, From bfbe37f0ce994e7a9945653d7624fadc5c500a9f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jan 2020 18:49:45 +0100 Subject: [PATCH 017/131] powerpc/83xx: use resource_size Use resource_size rather than a verbose computation on the end and start fields. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) @@ struct resource ptr; @@ - (ptr.end - ptr.start + 1) + resource_size(&ptr) Signed-off-by: Julia Lawall Acked-by: Scott Wood Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1577900990-8588-6-git-send-email-Julia.Lawall@inria.fr --- arch/powerpc/platforms/83xx/km83xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index 273145aed90a..b0d5471f620d 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -64,7 +64,7 @@ static void quirk_mpc8360e_qe_enet10(void) return; } - base = ioremap(res.start, res.end - res.start + 1); + base = ioremap(res.start, resource_size(&res)); /* * set output delay adjustments to default values according From 552aa086944a9aeabd599892007c2c7faedb894e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jan 2020 18:49:50 +0100 Subject: [PATCH 018/131] powerpc/powernv: use resource_size Use resource_size rather than a verbose computation on the end and start fields. The semantic patch that makes these changes is as follows: (http://coccinelle.lip6.fr/) @@ struct resource ptr; @@ - (ptr.end - ptr.start + 1) + resource_size(&ptr) Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1577900990-8588-11-git-send-email-Julia.Lawall@inria.fr --- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4374836b033b..62f17c015f9f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -792,7 +792,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; @@ -874,7 +874,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; From 6ad4afc97bc6c5cca9786030492ddfab871ce79e Mon Sep 17 00:00:00 2001 From: Bai Yingjie Date: Mon, 6 Jan 2020 12:29:53 +0800 Subject: [PATCH 019/131] powerpc32/booke: consistently return phys_addr_t in __pa() When CONFIG_RELOCATABLE=y is set, VIRT_PHYS_OFFSET is a 64bit variable, thus __pa() returns as 64bit value. But when CONFIG_RELOCATABLE=n, __pa() returns 32bit value. When CONFIG_PHYS_64BIT is set, __pa() should consistently return as 64bit value irrelevant to CONFIG_RELOCATABLE. So we'd make __pa() consistently return phys_addr_t, which is 64bit when CONFIG_PHYS_64BIT is set. Signed-off-by: Bai Yingjie Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200106042957.26494-1-yingjie_bai@126.com --- arch/powerpc/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 7f1fd41e3065..86332080399a 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -209,7 +209,7 @@ static inline bool pfn_valid(unsigned long pfn) */ #if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) #define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + VIRT_PHYS_OFFSET)) -#define __pa(x) ((unsigned long)(x) - VIRT_PHYS_OFFSET) +#define __pa(x) ((phys_addr_t)(unsigned long)(x) - VIRT_PHYS_OFFSET) #else #ifdef CONFIG_PPC64 /* From eeb09917c138ccd6d9a1c8410891ca2fa5feb5ea Mon Sep 17 00:00:00 2001 From: Bai Yingjie Date: Mon, 6 Jan 2020 12:29:54 +0800 Subject: [PATCH 020/131] powerpc/mpc85xx: also write addr_h to spin table for 64bit boot entry CPU like P4080 has 36bit physical address, its DDR physical start address can be configured above 4G by LAW registers. For such systems in which their physical memory start address was configured higher than 4G, we need also to write addr_h into the spin table of the target secondary CPU, so that addr_h and addr_l together represent a 64bit physical address. Otherwise the secondary core can not get correct entry to start from. Signed-off-by: Bai Yingjie Acked-by: Scott Wood Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200106042957.26494-2-yingjie_bai@126.com --- arch/powerpc/platforms/85xx/smp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 8c7ea2486bc0..48f7d96ae37d 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -252,6 +252,15 @@ static int smp_85xx_start_cpu(int cpu) out_be64((u64 *)(&spin_table->addr_h), __pa(ppc_function_entry(generic_secondary_smp_init))); #else +#ifdef CONFIG_PHYS_ADDR_T_64BIT + /* + * We need also to write addr_h to spin table for systems + * in which their physical memory start address was configured + * to above 4G, otherwise the secondary core can not get + * correct entry to start from. + */ + out_be32(&spin_table->addr_h, __pa(__early_start) >> 32); +#endif out_be32(&spin_table->addr_l, __pa(__early_start)); #endif flush_spin_table(spin_table); From c2a20711fc181e7f22ee5c16c28cb9578af84729 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 6 Jan 2020 13:50:02 -0600 Subject: [PATCH 021/131] powerpc/xmon: don't access ASDR in VMs ASDR is HV-privileged and must only be accessed in HV-mode. Fixes a Program Check (0x700) when xmon in a VM dumps SPRs. Fixes: d1e1b351f50f ("powerpc/xmon: Add ISA v3.0 SPRs to SPR dump") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200107021633.GB29843@us.ibm.com --- arch/powerpc/xmon/xmon.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a7056049709e..03d23075ac43 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1949,15 +1949,14 @@ static void dump_300_sprs(void) printf("pidr = %.16lx tidr = %.16lx\n", mfspr(SPRN_PID), mfspr(SPRN_TIDR)); - printf("asdr = %.16lx psscr = %.16lx\n", - mfspr(SPRN_ASDR), hv ? mfspr(SPRN_PSSCR) - : mfspr(SPRN_PSSCR_PR)); + printf("psscr = %.16lx\n", + hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR)); if (!hv) return; - printf("ptcr = %.16lx\n", - mfspr(SPRN_PTCR)); + printf("ptcr = %.16lx asdr = %.16lx\n", + mfspr(SPRN_PTCR), mfspr(SPRN_ASDR)); #endif } From fbee6ba2dca30d302efe6bddb3a886f5e964a257 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 10 Jan 2020 12:54:02 +0800 Subject: [PATCH 022/131] powerpc/pseries: Advance pfn if section is not present in lmb_is_removable() In lmb_is_removable(), if a section is not present, it should continue to test the rest of the sections in the block. But the current code fails to do so. Fixes: 51925fb3c5c9 ("powerpc/pseries: Implement memory hotplug remove in the kernel") Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Pingfan Liu Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1578632042-12415-1-git-send-email-kernelfans@gmail.com --- arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index c126b94d1943..a4d40a3ceea3 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -360,8 +360,10 @@ static bool lmb_is_removable(struct drmem_lmb *lmb) for (i = 0; i < scns_per_block; i++) { pfn = PFN_DOWN(phys_addr); - if (!pfn_present(pfn)) + if (!pfn_present(pfn)) { + phys_addr += MIN_MEMORY_BLOCK_SIZE; continue; + } rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); phys_addr += MIN_MEMORY_BLOCK_SIZE; From 970d54f99ceac5bbf27929cb5ebfe18338ba1543 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Tue, 24 Dec 2019 17:41:25 +1100 Subject: [PATCH 023/131] powerpc/book3s64/hash: Disable 16M linear mapping size if not aligned With STRICT_KERNEL_RWX on in a relocatable kernel under the hash MMU, if the position the kernel is loaded at is not 16M aligned things go horribly wrong. Specifically hash__mark_initmem_nx() will call hash__change_memory_range() which then aligns down the start address, and due to the text not being 16M aligned causes some of the kernel text to be marked non-executable. We can avoid this when selecting the linear mapping size, so do so and print a warning. I tested this for various alignments and as long as the position is 64K aligned it's fine (the base requirement for powerpc). Signed-off-by: Russell Currey [mpe: Add details of the failure mode] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191224064126.183670-1-ruscur@russell.cc --- arch/powerpc/mm/book3s64/hash_utils.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b30435c7d804..523d4d39d11e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -652,6 +652,7 @@ static void init_hpte_page_sizes(void) static void __init htab_init_page_sizes(void) { + bool aligned = true; init_hpte_page_sizes(); if (!debug_pagealloc_enabled()) { @@ -659,7 +660,15 @@ static void __init htab_init_page_sizes(void) * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) + if (IS_ENABLED(STRICT_KERNEL_RWX) && + (unsigned long)_stext % 0x1000000) { + if (mmu_psize_defs[MMU_PAGE_16M].shift) + pr_warn("Kernel not 16M aligned, " + "disabling 16M linear map alignment"); + aligned = false; + } + + if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned) mmu_linear_psize = MMU_PAGE_16M; else if (mmu_psize_defs[MMU_PAGE_1M].shift) mmu_linear_psize = MMU_PAGE_1M; From c55d7b5e64265fdca45c85b639013e770bde2d0e Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Tue, 24 Dec 2019 17:41:26 +1100 Subject: [PATCH 024/131] powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE I have tested this with the Radix MMU and everything seems to work, and the previous patch for Hash seems to fix everything too. STRICT_KERNEL_RWX should still be disabled by default for now. Please test STRICT_KERNEL_RWX + RELOCATABLE! Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191224064126.183670-2-ruscur@russell.cc --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..6093c48976bf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -133,7 +133,7 @@ config PPC select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 - select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) + select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UACCESS_MCSAFE if PPC64 From 30e813cf46ccaeea6508607632e49b4a1d743d2a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Dec 2019 16:42:55 +0100 Subject: [PATCH 025/131] misc: cxl: use mmgrab Mmgrab was introduced in commit f1f1007644ff ("mm: add new mmgrab() helper") and most of the kernel was updated to use it. Update a remaining file. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) @@ expression e; @@ - atomic_inc(&e->mm_count); + mmgrab(e); Signed-off-by: Julia Lawall Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1577634178-22530-2-git-send-email-Julia.Lawall@inria.fr --- drivers/misc/cxl/context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index aed9c445d1e2..fb2eff69e449 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -352,7 +352,7 @@ void cxl_context_free(struct cxl_context *ctx) void cxl_context_mm_count_get(struct cxl_context *ctx) { if (ctx->mm) - atomic_inc(&ctx->mm->mm_count); + mmgrab(ctx->mm); } void cxl_context_mm_count_put(struct cxl_context *ctx) From ed0bc98f8cbe4f8254759d333a47aedc816ff8c5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jul 2019 12:24:03 +1000 Subject: [PATCH 026/131] powerpc/64s: Reimplement power4_idle code in C This implements the tricky tracing and soft irq handling bits in C, leaving the low level bit to asm. A functional difference is that this redirects the interrupt exit to a return stub to execute blr, rather than the lr address itself. This is probably barely measurable on real hardware, but it keeps the link stack balanced. Tested with QEMU. Signed-off-by: Nicholas Piggin [mpe: Move power4_fixup_nap back into exceptions-64s.S] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190711022404.18132-1-npiggin@gmail.com --- arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/exceptions-64s.S | 13 ++++- arch/powerpc/kernel/idle.c | 25 +++++++++ arch/powerpc/kernel/idle_book3s.S | 20 +++++++ arch/powerpc/kernel/idle_power4.S | 83 ---------------------------- arch/powerpc/platforms/Kconfig | 4 ++ 7 files changed, 64 insertions(+), 87 deletions(-) delete mode 100644 arch/powerpc/kernel/idle_power4.S diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a9993e7a443b..e114eb36721c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -412,6 +412,9 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32) extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val); extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val); extern unsigned long isa206_idle_insn_mayloss(unsigned long type); +#ifdef CONFIG_PPC_970_NAP +extern void power4_idle_nap(void); +#endif extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 157b0147921f..78a1b22d4fd8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -62,8 +62,7 @@ obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o -obj-$(CONFIG_PPC_970_NAP) += idle_power4.o -obj-$(CONFIG_PPC_P7_NAP) += idle_book3s.o +obj-$(CONFIG_PPC_BOOK3S_IDLE) += idle_book3s.o procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 46508b148e16..b049c0bfa883 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2208,11 +2208,20 @@ __end_interrupts: DEFINE_FIXED_SYMBOL(__end_interrupts) #ifdef CONFIG_PPC_970_NAP + /* + * Called by exception entry code if _TLF_NAPPING was set, this clears + * the NAPPING flag, and redirects the exception exit to + * power4_fixup_nap_return. + */ + .globl power4_fixup_nap EXC_COMMON_BEGIN(power4_fixup_nap) andc r9,r9,r10 std r9,TI_LOCAL_FLAGS(r11) - ld r10,_LINK(r1) /* make idle task do the */ - std r10,_NIP(r1) /* equivalent of a blr */ + LOAD_REG_ADDR(r10, power4_idle_nap_return) + std r10,_NIP(r1) + blr + +power4_idle_nap_return: blr #endif diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index a36fd053c3db..422e31d2f5a2 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -77,6 +77,31 @@ void arch_cpu_idle(void) int powersave_nap; +#ifdef CONFIG_PPC_970_NAP +void power4_idle(void) +{ + if (!cpu_has_feature(CPU_FTR_CAN_NAP)) + return; + + if (!powersave_nap) + return; + + if (!prep_irq_for_idle()) + return; + + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile("DSSALL ; sync" ::: "memory"); + + power4_idle_nap(); + + /* + * power4_idle_nap returns with interrupts enabled (soft and hard). + * to our caller with interrupts enabled (soft and hard). Our caller + * can cope with either interrupts disabled or enabled upon return. + */ +} +#endif + #ifdef CONFIG_SYSCTL /* * Register the sysctl to set/clear powersave_nap. diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d32751994a62..22f249b6f58d 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -15,7 +15,9 @@ #include #include #include +#include /* TLF_NAPPING */ +#ifdef CONFIG_PPC_P7_NAP /* * Desired PSSCR in r3 * @@ -181,4 +183,22 @@ _GLOBAL(isa206_idle_insn_mayloss) bne 2f IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 2: IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) +#endif +#ifdef CONFIG_PPC_970_NAP +_GLOBAL(power4_idle_nap) + LOAD_REG_IMMEDIATE(r7, MSR_KERNEL|MSR_EE|MSR_POW) + ld r9,PACA_THREAD_INFO(r13) + ld r8,TI_LOCAL_FLAGS(r9) + ori r8,r8,_TLF_NAPPING + std r8,TI_LOCAL_FLAGS(r9) + /* + * NAPPING bit is set, from this point onward power4_fixup_nap + * will cause exceptions to return to power4_idle_nap_return. + */ +1: sync + isync + mtmsrd r7 + isync + b 1b +#endif diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S deleted file mode 100644 index 33c625329078..000000000000 --- a/arch/powerpc/kernel/idle_power4.S +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains the power_save function for 970-family CPUs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#undef DEBUG - - .text - -_GLOBAL(power4_idle) -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) - /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - - /* This sequence is similar to prep_irq_for_idle() */ - - /* Hard disable interrupts */ - mfmsr r7 - rldicl r0,r7,48,1 - rotldi r0,r0,16 - mtmsrd r0,1 - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- 2f - - /* - * Soft-enable interrupts. This will make power4_fixup_nap return - * to our caller with interrupts enabled (soft and hard). The caller - * can cope with either interrupts disabled or enabled upon return. - */ -#ifdef CONFIG_TRACE_IRQFLAGS - /* Tell the tracer interrupts are on, because idle responds to them. */ - mflr r0 - std r0,16(r1) - stdu r1,-128(r1) - bl trace_hardirqs_on - addi r1,r1,128 - ld r0,16(r1) - mtlr r0 - mfmsr r7 -#endif /* CONFIG_TRACE_IRQFLAGS */ - - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13) /* we'll hard-enable shortly */ -BEGIN_FTR_SECTION - DSSALL - sync -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - ld r9, PACA_THREAD_INFO(r13) - ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ - ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ - ori r7,r7,MSR_EE - oris r7,r7,MSR_POW@h -1: sync - isync - mtmsrd r7 - isync - b 1b - -2: /* Return if an interrupt had happened while soft disabled */ - /* Set the HARD_DIS flag because interrupts are now hard disabled */ - ori r0,r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - blr diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index e28df298df56..1f8025383caa 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -177,6 +177,10 @@ config PPC_970_NAP config PPC_P7_NAP bool +config PPC_BOOK3S_IDLE + def_bool y + depends on (PPC_970_NAP || PPC_P7_NAP) + config PPC_INDIRECT_PIO bool select GENERIC_IOMAP From 1e1c8b2cc37afb333c1829e8e0360321813bf220 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 07:14:40 +0000 Subject: [PATCH 027/131] powerpc/ptdump: don't entirely rebuild kernel when selecting CONFIG_PPC_DEBUG_WX Selecting CONFIG_PPC_DEBUG_WX only impacts ptdump and pgtable_32/64 init calls. Declaring related functions in asm/pgtable.h implies rebuilding almost everything. Move ptdump_check_wx() declaration in mm/mmu_decl.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/bf34fd9dca61eadf9a134a9f89ebbc162cfd5f86.1578986011.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/pgtable.h | 6 ------ arch/powerpc/mm/mmu_decl.h | 6 ++++++ arch/powerpc/mm/ptdump/ptdump.c | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 0e4ec8cc37b7..8cc543ed114c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -94,12 +94,6 @@ void mark_initmem_nx(void); static inline void mark_initmem_nx(void) { } #endif -#ifdef CONFIG_PPC_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h * so we are sure it is included when arriving here. diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 8e99649c24fc..7097e07a209a 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -181,3 +181,9 @@ void mmu_mark_rodata_ro(void); static inline void mmu_mark_initmem_nx(void) { } static inline void mmu_mark_rodata_ro(void) { } #endif + +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void); +#else +static inline void ptdump_check_wx(void) { } +#endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2f9ddc29c535..4af0d5d9589e 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -24,6 +24,8 @@ #include #include +#include + #include "ptdump.h" /* From e26ad936dd89d79f66c2b567f700e0c2a7103070 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 08:13:08 +0000 Subject: [PATCH 028/131] powerpc/ptdump: Fix W+X verification call in mark_rodata_ro() ptdump_check_wx() also have to be called when pages are mapped by blocks. Fixes: 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/37517da8310f4457f28921a4edb88fb21d27b62a.1578989531.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/pgtable_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 73b84166d06a..5fb90edd865e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -218,6 +218,7 @@ void mark_rodata_ro(void) if (v_block_mapped((unsigned long)_sinittext)) { mmu_mark_rodata_ro(); + ptdump_check_wx(); return; } From d80ae83f1f932ab7af47b54d0d3bef4f4dba489f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 08:13:09 +0000 Subject: [PATCH 029/131] powerpc/ptdump: Fix W+X verification Verification cannot rely on simple bit checking because on some platforms PAGE_RW is 0, checking that a page is not W means checking that PAGE_RO is set instead of checking that PAGE_RW is not set. Use pte helpers instead of checking bits. Fixes: 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0d894839fdbb19070f0e1e4140363be4f2bb62fc.1578989540.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ptdump/ptdump.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 4af0d5d9589e..206156255247 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -175,10 +175,12 @@ static void dump_addr(struct pg_state *st, unsigned long addr) static void note_prot_wx(struct pg_state *st, unsigned long addr) { + pte_t pte = __pte(st->current_flags); + if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) return; - if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) + if (!pte_write(pte) || !pte_exec(pte)) return; WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", From f509247b08f2dcf7754d9ed85ad69a7972aa132b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 08:13:10 +0000 Subject: [PATCH 030/131] powerpc/ptdump: Only enable PPC_CHECK_WX with STRICT_KERNEL_RWX ptdump_check_wx() is called from mark_rodata_ro() which only exists when CONFIG_STRICT_KERNEL_RWX is selected. Fixes: 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/922d4939c735c6b52b4137838bcc066fffd4fc33.1578989545.git.christophe.leroy@c-s.fr --- arch/powerpc/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 4e1d39847462..0b063830eea8 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -371,7 +371,7 @@ config PPC_PTDUMP config PPC_DEBUG_WX bool "Warn on W+X mappings at boot" - depends on PPC_PTDUMP + depends on PPC_PTDUMP && STRICT_KERNEL_RWX help Generate a warning if any W+X mappings are found at boot. From 991d656d722dbc783481f408d6e4cbcce2e8bb78 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Nov 2019 13:16:50 +0000 Subject: [PATCH 031/131] powerpc/8xx: Fix permanently mapped IMMR region. When not using large TLBs, the IMMR region is still mapped as a whole block in the FIXMAP area. Properly report that the IMMR region is block-mapped even when not using large TLBs. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/45f4f414bcd7198b0755cf4287ff216fbfc24b9d.1574774187.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/nohash/8xx.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 96eb8e43f39b..3189308dece4 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -21,33 +21,34 @@ extern int __map_without_ltlbs; static unsigned long block_mapped_ram; /* - * Return PA for this VA if it is in an area mapped with LTLBs. + * Return PA for this VA if it is in an area mapped with LTLBs or fixmap. * Otherwise, returns 0 */ phys_addr_t v_block_mapped(unsigned long va) { unsigned long p = PHYS_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; + if (__map_without_ltlbs) + return 0; if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) return __pa(va); return 0; } /* - * Return VA for a given PA mapped with LTLBs or 0 if not mapped + * Return VA for a given PA mapped with LTLBs or fixmap + * Return 0 if not mapped */ unsigned long p_block_mapped(phys_addr_t pa) { unsigned long p = PHYS_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; + if (__map_without_ltlbs) + return 0; if (pa < block_mapped_ram) return (unsigned long)__va(pa); return 0; From 39413ae009674c6ba745850515b551bbb9d6374b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Nov 2019 17:43:29 +0000 Subject: [PATCH 032/131] powerpc/hw_breakpoints: Rewrite 8xx breakpoints to allow any address range size. Unlike standard powerpc, Powerpc 8xx doesn't have SPRN_DABR, but it has a breakpoint support based on a set of comparators which allow more flexibility. Commit 4ad8622dc548 ("powerpc/8xx: Implement hw_breakpoint") implemented breakpoints by emulating the DABR behaviour. It did this by setting one comparator the match 4 bytes at breakpoint address and the other comparator to match 4 bytes at breakpoint address + 4. Rewrite 8xx hw_breakpoint to make breakpoints match all addresses defined by the breakpoint address and length by making full use of comparators. Now, comparator E is set to match any address greater than breakpoint address minus one. Comparator F is set to match any address lower than breakpoint address plus breakpoint length. Addresses are aligned to 32 bits. When the breakpoint range starts at address 0, the breakpoint is set to match comparator F only. When the breakpoint range end at address 0xffffffff, the breakpoint is set to match comparator E only. Otherwise the breakpoint is set to match comparator E and F. At the same time, use registers bit names instead of hardcoded values. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/05105deeaf63bc02151aea2cdeaf525534e0e9d4.1574790198.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/hw_breakpoint.h | 4 ++ arch/powerpc/include/asm/reg_8xx.h | 14 ++++++ arch/powerpc/kernel/hw_breakpoint.c | 15 ++++--- arch/powerpc/kernel/process.c | 57 +++++++++++++++--------- 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 27ac6f5d2891..f2f8d8aa8e3b 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -34,7 +34,11 @@ struct arch_hw_breakpoint { #define HW_BRK_TYPE_PRIV_ALL (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \ HW_BRK_TYPE_HYP) +#ifdef CONFIG_PPC_8xx +#define HW_BREAKPOINT_ALIGN 0x3 +#else #define HW_BREAKPOINT_ALIGN 0x7 +#endif #define DABR_MAX_LEN 8 #define DAWR_MAX_LEN 512 diff --git a/arch/powerpc/include/asm/reg_8xx.h b/arch/powerpc/include/asm/reg_8xx.h index 07df35ee8cbc..299ee7be0f67 100644 --- a/arch/powerpc/include/asm/reg_8xx.h +++ b/arch/powerpc/include/asm/reg_8xx.h @@ -35,7 +35,21 @@ #define SPRN_CMPE 152 #define SPRN_CMPF 153 #define SPRN_LCTRL1 156 +#define LCTRL1_CTE_GT 0xc0000000 +#define LCTRL1_CTF_LT 0x14000000 +#define LCTRL1_CRWE_RW 0x00000000 +#define LCTRL1_CRWE_RO 0x00040000 +#define LCTRL1_CRWE_WO 0x000c0000 +#define LCTRL1_CRWF_RW 0x00000000 +#define LCTRL1_CRWF_RO 0x00010000 +#define LCTRL1_CRWF_WO 0x00030000 #define SPRN_LCTRL2 157 +#define LCTRL2_LW0EN 0x80000000 +#define LCTRL2_LW0LA_E 0x00000000 +#define LCTRL2_LW0LA_F 0x04000000 +#define LCTRL2_LW0LA_EandF 0x08000000 +#define LCTRL2_LW0LADC 0x02000000 +#define LCTRL2_SLW0EN 0x00000002 #ifdef CONFIG_PPC_8xx #define SPRN_ICTRL 158 #endif diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 58ce3d37c2a3..2462cd7c565c 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -160,6 +160,9 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) /* DAWR region can't cross 512 bytes boundary */ if ((start_addr >> 9) != (end_addr >> 9)) return -EINVAL; + } else if (IS_ENABLED(CONFIG_PPC_8xx)) { + /* 8xx can setup a range without limitation */ + max_len = U16_MAX; } if (hw_len > max_len) @@ -328,13 +331,11 @@ int hw_breakpoint_handler(struct die_args *args) } info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; - if (IS_ENABLED(CONFIG_PPC_8xx)) { - if (!dar_within_range(regs->dar, info)) - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - } else { - if (!stepping_handler(regs, bp, info)) - goto out; - } + if (!dar_within_range(regs->dar, info)) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + + if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info)) + goto out; /* * As a policy, the callback is invoked in a 'trigger-after-execute' diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4df94b6e2f32..7fcf72e58826 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -740,28 +740,6 @@ static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) mtspr(SPRN_DABRX, dabrx); return 0; } -#elif defined(CONFIG_PPC_8xx) -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - unsigned long addr = dabr & ~HW_BRK_TYPE_DABR; - unsigned long lctrl1 = 0x90000000; /* compare type: equal on E & F */ - unsigned long lctrl2 = 0x8e000002; /* watchpoint 1 on cmp E | F */ - - if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) - lctrl1 |= 0xa0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) - lctrl1 |= 0xf0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == 0) - lctrl2 = 0; - - mtspr(SPRN_LCTRL2, 0); - mtspr(SPRN_CMPE, addr); - mtspr(SPRN_CMPF, addr + 4); - mtspr(SPRN_LCTRL1, lctrl1); - mtspr(SPRN_LCTRL2, lctrl2); - - return 0; -} #else static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) { @@ -782,6 +760,39 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk) return __set_dabr(dabr, dabrx); } +static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) +{ + unsigned long lctrl1 = LCTRL1_CTE_GT | LCTRL1_CTF_LT | LCTRL1_CRWE_RW | + LCTRL1_CRWF_RW; + unsigned long lctrl2 = LCTRL2_LW0EN | LCTRL2_LW0LADC | LCTRL2_SLW0EN; + unsigned long start_addr = brk->address & ~HW_BREAKPOINT_ALIGN; + unsigned long end_addr = (brk->address + brk->len - 1) | HW_BREAKPOINT_ALIGN; + + if (start_addr == 0) + lctrl2 |= LCTRL2_LW0LA_F; + else if (end_addr == ~0U) + lctrl2 |= LCTRL2_LW0LA_E; + else + lctrl2 |= LCTRL2_LW0LA_EandF; + + mtspr(SPRN_LCTRL2, 0); + + if ((brk->type & HW_BRK_TYPE_RDWR) == 0) + return 0; + + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) + lctrl1 |= LCTRL1_CRWE_RO | LCTRL1_CRWF_RO; + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) + lctrl1 |= LCTRL1_CRWE_WO | LCTRL1_CRWF_WO; + + mtspr(SPRN_CMPE, start_addr - 1); + mtspr(SPRN_CMPF, end_addr + 1); + mtspr(SPRN_LCTRL1, lctrl1); + mtspr(SPRN_LCTRL2, lctrl2); + + return 0; +} + void __set_breakpoint(struct arch_hw_breakpoint *brk) { memcpy(this_cpu_ptr(¤t_brk), brk, sizeof(*brk)); @@ -789,6 +800,8 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk) if (dawr_enabled()) // Power8 or later set_dawr(brk); + else if (IS_ENABLED(CONFIG_PPC_8xx)) + set_breakpoint_8xx(brk); else if (!cpu_has_feature(CPU_FTR_ARCH_207S)) // Power7 or earlier set_dabr(brk); From a426ea9bb3889b3ce3bec52f2467c1becdcc330a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Nov 2019 17:43:30 +0000 Subject: [PATCH 033/131] selftests/powerpc: Enable range tests on 8xx in ptrace-hwbreak.c selftest 8xx is now able to support any range length so range tests can be enabled. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/081e3b4e3a17a8ec9fdac46b505e3a29ca15f209.1574790198.git.christophe.leroy@c-s.fr --- tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index 7deedbc16b0b..fc477dfe86a2 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -455,9 +455,8 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr) if (dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_RANGE) { test_sethwdebug_exact(child_pid); - if (!is_8xx) - test_sethwdebug_range_aligned(child_pid); - if (dawr && !is_8xx) { + test_sethwdebug_range_aligned(child_pid); + if (dawr || is_8xx) { test_sethwdebug_range_unaligned(child_pid); test_sethwdebug_range_unaligned_dar(child_pid); test_sethwdebug_dawr_max_range(child_pid); From 8c452a889821ca0cd2a5f2e3e87fbc01e56408cb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Nov 2019 12:16:35 +0000 Subject: [PATCH 034/131] powerpc/devicetrees: Change 'gpios' to 'cs-gpios' on fsl, spi nodes Since commit 0f0581b24bd0 ("spi: fsl: Convert to use CS GPIO descriptors"), the prefered way to define chipselect GPIOs is using 'cs-gpios' property instead of the legacy 'gpios' property. Signed-off-by: Christophe Leroy Reviewed-by: Linus Walleij Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7556683b57d8ce100855857f03d1cd3d2903d045.1574943062.git.christophe.leroy@c-s.fr --- Documentation/devicetree/bindings/spi/fsl-spi.txt | 8 ++++---- arch/powerpc/boot/dts/mgcoge.dts | 2 +- arch/powerpc/boot/dts/mpc832x_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8610_hpcd.dts | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/fsl-spi.txt b/Documentation/devicetree/bindings/spi/fsl-spi.txt index 411375eac54d..0654380eb751 100644 --- a/Documentation/devicetree/bindings/spi/fsl-spi.txt +++ b/Documentation/devicetree/bindings/spi/fsl-spi.txt @@ -15,13 +15,13 @@ Required properties: - clock-frequency : input clock frequency to non FSL_SOC cores Optional properties: -- gpios : specifies the gpio pins to be used for chipselects. +- cs-gpios : specifies the gpio pins to be used for chipselects. The gpios will be referred to as reg = in the SPI child nodes. If unspecified, a single SPI device without a chip select can be used. - fsl,spisel_boot : for the MPC8306 and MPC8309, specifies that the SPISEL_BOOT signal is used as chip select for a slave device. Use reg = in the corresponding child node, i.e. 0 if - the gpios property is not present. + the cs-gpios property is not present. Example: spi@4c0 { @@ -31,8 +31,8 @@ Example: interrupts = <82 0>; interrupt-parent = <700>; mode = "cpu"; - gpios = <&gpio 18 1 // device reg=<0> - &gpio 19 1>; // device reg=<1> + cs-gpios = <&gpio 18 1 // device reg=<0> + &gpio 19 1>; // device reg=<1> }; diff --git a/arch/powerpc/boot/dts/mgcoge.dts b/arch/powerpc/boot/dts/mgcoge.dts index a2dd5f1da621..7de068991bde 100644 --- a/arch/powerpc/boot/dts/mgcoge.dts +++ b/arch/powerpc/boot/dts/mgcoge.dts @@ -224,7 +224,7 @@ reg = <0x11a80 0x40 0x89fc 0x2>; interrupts = <2 8>; interrupt-parent = <&PIC>; - gpios = < &cpm2_pio_d 19 0>; + cs-gpios = < &cpm2_pio_d 19 0>; #address-cells = <1>; #size-cells = <0>; ds3106@1 { diff --git a/arch/powerpc/boot/dts/mpc832x_rdb.dts b/arch/powerpc/boot/dts/mpc832x_rdb.dts index b6257186528e..ecebc27a2898 100644 --- a/arch/powerpc/boot/dts/mpc832x_rdb.dts +++ b/arch/powerpc/boot/dts/mpc832x_rdb.dts @@ -249,7 +249,7 @@ reg = <0x4c0 0x40>; interrupts = <2>; interrupt-parent = <&qeic>; - gpios = <&qe_pio_d 13 0>; + cs-gpios = <&qe_pio_d 13 0>; mode = "cpu-qe"; mmc-slot@0 { diff --git a/arch/powerpc/boot/dts/mpc8610_hpcd.dts b/arch/powerpc/boot/dts/mpc8610_hpcd.dts index 1a8321ac105a..33bbe58c1ad0 100644 --- a/arch/powerpc/boot/dts/mpc8610_hpcd.dts +++ b/arch/powerpc/boot/dts/mpc8610_hpcd.dts @@ -200,7 +200,7 @@ interrupts = <59 2>; interrupt-parent = <&mpic>; mode = "cpu"; - gpios = <&sdcsr_pio 7 0>; + cs-gpios = <&sdcsr_pio 7 0>; sleep = <&pmc 0x00000800 0>; mmc-slot@0 { From 902137ba8e469ed07c7f120a390161937a6288fb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:27 +0000 Subject: [PATCH 035/131] powerpc/32: Add VDSO version of getcpu on non SMP Commit 18ad51dd342a ("powerpc: Add VDSO version of getcpu") added getcpu() for PPC64 only, by making use of a user readable general purpose SPR. PPC32 doesn't have any such SPR. For non SMP, just return CPU id 0 from the VDSO directly. PPC32 doesn't support CONFIG_NUMA so NUMA node is always 0. Before the patch, vdsotest reported: getcpu: syscall: 1572 nsec/call getcpu: libc: 1787 nsec/call getcpu: vdso: not tested Now, vdsotest reports: getcpu: syscall: 1582 nsec/call getcpu: libc: 502 nsec/call getcpu: vdso: 187 nsec/call Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/eaac4b6494ecff1811220fccc895bf282aab884a.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/vdso32/Makefile | 4 +--- arch/powerpc/kernel/vdso32/getcpu.S | 17 +++++++++++++++++ arch/powerpc/kernel/vdso32/vdso32.lds.S | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 06f54d947057..e147bbdc12cd 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -2,9 +2,7 @@ # List of files in the vdso, has to be asm only for now -obj-vdso32-$(CONFIG_PPC64) = getcpu.o -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ - $(obj-vdso32-y) +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o # Build rules diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S index 63e914539e1a..90b39af14383 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -15,6 +15,7 @@ * int __kernel_getcpu(unsigned *cpu, unsigned *node); * */ +#if defined(CONFIG_PPC64) V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc mfspr r5,SPRN_SPRG_VDSO_READ @@ -31,3 +32,19 @@ V_FUNCTION_BEGIN(__kernel_getcpu) blr .cfi_endproc V_FUNCTION_END(__kernel_getcpu) +#elif !defined(CONFIG_SMP) +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + cmpwi cr0, r3, 0 + cmpwi cr1, r4, 0 + li r5, 0 + beq cr0, 1f + stw r5, 0(r3) +1: li r3, 0 /* always success */ + crclr cr0*4+so + beqlr cr1 + stw r5, 0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) +#endif diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 00c025ba4a92..5206c2eb2a1d 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -155,7 +155,7 @@ VERSION __kernel_sync_dicache_p5; __kernel_sigtramp32; __kernel_sigtramp_rt32; -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || !defined(CONFIG_SMP) __kernel_getcpu; #endif From 654abc69ef2e69712e6d4e8a6cb9292b97a4aa39 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:28 +0000 Subject: [PATCH 036/131] powerpc/vdso32: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE This is copied and adapted from commit 5c929885f1bb ("powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE") from Santosh Sivaraj Benchmark from vdsotest-all: clock-gettime-realtime: syscall: 3601 nsec/call clock-gettime-realtime: libc: 1072 nsec/call clock-gettime-realtime: vdso: 931 nsec/call clock-gettime-monotonic: syscall: 4034 nsec/call clock-gettime-monotonic: libc: 1213 nsec/call clock-gettime-monotonic: vdso: 1076 nsec/call clock-gettime-realtime-coarse: syscall: 2722 nsec/call clock-gettime-realtime-coarse: libc: 805 nsec/call clock-gettime-realtime-coarse: vdso: 668 nsec/call clock-gettime-monotonic-coarse: syscall: 2949 nsec/call clock-gettime-monotonic-coarse: libc: 882 nsec/call clock-gettime-monotonic-coarse: vdso: 745 nsec/call Additional test passed with: vdsotest -d 30 clock-gettime-monotonic-coarse verify Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://github.com/linuxppc/issues/issues/41 Link: https://lore.kernel.org/r/d1d24a376e396540194eeb85a2efe481e92ade24.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/vdso32/gettimeofday.S | 64 ++++++++++++++++++++--- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 3306672f57a9..e9ce8ee56edb 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -69,7 +69,13 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) cmpli cr0,r3,CLOCK_REALTIME cmpli cr1,r3,CLOCK_MONOTONIC cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f + + cmpli cr5,r3,CLOCK_REALTIME_COARSE + cmpli cr6,r3,CLOCK_MONOTONIC_COARSE + cror cr5*4+eq,cr5*4+eq,cr6*4+eq + + cror cr0*4+eq,cr0*4+eq,cr5*4+eq + bne cr0, .Lgettime_fallback mflr r12 /* r12 saves lr */ .cfi_register lr,r12 @@ -78,8 +84,10 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mr r9,r3 /* datapage ptr in r9 */ lis r7,NSEC_PER_SEC@h /* want nanoseconds */ ori r7,r7,NSEC_PER_SEC@l -50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ - bne cr1,80f /* not monotonic -> all done */ + beq cr5, .Lcoarse_clocks +.Lprecise_clocks: + bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ + bne cr1, .Lfinish /* not monotonic -> all done */ /* * CLOCK_MONOTONIC @@ -103,12 +111,53 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r9,r9,r0 lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) cmpl cr0,r8,r0 /* check if updated */ - bne- 50b + bne- .Lprecise_clocks + b .Lfinish_monotonic + + /* + * For coarse clocks we get data directly from the vdso data page, so + * we don't need to call __do_get_tspec, but we still need to do the + * counter trick. + */ +.Lcoarse_clocks: + lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) + andi. r0,r8,1 /* pending update ? loop */ + bne- .Lcoarse_clocks + add r9,r9,r0 /* r0 is already 0 */ + + /* + * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE + * too + */ + lwz r3,STAMP_XTIME_SEC+LOPART(r9) + lwz r4,STAMP_XTIME_NSEC+LOPART(r9) + bne cr6,1f + + /* CLOCK_MONOTONIC_COARSE */ + lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) + lwz r6,WTOM_CLOCK_NSEC(r9) + + /* check if counter has updated */ + or r0,r6,r5 +1: or r0,r0,r3 + or r0,r0,r4 + xor r0,r0,r0 + add r3,r3,r0 + lwz r0,CFG_TB_UPDATE_COUNT+LOPART(r9) + cmpl cr0,r0,r8 /* check if updated */ + bne- .Lcoarse_clocks + + /* Counter has not updated, so continue calculating proper values for + * sec and nsec if monotonic coarse, or just return with the proper + * values for realtime. + */ + bne cr6, .Lfinish /* Calculate and store result. Note that this mimics the C code, * which may cause funny results if nsec goes negative... is that * possible at all ? */ +.Lfinish_monotonic: add r3,r3,r5 add r4,r4,r6 cmpw cr0,r4,r7 @@ -116,11 +165,12 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) blt 1f subf r4,r7,r4 addi r3,r3,1 -1: bge cr1,80f +1: bge cr1, .Lfinish addi r3,r3,-1 add r4,r4,r7 -80: stw r3,TSPC32_TV_SEC(r11) +.Lfinish: + stw r3,TSPC32_TV_SEC(r11) stw r4,TSPC32_TV_NSEC(r11) mtlr r12 @@ -131,7 +181,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* * syscall fallback */ -99: +.Lgettime_fallback: li r0,__NR_clock_gettime .cfi_restore lr sc From ec0895f08f99515194e9fcfe1338becf6f759d38 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:30 +0000 Subject: [PATCH 037/131] powerpc/vdso32: inline __get_datapage() __get_datapage() is only a few instructions to retrieve the address of the page where the kernel stores data to the VDSO. By inlining this function into its users, a bl/blr pair and a mflr/mtlr pair is avoided, plus a few reg moves. The improvement is noticeable (about 55 nsec/call on an 8xx) vdsotest before the patch: gettimeofday: vdso: 731 nsec/call clock-gettime-realtime-coarse: vdso: 668 nsec/call clock-gettime-monotonic-coarse: vdso: 745 nsec/call vdsotest after the patch: gettimeofday: vdso: 677 nsec/call clock-gettime-realtime-coarse: vdso: 613 nsec/call clock-gettime-monotonic-coarse: vdso: 690 nsec/call Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c39ef7f3dfa25356b01e211d539671f279086c09.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/vdso_datapage.h | 10 ++++++++ arch/powerpc/kernel/vdso32/cacheflush.S | 9 ++++---- arch/powerpc/kernel/vdso32/datapage.S | 28 +++-------------------- arch/powerpc/kernel/vdso32/gettimeofday.S | 12 ++++------ 4 files changed, 22 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 40f13f3626d3..ee5319a6f4e3 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -118,6 +118,16 @@ struct vdso_data { extern struct vdso_data *vdso_data; +#else /* __ASSEMBLY__ */ + +.macro get_datapage ptr, tmp + bcl 20, 31, .+4 + mflr \ptr + addi \ptr, \ptr, (__kernel_datapage_offset - (.-4))@l + lwz \tmp, 0(\ptr) + add \ptr, \tmp, \ptr +.endm + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S index 7f882e7b9f43..d178ec8c279d 100644 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -8,6 +8,7 @@ #include #include #include +#include #include .text @@ -24,14 +25,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r11,r3 - bl __get_datapage@local + get_datapage r10, r0 mtlr r12 - mr r10,r3 lwz r7,CFG_DCACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) @@ -48,7 +47,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) lwz r7,CFG_ICACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 6c7401bd284e..1095d818f94a 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -10,35 +10,13 @@ #include #include #include +#include .text .global __kernel_datapage_offset; __kernel_datapage_offset: .long 0 -V_FUNCTION_BEGIN(__get_datapage) - .cfi_startproc - /* We don't want that exposed or overridable as we want other objects - * to be able to bl directly to here - */ - .protected __get_datapage - .hidden __get_datapage - - mflr r0 - .cfi_register lr,r0 - - bcl 20,31,data_page_branch -data_page_branch: - mflr r3 - mtlr r0 - addi r3, r3, __kernel_datapage_offset-data_page_branch - lwz r0,0(r3) - .cfi_restore lr - add r3,r0,r3 - blr - .cfi_endproc -V_FUNCTION_END(__get_datapage) - /* * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; * @@ -53,7 +31,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mflr r12 .cfi_register lr,r12 mr r4,r3 - bl __get_datapage@local + get_datapage r3, r0 mtlr r12 addi r3,r3,CFG_SYSCALL_MAP32 cmpli cr0,r4,0 @@ -75,7 +53,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - bl __get_datapage@local + get_datapage r3, r0 lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) lwz r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index e9ce8ee56edb..74973548529a 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -33,8 +34,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mr r10,r3 /* r10 saves tv */ mr r11,r4 /* r11 saves tz */ - bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ + get_datapage r9, r0 cmplwi r10,0 /* check if tv is NULL */ beq 3f lis r7,1000000@ha /* load up USEC_PER_SEC */ @@ -80,8 +80,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mflr r12 /* r12 saves lr */ .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ - bl __get_datapage@local /* get data page */ - mr r9,r3 /* datapage ptr in r9 */ + get_datapage r9, r0 lis r7,NSEC_PER_SEC@h /* want nanoseconds */ ori r7,r7,NSEC_PER_SEC@l beq cr5, .Lcoarse_clocks @@ -206,7 +205,7 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) mflr r12 .cfi_register lr,r12 - bl __get_datapage@local /* get data page */ + get_datapage r3, r0 lwz r5, CLOCK_HRTIMER_RES(r3) mtlr r12 li r3,0 @@ -240,8 +239,7 @@ V_FUNCTION_BEGIN(__kernel_time) .cfi_register lr,r12 mr r11,r3 /* r11 holds t */ - bl __get_datapage@local - mr r9, r3 /* datapage ptr in r9 */ + get_datapage r9, r0 lwz r3,STAMP_XTIME_SEC+LOPART(r9) From 2c29eef9fc6f61cfb81b54683be8f116dac80e7a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:31 +0000 Subject: [PATCH 038/131] powerpc/vdso32: Don't read cache line size from the datapage on PPC32. On PPC32, the cache lines have a fixed size known at build time. Don't read it from the datapage. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/dfa7b35e27e01964fcda84bf1ed8b2b31cf93826.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/vdso_datapage.h | 4 ---- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/vdso.c | 5 ----- arch/powerpc/kernel/vdso32/cacheflush.S | 23 +++++++++++++++++++++++ 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index ee5319a6f4e3..b9ef6cf50ea5 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -108,10 +108,6 @@ struct vdso_data { __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ __u32 hrtimer_res; /* hrtimer resolution */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ - __u32 dcache_block_size; /* L1 d-cache block size */ - __u32 icache_block_size; /* L1 i-cache block size */ - __u32 dcache_log_block_size; /* L1 d-cache log block size */ - __u32 icache_log_block_size; /* L1 i-cache log block size */ }; #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 3d47aec7becf..0013197d89a6 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -389,11 +389,11 @@ int main(void) OFFSET(STAMP_XTIME_NSEC, vdso_data, stamp_xtime_nsec); OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction); OFFSET(CLOCK_HRTIMER_RES, vdso_data, hrtimer_res); +#ifdef CONFIG_PPC64 OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size); OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size); OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size); OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_data, dcache_log_block_size); -#ifdef CONFIG_PPC64 OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64); OFFSET(TVAL64_TV_SEC, __kernel_old_timeval, tv_sec); OFFSET(TVAL64_TV_USEC, __kernel_old_timeval, tv_usec); diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index eae9ddaecbcf..b9a108411c0d 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -728,11 +728,6 @@ static int __init vdso_init(void) */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); -#else - vdso_data->dcache_block_size = L1_CACHE_BYTES; - vdso_data->dcache_log_block_size = L1_CACHE_SHIFT; - vdso_data->icache_block_size = L1_CACHE_BYTES; - vdso_data->icache_log_block_size = L1_CACHE_SHIFT; #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S index d178ec8c279d..3440ddf21c8b 100644 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -10,6 +10,7 @@ #include #include #include +#include .text @@ -23,28 +24,44 @@ */ V_FUNCTION_BEGIN(__kernel_sync_dicache) .cfi_startproc +#ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 get_datapage r10, r0 mtlr r12 +#endif +#ifdef CONFIG_PPC64 lwz r7,CFG_DCACHE_BLOCKSZ(r10) addi r5,r7,-1 +#else + li r5, L1_CACHE_BYTES - 1 +#endif andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ +#ifdef CONFIG_PPC64 lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) srw. r8,r8,r9 /* compute line count */ +#else + srwi. r8, r8, L1_CACHE_SHIFT + mr r7, r6 +#endif crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 1: dcbst 0,r6 +#ifdef CONFIG_PPC64 add r6,r6,r7 +#else + addi r6, r6, L1_CACHE_BYTES +#endif bdnz 1b sync /* Now invalidate the instruction cache */ +#ifdef CONFIG_PPC64 lwz r7,CFG_ICACHE_BLOCKSZ(r10) addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ @@ -54,9 +71,15 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) srw. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ +#endif mtctr r8 +#ifdef CONFIG_PPC64 2: icbi 0,r6 add r6,r6,r7 +#else +2: icbi 0, r7 + addi r7, r7, L1_CACHE_BYTES +#endif bdnz 2b isync li r3,0 From 6e2f9e9cfd560f5d9ce86503e5ae66330ccb97fc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:32 +0000 Subject: [PATCH 039/131] powerpc/vdso32: use LOAD_REG_IMMEDIATE() Use LOAD_REG_IMMEDIATE() to load registers with immediate value. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/36f111437e66e601929308f5d5dce230e1ce472f.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/vdso32/gettimeofday.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 74973548529a..9aafacea9c4a 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -37,8 +37,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) get_datapage r9, r0 cmplwi r10,0 /* check if tv is NULL */ beq 3f - lis r7,1000000@ha /* load up USEC_PER_SEC */ - addi r7,r7,1000000@l /* so we get microseconds in r4 */ + LOAD_REG_IMMEDIATE(r7, 1000000) /* load up USEC_PER_SEC */ bl __do_get_tspec@local /* get sec/usec from tb & kernel */ stw r3,TVAL32_TV_SEC(r10) stw r4,TVAL32_TV_USEC(r10) @@ -81,8 +80,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ get_datapage r9, r0 - lis r7,NSEC_PER_SEC@h /* want nanoseconds */ - ori r7,r7,NSEC_PER_SEC@l + LOAD_REG_IMMEDIATE(r7, NSEC_PER_SEC) /* load up NSEC_PER_SEC */ beq cr5, .Lcoarse_clocks .Lprecise_clocks: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ From e33ffc956b086675551b86f5f83cf50ca47aa71c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:33 +0000 Subject: [PATCH 040/131] powerpc/vdso32: implement clock_getres entirely clock_getres returns hrtimer_res for all clocks but coarse ones for which it returns KTIME_LOW_RES. return EINVAL for unknown clocks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/37f94e47c91070b7606fb3ec3fe6fd2302a475a0.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/kernel/vdso32/gettimeofday.S | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0013197d89a6..90e53d432f2e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -413,7 +413,10 @@ int main(void) DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); + DEFINE(CLOCK_MAX, CLOCK_TAI); DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); + DEFINE(EINVAL, EINVAL); + DEFINE(KTIME_LOW_RES, KTIME_LOW_RES); #ifdef CONFIG_BUG DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 9aafacea9c4a..20ae38f3a5a3 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -196,17 +196,20 @@ V_FUNCTION_END(__kernel_clock_gettime) V_FUNCTION_BEGIN(__kernel_clock_getres) .cfi_startproc /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f + cmplwi cr0, r3, CLOCK_MAX + cmpwi cr1, r3, CLOCK_REALTIME_COARSE + cmpwi cr7, r3, CLOCK_MONOTONIC_COARSE + bgt cr0, 99f + LOAD_REG_IMMEDIATE(r5, KTIME_LOW_RES) + beq cr1, 1f + beq cr7, 1f mflr r12 .cfi_register lr,r12 get_datapage r3, r0 lwz r5, CLOCK_HRTIMER_RES(r3) mtlr r12 - li r3,0 +1: li r3,0 cmpli cr0,r4,0 crclr cr0*4+so beqlr @@ -215,11 +218,11 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) blr /* - * syscall fallback + * invalid clock */ 99: - li r0,__NR_clock_getres - sc + li r3, EINVAL + crset so blr .cfi_endproc V_FUNCTION_END(__kernel_clock_getres) From bfc2eae0ad72a43d2a270acce2581c065223aa12 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Dec 2019 07:57:35 +0000 Subject: [PATCH 041/131] powerpc/vdso32: miscellaneous optimisations Various optimisations by inverting branches and removing redundant instructions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b4e79f963845545bcce1459cd6fcfe46bdde7863.1575273217.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/vdso32/datapage.S | 3 +-- arch/powerpc/kernel/vdso32/getcpu.S | 6 +++--- arch/powerpc/kernel/vdso32/gettimeofday.S | 18 +++++++++--------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 1095d818f94a..217bb630f8f9 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -30,11 +30,10 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r4,r3 + mr. r4,r3 get_datapage r3, r0 mtlr r12 addi r3,r3,CFG_SYSCALL_MAP32 - cmpli cr0,r4,0 beqlr li r0,NR_syscalls stw r0,0(r4) diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S index 90b39af14383..ff5e214fec41 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -25,10 +25,10 @@ V_FUNCTION_BEGIN(__kernel_getcpu) rlwinm r7,r5,16,31-15,31-0 beq cr0,1f stw r6,0(r3) -1: beq cr1,2f - stw r7,0(r4) -2: crclr cr0*4+so +1: crclr cr0*4+so li r3,0 /* always success */ + beqlr cr1 + stw r7,0(r4) blr .cfi_endproc V_FUNCTION_END(__kernel_getcpu) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 20ae38f3a5a3..a3951567118a 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -32,10 +32,9 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mflr r12 .cfi_register lr,r12 - mr r10,r3 /* r10 saves tv */ + mr. r10,r3 /* r10 saves tv */ mr r11,r4 /* r11 saves tz */ get_datapage r9, r0 - cmplwi r10,0 /* check if tv is NULL */ beq 3f LOAD_REG_IMMEDIATE(r7, 1000000) /* load up USEC_PER_SEC */ bl __do_get_tspec@local /* get sec/usec from tb & kernel */ @@ -43,15 +42,16 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) stw r4,TVAL32_TV_USEC(r10) 3: cmplwi r11,0 /* check if tz is NULL */ - beq 1f + mtlr r12 + crclr cr0*4+so + li r3,0 + beqlr + lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ lwz r5,CFG_TZ_DSTTIME(r9) stw r4,TZONE_TZ_MINWEST(r11) stw r5,TZONE_TZ_DSTTIME(r11) -1: mtlr r12 - crclr cr0*4+so - li r3,0 blr .cfi_endproc V_FUNCTION_END(__kernel_gettimeofday) @@ -245,10 +245,10 @@ V_FUNCTION_BEGIN(__kernel_time) lwz r3,STAMP_XTIME_SEC+LOPART(r9) cmplwi r11,0 /* check if t is NULL */ - beq 2f - stw r3,0(r11) /* store result at *t */ -2: mtlr r12 + mtlr r12 crclr cr0*4+so + beqlr + stw r3,0(r11) /* store result at *t */ blr .cfi_endproc V_FUNCTION_END(__kernel_time) From 05dd7da76986937fb288b4213b1fa10dbe0d1b33 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:08 +0100 Subject: [PATCH 042/131] powerpc/powernv/ioda: Fix ref count for devices with their own PE The pci_dn structure used to store a pointer to the struct pci_dev, so taking a reference on the device was required. However, the pci_dev pointer was later removed from the pci_dn structure, but the reference was kept for the npu device. See commit 902bdc57451c ("powerpc/powernv/idoa: Remove unnecessary pcidev from pci_dn"). We don't need to take a reference on the device when assigning the PE as the struct pnv_ioda_pe is cleaned up at the same time as the (physical) device is released. Doing so prevents the device from being released, which is a problem for opencapi devices, since we want to be able to remove them through PCI hotplug. Now the ugly part: nvlink npu devices are not meant to be released. Because of the above, we've always leaked a reference and simply removing it now is dangerous and would likely require more work. There's currently no release device callback for nvlink devices for example. So to be safe, this patch leaks a reference on the npu device, but only for nvlink and not opencapi. Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-2-fbarrat@linux.ibm.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 62f17c015f9f..82e81fe83637 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1062,14 +1062,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return NULL; } - /* NOTE: We get only one ref to the pci_dev for the pdn, not for the - * pointer in the PE data structure, both should be destroyed at the - * same time. However, this needs to be looked at more closely again - * once we actually start removing things (Hotplug, SR-IOV, ...) + /* NOTE: We don't get a reference for the pointer in the PE + * data structure, both the device and PE structures should be + * destroyed at the same time. However, removing nvlink + * devices will need some work. * * At some point we want to remove the PDN completely anyways */ - pci_dev_get(dev); pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; @@ -1084,7 +1083,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; - pci_dev_put(dev); return NULL; } @@ -1205,6 +1203,14 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); struct pnv_phb *phb = hose->private_data; + /* + * Intentionally leak a reference on the npu device (for + * nvlink only; this is not an opencapi path) to make sure it + * never goes away, as it's been the case all along and some + * work is needed otherwise. + */ + pci_dev_get(npu_pdev); + /* * Due to a hardware errata PE#0 on the NPU is reserved for * error handling. This means we only have three PEs remaining @@ -1228,7 +1234,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) */ dev_info(&npu_pdev->dev, "Associating to existing PE %x\n", pe_num); - pci_dev_get(npu_pdev); npu_pdn = pci_get_pdn(npu_pdev); rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; npu_pdn->pe_number = pe_num; From 80f1ff83fa113aa718ff4009684981386c574954 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:09 +0100 Subject: [PATCH 043/131] powerpc/powernv/ioda: Protect PE list Protect the PHB's list of PE. Probably not needed as long as it was populated during PHB creation, but it feels right and will become required once we can add/remove opencapi devices on hotplug. Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-3-fbarrat@linux.ibm.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 82e81fe83637..4bf67d8a3385 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1087,8 +1087,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) } /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); - + mutex_unlock(&phb->ioda.pe_list_mutex); return pe; } @@ -3528,7 +3529,10 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) struct pnv_phb *phb = pe->phb; struct pnv_ioda_pe *slave, *tmp; + mutex_lock(&phb->ioda.pe_list_mutex); list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + switch (phb->type) { case PNV_PHB_IODA1: pnv_pci_ioda1_release_pe_dma(pe); From c1a2feade085fb4213e2823dabaa2401626843c8 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:10 +0100 Subject: [PATCH 044/131] powerpc/powernv/ioda: set up PE on opencapi device when enabling The PE for an opencapi device was set as part of a late PHB fixup operation, when creating the PHB. To use the PCI hotplug framework, this is not going to work, as the PHB stays the same, it's only the devices underneath which are updated. For regular PCI devices, it is done as part of the reconfiguration of the bridge, but for opencapi PHBs, we don't have an intermediate bridge. So let's define the PE when the device is enabled. PEs are meaningless for opencapi, the NPU doesn't define them and opal is not doing anything with them. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-4-fbarrat@linux.ibm.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4bf67d8a3385..99291a8f41c6 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1274,8 +1274,6 @@ static void pnv_pci_ioda_setup_PEs(void) { struct pci_controller *hose; struct pnv_phb *phb; - struct pci_bus *bus; - struct pci_dev *pdev; struct pnv_ioda_pe *pe; list_for_each_entry(hose, &hose_list, list_node) { @@ -1287,11 +1285,6 @@ static void pnv_pci_ioda_setup_PEs(void) if (phb->model == PNV_PHB_MODEL_NPU2) WARN_ON_ONCE(pnv_npu2_init(hose)); } - if (phb->type == PNV_PHB_NPU_OCAPI) { - bus = hose->bus; - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_dev_PE(pdev); - } } list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; @@ -3400,6 +3393,28 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } +static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn; + struct pnv_ioda_pe *pe; + + if (!phb->initialized) + return true; + + pdn = pci_get_pdn(dev); + if (!pdn) + return false; + + if (pdn->pe_number == IODA_INVALID_PE) { + pe = pnv_ioda_setup_dev_PE(dev); + if (!pe) + return false; + } + return true; +} + static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, int num) { @@ -3640,7 +3655,7 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { }; static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { - .enable_device_hook = pnv_pci_enable_device_hook, + .enable_device_hook = pnv_ocapi_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .shutdown = pnv_pci_ioda_shutdown, From f724385fea0136af31a850fbcff01a6e8cf8c889 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:11 +0100 Subject: [PATCH 045/131] powerpc/powernv/ioda: Release opencapi device With hotplug, an opencapi device can now go away. It needs to be released, mostly to clean up its PE state. We were previously not defining any device callback. We can reuse the standard PCI release callback, it does a bit too much for an opencapi device, but it's harmless, and only needs minor tuning. Also separate the undo of the PELT-V code in a separate function, it is not needed for NPU devices and it improves a bit the readability of the code. Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-5-fbarrat@linux.ibm.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 59 +++++++++++++++-------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 99291a8f41c6..e41d38299ae9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -188,7 +188,7 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */ + WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */ kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); clear_bit(pe_num, phb->ioda.pe_alloc); @@ -777,6 +777,34 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } +static void pnv_ioda_unset_peltv(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + struct pci_dev *parent) +{ + int64_t rc; + + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, + OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); +} + static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; @@ -827,25 +855,13 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) for (rid = pe->rid; rid < rid_end; rid++) phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; - /* Release from all parents PELT-V */ - while (parent) { - struct pci_dn *pdn = pci_get_pdn(parent); - if (pdn && pdn->pe_number != IODA_INVALID_PE) { - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - /* XXX What to do in case of error ? */ - } - parent = parent->bus->self; - } + /* + * Release from all parents PELT-V. NPUs don't have a PELTV + * table + */ + if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI) + pnv_ioda_unset_peltv(phb, pe, parent); - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); - - /* Disassociate PE in PELT */ - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - if (rc) - pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, bcomp, dcomp, fcomp, OPAL_UNMAP_PE); if (rc) @@ -1075,6 +1091,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pe->pbus = NULL; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; + pe->device_count++; pe_info(pe, "Associated device to PE\n"); @@ -1239,6 +1256,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; npu_pdn->pe_number = pe_num; phb->ioda.pe_rmap[rid] = pe->pe_number; + pe->device_count++; /* Map the PE to this link */ rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, @@ -3555,6 +3573,8 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) case PNV_PHB_IODA2: pnv_pci_ioda2_release_pe_dma(pe); break; + case PNV_PHB_NPU_OCAPI: + break; default: WARN_ON(1); } @@ -3656,6 +3676,7 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { .enable_device_hook = pnv_ocapi_enable_device_hook, + .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .shutdown = pnv_pci_ioda_shutdown, From bbb789046084d18fff1a32a7bcdeca36dfa0ac14 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:12 +0100 Subject: [PATCH 046/131] powerpc/powernv/ioda: Find opencapi slot for a device node Unlike real PCI slots, opencapi slots are directly associated to the (virtual) opencapi PHB, there's no intermediate bridge. So when looking for a slot ID, we must start the search from the device node itself and not its parent. Also, the slot ID is not attached to a specific bdfn, so let's build it from the PHB ID, like skiboot. Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-6-fbarrat@linux.ibm.com --- arch/powerpc/include/asm/pnv-pci.h | 1 + arch/powerpc/platforms/powernv/pci.c | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index edcb1fc50aeb..d0ee0ede5767 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -15,6 +15,7 @@ #define PCI_SLOT_ID_PREFIX (1UL << 63) #define PCI_SLOT_ID(phb_id, bdfn) \ (PCI_SLOT_ID_PREFIX | ((uint64_t)(bdfn) << 16) | (phb_id)) +#define PCI_PHB_SLOT_ID(phb_id) (phb_id) extern int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id); extern int pnv_pci_get_device_tree(uint32_t phandle, void *buf, uint64_t len); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index e8e58a2cccdd..929e0241d862 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -38,7 +38,7 @@ static DEFINE_MUTEX(tunnel_mutex); int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) { - struct device_node *parent = np; + struct device_node *node = np; u32 bdfn; u64 phbid; int ret; @@ -48,25 +48,29 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) return -ENXIO; bdfn = ((bdfn & 0x00ffff00) >> 8); - while ((parent = of_get_parent(parent))) { - if (!PCI_DN(parent)) { - of_node_put(parent); + for (node = np; node; node = of_get_parent(node)) { + if (!PCI_DN(node)) { + of_node_put(node); break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb") && - !of_device_is_compatible(parent, "ibm,ioda3-phb")) { - of_node_put(parent); + if (!of_device_is_compatible(node, "ibm,ioda2-phb") && + !of_device_is_compatible(node, "ibm,ioda3-phb") && + !of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) { + of_node_put(node); continue; } - ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid); + ret = of_property_read_u64(node, "ibm,opal-phbid", &phbid); if (ret) { - of_node_put(parent); + of_node_put(node); return -ENXIO; } - *id = PCI_SLOT_ID(phbid, bdfn); + if (of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) + *id = PCI_PHB_SLOT_ID(phbid); + else + *id = PCI_SLOT_ID(phbid, bdfn); return 0; } From 658ab186dd22060408d94f5c5a6d02d809baba44 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:13 +0100 Subject: [PATCH 047/131] pci/hotplug/pnv-php: Remove erroneous warning On powernv, when removing a device through hotplug, the following warning is logged: Invalid refcount <.> on <...> It may be incorrect, the refcount may be set to a higher value than 1 and be valid. of_detach_node() can drop more than one reference. As it doesn't seem trivial to assert the correct value, let's remove the warning. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-7-fbarrat@linux.ibm.com --- drivers/pci/hotplug/pnv_php.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index d7b2b47bc33e..6037983c6e46 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -151,17 +151,11 @@ static void pnv_php_rmv_pdns(struct device_node *dn) static void pnv_php_detach_device_nodes(struct device_node *parent) { struct device_node *dn; - int refcount; for_each_child_of_node(parent, dn) { pnv_php_detach_device_nodes(dn); of_node_put(dn); - refcount = kref_read(&dn->kobj.kref); - if (refcount != 1) - pr_warn("Invalid refcount %d on <%pOF>\n", - refcount, dn); - of_detach_node(dn); } } From 323c2a26ff43500a96799250330fab68903d776f Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:14 +0100 Subject: [PATCH 048/131] pci/hotplug/pnv-php: Improve error msg on power state change failure When changing the slot state, if opal hits an error and tells as such in the asynchronous reply, the warning "Wrong msg" is logged, which is rather confusing. Instead we can reuse the better message which is already used when we couldn't submit the asynchronous opal request initially. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-8-fbarrat@linux.ibm.com --- drivers/pci/hotplug/pnv_php.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 6037983c6e46..c5f88e937fd5 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -336,18 +336,19 @@ int pnv_php_set_slot_power_state(struct hotplug_slot *slot, ret = pnv_pci_set_power_state(php_slot->id, state, &msg); if (ret > 0) { if (be64_to_cpu(msg.params[1]) != php_slot->dn->phandle || - be64_to_cpu(msg.params[2]) != state || - be64_to_cpu(msg.params[3]) != OPAL_SUCCESS) { + be64_to_cpu(msg.params[2]) != state) { pci_warn(php_slot->pdev, "Wrong msg (%lld, %lld, %lld)\n", be64_to_cpu(msg.params[1]), be64_to_cpu(msg.params[2]), be64_to_cpu(msg.params[3])); return -ENOMSG; } + if (be64_to_cpu(msg.params[3]) != OPAL_SUCCESS) { + ret = -ENODEV; + goto error; + } } else if (ret < 0) { - pci_warn(php_slot->pdev, "Error %d powering %s\n", - ret, (state == OPAL_PCI_SLOT_POWER_ON) ? "on" : "off"); - return ret; + goto error; } if (state == OPAL_PCI_SLOT_POWER_OFF || state == OPAL_PCI_SLOT_OFFLINE) @@ -356,6 +357,11 @@ int pnv_php_set_slot_power_state(struct hotplug_slot *slot, ret = pnv_php_add_devtree(php_slot); return ret; + +error: + pci_warn(php_slot->pdev, "Error %d powering %s\n", + ret, (state == OPAL_PCI_SLOT_POWER_ON) ? "on" : "off"); + return ret; } EXPORT_SYMBOL_GPL(pnv_php_set_slot_power_state); From ea53919b15bf1e821b034b9c89758008de521437 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:15 +0100 Subject: [PATCH 049/131] pci/hotplug/pnv-php: Register opencapi slots Add the opencapi PHBs to the list of PHBs being scanned to look for slots. Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-9-fbarrat@linux.ibm.com --- drivers/pci/hotplug/pnv_php.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index c5f88e937fd5..0cb6a4897905 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -1009,6 +1009,8 @@ static int __init pnv_php_init(void) for_each_compatible_node(dn, NULL, "ibm,ioda3-phb") pnv_php_register(dn); + for_each_compatible_node(dn, NULL, "ibm,ioda2-npu2-opencapi-phb") + pnv_php_register_one(dn); /* slot directly under the PHB */ return 0; } @@ -1021,6 +1023,9 @@ static void __exit pnv_php_exit(void) for_each_compatible_node(dn, NULL, "ibm,ioda3-phb") pnv_php_unregister(dn); + + for_each_compatible_node(dn, NULL, "ibm,ioda2-npu2-opencapi-phb") + pnv_php_unregister_one(dn); /* slot directly under the PHB */ } module_init(pnv_php_init); From be1611e043de63948c21cc7dd27ea6798ffd6fa4 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:16 +0100 Subject: [PATCH 050/131] pci/hotplug/pnv-php: Relax check when disabling slot The driver only allows to disable a slot in the POPULATED state. However, if an error occurs while enabling the slot, say because the link couldn't be trained, then the POPULATED state may not be reached, yet the power state of the slot is on. So allow to disable a slot in the REGISTERED state. Removing the devices will do nothing since it's not populated, and we'll set the power state of the slot back to off. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-10-fbarrat@linux.ibm.com --- drivers/pci/hotplug/pnv_php.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 0cb6a4897905..f2de76390aff 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -566,7 +566,13 @@ static int pnv_php_disable_slot(struct hotplug_slot *slot) struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); int ret; - if (php_slot->state != PNV_PHP_STATE_POPULATED) + /* + * Allow to disable a slot already in the registered state to + * cover cases where the slot couldn't be enabled and never + * reached the populated state + */ + if (php_slot->state != PNV_PHP_STATE_POPULATED && + php_slot->state != PNV_PHP_STATE_REGISTERED) return 0; /* Remove all devices behind the slot */ From 748ac391ab9acd8d7f3c93cbf3e63c773c0b2638 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:17 +0100 Subject: [PATCH 051/131] pci/hotplug/pnv-php: Wrap warnings in macro An opencapi slot doesn't have an associated bridge device. It's not needed for operation, but any warning is displayed through pci_warn() which uses the pci_dev struct of the assocated bridge device. So wrap those warning so that a different trace mechanism can be used if it's an opencapi slot. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-11-fbarrat@linux.ibm.com --- drivers/pci/hotplug/pnv_php.c | 51 +++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index f2de76390aff..04565162a449 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -18,6 +18,9 @@ #define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" #define DRIVER_DESC "PowerPC PowerNV PCI Hotplug Driver" +#define SLOT_WARN(sl, x...) \ + ((sl)->pdev ? pci_warn((sl)->pdev, x) : dev_warn(&(sl)->bus->dev, x)) + struct pnv_php_event { bool added; struct pnv_php_slot *php_slot; @@ -265,7 +268,7 @@ static int pnv_php_add_devtree(struct pnv_php_slot *php_slot) ret = pnv_pci_get_device_tree(php_slot->dn->phandle, fdt1, 0x10000); if (ret) { - pci_warn(php_slot->pdev, "Error %d getting FDT blob\n", ret); + SLOT_WARN(php_slot, "Error %d getting FDT blob\n", ret); goto free_fdt1; } @@ -279,7 +282,7 @@ static int pnv_php_add_devtree(struct pnv_php_slot *php_slot) dt = of_fdt_unflatten_tree(fdt, php_slot->dn, NULL); if (!dt) { ret = -EINVAL; - pci_warn(php_slot->pdev, "Cannot unflatten FDT\n"); + SLOT_WARN(php_slot, "Cannot unflatten FDT\n"); goto free_fdt; } @@ -289,15 +292,15 @@ static int pnv_php_add_devtree(struct pnv_php_slot *php_slot) ret = pnv_php_populate_changeset(&php_slot->ocs, php_slot->dn); if (ret) { pnv_php_reverse_nodes(php_slot->dn); - pci_warn(php_slot->pdev, "Error %d populating changeset\n", - ret); + SLOT_WARN(php_slot, "Error %d populating changeset\n", + ret); goto free_dt; } php_slot->dn->child = NULL; ret = of_changeset_apply(&php_slot->ocs); if (ret) { - pci_warn(php_slot->pdev, "Error %d applying changeset\n", ret); + SLOT_WARN(php_slot, "Error %d applying changeset\n", ret); goto destroy_changeset; } @@ -337,10 +340,10 @@ int pnv_php_set_slot_power_state(struct hotplug_slot *slot, if (ret > 0) { if (be64_to_cpu(msg.params[1]) != php_slot->dn->phandle || be64_to_cpu(msg.params[2]) != state) { - pci_warn(php_slot->pdev, "Wrong msg (%lld, %lld, %lld)\n", - be64_to_cpu(msg.params[1]), - be64_to_cpu(msg.params[2]), - be64_to_cpu(msg.params[3])); + SLOT_WARN(php_slot, "Wrong msg (%lld, %lld, %lld)\n", + be64_to_cpu(msg.params[1]), + be64_to_cpu(msg.params[2]), + be64_to_cpu(msg.params[3])); return -ENOMSG; } if (be64_to_cpu(msg.params[3]) != OPAL_SUCCESS) { @@ -359,8 +362,8 @@ int pnv_php_set_slot_power_state(struct hotplug_slot *slot, return ret; error: - pci_warn(php_slot->pdev, "Error %d powering %s\n", - ret, (state == OPAL_PCI_SLOT_POWER_ON) ? "on" : "off"); + SLOT_WARN(php_slot, "Error %d powering %s\n", + ret, (state == OPAL_PCI_SLOT_POWER_ON) ? "on" : "off"); return ret; } EXPORT_SYMBOL_GPL(pnv_php_set_slot_power_state); @@ -378,8 +381,8 @@ static int pnv_php_get_power_state(struct hotplug_slot *slot, u8 *state) */ ret = pnv_pci_get_power_state(php_slot->id, &power_state); if (ret) { - pci_warn(php_slot->pdev, "Error %d getting power status\n", - ret); + SLOT_WARN(php_slot, "Error %d getting power status\n", + ret); } else { *state = power_state; } @@ -402,7 +405,7 @@ static int pnv_php_get_adapter_state(struct hotplug_slot *slot, u8 *state) *state = presence; ret = 0; } else { - pci_warn(php_slot->pdev, "Error %d getting presence\n", ret); + SLOT_WARN(php_slot, "Error %d getting presence\n", ret); } return ret; @@ -681,7 +684,7 @@ static int pnv_php_register_slot(struct pnv_php_slot *php_slot) ret = pci_hp_register(&php_slot->slot, php_slot->bus, php_slot->slot_no, php_slot->name); if (ret) { - pci_warn(php_slot->pdev, "Error %d registering slot\n", ret); + SLOT_WARN(php_slot, "Error %d registering slot\n", ret); return ret; } @@ -734,7 +737,7 @@ static int pnv_php_enable_msix(struct pnv_php_slot *php_slot) /* Enable MSIx */ ret = pci_enable_msix_exact(pdev, &entry, 1); if (ret) { - pci_warn(pdev, "Error %d enabling MSIx\n", ret); + SLOT_WARN(php_slot, "Error %d enabling MSIx\n", ret); return ret; } @@ -784,8 +787,9 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data) (sts & PCI_EXP_SLTSTA_PDC)) { ret = pnv_pci_get_presence_state(php_slot->id, &presence); if (ret) { - pci_warn(pdev, "PCI slot [%s] error %d getting presence (0x%04x), to retry the operation.\n", - php_slot->name, ret, sts); + SLOT_WARN(php_slot, + "PCI slot [%s] error %d getting presence (0x%04x), to retry the operation.\n", + php_slot->name, ret, sts); return IRQ_HANDLED; } @@ -815,8 +819,9 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data) */ event = kzalloc(sizeof(*event), GFP_ATOMIC); if (!event) { - pci_warn(pdev, "PCI slot [%s] missed hotplug event 0x%04x\n", - php_slot->name, sts); + SLOT_WARN(php_slot, + "PCI slot [%s] missed hotplug event 0x%04x\n", + php_slot->name, sts); return IRQ_HANDLED; } @@ -840,7 +845,7 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq) /* Allocate workqueue */ php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name); if (!php_slot->wq) { - pci_warn(pdev, "Cannot alloc workqueue\n"); + SLOT_WARN(php_slot, "Cannot alloc workqueue\n"); pnv_php_disable_irq(php_slot, true); return; } @@ -864,7 +869,7 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq) php_slot->name, php_slot); if (ret) { pnv_php_disable_irq(php_slot, true); - pci_warn(pdev, "Error %d enabling IRQ %d\n", ret, irq); + SLOT_WARN(php_slot, "Error %d enabling IRQ %d\n", ret, irq); return; } @@ -900,7 +905,7 @@ static void pnv_php_enable_irq(struct pnv_php_slot *php_slot) ret = pci_enable_device(pdev); if (ret) { - pci_warn(pdev, "Error %d enabling device\n", ret); + SLOT_WARN(php_slot, "Error %d enabling device\n", ret); return; } From 49ce94b8677c7d7a15c4d7cbbb9ff1cd8387827b Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Thu, 21 Nov 2019 14:49:18 +0100 Subject: [PATCH 052/131] ocxl: Add PCI hotplug dependency to Kconfig The PCI hotplug framework is used to update the devices when a new image is written to the FPGA. Reviewed-by: Alastair D'Silva Reviewed-by: Andrew Donnellan Signed-off-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191121134918.7155-12-fbarrat@linux.ibm.com --- drivers/misc/ocxl/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/ocxl/Kconfig b/drivers/misc/ocxl/Kconfig index 1916fa65f2f2..2d2266c1439e 100644 --- a/drivers/misc/ocxl/Kconfig +++ b/drivers/misc/ocxl/Kconfig @@ -11,6 +11,7 @@ config OCXL tristate "OpenCAPI coherent accelerator support" depends on PPC_POWERNV && PCI && EEH select OCXL_BASE + select HOTPLUG_PCI_POWERNV default m help Select this option to enable the ocxl driver for Open From b1268f4cdba71c0cd40b533778812340d36de8ae Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Jul 2019 18:56:08 +1000 Subject: [PATCH 053/131] powerpc/eeh_cache: Don't use pci_dn when inserting new ranges At the point where we start inserting ranges into the EEH address cache the binding between pci_dev and eeh_dev has already been set up. Instead of consulting the pci_dn tree we can retrieve the eeh_dev directly using pci_dev_to_eeh_dev(). Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190715085612.8802-2-oohall@gmail.com --- arch/powerpc/kernel/eeh_cache.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index cf11277ebd02..6b50bf15d8c1 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -159,18 +159,10 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; int i; - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) { - pr_warn("PCI: no pci dn found for dev=%s\n", - pci_name(dev)); - return; - } - - edev = pdn_to_eeh_dev(pdn); + edev = pci_dev_to_eeh_dev(dev); if (!edev) { pr_warn("PCI: no EEH dev found for %s\n", pci_name(dev)); From 89f51839bd75b1aa8fa062a5f2b4bbcde560f6e2 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Jul 2019 18:56:09 +1000 Subject: [PATCH 054/131] powerpc/eeh_sysfs: Fix incorrect comment The EEH_ATTR_SHOW() helper is used to display fields from struct eeh_dev not struct pci_dn. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190715085612.8802-3-oohall@gmail.com --- arch/powerpc/kernel/eeh_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index ab44d965a53c..6afd82662b76 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -14,7 +14,7 @@ /** * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic * @_name: name of file in sysfs directory - * @_memb: name of member in struct pci_dn to access + * @_memb: name of member in struct eeh_dev to access * @_format: printf format for display * * All of the attributes look very similar, so just From 4107248c5615ab20b64073f2dd5917b3961239dc Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Jul 2019 18:56:10 +1000 Subject: [PATCH 055/131] powerpc/eeh_sysfs: ifdef pseries sr-iov sysfs properties There are several EEH sysfs properties that only exists when the "ibm,is-open-sriov-pf" property appears in the device tree node of the PCI device. This used on pseries to indicate to the guest that the hypervisor allows the guest to configure the SR-IOV capability. Doing this requires some handshaking between the guest, hypervisor and userspace when a VF is EEH frozen which is why these properties exist. This is all dead code on non-pseries platforms so wrap it in an #ifdef CONFIG_PPC_PSERIES to make the dependency clearer. Signed-off-by: Oliver O'Halloran Tested-by: Sam Bobroff Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190715085612.8802-4-oohall@gmail.com --- arch/powerpc/kernel/eeh_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 6afd82662b76..64a6dd519bf6 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -75,7 +75,7 @@ static ssize_t eeh_pe_state_store(struct device *dev, static DEVICE_ATTR_RW(eeh_pe_state); -#ifdef CONFIG_PCI_IOV +#if defined(CONFIG_PCI_IOV) && defined(CONFIG_PPC_PSERIES) static ssize_t eeh_notify_resume_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -132,7 +132,7 @@ static void eeh_notify_resume_remove(struct pci_dev *pdev) #else static inline int eeh_notify_resume_add(struct pci_dev *pdev) { return 0; } static inline void eeh_notify_resume_remove(struct pci_dev *pdev) { } -#endif /* CONFIG_PCI_IOV */ +#endif /* CONFIG_PCI_IOV && CONFIG PPC_PSERIES*/ void eeh_sysfs_add_device(struct pci_dev *pdev) { From 758b423275f0738fa4d382057c03f3d45c12905d Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Jul 2019 18:56:11 +1000 Subject: [PATCH 056/131] powerpc/eeh_sysfs: Remove double pci_dn lookup. In eeh_notify_resume_show() the pci_dn for the device is looked up once in the declaration block and then once after checking for a NULL eeh_dev. Remove the second lookup since it's pointless. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190715085612.8802-5-oohall@gmail.com --- arch/powerpc/kernel/eeh_sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 64a6dd519bf6..a6638ea54797 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -86,7 +86,6 @@ static ssize_t eeh_notify_resume_show(struct device *dev, if (!edev || !edev->pe) return -ENODEV; - pdn = pci_get_pdn(pdev); return sprintf(buf, "%d\n", pdn->last_allow_rc); } From 3489cdc417b20d929b33eff4d8312b4dd1f39ca1 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 15 Jul 2019 18:56:12 +1000 Subject: [PATCH 057/131] powerpc/eeh_sysfs: Make clearing EEH_DEV_SYSFS saner The eeh_sysfs_remove_device() function is supposed to clear the EEH_DEV_SYSFS flag since it indicates the EEH sysfs entries have been added for a pci_dev. When the sysfs files are removed eeh_remove_device() the eeh_dev and the pci_dev have already been de-associated. This then causes the pci_dev_to_eeh_dev() call in eeh_sysfs_remove_device() to return NULL so the flag can't be cleared from the still-live eeh_dev. This problem is worked around in the caller by clearing the flag manually. However, this behaviour doesn't make a whole lot of sense, so this patch fixes it by: a) Re-ordering eeh_remove_device() so that eeh_sysfs_remove_device() is called before de-associating the pci_dev and eeh_dev. b) Making eeh_sysfs_remove_device() emit a warning if there's no corresponding eeh_dev for a pci_dev. The paths where the sysfs files are only reachable if EEH was setup for the device for the device in the first place so hitting this warning indicates a programming error. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190715085612.8802-6-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 30 +++++++++++++++++------------- arch/powerpc/kernel/eeh_sysfs.c | 15 ++++++++------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index bc8a551013be..daf9ff34a255 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1191,7 +1191,6 @@ void eeh_add_device_late(struct pci_dev *dev) eeh_rmv_from_parent_pe(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); - edev->mode &= ~EEH_DEV_SYSFS; /* * We definitely should have the PCI device removed @@ -1295,6 +1294,23 @@ void eeh_remove_device(struct pci_dev *dev) */ edev->pdev = NULL; + /* + * eeh_sysfs_remove_device() uses pci_dev_to_eeh_dev() so we need to + * remove the sysfs files before clearing dev.archdata.edev + */ + if (edev->mode & EEH_DEV_SYSFS) + eeh_sysfs_remove_device(dev); + + /* + * We're removing from the PCI subsystem, that means + * the PCI device driver can't support EEH or not + * well. So we rely on hotplug completely to do recovery + * for the specific PCI device. + */ + edev->mode |= EEH_DEV_NO_HANDLER; + + eeh_addr_cache_rmv_dev(dev); + /* * The flag "in_error" is used to trace EEH devices for VFs * in error state or not. It's set in eeh_report_error(). If @@ -1307,18 +1323,6 @@ void eeh_remove_device(struct pci_dev *dev) eeh_rmv_from_parent_pe(edev); else edev->mode |= EEH_DEV_DISCONNECTED; - - /* - * We're removing from the PCI subsystem, that means - * the PCI device driver can't support EEH or not - * well. So we rely on hotplug completely to do recovery - * for the specific PCI device. - */ - edev->mode |= EEH_DEV_NO_HANDLER; - - eeh_addr_cache_rmv_dev(dev); - eeh_sysfs_remove_device(dev); - edev->mode &= ~EEH_DEV_SYSFS; } int eeh_unfreeze_pe(struct eeh_pe *pe) diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index a6638ea54797..4fb0f1e1017a 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -159,22 +159,23 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) { struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + WARN_ON(eeh_enabled()); + return; + } + + edev->mode &= ~EEH_DEV_SYSFS; + /* * The parent directory might have been removed. We needn't * continue for that case. */ - if (!pdev->dev.kobj.sd) { - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; + if (!pdev->dev.kobj.sd) return; - } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); eeh_notify_resume_remove(pdev); - - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; } From 1fb4124ca9d456656a324f1ee29b7bf942f59ac8 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 21 Aug 2019 16:26:53 +1000 Subject: [PATCH 058/131] powerpc/sriov: Remove VF eeh_dev state when disabling SR-IOV When disabling virtual functions on an SR-IOV adapter we currently do not correctly remove the EEH state for the now-dead virtual functions. When removing the pci_dn that was created for the VF when SR-IOV was enabled we free the corresponding eeh_dev without removing it from the child device list of the eeh_pe that contained it. This can result in crashes due to the use-after-free. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190821062655.19735-1-oohall@gmail.com --- arch/powerpc/kernel/pci_dn.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 9524009ca1ae..d876eda92609 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -244,9 +244,22 @@ void remove_dev_pci_data(struct pci_dev *pdev) continue; #ifdef CONFIG_EEH - /* Release EEH device for the VF */ + /* + * Release EEH state for this VF. The PCI core + * has already torn down the pci_dev for this VF, but + * we're responsible to removing the eeh_dev since it + * has the same lifetime as the pci_dn that spawned it. + */ edev = pdn_to_eeh_dev(pdn); if (edev) { + /* + * We allocate pci_dn's for the totalvfs count, + * but only only the vfs that were activated + * have a configured PE. + */ + if (edev->pe) + eeh_rmv_from_parent_pe(edev); + pdn->edev = NULL; kfree(edev); } From 8cd6aacc640116213f9373808a627ecd3728accc Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 21 Aug 2019 16:26:54 +1000 Subject: [PATCH 059/131] powerpc/pcidn: Make VF pci_dn management CONFIG_PCI_IOV specific The powerpc PCI code requires that a pci_dn structure exists for all devices in the system. This is fine for real devices since at boot a pci_dn is created for each PCI device in the DT and it's fine for hotplugged devices since the hotplug slot driver will manage the pci_dn's devices in hotplug slots. For SR-IOV, we need the platform / pcibios to manage the pci_dn for virtual functions since firmware is unaware of VFs, and they aren't "hot plugged" in the traditional sense. Management of the pci_dn is handled by the, poorly named, functions: add_pci_dev_data() and remove_pci_dev_data(). The entire body of these functions is #ifdef`ed around CONFIG_PCI_IOV and they cannot be used in any other context, so make them only available when CONFIG_PCI_IOV is selected, and rename them to reflect their actual usage rather than having them masquerade as generic code. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190821062655.19735-2-oohall@gmail.com --- arch/powerpc/include/asm/pci-bridge.h | 7 +++++-- arch/powerpc/kernel/pci_dn.c | 15 +++++---------- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- arch/powerpc/platforms/pseries/pci.c | 4 ++-- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index ea6ec65970ef..69f4cb3b7c56 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -223,12 +223,15 @@ struct pci_dn { extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, int devfn); extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); -extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); -extern void remove_dev_pci_data(struct pci_dev *pdev); extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn); extern void pci_remove_device_node_info(struct device_node *dn); +#ifdef CONFIG_PCI_IOV +struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev); +void remove_sriov_vf_pdns(struct pci_dev *pdev); +#endif + static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) { diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index d876eda92609..c158ac897efb 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -125,7 +125,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) } #ifdef CONFIG_PCI_IOV -static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, +static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent, int vf_index, int busno, int devfn) { @@ -151,11 +151,9 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return pdn; } -#endif -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) +struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent, *pdn; int i; @@ -176,7 +174,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, i, + pdn = add_one_sriov_vf_pdn(parent, i, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -192,14 +190,11 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) edev->physfn = pdev; #endif /* CONFIG_EEH */ } -#endif /* CONFIG_PCI_IOV */ - return pci_get_pdn(pdev); } -void remove_dev_pci_data(struct pci_dev *pdev) +void remove_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent; struct pci_dn *pdn, *tmp; int i; @@ -271,8 +266,8 @@ void remove_dev_pci_data(struct pci_dev *pdev) kfree(pdn); } } -#endif /* CONFIG_PCI_IOV */ } +#endif /* CONFIG_PCI_IOV */ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e41d38299ae9..d178c572c474 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1747,14 +1747,14 @@ int pnv_pcibios_sriov_disable(struct pci_dev *pdev) pnv_pci_sriov_disable(pdev); /* Release PCI data */ - remove_dev_pci_data(pdev); + remove_sriov_vf_pdns(pdev); return 0; } int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate PCI data */ - add_dev_pci_data(pdev); + add_sriov_vf_pdns(pdev); return pnv_pci_sriov_enable(pdev, num_vfs); } diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 722830978639..911534b89c85 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -192,7 +192,7 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate PCI data */ - add_dev_pci_data(pdev); + add_sriov_vf_pdns(pdev); return pseries_pci_sriov_enable(pdev, num_vfs); } @@ -204,7 +204,7 @@ int pseries_pcibios_sriov_disable(struct pci_dev *pdev) /* Releasing pe_num_map */ kfree(pdn->pe_num_map); /* Release PCI data */ - remove_dev_pci_data(pdev); + remove_sriov_vf_pdns(pdev); pci_vf_drivers_autoprobe(pdev, true); return 0; } From a4af49f34f76f1e6125b22b50d18580426d935e0 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 21 Aug 2019 16:26:55 +1000 Subject: [PATCH 060/131] powerpc/pcidn: Warn when sriov pci_dn management is used incorrectly These functions can only be used on a SR-IOV capable physical function and they're only called in pcibios_sriov_enable / disable. Make them emit a warning in the future if they're used incorrectly and remove the dead code that checks if the device is a VF. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190821062655.19735-3-oohall@gmail.com --- arch/powerpc/kernel/pci_dn.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index c158ac897efb..4e654df55969 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -158,8 +158,8 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev) int i; /* Only support IOV for now */ - if (!pdev->is_physfn) - return pci_get_pdn(pdev); + if (WARN_ON(!pdev->is_physfn)) + return NULL; /* Check if VFs have been populated */ pdn = pci_get_pdn(pdev); @@ -199,19 +199,8 @@ void remove_sriov_vf_pdns(struct pci_dev *pdev) struct pci_dn *pdn, *tmp; int i; - /* - * VF and VF PE are created/released dynamically, so we need to - * bind/unbind them. Otherwise the VF and VF PE would be mismatched - * when re-enabling SR-IOV. - */ - if (pdev->is_virtfn) { - pdn = pci_get_pdn(pdev); - pdn->pe_number = IODA_INVALID_PE; - return; - } - /* Only support IOV PF for now */ - if (!pdev->is_physfn) + if (WARN_ON(!pdev->is_physfn)) return; /* Check if VFs have been populated */ From 22ba7289079be12c85895fee41602139e9553c93 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 12 Sep 2019 15:29:43 +1000 Subject: [PATCH 061/131] powernv/pci: Use pnv_phb as the private data for debugfs entries Use the pnv_phb structure as the private data pointer for the debugfs files. This lets us delete some code and an open-coded use of hose->private_data. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190912052945.12589-1-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d178c572c474..f2c755626887 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3090,19 +3090,12 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) #ifdef CONFIG_DEBUG_FS static int pnv_pci_diag_data_set(void *data, u64 val) { - struct pci_controller *hose; - struct pnv_phb *phb; + struct pnv_phb *phb = data; s64 ret; if (val != 1ULL) return -EINVAL; - hose = (struct pci_controller *)data; - if (!hose || !hose->private_data) - return -ENODEV; - - phb = hose->private_data; - /* Retrieve the diag data from firmware */ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, phb->diag_data_size); @@ -3141,7 +3134,7 @@ static void pnv_pci_ioda_create_dbgfs(void) } debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, - hose, &pnv_pci_diag_data_fops); + phb, &pnv_pci_diag_data_fops); } #endif /* CONFIG_DEBUG_FS */ } From c13a17b73eb0b30070a347111b8c386f4e6a844e Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 12 Sep 2019 15:29:44 +1000 Subject: [PATCH 062/131] powernv/pci: Allow any write trigger the diag dump Make the dump trigger off any input rather than just '1'. This allows you to write "echo 1> dump_diag_data" and it'll do what you want rather than erroring out pointlessly. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190912052945.12589-2-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f2c755626887..f0425bd4edc6 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3093,9 +3093,6 @@ static int pnv_pci_diag_data_set(void *data, u64 val) struct pnv_phb *phb = data; s64 ret; - if (val != 1ULL) - return -EINVAL; - /* Retrieve the diag data from firmware */ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, phb->diag_data_size); From 18697d2b08622b35278fc4312e86e6b5d3cd758d Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 12 Sep 2019 15:29:45 +1000 Subject: [PATCH 063/131] powernv/pci: Add a debugfs entry to dump PHB's IODA PE state Add a debugfs entry to dump the state of the active IODA PEs. The IODA PE state reflects how the PHB's internal concept of a PE is configured. This is separate to the EEH PE state and is managed power the PowerNV PCI backend rather than the EEH core. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy [mpe: Use DEFINE_DEBUGFS_ATTRIBUTE] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190912052945.12589-3-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f0425bd4edc6..40b2928efe78 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3107,6 +3107,33 @@ static int pnv_pci_diag_data_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set, "%llu\n"); +static int pnv_pci_ioda_pe_dump(void *data, u64 val) +{ + struct pnv_phb *phb = data; + int pe_num; + + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num]; + + if (!test_bit(pe_num, phb->ioda.pe_alloc)) + continue; + + pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n", + pe->rid, pe->device_count, + (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "", + (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "", + (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "", + (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "", + (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "", + (pe->flags & PNV_IODA_PE_VF) ? "vf " : ""); + } + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL, + pnv_pci_ioda_pe_dump, "%llu\n"); + #endif /* CONFIG_DEBUG_FS */ static void pnv_pci_ioda_create_dbgfs(void) @@ -3132,6 +3159,8 @@ static void pnv_pci_ioda_create_dbgfs(void) debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, phb, &pnv_pci_diag_data_fops); + debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs, + phb, &pnv_pci_ioda_pe_dump_fops); } #endif /* CONFIG_DEBUG_FS */ } From 4e0942c0302b5ad76b228b1a7b8c09f658a1d58a Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 16 Oct 2019 12:25:36 +1100 Subject: [PATCH 064/131] powerpc/eeh: Only dump stack once if an MMIO loop is detected Many drivers don't check for errors when they get a 0xFFs response from an MMIO load. As a result after an EEH event occurs a driver can get stuck in a polling loop unless it some kind of internal timeout logic. Currently EEH tries to detect and report stuck drivers by dumping a stack trace after eeh_dev_check_failure() is called EEH_MAX_FAILS times on an already frozen PE. The value of EEH_MAX_FAILS was chosen so that a dump would occur every few seconds if the driver was spinning in a loop. This results in a lot of spurious stack traces in the kernel log. Fix this by limiting it to printing one stack trace for each PE freeze. If the driver is truely stuck the kernel's hung task detector is better suited to reporting the probelm anyway. Signed-off-by: Oliver O'Halloran Reviewed-by: Sam Bobroff Tested-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191016012536.22588-1-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index daf9ff34a255..17cb3e9b5697 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -503,7 +503,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) rc = 1; if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; - if (pe->check_count % EEH_MAX_FAILS == 0) { + if (pe->check_count == EEH_MAX_FAILS) { dn = pci_device_to_OF_node(dev); if (dn) location = of_get_property(dn, "ibm,loc-code", From db93361260e2411f854514e8b54b6c31a5b2d5bd Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 1 Nov 2019 17:26:10 +1100 Subject: [PATCH 065/131] powerpc/powernv: Rework exports to support subnodes Originally we only had a handful of exported memory ranges, but we'd to export the per-core trace buffers. This results in a lot of files in the exports directory which is a but unfortunate. We can clean things up a bit by turning subnodes into subdirectories of the exports directory. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191101062611.32610-1-oohall@gmail.com --- arch/powerpc/platforms/powernv/opal.c | 114 ++++++++++++++++---------- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index a6ee08009f0f..14348561243f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -834,6 +834,75 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, bin_attr->size); } +static int opal_add_one_export(struct kobject *parent, const char *export_name, + struct device_node *np, const char *prop_name) +{ + struct bin_attribute *attr = NULL; + const char *name = NULL; + u64 vals[2]; + int rc; + + rc = of_property_read_u64_array(np, prop_name, &vals[0], 2); + if (rc) + goto out; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + name = kstrdup(export_name, GFP_KERNEL); + if (!name) { + rc = -ENOMEM; + goto out; + } + + sysfs_bin_attr_init(attr); + attr->attr.name = name; + attr->attr.mode = 0400; + attr->read = export_attr_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; + + rc = sysfs_create_bin_file(parent, attr); +out: + if (rc) { + kfree(name); + kfree(attr); + } + + return rc; +} + +static void opal_add_exported_attrs(struct device_node *np, + struct kobject *kobj) +{ + struct device_node *child; + struct property *prop; + + for_each_property_of_node(np, prop) { + int rc; + + if (!strcmp(prop->name, "name") || + !strcmp(prop->name, "phandle")) + continue; + + rc = opal_add_one_export(kobj, prop->name, np, prop->name); + if (rc) { + pr_warn("Unable to add export %pOF/%s, rc = %d!\n", + np, prop->name, rc); + } + } + + for_each_child_of_node(np, child) { + struct kobject *child_kobj; + + child_kobj = kobject_create_and_add(child->name, kobj); + if (!child_kobj) { + pr_err("Unable to create export dir for %pOF\n", child); + continue; + } + + opal_add_exported_attrs(child, child_kobj); + } +} + /* * opal_export_attrs: creates a sysfs node for each property listed in * the device-tree under /ibm,opal/firmware/exports/ @@ -843,12 +912,8 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, */ static void opal_export_attrs(void) { - struct bin_attribute *attr; struct device_node *np; - struct property *prop; struct kobject *kobj; - u64 vals[2]; - int rc; np = of_find_node_by_path("/ibm,opal/firmware/exports"); if (!np) @@ -861,41 +926,7 @@ static void opal_export_attrs(void) return; } - for_each_property_of_node(np, prop) { - if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) - continue; - - if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) - continue; - - attr = kzalloc(sizeof(*attr), GFP_KERNEL); - - if (attr == NULL) { - pr_warn("Failed kmalloc for bin_attribute!"); - continue; - } - - sysfs_bin_attr_init(attr); - attr->attr.name = kstrdup(prop->name, GFP_KERNEL); - attr->attr.mode = 0400; - attr->read = export_attr_read; - attr->private = __va(vals[0]); - attr->size = vals[1]; - - if (attr->attr.name == NULL) { - pr_warn("Failed kstrdup for bin_attribute attr.name"); - kfree(attr); - continue; - } - - rc = sysfs_create_bin_file(kobj, attr); - if (rc) { - pr_warn("Error %d creating OPAL sysfs exports/%s file\n", - rc, prop->name); - kfree(attr->attr.name); - kfree(attr); - } - } + opal_add_exported_attrs(np, kobj); of_node_put(np); } @@ -1056,11 +1087,10 @@ static int __init opal_init(void) opal_sys_param_init(); /* Setup message log sysfs interface. */ opal_msglog_sysfs_init(); + /* Add all export properties*/ + opal_export_attrs(); } - /* Export all properties */ - opal_export_attrs(); - /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init("ibm,opal-ipmi"); opal_pdev_init("ibm,opal-flash"); From 846a17a53aeeee426cbc1252f517a6660eab1427 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 1 Nov 2019 17:26:11 +1100 Subject: [PATCH 066/131] powerpc/powernv: Use common code for the symbol_map export Long before we had a generic way for firmware to export memory ranges of interest we added a special case for the skiboot symbol map. The code is pretty much identical to the generic export so re-use the code. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191101062611.32610-2-oohall@gmail.com --- arch/powerpc/platforms/powernv/opal.c | 48 ++++++--------------------- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 14348561243f..2b3dfd0b6cdd 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -790,42 +790,6 @@ static int opal_sysfs_init(void) return 0; } -static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) -{ - return memory_read_from_buffer(buf, count, &off, bin_attr->private, - bin_attr->size); -} - -static struct bin_attribute symbol_map_attr = { - .attr = {.name = "symbol_map", .mode = 0400}, - .read = symbol_map_read -}; - -static void opal_export_symmap(void) -{ - const __be64 *syms; - unsigned int size; - struct device_node *fw; - int rc; - - fw = of_find_node_by_path("/ibm,opal/firmware"); - if (!fw) - return; - syms = of_get_property(fw, "symbol-map", &size); - if (!syms || size != 2 * sizeof(__be64)) - return; - - /* Setup attributes */ - symbol_map_attr.private = __va(be64_to_cpu(syms[0])); - symbol_map_attr.size = be64_to_cpu(syms[1]); - - rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); - if (rc) - pr_warn("Error %d creating OPAL symbols file\n", rc); -} - static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) @@ -914,6 +878,7 @@ static void opal_export_attrs(void) { struct device_node *np; struct kobject *kobj; + int rc; np = of_find_node_by_path("/ibm,opal/firmware/exports"); if (!np) @@ -928,6 +893,15 @@ static void opal_export_attrs(void) opal_add_exported_attrs(np, kobj); + /* + * NB: symbol_map existed before the generic export interface so it + * lives under the top level opal_kobj. + */ + rc = opal_add_one_export(opal_kobj, "symbol_map", + np->parent, "symbol-map"); + if (rc) + pr_warn("Error %d creating OPAL symbols file\n", rc); + of_node_put(np); } @@ -1073,8 +1047,6 @@ static int __init opal_init(void) /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { - /* Export symbol map to userspace */ - opal_export_symmap(); /* Setup dump region interface */ opal_dump_region_init(); /* Setup error log interface */ From 2d9b332d99b24f27f77e9ba38ce3c8beb11a81c0 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 1 Nov 2019 19:55:21 +1100 Subject: [PATCH 067/131] powerpc/xmon: Allow passing an argument to ppc_md.restart() On PowerNV a few different kinds of reboot are supported. We'd like to be able to exercise these from xmon so allow 'zr' to take an argument, and pass that to the ppc_md.restart() function. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191101085522.3055-1-oohall@gmail.com --- arch/powerpc/xmon/xmon.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 03d23075ac43..e8c84d265602 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1192,16 +1192,19 @@ static int do_step(struct pt_regs *regs) static void bootcmds(void) { + char tmp[64]; int cmd; cmd = inchar(); - if (cmd == 'r') - ppc_md.restart(NULL); - else if (cmd == 'h') + if (cmd == 'r') { + getstring(tmp, 64); + ppc_md.restart(tmp); + } else if (cmd == 'h') { ppc_md.halt(); - else if (cmd == 'p') + } else if (cmd == 'p') { if (pm_power_off) pm_power_off(); + } } static int cpu_cmd(void) From 37f6f8e88caef5f69774b82117ca35bdc4a985d8 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 1 Nov 2019 19:55:22 +1100 Subject: [PATCH 068/131] powerpc/powernv: Allow manually invoking special reboots OPAL provides several different kinds of reboot for the kernel to use, namely forcing a full reboot, platform error reboot and MPIPL. Right now triggering the alternative resets requires some ad-hoc method such as triggering a kernel crash and hoping the stars align. It's sometimes handy to be able to trigger one of these resets directly, so add a way to do that. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191101085522.3055-2-oohall@gmail.com --- arch/powerpc/platforms/powernv/setup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 83498604d322..11fdae81b5dd 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -233,6 +233,10 @@ static void __noreturn pnv_restart(char *cmd) rc = opal_cec_reboot(); else if (strcmp(cmd, "full") == 0) rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL); + else if (strcmp(cmd, "mpipl") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL); + else if (strcmp(cmd, "error") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL); else rc = OPAL_UNSUPPORTED; From 3ab3f3c9df348324029e3fbdf381f551b1df8f1e Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 10 Jan 2020 18:02:03 +1100 Subject: [PATCH 069/131] powerpc/pci: Fold pcibios_setup_device() into pcibios_bus_add_device() pcibios_bus_add_device() is the only caller of pcibios_setup_device(). Fold them together since there's no real reason to keep them separate. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200110070207.439-2-oohall@gmail.com --- arch/powerpc/kernel/pci-common.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index f8a59d7b724c..c6c03416a151 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -958,7 +958,7 @@ void pcibios_setup_bus_self(struct pci_bus *bus) phb->controller_ops.dma_bus_setup(bus); } -static void pcibios_setup_device(struct pci_dev *dev) +void pcibios_bus_add_device(struct pci_dev *dev) { struct pci_controller *phb; /* Fixup NUMA node as it may not be setup yet by the generic @@ -979,15 +979,9 @@ static void pcibios_setup_device(struct pci_dev *dev) pci_read_irq_line(dev); if (ppc_md.pci_irq_fixup) ppc_md.pci_irq_fixup(dev); -} - -void pcibios_bus_add_device(struct pci_dev *pdev) -{ - /* Perform platform-specific device setup */ - pcibios_setup_device(pdev); if (ppc_md.pcibios_bus_add_device) - ppc_md.pcibios_bus_add_device(pdev); + ppc_md.pcibios_bus_add_device(dev); } int pcibios_add_device(struct pci_dev *dev) From ac1172019593f80818a2fdc406d3d30d8bbb05f2 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 10 Jan 2020 18:02:04 +1100 Subject: [PATCH 070/131] powernv/pci: Remove dma_dev_setup() for NPU PHBs The pnv_pci_dma_dev_setup() only does something when: 1) There PHB contains VFs, or 2) The PHB defines a dma_dev_setup() callback in the pnv_phb structure. Neither is true for NPU PHBs so there's no reason to set the callback. Reviewed-by: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200110070207.439-3-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 40b2928efe78..2b664b0caea3 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3683,7 +3683,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { }; static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, From 965c94f309be58fbcc6c8d3e4f123376c5970d79 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 10 Jan 2020 18:02:05 +1100 Subject: [PATCH 071/131] powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov() An ioda_pe for each VF is allocated in pnv_pci_sriov_enable() before the pci_dev for the VF is created. We need to set the pe->pdev pointer at some point after the pci_dev is created. Currently we do that in: pcibios_bus_add_device() pnv_pci_dma_dev_setup() (via phb->ops.dma_dev_setup) /* fixup is done here */ pnv_pci_ioda_dma_dev_setup() (via pnv_phb->dma_dev_setup) The fixup needs to be done before setting up DMA for for the VF's PE, but there's no real reason to delay it until this point. Move the fixup into pnv_pci_ioda_fixup_iov() so the ordering is: pcibios_add_device() pnv_pci_ioda_fixup_iov() (via ppc_md.pcibios_fixup_sriov) pcibios_bus_add_device() ... This isn't strictly required, but it's a slightly more logical place to do the fixup and it simplifies pnv_pci_dma_dev_setup(). Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200110070207.439-4-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 29 +++++++++++++++++++---- arch/powerpc/platforms/powernv/pci.c | 14 ----------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2b664b0caea3..5ff8ac8c70f8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2917,9 +2917,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pci_dn *pdn; int mul, total_vfs; - if (!pdev->is_physfn || pci_dev_is_added(pdev)) - return; - pdn = pci_get_pdn(pdev); pdn->vfs_expanded = 0; pdn->m64_single_mode = false; @@ -2994,6 +2991,30 @@ truncate_iov: res->end = res->start - 1; } } + +static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) +{ + if (WARN_ON(pci_dev_is_added(pdev))) + return; + + if (pdev->is_virtfn) { + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); + + /* + * VF PEs are single-device PEs so their pdev pointer needs to + * be set. The pdev doesn't exist when the PE is allocated (in + * (pcibios_sriov_enable()) so we fix it up here. + */ + pe->pdev = pdev; + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); + } else if (pdev->is_physfn) { + /* + * For PFs adjust their allocated IOV resources to match what + * the PHB can support using it's M64 BAR table. + */ + pnv_pci_ioda_fixup_iov_resources(pdev); + } +} #endif /* CONFIG_PCI_IOV */ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, @@ -3936,7 +3957,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_default_alignment = pnv_pci_default_alignment; #ifdef CONFIG_PCI_IOV - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov; ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 929e0241d862..72ffc98dd43f 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -818,20 +818,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; -#ifdef CONFIG_PCI_IOV - struct pnv_ioda_pe *pe; - - /* Fix the VF pdn PE number */ - if (pdev->is_virtfn) { - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (pe->rid == ((pdev->bus->number << 8) | - (pdev->devfn & 0xff))) { - pe->pdev = pdev; - break; - } - } - } -#endif /* CONFIG_PCI_IOV */ if (phb && phb->dma_dev_setup) phb->dma_dev_setup(phb, pdev); From 0a25d9c40161269067c47f6c09a12d76cfe72581 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 10 Jan 2020 18:02:06 +1100 Subject: [PATCH 072/131] powernv/pci: Fold pnv_pci_dma_dev_setup() into the pci-ioda.c version pnv_pci_dma_dev_setup() does nothing but call the phb->dma_dev_setup() callback, if one exists. That callback is only set for normal PCIe PHBs so we can remove the layer of indirection and use the ioda version in the pci_controller_ops. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200110070207.439-5-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 ++++--- arch/powerpc/platforms/powernv/pci.c | 9 --------- arch/powerpc/platforms/powernv/pci.h | 2 -- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5ff8ac8c70f8..6755d01eae8e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1760,8 +1760,10 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) } #endif /* CONFIG_PCI_IOV */ -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) +static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) { + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; @@ -3690,7 +3692,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) } static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .setup_msi_irqs = pnv_setup_msi_irqs, @@ -3950,7 +3952,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; break; default: - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 72ffc98dd43f..36a8fbf6907d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -814,15 +814,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -void pnv_pci_dma_dev_setup(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - - if (phb && phb->dma_dev_setup) - phb->dma_dev_setup(phb, pdev); -} - void pnv_pci_dma_bus_setup(struct pci_bus *bus) { struct pci_controller *hose = bus->sysdata; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f914f0b14e4e..0cdc9ba1a541 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -108,7 +108,6 @@ struct pnv_phb { int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); - void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); @@ -189,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); From 946743d035bd2cfff059ae79012ab22148caeac2 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 10 Jan 2020 18:02:07 +1100 Subject: [PATCH 073/131] powernv/pci: Move pnv_pci_dma_bus_setup() to pci-ioda.c This is only used in pci-ioda.c so move it there and rename it to match. Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200110070207.439-6-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 22 +++++++++++++++++++++- arch/powerpc/platforms/powernv/pci.c | 20 -------------------- arch/powerpc/platforms/powernv/pci.h | 1 - 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6755d01eae8e..22c22cd7bd82 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3691,9 +3691,29 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) OPAL_ASSERT_RESET); } +static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) + continue; + + if (!pe->pbus) + continue; + + if (bus->number == ((pe->rid >> 8) & 0xFF)) { + pe->pbus = bus; + break; + } + } +} + static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, + .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 36a8fbf6907d..5bf818246339 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -814,26 +814,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -void pnv_pci_dma_bus_setup(struct pci_bus *bus) -{ - struct pci_controller *hose = bus->sysdata; - struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) - continue; - - if (!pe->pbus) - continue; - - if (bus->number == ((pe->rid >> 8) & 0xFF)) { - pe->pbus = bus; - break; - } - } -} - struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 0cdc9ba1a541..d3bbdeab3a32 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -188,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); From 173bf44bdfc768af3c07cd0aeeb6ad8d1331b77d Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Tue, 14 Jan 2020 11:00:25 +0000 Subject: [PATCH 074/131] MAINTAINERS: Add myself as maintainer of ehv_bytechan tty driver Michael Ellerman made a call for volunteers from NXP to maintain this driver and I offered myself. Signed-off-by: Laurentiu Tudor Acked-by: Timur Tabi Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200114110012.17351-1-laurentiu.tudor@nxp.com --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8982c6e013b3..d10f6b00d808 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6155,6 +6155,12 @@ M: Maxim Levitsky S: Maintained F: drivers/media/rc/ene_ir.* +EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER +M: Laurentiu Tudor +L: linuxppc-dev@lists.ozlabs.org +S: Maintained +F: drivers/tty/ehv_bytechan.c + EPSON S1D13XXX FRAMEBUFFER DRIVER M: Kristoffer Ericson S: Maintained From 25dd118f4b2773490df12b9d190c1956cf3250c3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:41:15 +0800 Subject: [PATCH 075/131] macintosh: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191120134115.14918-1-krzk@kernel.org --- drivers/macintosh/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index 574e122ae105..cbd46c1c5bf7 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -178,7 +178,7 @@ config THERM_ADT746X depends on I2C && I2C_POWERMAC && PPC_PMAC && !PPC_PMAC64 help This driver provides some thermostat and fan control for the - iBook G4, and the ATI based aluminium PowerBooks, allowing slightly + iBook G4, and the ATI based aluminium PowerBooks, allowing slightly better fan behaviour by default, and some manual control. config WINDFARM @@ -214,7 +214,7 @@ config WINDFARM_PM91 select I2C_POWERMAC help This driver provides thermal control for the PowerMac9,1 - which is the recent (SMU based) single CPU desktop G5 + which is the recent (SMU based) single CPU desktop G5 config WINDFARM_PM112 tristate "Support for thermal management on PowerMac11,2" @@ -242,7 +242,7 @@ config PMAC_RACKMETER depends on PPC_PMAC help This driver provides some support to control the front panel - blue LEDs "vu-meter" of the XServer macs. + blue LEDs "vu-meter" of the XServer macs. config SENSORS_AMS tristate "Apple Motion Sensor driver" From 6a3163212f311daaf2ca3b676db2e11cfd81c6b3 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 4 Jul 2019 00:03:19 +0200 Subject: [PATCH 076/131] KVM: PPC: Book3S HV: XIVE: Fix typo in comment Signed-off-by: Greg Kurz Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156219139988.578018.1046848908285019838.stgit@bahia.lan --- arch/powerpc/kvm/book3s_xive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 66858b7d3c6b..85215e79db42 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -484,7 +484,7 @@ static void xive_finish_unmask(struct kvmppc_xive *xive, kvmppc_xive_select_irq(state, &hw_num, &xd); /* - * See command in xive_lock_and_mask() concerning masking + * See comment in xive_lock_and_mask() concerning masking * via firmware. */ if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { From b059c63620fbba8a5da60f01d99d003681447e3c Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 15 Nov 2019 19:10:58 +0100 Subject: [PATCH 077/131] powerpc/xive: Drop extern qualifiers from header function prototypes As reported by ./scripts/checkpatch.pl --strict: CHECK: extern prototypes should be avoided in .h files Signed-off-by: Greg Kurz Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/157384145834.181768.944827793193636924.stgit@bahia.lan --- arch/powerpc/include/asm/xive.h | 84 ++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 24cdf97376c4..93f982dbb3d4 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -87,56 +87,56 @@ extern bool __xive_enabled; static inline bool xive_enabled(void) { return __xive_enabled; } -extern bool xive_spapr_init(void); -extern bool xive_native_init(void); -extern void xive_smp_probe(void); -extern int xive_smp_prepare_cpu(unsigned int cpu); -extern void xive_smp_setup_cpu(void); -extern void xive_smp_disable_cpu(void); -extern void xive_teardown_cpu(void); -extern void xive_shutdown(void); -extern void xive_flush_interrupt(void); +bool xive_spapr_init(void); +bool xive_native_init(void); +void xive_smp_probe(void); +int xive_smp_prepare_cpu(unsigned int cpu); +void xive_smp_setup_cpu(void); +void xive_smp_disable_cpu(void); +void xive_teardown_cpu(void); +void xive_shutdown(void); +void xive_flush_interrupt(void); /* xmon hook */ -extern void xmon_xive_do_dump(int cpu); -extern int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d); +void xmon_xive_do_dump(int cpu); +int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d); /* APIs used by KVM */ -extern u32 xive_native_default_eq_shift(void); -extern u32 xive_native_alloc_vp_block(u32 max_vcpus); -extern void xive_native_free_vp_block(u32 vp_base); -extern int xive_native_populate_irq_data(u32 hw_irq, - struct xive_irq_data *data); -extern void xive_cleanup_irq_data(struct xive_irq_data *xd); -extern u32 xive_native_alloc_irq(void); -extern void xive_native_free_irq(u32 irq); -extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); +u32 xive_native_default_eq_shift(void); +u32 xive_native_alloc_vp_block(u32 max_vcpus); +void xive_native_free_vp_block(u32 vp_base); +int xive_native_populate_irq_data(u32 hw_irq, + struct xive_irq_data *data); +void xive_cleanup_irq_data(struct xive_irq_data *xd); +u32 xive_native_alloc_irq(void); +void xive_native_free_irq(u32 irq); +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); -extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, - __be32 *qpage, u32 order, bool can_escalate); -extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate); +void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); -extern void xive_native_sync_source(u32 hw_irq); -extern void xive_native_sync_queue(u32 hw_irq); -extern bool is_xive_irq(struct irq_chip *chip); -extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); -extern int xive_native_disable_vp(u32 vp_id); -extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); -extern bool xive_native_has_single_escalation(void); +void xive_native_sync_source(u32 hw_irq); +void xive_native_sync_queue(u32 hw_irq); +bool is_xive_irq(struct irq_chip *chip); +int xive_native_enable_vp(u32 vp_id, bool single_escalation); +int xive_native_disable_vp(u32 vp_id); +int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); +bool xive_native_has_single_escalation(void); -extern int xive_native_get_queue_info(u32 vp_id, uint32_t prio, - u64 *out_qpage, - u64 *out_qsize, - u64 *out_qeoi_page, - u32 *out_escalate_irq, - u64 *out_qflags); +int xive_native_get_queue_info(u32 vp_id, uint32_t prio, + u64 *out_qpage, + u64 *out_qsize, + u64 *out_qeoi_page, + u32 *out_escalate_irq, + u64 *out_qflags); -extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, - u32 *qindex); -extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, - u32 qindex); -extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); -extern bool xive_native_has_queue_state_support(void); +int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, + u32 *qindex); +int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, + u32 qindex); +int xive_native_get_vp_state(u32 vp_id, u64 *out_state); +bool xive_native_has_queue_state_support(void); #else From 7e6f8cbc5e10cf7601c762db267b795273d53078 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 8 Jan 2020 12:16:47 +0530 Subject: [PATCH 078/131] powerpc/papr_scm: Don't enable direct map for a region by default Setting ND_REGION_PAGEMAP flag implies namespace mode defaults to fsdax mode. This also means kernel ends up creating struct page backing for these namspace ranges. With large namespaces that is not the right thing to do. We should let the user select the mode he/she wants the namespace to be created with. Hence disable ND_REGION_PAGEMAP for papr_scm regions. We still keep the flag for of_pmem because it supports only small persistent memory regions. This is similar to what is done for x86 with commit commit: 004f1afbe199 ("libnvdimm, pmem: direct map legacy pmem by default") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200108064647.169637-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index cdd316c6cef3..8da39a9c5569 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -357,7 +357,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ndr_desc.mapping = &mapping; ndr_desc.num_mappings = 1; ndr_desc.nd_set = &p->nd_set; - set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); if (p->is_volatile) p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc); From aff8c8242bc638ba57247ae1ec5f272ac3ed3b92 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Mon, 20 Jan 2020 14:10:02 -0800 Subject: [PATCH 079/131] powerpc/pseries/vio: Fix iommu_table use-after-free refcount warning Commit e5afdf9dd515 ("powerpc/vfio_spapr_tce: Add reference counting to iommu_table") missed an iommu_table allocation in the pseries vio code. The iommu_table is allocated with kzalloc and as a result the associated kref gets a value of zero. This has the side effect that during a DLPAR remove of the associated virtual IOA the iommu_tce_table_put() triggers a use-after-free underflow warning. Call Trace: [c0000002879e39f0] [c00000000071ecb4] refcount_warn_saturate+0x184/0x190 (unreliable) [c0000002879e3a50] [c0000000000500ac] iommu_tce_table_put+0x9c/0xb0 [c0000002879e3a70] [c0000000000f54e4] vio_dev_release+0x34/0x70 [c0000002879e3aa0] [c00000000087cfa4] device_release+0x54/0xf0 [c0000002879e3b10] [c000000000d64c84] kobject_cleanup+0xa4/0x240 [c0000002879e3b90] [c00000000087d358] put_device+0x28/0x40 [c0000002879e3bb0] [c0000000007a328c] dlpar_remove_slot+0x15c/0x250 [c0000002879e3c50] [c0000000007a348c] remove_slot_store+0xac/0xf0 [c0000002879e3cd0] [c000000000d64220] kobj_attr_store+0x30/0x60 [c0000002879e3cf0] [c0000000004ff13c] sysfs_kf_write+0x6c/0xa0 [c0000002879e3d10] [c0000000004fde4c] kernfs_fop_write+0x18c/0x260 [c0000002879e3d60] [c000000000410f3c] __vfs_write+0x3c/0x70 [c0000002879e3d80] [c000000000415408] vfs_write+0xc8/0x250 [c0000002879e3dd0] [c0000000004157dc] ksys_write+0x7c/0x120 [c0000002879e3e20] [c00000000000b278] system_call+0x5c/0x68 Further, since the refcount was always zero the iommu_tce_table_put() fails to call the iommu_table release function resulting in a leak. Fix this issue be initilizing the iommu_table kref immediately after allocation. Fixes: e5afdf9dd515 ("powerpc/vfio_spapr_tce: Add reference counting to iommu_table") Signed-off-by: Tyrel Datwyler Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1579558202-26052-1-git-send-email-tyreld@linux.ibm.com --- arch/powerpc/platforms/pseries/vio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 79e2287991db..f682b7babc09 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1176,6 +1176,8 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) if (tbl == NULL) return NULL; + kref_init(&tbl->it_kref); + of_parse_dma_window(dev->dev.of_node, dma_window, &tbl->it_index, &offset, &size); From 1e3531982ee70adf1880715a968d9c3365f321ed Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Tue, 21 Jan 2020 09:31:53 +0800 Subject: [PATCH 080/131] powerpc/maple: Fix comparing pointer to 0 Fixes coccicheck warning: arch/powerpc/platforms/maple/setup.c:232:15-16: WARNING comparing pointer to 0 Compare pointer-typed values to NULL rather than 0. Signed-off-by: Chen Zhou Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200121013153.9937-1-chenzhou10@huawei.com --- arch/powerpc/platforms/maple/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c index 9cd6f3e1000b..bd5f9b1bc1a1 100644 --- a/arch/powerpc/platforms/maple/setup.c +++ b/arch/powerpc/platforms/maple/setup.c @@ -232,7 +232,7 @@ static void __init maple_init_IRQ(void) root = of_find_node_by_path("/"); naddr = of_n_addr_cells(root); opprop = of_get_property(root, "platform-open-pic", &opplen); - if (opprop != 0) { + if (opprop) { openpic_addr = of_read_number(opprop, naddr); has_isus = (opplen > naddr); printk(KERN_DEBUG "OpenPIC addr: %lx, has ISUs: %d\n", From def0bfdbd6039e96a9eb2baaa4470b079daab0d4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 23 Jan 2020 17:30:47 +0000 Subject: [PATCH 081/131] powerpc: use probe_user_read() and probe_user_write() Instead of opencoding, use probe_user_read() to failessly read a user location and probe_user_write() for writing to user. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e041f5eedb23f09ab553be8a91c3de2087147320.1579800517.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/process.c | 12 +----------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++---- arch/powerpc/mm/fault.c | 6 +----- arch/powerpc/oprofile/backtrace.c | 14 ++------------ arch/powerpc/perf/callchain.c | 20 +++----------------- arch/powerpc/perf/core-book3s.c | 8 +------- arch/powerpc/sysdev/fsl_pci.c | 10 ++++------ 7 files changed, 14 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 7fcf72e58826..fad50db9dcf2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1277,16 +1277,6 @@ void show_user_instructions(struct pt_regs *regs) pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - /* - * Make sure the NIP points at userspace, not kernel text/data or - * elsewhere. - */ - if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { - pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", - current->comm, current->pid); - return; - } - seq_buf_init(&s, buf, sizeof(buf)); while (n) { @@ -1297,7 +1287,7 @@ void show_user_instructions(struct pt_regs *regs) for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { int instr; - if (probe_kernel_address((const void *)pc, instr)) { + if (probe_user_read(&instr, (void __user *)pc, sizeof(instr))) { seq_buf_printf(&s, "XXXXXXXX "); continue; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index da857c8ba6e4..231410dc9db4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -63,12 +63,10 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, } isync(); - pagefault_disable(); if (is_load) - ret = raw_copy_from_user(to, from, n); + ret = probe_user_read(to, (const void __user *)from, n); else - ret = raw_copy_to_user(to, from, n); - pagefault_enable(); + ret = probe_user_write((void __user *)to, from, n); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b5047f9b5dec..9e119f98a725 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -279,12 +279,8 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && access_ok(nip, sizeof(*nip))) { unsigned int inst; - int res; - pagefault_disable(); - res = __get_user_inatomic(inst, nip); - pagefault_enable(); - if (!res) + if (!probe_user_read(&inst, nip, sizeof(inst))) return !store_updates_sp(inst); *must_retry = true; } diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c index 43245f4a9bcb..2799b922f780 100644 --- a/arch/powerpc/oprofile/backtrace.c +++ b/arch/powerpc/oprofile/backtrace.c @@ -28,15 +28,12 @@ static unsigned int user_getsp32(unsigned int sp, int is_first) unsigned int stack_frame[2]; void __user *p = compat_ptr(sp); - if (!access_ok(p, sizeof(stack_frame))) - return 0; - /* * The most likely reason for this is that we returned -EFAULT, * which means that we've done all that we can do from * interrupt context. */ - if (__copy_from_user_inatomic(stack_frame, p, sizeof(stack_frame))) + if (probe_user_read(stack_frame, (void __user *)p, sizeof(stack_frame))) return 0; if (!is_first) @@ -54,11 +51,7 @@ static unsigned long user_getsp64(unsigned long sp, int is_first) { unsigned long stack_frame[3]; - if (!access_ok((void __user *)sp, sizeof(stack_frame))) - return 0; - - if (__copy_from_user_inatomic(stack_frame, (void __user *)sp, - sizeof(stack_frame))) + if (probe_user_read(stack_frame, (void __user *)sp, sizeof(stack_frame))) return 0; if (!is_first) @@ -103,7 +96,6 @@ void op_powerpc_backtrace(struct pt_regs * const regs, unsigned int depth) first_frame = 0; } } else { - pagefault_disable(); #ifdef CONFIG_PPC64 if (!is_32bit_task()) { while (depth--) { @@ -112,7 +104,6 @@ void op_powerpc_backtrace(struct pt_regs * const regs, unsigned int depth) break; first_frame = 0; } - pagefault_enable(); return; } #endif @@ -123,6 +114,5 @@ void op_powerpc_backtrace(struct pt_regs * const regs, unsigned int depth) break; first_frame = 0; } - pagefault_enable(); } } diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 35d542515faf..cbc251981209 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -155,12 +155,8 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) ((unsigned long)ptr & 7)) return -EFAULT; - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); + if (!probe_user_read(ret, ptr, sizeof(*ret))) return 0; - } - pagefault_enable(); return read_user_stack_slow(ptr, ret, 8); } @@ -171,12 +167,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) ((unsigned long)ptr & 3)) return -EFAULT; - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); + if (!probe_user_read(ret, ptr, sizeof(*ret))) return 0; - } - pagefault_enable(); return read_user_stack_slow(ptr, ret, 4); } @@ -293,17 +285,11 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, */ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) { - int rc; - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || ((unsigned long)ptr & 3)) return -EFAULT; - pagefault_disable(); - rc = __get_user_inatomic(*ret, ptr); - pagefault_enable(); - - return rc; + return probe_user_read(ret, ptr, sizeof(*ret)); } static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 48604625ab31..3086055bf681 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -415,7 +415,6 @@ static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) static __u64 power_pmu_bhrb_to(u64 addr) { unsigned int instr; - int ret; __u64 target; if (is_kernel_addr(addr)) { @@ -426,13 +425,8 @@ static __u64 power_pmu_bhrb_to(u64 addr) } /* Userspace: need copy instruction here then translate it */ - pagefault_disable(); - ret = __get_user_inatomic(instr, (unsigned int __user *)addr); - if (ret) { - pagefault_enable(); + if (probe_user_read(&instr, (unsigned int __user *)addr, sizeof(instr))) return 0; - } - pagefault_enable(); target = branch_target(&instr); if ((!target) || (instr & BRANCH_ABSOLUTE)) diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 617a443d673d..4a8874bc1057 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1065,13 +1065,11 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs) addr += mfspr(SPRN_MCAR); if (is_in_pci_mem_space(addr)) { - if (user_mode(regs)) { - pagefault_disable(); - ret = get_user(inst, (__u32 __user *)regs->nip); - pagefault_enable(); - } else { + if (user_mode(regs)) + ret = probe_user_read(&inst, (void __user *)regs->nip, + sizeof(inst)); + else ret = probe_kernel_address((void *)regs->nip, inst); - } if (!ret && mcheck_handle_load(regs, inst)) { regs->nip += 4; From 493a55f1e7724dee5e71bc726d5b819292094587 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 22 Jan 2020 18:57:18 -0600 Subject: [PATCH 082/131] powerpc/xmon: Fix compile error in print_insn* functions Fix couple of compile errors I stumbled upon with CONFIG_XMON=y and CONFIG_XMON_DISASSEMBLY=n Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200123010455.GA15080@us.ibm.com --- arch/powerpc/xmon/dis-asm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/xmon/dis-asm.h b/arch/powerpc/xmon/dis-asm.h index c4d246ebca37..c4c982d6402e 100644 --- a/arch/powerpc/xmon/dis-asm.h +++ b/arch/powerpc/xmon/dis-asm.h @@ -13,13 +13,13 @@ extern int print_insn_spu(unsigned long insn, unsigned long memaddr); #else static inline int print_insn_powerpc(unsigned long insn, unsigned long memaddr) { - printf("%.8x", insn); + printf("%.8lx", insn); return 0; } static inline int print_insn_spu(unsigned long insn, unsigned long memaddr) { - printf("%.8x", insn); + printf("%.8lx", insn); return 0; } #endif From 5649607a8d0b0e019a4db14aab3de1e16c3a2b4f Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Wed, 22 Jan 2020 21:21:40 +0530 Subject: [PATCH 083/131] powerpc/papr_scm: Fix leaking 'bus_desc.provider_name' in some paths String 'bus_desc.provider_name' allocated inside papr_scm_nvdimm_init() will leaks in case call to nvdimm_bus_register() fails or when papr_scm_remove() is called. This minor patch ensures that 'bus_desc.provider_name' is freed in error path for nvdimm_bus_register() as well as in papr_scm_remove(). Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Signed-off-by: Vaibhav Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200122155140.120429-1-vaibhav@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 8da39a9c5569..0b4467e378e5 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -323,6 +323,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) p->bus = nvdimm_bus_register(NULL, &p->bus_desc); if (!p->bus) { dev_err(dev, "Error creating nvdimm bus %pOF\n", p->dn); + kfree(p->bus_desc.provider_name); return -ENXIO; } @@ -477,6 +478,7 @@ static int papr_scm_remove(struct platform_device *pdev) nvdimm_bus_unregister(p->bus); drc_pmem_unbind(p); + kfree(p->bus_desc.provider_name); kfree(p); return 0; From 736bcdd3a9fc672af33fb83230ecd0570ec38ec6 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 6 Dec 2019 14:17:22 +1100 Subject: [PATCH 084/131] powerpc/mm: Remove kvm radix prefetch workaround for Power9 DD2.2 Commit a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM") introduced a number of workarounds as coming out of a guest with the mmu enabled would make the cpu would start running in hypervisor state with the PID value from the guest. The cpu will then start prefetching for the hypervisor with that PID value. In Power9 DD2.2 the cpu behaviour was modified to fix this. When accessing Quadrant 0 in hypervisor mode with LPID != 0 prefetching will not be performed. This means that we can get rid of the workarounds for Power9 DD2.2 and later revisions. Add a new cpu feature CPU_FTR_P9_RADIX_PREFETCH_BUG to indicate if the workarounds are needed. Signed-off-by: Jordan Niethe Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191206031722.25781-1-jniethe5@gmail.com --- arch/powerpc/include/asm/cputable.h | 7 +++++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 13 ++++++++----- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +++++- arch/powerpc/mm/book3s64/radix_tlb.c | 3 +++ 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index cf00ff0d121d..40a4d3c6fd99 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -212,6 +212,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000) #define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x0000800000000000) #define CPU_FTR_P9_TLBIE_ERAT_BUG LONG_ASM_CONST(0x0001000000000000) +#define CPU_FTR_P9_RADIX_PREFETCH_BUG LONG_ASM_CONST(0x0002000000000000) #ifndef __ASSEMBLY__ @@ -459,8 +460,10 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG | CPU_FTR_P9_TIDR) -#define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 -#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) +#define CPU_FTRS_POWER9_DD2_0 (CPU_FTRS_POWER9 | CPU_FTR_P9_RADIX_PREFETCH_BUG) +#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | \ + CPU_FTR_P9_RADIX_PREFETCH_BUG | \ + CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ CPU_FTR_P9_TM_HV_ASSIST | \ CPU_FTR_P9_TM_XER_SO_BUG) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 180b3a5d1001..182b4047c1ef 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -727,17 +727,20 @@ static __init void cpufeatures_cpu_quirks(void) /* * Not all quirks can be derived from the cpufeatures device tree. */ - if ((version & 0xffffefff) == 0x004e0200) - ; /* DD2.0 has no feature flag */ - else if ((version & 0xffffefff) == 0x004e0201) + if ((version & 0xffffefff) == 0x004e0200) { + /* DD2.0 has no feature flag */ + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + } else if ((version & 0xffffefff) == 0x004e0201) { cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - else if ((version & 0xffffefff) == 0x004e0202) { + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + } else if ((version & 0xffffefff) == 0x004e0202) { cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - } else if ((version & 0xffff0000) == 0x004e0000) + } else if ((version & 0xffff0000) == 0x004e0000) { /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; + } if ((version & 0xffff0000) == 0x004e0000) { cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c6fbbd29bd87..dbc2fecc37f0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1801,6 +1801,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) tlbsync ptesync +BEGIN_FTR_SECTION /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) lwz r3, VCPU_GUEST_PID(r9) @@ -1830,6 +1831,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) addi r7,r7,0x1000 bdnz 1b ptesync +END_FTR_SECTION_IFSET(CPU_FTR_P9_RADIX_PREFETCH_BUG) 2: #endif /* CONFIG_PPC_RADIX_MMU */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 974109bb85db..dd1bea45325c 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -337,7 +337,11 @@ static void __init radix_init_pgtable(void) } /* Find out how many PID bits are supported */ - if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { + if (!mmu_pid_bits) + mmu_pid_bits = 20; + mmu_base_pid = 1; + } else if (cpu_has_feature(CPU_FTR_HVMODE)) { if (!mmu_pid_bits) mmu_pid_bits = 20; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index a95175c0972b..03f43c924e00 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1161,6 +1161,9 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) if (unlikely(pid == MMU_NO_CONTEXT)) return; + if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + return; + /* * If this context hasn't run on that CPU before and KVM is * around, there's a slim chance that the guest on another From f1dbc1c5c70d0d4c60b5d467ba941fba167c12f6 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Wed, 15 Jan 2020 08:53:59 -0600 Subject: [PATCH 085/131] powerpc/pseries/lparcfg: Fix display of Maximum Memory Correct overflow problem in calculation and display of Maximum Memory value to syscfg. Signed-off-by: Michael Bringmann [mpe: Only n_lmbs needs casting to unsigned long] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5577aef8-1d5a-ca95-ff0a-9c7b5977e5bf@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index e33e8bc4b69b..38c306551f76 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -435,10 +435,10 @@ static void maxmem_data(struct seq_file *m) { unsigned long maxmem = 0; - maxmem += drmem_info->n_lmbs * drmem_info->lmb_size; + maxmem += (unsigned long)drmem_info->n_lmbs * drmem_info->lmb_size; maxmem += hugetlb_total_pages() * PAGE_SIZE; - seq_printf(m, "MaxMem=%ld\n", maxmem); + seq_printf(m, "MaxMem=%lu\n", maxmem); } static int pseries_lparcfg_data(struct seq_file *m, void *v) From 414f50434aa2463202a5b35e844f4125dd1a7101 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 22 Jan 2020 14:11:25 +1100 Subject: [PATCH 086/131] selftests/eeh: Bump EEH wait time to 60s Some newer cards supported by aacraid can take up to 40s to recover after an EEH event. This causes spurious failures in the basic EEH self-test since the current maximim timeout is only 30s. Fix the immediate issue by bumping the timeout to a default of 60s, and allow the wait time to be specified via an environmental variable (EEH_MAX_WAIT). Reported-by: Steve Best Suggested-by: Douglas Miller Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200122031125.25991-1-oohall@gmail.com --- tools/testing/selftests/powerpc/eeh/eeh-functions.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh index 26112ab5cdf4..f52ed92b53e7 100755 --- a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh +++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh @@ -53,9 +53,13 @@ eeh_one_dev() { # is a no-op. echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check - # Enforce a 30s timeout for recovery. Even the IPR, which is infamously - # slow to reset, should recover within 30s. - max_wait=30 + # Default to a 60s timeout when waiting for a device to recover. This + # is an arbitrary default which can be overridden by setting the + # EEH_MAX_WAIT environmental variable when required. + + # The current record holder for longest recovery time is: + # "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds + max_wait=${EEH_MAX_WAIT:=60} for i in `seq 0 ${max_wait}` ; do if pe_ok $dev ; then From 39bccfd164970557c5cfc60b2db029f70542549f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:22 +0000 Subject: [PATCH 087/131] powerpc/32: replace MTMSRD() by mtmsr On PPC32, MTMSRD() is simply defined as mtmsr. Replace MTMSRD(reg) by mtmsr reg in files dedicated to PPC32, this makes the code less obscure. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/22469e78230edea3dbd0c79a555d73124f6c6d93.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 18 +++++++++--------- arch/powerpc/kernel/head_32.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index d60908ea37fb..6273b4862482 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -397,7 +397,7 @@ ret_from_syscall: LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC - MTMSRD(r10) + mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) @@ -554,7 +554,7 @@ syscall_exit_work: */ ori r10,r10,MSR_EE SYNC - MTMSRD(r10) + mtmsr r10 /* Save NVGPRS if they're not saved already */ lwz r4,_TRAP(r1) @@ -697,7 +697,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) and. r0,r0,r11 /* FP or altivec or SPE enabled? */ beq+ 1f andc r11,r11,r0 - MTMSRD(r11) + mtmsr r11 isync 1: stw r11,_MSR(r1) mfcr r10 @@ -831,7 +831,7 @@ ret_from_except: /* Note: We don't bother telling lockdep about it */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC /* Some chip revs have problems here... */ - MTMSRD(r10) /* disable interrupts */ + mtmsr r10 /* disable interrupts */ lwz r3,_MSR(r1) /* Returning to user mode? */ andi. r0,r3,MSR_PR @@ -998,7 +998,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) SYNC - MTMSRD(r10) /* clear the RI bit */ + mtmsr r10 /* clear the RI bit */ .globl exc_exit_restart exc_exit_restart: lwz r12,_NIP(r1) @@ -1234,7 +1234,7 @@ do_resched: /* r10 contains MSR_KERNEL here */ #endif ori r10,r10,MSR_EE SYNC - MTMSRD(r10) /* hard-enable interrupts */ + mtmsr r10 /* hard-enable interrupts */ bl schedule recheck: /* Note: And we don't tell it we are disabling them again @@ -1243,7 +1243,7 @@ recheck: */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC - MTMSRD(r10) /* disable interrupts */ + mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched @@ -1252,7 +1252,7 @@ recheck: do_user_signal: /* r10 contains MSR_KERNEL here */ ori r10,r10,MSR_EE SYNC - MTMSRD(r10) /* hard-enable interrupts */ + mtmsr r10 /* hard-enable interrupts */ /* save r13-r31 in the exception frame, if not already done */ lwz r3,_TRAP(r1) andi. r0,r3,1 @@ -1341,7 +1341,7 @@ _GLOBAL(enter_rtas) stw r9,8(r1) LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) SYNC /* disable interrupts so SRR0/1 */ - MTMSRD(r0) /* don't get trashed */ + mtmsr r0 /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 stw r7, THREAD + RTAS_SP(r2) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 8abc7783dbe5..b2ca8c9ffd8b 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -50,7 +50,7 @@ rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ + mtmsr r10 /* (except for mach check in rtas) */ #endif stw r0,GPR0(r11) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ @@ -80,7 +80,7 @@ rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ + mtmsr r10 /* (except for mach check in rtas) */ #endif lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ stw r2,GPR2(r11) From 1f1c4d0122eebd204468684f21f0270ea8a4999d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:23 +0000 Subject: [PATCH 088/131] powerpc/32: Add EXCEPTION_PROLOG_0 in head_32.h This patch creates a macro for the very first part of exception prolog, this will help when implementing CONFIG_VMAP_STACK Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2249fe62f481121a180e9655ad2b998093f318f3.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 4 +--- arch/powerpc/kernel/head_32.h | 9 ++++++--- arch/powerpc/kernel/head_8xx.S | 9 ++------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 4a24f8f026c7..9e868567b716 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -272,9 +272,7 @@ __secondary_hold_acknowledge: */ . = 0x200 DO_KVM 0x200 - mtspr SPRN_SPRG_SCRATCH0,r10 - mtspr SPRN_SPRG_SCRATCH1,r11 - mfcr r10 + EXCEPTION_PROLOG_0 #ifdef CONFIG_PPC_CHRP mfspr r11, SPRN_SPRG_THREAD lwz r11, RTAS_SP(r11) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index b2ca8c9ffd8b..8e345f8d4b0e 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -10,13 +10,16 @@ * We assume sprg3 has the physical address of the current * task's thread_struct. */ - .macro EXCEPTION_PROLOG + EXCEPTION_PROLOG_0 + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 +.endm + +.macro EXCEPTION_PROLOG_0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 - EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 .endm .macro EXCEPTION_PROLOG_1 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 19f583e18402..dac7c0a34eea 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -494,10 +494,7 @@ InstructionTLBError: */ . = 0x1400 DataTLBError: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 - + EXCEPTION_PROLOG_0 mfspr r11, SPRN_DAR cmpwi cr0, r11, RPN_PATTERN beq- FixupDAR /* must be a buggy dcbX, icbi insn. */ @@ -530,9 +527,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ */ . = 0x1c00 DataBreakpoint: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 + EXCEPTION_PROLOG_0 mfspr r11, SPRN_SRR0 cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l From 1ca9db5b6556c38aacac4c50a617aa3f7efbb306 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:24 +0000 Subject: [PATCH 089/131] powerpc/32: save DEAR/DAR before calling handle_page_fault handle_page_fault() is the only function that save DAR/DEAR itself. Save DAR/DEAR before calling handle_page_fault() to prepare for VMAP stack which will require to save even before. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3a4d58d378091086f00fde42b59610c80289e120.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 1 - arch/powerpc/kernel/head_32.S | 2 ++ arch/powerpc/kernel/head_40x.S | 2 ++ arch/powerpc/kernel/head_8xx.S | 2 ++ arch/powerpc/kernel/head_booke.h | 2 ++ arch/powerpc/kernel/head_fsl_booke.S | 1 + 6 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 6273b4862482..317ad9df8ba8 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -621,7 +621,6 @@ ppc_swapcontext: */ .globl handle_page_fault handle_page_fault: - stw r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_BOOK3S_32 andis. r0,r5,DSISR_DABRMATCH@h diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 9e868567b716..bebb49d877f2 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -310,6 +310,7 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) 1: lwz r5,_DSISR(r11) /* get DSISR value */ mfspr r4,SPRN_DAR + stw r4, _DAR(r11) EXC_XFER_LITE(0x300, handle_page_fault) @@ -327,6 +328,7 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) 1: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ + stw r4, _DAR(r11) EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 585ea1976550..9bb663977e84 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -313,6 +313,7 @@ _ENTRY(saved_ksp_limit) START_EXCEPTION(0x0400, InstructionAccess) EXCEPTION_PROLOG mr r4,r12 /* Pass SRR0 as arg2 */ + stw r4, _DEAR(r11) li r5,0 /* Pass zero as arg3 */ EXC_XFER_LITE(0x400, handle_page_fault) @@ -676,6 +677,7 @@ DataAccess: mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ + stw r4, _DEAR(r11) EXC_XFER_LITE(0x300, handle_page_fault) /* Other PowerPC processors, namely those derived from the 6xx-series diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index dac7c0a34eea..fb284d95c76a 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -486,6 +486,7 @@ InstructionTLBError: tlbie r4 /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ .Litlbie: + stw r4, _DAR(r11) EXC_XFER_LITE(0x400, handle_page_fault) /* This is the data TLB error on the MPC8xx. This could be due to @@ -504,6 +505,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) mfspr r4,SPRN_DAR + stw r4, _DAR(r11) andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 2ae635df9026..37fc84ed90e3 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -467,6 +467,7 @@ label: mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ + stw r4, _DEAR(r11); \ EXC_XFER_LITE(0x0300, handle_page_fault) #define INSTRUCTION_STORAGE_EXCEPTION \ @@ -475,6 +476,7 @@ label: mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mr r4,r12; /* Pass SRR0 as arg2 */ \ + stw r4, _DEAR(r11); \ li r5,0; /* Pass zero as arg3 */ \ EXC_XFER_LITE(0x0400, handle_page_fault) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 6f7a3a7162c5..840af004041e 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -378,6 +378,7 @@ interrupt_base: mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ andis. r10,r5,(ESR_ILK|ESR_DLK)@h bne 1f + stw r4, _DEAR(r11) EXC_XFER_LITE(0x0300, handle_page_fault) 1: addi r3,r1,STACK_FRAME_OVERHEAD From 5ae8fabc644632d31babbc7162aa876c79aaadb3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:25 +0000 Subject: [PATCH 090/131] powerpc/32: move MSR_PR test into EXCEPTION_PROLOG_0 In order to simplify VMAP stack implementation, move MSR_PR test into EXCEPTION_PROLOG_0. This requires to not modify cr0 between EXCEPTION_PROLOG_0 and EXCEPTION_PROLOG_1. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5c8b5bba692b92654dbd363a229a1ba91db725bb.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.h | 4 ++-- arch/powerpc/kernel/head_8xx.S | 39 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 8e345f8d4b0e..436ffd862d2a 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -19,12 +19,12 @@ .macro EXCEPTION_PROLOG_0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ mfcr r10 + andi. r11, r11, MSR_PR .endm .macro EXCEPTION_PROLOG_1 - mfspr r11,SPRN_SRR1 /* check whether user or kernel */ - andi. r11,r11,MSR_PR tophys(r11,r1) /* use tophys(r1) if kernel */ beq 1f mfspr r11,SPRN_SPRG_THREAD diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index fb284d95c76a..175c3cfc8014 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -497,8 +497,8 @@ InstructionTLBError: DataTLBError: EXCEPTION_PROLOG_0 mfspr r11, SPRN_DAR - cmpwi cr0, r11, RPN_PATTERN - beq- FixupDAR /* must be a buggy dcbX, icbi insn. */ + cmpwi cr1, r11, RPN_PATTERN + beq- cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 @@ -531,9 +531,9 @@ DARFixed:/* Return from dcbx instruction bug workaround */ DataBreakpoint: EXCEPTION_PROLOG_0 mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l - beq- cr0, 11f + beq- cr1, 11f beq- cr7, 11f EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 @@ -578,9 +578,9 @@ FixupDAR:/* Entry point for dcbx workaround. */ mfspr r10, SPRN_SRR0 mtspr SPRN_MD_EPN, r10 rlwinm r11, r10, 16, 0xfff8 - cmpli cr0, r11, PAGE_OFFSET@h + cmpli cr1, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TWB /* Get level 1 table */ - blt+ 3f + blt+ cr1, 3f rlwinm r11, r10, 16, 0xfff8 0: cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h @@ -595,7 +595,7 @@ FixupDAR:/* Entry point for dcbx workaround. */ 3: lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ mtspr SPRN_MD_TWC, r11 - mtcr r11 + mtcrf 0x01, r11 mfspr r11, SPRN_MD_TWC lwz r11, 0(r11) /* Get the pte */ bt 28,200f /* bit 28 = Large page (8M) */ @@ -608,16 +608,16 @@ FixupDAR:/* Entry point for dcbx workaround. */ * no need to include them here */ xoris r10, r11, 0x7c00 /* check if major OP code is 31 */ rlwinm r10, r10, 0, 21, 5 - cmpwi cr0, r10, 2028 /* Is dcbz? */ - beq+ 142f - cmpwi cr0, r10, 940 /* Is dcbi? */ - beq+ 142f - cmpwi cr0, r10, 108 /* Is dcbst? */ - beq+ 144f /* Fix up store bit! */ - cmpwi cr0, r10, 172 /* Is dcbf? */ - beq+ 142f - cmpwi cr0, r10, 1964 /* Is icbi? */ - beq+ 142f + cmpwi cr1, r10, 2028 /* Is dcbz? */ + beq+ cr1, 142f + cmpwi cr1, r10, 940 /* Is dcbi? */ + beq+ cr1, 142f + cmpwi cr1, r10, 108 /* Is dcbst? */ + beq+ cr1, 144f /* Fix up store bit! */ + cmpwi cr1, r10, 172 /* Is dcbf? */ + beq+ cr1, 142f + cmpwi cr1, r10, 1964 /* Is icbi? */ + beq+ cr1, 142f 141: mfspr r10,SPRN_M_TW b DARFixed /* Nope, go back to normal TLB processing */ @@ -676,8 +676,9 @@ FixupDAR:/* Entry point for dcbx workaround. */ add r10, r10, r30 ;b 151f add r10, r10, r31 151: - rlwinm. r11,r11,19,24,28 /* offset into jump table for reg RA */ - beq 152f /* if reg RA is zero, don't add it */ + rlwinm r11,r11,19,24,28 /* offset into jump table for reg RA */ + cmpwi cr1, r11, 0 + beq cr1, 152f /* if reg RA is zero, don't add it */ addi r11, r11, 150b@l /* add start of table */ mtctr r11 /* load ctr with jump address */ rlwinm r11,r11,0,16,10 /* make sure we don't execute this more than once */ From c9c84fd945bbaadbf69d953be308e6f35737fef6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:26 +0000 Subject: [PATCH 091/131] powerpc/32: add a macro to get and/or save DAR and DSISR on stack. Refactor reading and saving of DAR and DSISR in exception vectors. This will ease the implementation of VMAP stack. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1286b3e51b07727c6b4b05f2df9af3f9b1717fb5.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 5 +---- arch/powerpc/kernel/head_32.h | 11 +++++++++++ arch/powerpc/kernel/head_8xx.S | 23 +++++++---------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index bebb49d877f2..449625b4ff03 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -339,10 +339,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) DO_KVM 0x600 Alignment: EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + save_dar_dsisr_on_stack r4, r5, r11 addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x600, alignment_exception) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 436ffd862d2a..f19a1ab91fb5 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -144,6 +144,17 @@ RFI /* jump to handler, enable MMU */ .endm +.macro save_dar_dsisr_on_stack reg1, reg2, sp + mfspr \reg1, SPRN_DAR + mfspr \reg2, SPRN_DSISR + stw \reg1, _DAR(\sp) + stw \reg2, _DSISR(\sp) +.endm + +.macro get_and_save_dar_dsisr_on_stack reg1, reg2, sp + save_dar_dsisr_on_stack \reg1, \reg2, \sp +.endm + /* * Note: code which follows this uses cr0.eq (set if from kernel), * r11, r12 (SRR0), and r9 (SRR1). diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 175c3cfc8014..25e19af49705 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -128,12 +128,9 @@ instruction_counter: . = 0x200 MachineCheck: EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + save_dar_dsisr_on_stack r4, r5, r11 + li r6, RPN_PATTERN + mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x200, machine_check_exception) @@ -156,12 +153,9 @@ InstructionAccess: . = 0x600 Alignment: EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + save_dar_dsisr_on_stack r4, r5, r11 + li r6, RPN_PATTERN + mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x600, alignment_exception) @@ -502,10 +496,7 @@ DataTLBError: DARFixed:/* Return from dcbx instruction bug workaround */ EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - mfspr r4,SPRN_DAR - stw r4, _DAR(r11) + get_and_save_dar_dsisr_on_stack r4, r5, r11 andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 From 028474876f472c3b6eee633aed528a1206609657 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:27 +0000 Subject: [PATCH 092/131] powerpc/32: prepare for CONFIG_VMAP_STACK To support CONFIG_VMAP_STACK, the kernel has to activate Data MMU Translation for accessing the stack. Before doing that it must save SRR0, SRR1 and also DAR and DSISR when relevant, in order to not loose them in case there is a Data TLB Miss once the translation is reactivated. This patch adds fields in thread struct for saving those registers. It prepares entry_32.S to handle exception entry with Data MMU Translation enabled and alters EXCEPTION_PROLOG macros to save SRR0, SRR1, DAR and DSISR then reenables Data MMU. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a775a1fea60f190e0f63503463fb775310a2009b.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/processor.h | 6 ++ arch/powerpc/include/asm/thread_info.h | 5 + arch/powerpc/kernel/asm-offsets.c | 6 ++ arch/powerpc/kernel/entry_32.S | 4 +- arch/powerpc/kernel/head_32.h | 128 ++++++++++++++++++++++--- 5 files changed, 133 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e114eb36721c..8387698bd5b6 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -162,6 +162,12 @@ struct thread_struct { #endif #if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) unsigned long kuap; /* opened segments for user access */ +#endif +#ifdef CONFIG_VMAP_STACK + unsigned long srr0; + unsigned long srr1; + unsigned long dar; + unsigned long dsisr; #endif /* Debug Registers */ struct debug_reg debug; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 8e1d0195ac36..488d5c4670ff 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -10,10 +10,15 @@ #define _ASM_POWERPC_THREAD_INFO_H #include +#include #ifdef __KERNEL__ +#if defined(CONFIG_VMAP_STACK) && CONFIG_THREAD_SHIFT < PAGE_SHIFT +#define THREAD_SHIFT PAGE_SHIFT +#else #define THREAD_SHIFT CONFIG_THREAD_SHIFT +#endif #define THREAD_SIZE (1 << THREAD_SHIFT) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 90e53d432f2e..c25e562f1cd9 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -127,6 +127,12 @@ int main(void) OFFSET(KSP_VSID, thread_struct, ksp_vsid); #else /* CONFIG_PPC64 */ OFFSET(PGDIR, thread_struct, pgdir); +#ifdef CONFIG_VMAP_STACK + OFFSET(SRR0, thread_struct, srr0); + OFFSET(SRR1, thread_struct, srr1); + OFFSET(DAR, thread_struct, dar); + OFFSET(DSISR, thread_struct, dsisr); +#endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); OFFSET(THREAD_ACC, thread_struct, acc); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 317ad9df8ba8..8f6617cf2689 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -140,6 +140,7 @@ transfer_to_handler: stw r12,_CTR(r11) stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD + tovirt_vmstack r12, r12 beq 2f /* if from user, fix up THREAD.regs */ addi r2, r12, -THREAD addi r11,r1,STACK_FRAME_OVERHEAD @@ -195,7 +196,8 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflr r9 - tovirt(r2, r2) /* set r2 to current */ + tovirt_novmstack r2, r2 /* set r2 to current */ + tovirt_vmstack r9, r9 lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index f19a1ab91fb5..e36987aede86 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -10,31 +10,54 @@ * We assume sprg3 has the physical address of the current * task's thread_struct. */ -.macro EXCEPTION_PROLOG - EXCEPTION_PROLOG_0 +.macro EXCEPTION_PROLOG handle_dar_dsisr=0 + EXCEPTION_PROLOG_0 handle_dar_dsisr=\handle_dar_dsisr EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 handle_dar_dsisr=\handle_dar_dsisr .endm -.macro EXCEPTION_PROLOG_0 +.macro EXCEPTION_PROLOG_0 handle_dar_dsisr=0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 +#ifdef CONFIG_VMAP_STACK + mfspr r10, SPRN_SPRG_THREAD + .if \handle_dar_dsisr + mfspr r11, SPRN_DAR + stw r11, DAR(r10) + mfspr r11, SPRN_DSISR + stw r11, DSISR(r10) + .endif + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) +#endif mfspr r11, SPRN_SRR1 /* check whether user or kernel */ +#ifdef CONFIG_VMAP_STACK + stw r11, SRR1(r10) +#endif mfcr r10 andi. r11, r11, MSR_PR .endm .macro EXCEPTION_PROLOG_1 +#ifdef CONFIG_VMAP_STACK + li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r11 + isync + subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ +#else tophys(r11,r1) /* use tophys(r1) if kernel */ + subi r11, r11, INT_FRAME_SIZE /* alloc exc. frame */ +#endif beq 1f mfspr r11,SPRN_SPRG_THREAD + tovirt_vmstack r11, r11 lwz r11,TASK_STACK-THREAD(r11) - addi r11,r11,THREAD_SIZE - tophys(r11,r11) -1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ + addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE + tophys_novmstack r11, r11 +1: .endm -.macro EXCEPTION_PROLOG_2 +.macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 stw r10,_CCR(r11) /* save registers */ stw r12,GPR12(r11) stw r9,GPR9(r11) @@ -44,15 +67,32 @@ stw r12,GPR11(r11) mflr r10 stw r10,_LINK(r11) +#ifdef CONFIG_VMAP_STACK + mfspr r12, SPRN_SPRG_THREAD + tovirt(r12, r12) + .if \handle_dar_dsisr + lwz r10, DAR(r12) + stw r10, _DAR(r11) + lwz r10, DSISR(r12) + stw r10, _DSISR(r11) + .endif + lwz r9, SRR1(r12) + lwz r12, SRR0(r12) +#else mfspr r12,SPRN_SRR0 mfspr r9,SPRN_SRR1 +#endif stw r1,GPR1(r11) stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ + tovirt_novmstack r1, r11 /* set new kernel sp */ #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#else +#ifdef CONFIG_VMAP_STACK + li r10, MSR_KERNEL & ~MSR_IR /* can take exceptions */ #else li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ +#endif mtmsr r10 /* (except for mach check in rtas) */ #endif stw r0,GPR0(r11) @@ -65,24 +105,45 @@ .macro SYSCALL_ENTRY trapno mfspr r12,SPRN_SPRG_THREAD +#ifdef CONFIG_VMAP_STACK + mfspr r9, SPRN_SRR0 + mfspr r11, SPRN_SRR1 + stw r9, SRR0(r12) + stw r11, SRR1(r12) +#endif mfcr r10 lwz r11,TASK_STACK-THREAD(r12) - mflr r9 - addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE rlwinm r10,r10,0,4,2 /* Clear SO bit in CR */ - tophys(r11,r11) + addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + li r9, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r9 + isync +#endif + tovirt_vmstack r12, r12 + tophys_novmstack r11, r11 + mflr r9 stw r10,_CCR(r11) /* save registers */ + stw r9, _LINK(r11) +#ifdef CONFIG_VMAP_STACK + lwz r10, SRR0(r12) + lwz r9, SRR1(r12) +#else mfspr r10,SPRN_SRR0 - stw r9,_LINK(r11) mfspr r9,SPRN_SRR1 +#endif stw r1,GPR1(r11) stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ + tovirt_novmstack r1, r11 /* set new kernel sp */ stw r10,_NIP(r11) #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#else +#ifdef CONFIG_VMAP_STACK + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~MSR_IR) /* can take exceptions */ #else LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ +#endif mtmsr r10 /* (except for mach check in rtas) */ #endif lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ @@ -121,7 +182,7 @@ #endif 3: - tovirt(r2, r2) /* set r2 to current */ + tovirt_novmstack r2, r2 /* set r2 to current */ lis r11, transfer_to_syscall@h ori r11, r11, transfer_to_syscall@l #ifdef CONFIG_TRACE_IRQFLAGS @@ -145,14 +206,51 @@ .endm .macro save_dar_dsisr_on_stack reg1, reg2, sp +#ifndef CONFIG_VMAP_STACK mfspr \reg1, SPRN_DAR mfspr \reg2, SPRN_DSISR stw \reg1, _DAR(\sp) stw \reg2, _DSISR(\sp) +#endif .endm .macro get_and_save_dar_dsisr_on_stack reg1, reg2, sp +#ifdef CONFIG_VMAP_STACK + lwz \reg1, _DAR(\sp) + lwz \reg2, _DSISR(\sp) +#else save_dar_dsisr_on_stack \reg1, \reg2, \sp +#endif +.endm + +.macro tovirt_vmstack dst, src +#ifdef CONFIG_VMAP_STACK + tovirt(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif +.endm + +.macro tovirt_novmstack dst, src +#ifndef CONFIG_VMAP_STACK + tovirt(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif +.endm + +.macro tophys_novmstack dst, src +#ifndef CONFIG_VMAP_STACK + tophys(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif .endm /* From 63289e7d3a645a4291ec43c1d526dd4a811da1a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:28 +0000 Subject: [PATCH 093/131] powerpc: align stack to 2 * THREAD_SIZE with VMAP_STACK In order to ease stack overflow detection, align stack to 2 * THREAD_SIZE when using VMAP_STACK. This allows overflow detection using a single bit check. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/60e9ae86b7d2cdcf21468787076d345663648f46.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/thread_info.h | 13 +++++++++++++ arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/vmlinux.lds.S | 2 +- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 488d5c4670ff..a2270749b282 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -22,6 +22,19 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) +/* + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry + * assembly. + */ +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN_SHIFT (THREAD_SHIFT + 1) +#else +#define THREAD_ALIGN_SHIFT THREAD_SHIFT +#endif + +#define THREAD_ALIGN (1 << THREAD_ALIGN_SHIFT) + #ifndef __ASSEMBLY__ #include #include diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index dcffe927f5b9..f014c4f7a337 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -140,7 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); if (!ptr) panic("cannot allocate %d bytes for stack at %pS\n", diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6104917a282d..e05e6dd67ae6 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -633,7 +633,7 @@ static void *__init alloc_stack(unsigned long limit, int cpu) BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_ALIGN, MEMBLOCK_LOW_LIMIT, limit, early_cpu_to_node(cpu)); if (!ptr) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8834220036a5..b4c89a1acebb 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -323,7 +323,7 @@ SECTIONS #endif /* The initial task and kernel stack */ - INIT_TASK_DATA_SECTION(THREAD_SIZE) + INIT_TASK_DATA_SECTION(THREAD_ALIGN) .data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) { PAGE_ALIGNED_DATA(PAGE_SIZE) From 3978eb78517c1fc06dd75405d732af45c80f5bd2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:29 +0000 Subject: [PATCH 094/131] powerpc/32: Add early stack overflow detection with VMAP stack. To avoid recursive faults, stack overflow detection has to be performed before writing in the stack in exception prologs. Do it by checking the alignment. If the stack pointer alignment is wrong, it means it is pointing to the following or preceding page. Without VMAP stack, a stack overflow is catastrophic. With VMAP stack, a stack overflow isn't destructive, so don't panic. Kill the task with SIGSEGV instead. A dedicated overflow stack is set up for each CPU. lkdtm: Performing direct entry EXHAUST_STACK lkdtm: Calling function with 512 frame size to depth 32 ... lkdtm: loop 32/32 ... lkdtm: loop 31/32 ... lkdtm: loop 30/32 ... lkdtm: loop 29/32 ... lkdtm: loop 28/32 ... lkdtm: loop 27/32 ... lkdtm: loop 26/32 ... lkdtm: loop 25/32 ... lkdtm: loop 24/32 ... lkdtm: loop 23/32 ... lkdtm: loop 22/32 ... lkdtm: loop 21/32 ... lkdtm: loop 20/32 ... Kernel stack overflow in process test[359], r1=c900c008 Oops: Kernel stack overflow, sig: 6 [#1] BE PAGE_SIZE=4K MMU=Hash PowerMac Modules linked in: CPU: 0 PID: 359 Comm: test Not tainted 5.3.0-rc7+ #2225 NIP: c0622060 LR: c0626710 CTR: 00000000 REGS: c0895f48 TRAP: 0000 Not tainted (5.3.0-rc7+) MSR: 00001032 CR: 28004224 XER: 00000000 GPR00: c0626ca4 c900c008 c783c000 c07335cc c900c010 c07335cc c900c0f0 c07335cc GPR08: c900c0f0 00000001 00000000 00000000 28008222 00000000 00000000 00000000 GPR16: 00000000 00000000 10010128 10010000 b799c245 10010158 c07335cc 00000025 GPR24: c0690000 c08b91d4 c068f688 00000020 c900c0f0 c068f668 c08b95b4 c08b91d4 NIP [c0622060] format_decode+0x0/0x4d4 LR [c0626710] vsnprintf+0x80/0x5fc Call Trace: [c900c068] [c0626ca4] vscnprintf+0x18/0x48 [c900c078] [c007b944] vprintk_store+0x40/0x214 [c900c0b8] [c007bf50] vprintk_emit+0x90/0x1dc [c900c0e8] [c007c5cc] printk+0x50/0x60 [c900c128] [c03da5b0] recursive_loop+0x44/0x6c [c900c338] [c03da5c4] recursive_loop+0x58/0x6c [c900c548] [c03da5c4] recursive_loop+0x58/0x6c [c900c758] [c03da5c4] recursive_loop+0x58/0x6c [c900c968] [c03da5c4] recursive_loop+0x58/0x6c [c900cb78] [c03da5c4] recursive_loop+0x58/0x6c [c900cd88] [c03da5c4] recursive_loop+0x58/0x6c [c900cf98] [c03da5c4] recursive_loop+0x58/0x6c [c900d1a8] [c03da5c4] recursive_loop+0x58/0x6c [c900d3b8] [c03da5c4] recursive_loop+0x58/0x6c [c900d5c8] [c03da5c4] recursive_loop+0x58/0x6c [c900d7d8] [c03da5c4] recursive_loop+0x58/0x6c [c900d9e8] [c03da5c4] recursive_loop+0x58/0x6c [c900dbf8] [c03da5c4] recursive_loop+0x58/0x6c [c900de08] [c03da67c] lkdtm_EXHAUST_STACK+0x30/0x4c [c900de18] [c03da3e8] direct_entry+0xc8/0x140 [c900de48] [c029fb40] full_proxy_write+0x64/0xcc [c900de68] [c01500f8] __vfs_write+0x30/0x1d0 [c900dee8] [c0152cb8] vfs_write+0xb8/0x1d4 [c900df08] [c0152f7c] ksys_write+0x58/0xe8 [c900df38] [c0014208] ret_from_syscall+0x0/0x34 --- interrupt: c01 at 0xf806664 LR = 0x1000c868 Instruction dump: 4bffff91 80010014 7c832378 7c0803a6 38210010 4e800020 3d20c08a 3ca0c089 8089a0cc 38a58f0c 38600001 4ba2d494 <9421ffe0> 7c0802a6 bfc10018 7c9f2378 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1b89c121b4070c7ee99e4f22cc178f15a736b07b.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 4 ++++ arch/powerpc/kernel/head_32.h | 28 ++++++++++++++++++++++++++++ arch/powerpc/kernel/setup.h | 2 +- arch/powerpc/kernel/setup_32.c | 12 ++++++++++++ arch/powerpc/kernel/traps.c | 9 +++++++++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 8f6617cf2689..7e5a1722e4f2 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -182,9 +182,11 @@ transfer_to_handler: */ kuap_save_and_lock r11, r12, r9, r2, r0 addi r2, r12, -THREAD +#ifndef CONFIG_VMAP_STACK lwz r9,KSP_LIMIT(r12) cmplw r1,r9 /* if r1 <= ksp_limit */ ble- stack_ovf /* then the kernel stack overflowed */ +#endif 5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) lwz r12,TI_LOCAL_FLAGS(r2) @@ -289,6 +291,7 @@ reenable_mmu: b fast_exception_return #endif +#ifndef CONFIG_VMAP_STACK /* * On kernel stack overflow, load up an initial stack pointer * and call StackOverflow(regs), which should not return. @@ -314,6 +317,7 @@ stack_ovf: mtspr SPRN_SRR1,r10 SYNC RFI +#endif #ifdef CONFIG_TRACE_IRQFLAGS trace_syscall_entry_irq_off: diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index e36987aede86..c39209d56020 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -55,6 +55,10 @@ addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE tophys_novmstack r11, r11 1: +#ifdef CONFIG_VMAP_STACK + mtcrf 0x7f, r11 + bt 32 - THREAD_ALIGN_SHIFT, stack_overflow +#endif .endm .macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 @@ -299,4 +303,28 @@ label: EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) +.macro vmap_stack_overflow_exception +#ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_SMP + mfspr r11, SPRN_SPRG_THREAD + tovirt(r11, r11) + lwz r11, TASK_CPU - THREAD(r11) + slwi r11, r11, 3 + addis r11, r11, emergency_ctx@ha +#else + lis r11, emergency_ctx@ha +#endif + lwz r11, emergency_ctx@l(r11) + cmpwi cr1, r11, 0 + bne cr1, 1f + lis r11, init_thread_union@ha + addi r11, r11, init_thread_union@l +1: addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE + EXCEPTION_PROLOG_2 + SAVE_NVGPRS(r11) + addi r3, r1, STACK_FRAME_OVERHEAD + EXC_XFER_STD(0, stack_overflow_exception) +#endif +.endm + #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index c82577c4b15d..2dd0d9cb5a20 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -35,7 +35,7 @@ void exc_lvl_early_init(void); static inline void exc_lvl_early_init(void) { }; #endif -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || defined(CONFIG_VMAP_STACK) void emergency_stack_init(void); #else static inline void emergency_stack_init(void) { }; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index f014c4f7a337..a55b4d9ab824 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -161,6 +161,18 @@ void __init irqstack_early_init(void) } } +#ifdef CONFIG_VMAP_STACK +void *emergency_ctx[NR_CPUS] __ro_after_init; + +void __init emergency_stack_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + emergency_ctx[i] = alloc_stack(); +} +#endif + #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) void __init exc_lvl_early_init(void) { diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 014ff0701f24..82a3438300fd 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1637,6 +1637,15 @@ void StackOverflow(struct pt_regs *regs) panic("kernel stack overflow"); } +void stack_overflow_exception(struct pt_regs *regs) +{ + enum ctx_state prev_state = exception_enter(); + + die("Kernel stack overflow", regs, SIGSEGV); + + exception_exit(prev_state); +} + void kernel_fp_unavailable_exception(struct pt_regs *regs) { enum ctx_state prev_state = exception_enter(); From 547db12fd8a0ce753c6bafd130827f1755c93fe6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:30 +0000 Subject: [PATCH 095/131] powerpc/32: Use vmapped stacks for interrupts In order to also catch overflows on IRQ stacks, use vmapped stacks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d33ad1b36ddff4dcc19f96c592c12a61cf85d406.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/irq.c | 22 ++++++++++++++++++++++ arch/powerpc/kernel/setup_32.c | 3 +++ 2 files changed, 25 insertions(+) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index add67498c126..5c9b11878555 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -664,8 +665,29 @@ void do_IRQ(struct pt_regs *regs) set_irq_regs(old_regs); } +static void *__init alloc_vm_stack(void) +{ + return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, + VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, + 0, NUMA_NO_NODE, (void*)_RET_IP_); +} + +static void __init vmap_irqstack_init(void) +{ + int i; + + for_each_possible_cpu(i) { + softirq_ctx[i] = alloc_vm_stack(); + hardirq_ctx[i] = alloc_vm_stack(); + } +} + + void __init init_IRQ(void) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) + vmap_irqstack_init(); + if (ppc_md.init_IRQ) ppc_md.init_IRQ(); } diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index a55b4d9ab824..5b49b26eb154 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -153,6 +153,9 @@ void __init irqstack_early_init(void) { unsigned int i; + if (IS_ENABLED(CONFIG_VMAP_STACK)) + return; + /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { From 6edc318585410b9cae5a7e4d156f09c8e5e3a700 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:31 +0000 Subject: [PATCH 096/131] powerpc/8xx: Use alternative scratch registers in DTLB miss handler In preparation of handling CONFIG_VMAP_STACK, DTLB miss handler need to use different scratch registers than other exception handlers in order to not jeopardise exception entry on stack DTLB misses. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c5287ea59ae9630f505019b309bf94029241635f.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 27 ++++++++++++++------------- arch/powerpc/perf/8xx-pmu.c | 12 ++++++++---- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 25e19af49705..3de9c5f1746c 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -193,8 +193,9 @@ SystemCall: 0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) addi r10, r10, 1 stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi #endif @@ -337,8 +338,8 @@ ITLBMissLinear: . = 0x1200 DataStoreTLBMiss: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 + mtspr SPRN_DAR, r10 + mtspr SPRN_M_TW, r11 mfcr r11 /* If we are faulting a kernel address, we have to use the @@ -403,10 +404,10 @@ DataStoreTLBMiss: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ /* Restore registers */ - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_1 @@ -422,10 +423,10 @@ DTLBMissIMMR: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_2 @@ -459,10 +460,10 @@ DTLBMissLinear: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_3 diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c index 19124b0b171a..1ad03c55c88c 100644 --- a/arch/powerpc/perf/8xx-pmu.c +++ b/arch/powerpc/perf/8xx-pmu.c @@ -157,10 +157,6 @@ static void mpc8xx_pmu_read(struct perf_event *event) static void mpc8xx_pmu_del(struct perf_event *event, int flags) { - /* mfspr r10, SPRN_SPRG_SCRATCH0 */ - unsigned int insn = PPC_INST_MFSPR | __PPC_RS(R10) | - __PPC_SPR(SPRN_SPRG_SCRATCH0); - mpc8xx_pmu_read(event); /* If it was the last user, stop counting to avoid useles overhead */ @@ -173,6 +169,10 @@ static void mpc8xx_pmu_del(struct perf_event *event, int flags) break; case PERF_8xx_ID_ITLB_LOAD_MISS: if (atomic_dec_return(&itlb_miss_ref) == 0) { + /* mfspr r10, SPRN_SPRG_SCRATCH0 */ + unsigned int insn = PPC_INST_MFSPR | __PPC_RS(R10) | + __PPC_SPR(SPRN_SPRG_SCRATCH0); + patch_instruction_site(&patch__itlbmiss_exit_1, insn); #ifndef CONFIG_PIN_TLB_TEXT patch_instruction_site(&patch__itlbmiss_exit_2, insn); @@ -181,6 +181,10 @@ static void mpc8xx_pmu_del(struct perf_event *event, int flags) break; case PERF_8xx_ID_DTLB_LOAD_MISS: if (atomic_dec_return(&dtlb_miss_ref) == 0) { + /* mfspr r10, SPRN_DAR */ + unsigned int insn = PPC_INST_MFSPR | __PPC_RS(R10) | + __PPC_SPR(SPRN_DAR); + patch_instruction_site(&patch__dtlbmiss_exit_1, insn); patch_instruction_site(&patch__dtlbmiss_exit_2, insn); patch_instruction_site(&patch__dtlbmiss_exit_3, insn); From 9260f76ae2533787e92be58d5e509e81b72f5f9c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:32 +0000 Subject: [PATCH 097/131] powerpc/8xx: Drop exception entries for non-existing exceptions head_8xx.S has entries for all exceptions from 0x100 to 0x1f00. Several of them do not exist and are never generated by the 8xx in accordance with the documentation. Remove those entry points to make some room for future growing exception code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/66f92866fe9524cf0f056016921c7d53adaef3a0.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 3de9c5f1746c..5aa63693f790 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -134,18 +134,6 @@ MachineCheck: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x200, machine_check_exception) -/* Data access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x300 -DataAccess: - -/* Instruction access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x400 -InstructionAccess: - /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) @@ -162,16 +150,9 @@ Alignment: /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) -/* No FPU on MPC8xx. This exception is not supposed to happen. -*/ - EXCEPTION(0x800, FPUnavailable, unknown_exception, EXC_XFER_STD) - /* Decrementer */ EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) - /* System call */ . = 0xc00 SystemCall: @@ -179,8 +160,6 @@ SystemCall: /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_STD) /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. @@ -507,14 +486,6 @@ DARFixed:/* Return from dcbx instruction bug workaround */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) - /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. From 596419afed72656a4529d3d67475e4b2b1912485 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:33 +0000 Subject: [PATCH 098/131] powerpc/8xx: Move DataStoreTLBMiss perf handler Move DataStoreTLBMiss perf handler in order to cope with future growing exception prolog. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/75dd28b04efd2cbdbf01153173d99c11cdff2f08.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 5aa63693f790..1e718e47fe3c 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -166,18 +166,6 @@ SystemCall: */ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) -/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_DAR - mtspr SPRN_DAR, r11 /* Tag DAR */ - mfspr r11, SPRN_M_TW - rfi -#endif - . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction @@ -486,6 +474,18 @@ DARFixed:/* Return from dcbx instruction bug workaround */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) +/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW + rfi +#endif + /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. From afe1ec5ab83029baf0f8368a255dc6b998bde576 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:34 +0000 Subject: [PATCH 099/131] powerpc/8xx: Split breakpoint exception Breakpoint exception is big. Split it to support future growth on exception prolog. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1dda3293d86d0f715b13b2633c95d2188a42a02c.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 1e718e47fe3c..225e242ce1c5 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -490,14 +490,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ * support of breakpoints and such. Someday I will get around to * using them. */ - . = 0x1c00 -DataBreakpoint: - EXCEPTION_PROLOG_0 - mfspr r11, SPRN_SRR0 - cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l - beq- cr1, 11f - beq- cr7, 11f +do_databreakpoint: EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD @@ -505,7 +498,15 @@ DataBreakpoint: stw r4,_DAR(r11) mfspr r5,SPRN_DSISR EXC_XFER_STD(0x1c00, do_break) -11: + + . = 0x1c00 +DataBreakpoint: + EXCEPTION_PROLOG_0 + mfspr r11, SPRN_SRR0 + cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l + cror 4*cr1+eq, 4*cr1+eq, 4*cr7+eq + bne cr1, do_databreakpoint mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 From d52668f6b3e052fa97e93e536c492f5880c73474 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 26 Jan 2020 00:20:16 +1100 Subject: [PATCH 100/131] powerpc/8xx: Move tail of alignment exception out of line When we enable VMAP_STACK there will not be enough room for the alignment handler at 0x600 in head_8xx.S. For now move the tail of the alignment handler out of line, and branch to it. Suggested-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_8xx.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 225e242ce1c5..a6b1b7af2217 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -145,7 +145,7 @@ Alignment: li r6, RPN_PATTERN mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + b .Lalignment_exception_ool /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) @@ -153,6 +153,11 @@ Alignment: /* Decrementer */ EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) + /* With VMAP_STACK there's not enough room for this at 0x600 */ + . = 0xa00 +.Lalignment_exception_ool: + EXC_XFER_STD(0x600, alignment_exception) + /* System call */ . = 0xc00 SystemCall: From 99b229161f8d99742cdfd6be21059ebd9eb7b696 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:35 +0000 Subject: [PATCH 101/131] powerpc/8xx: Enable CONFIG_VMAP_STACK This patch enables CONFIG_VMAP_STACK. For that, a few changes are done in head_8xx.S. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d7ba1e34e80898310d6a314cbebe48baa32894ef.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 32 +++++++++++++++++++++----- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index a6b1b7af2217..9922306ae512 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -127,7 +127,7 @@ instruction_counter: /* Machine check */ . = 0x200 MachineCheck: - EXCEPTION_PROLOG + EXCEPTION_PROLOG handle_dar_dsisr=1 save_dar_dsisr_on_stack r4, r5, r11 li r6, RPN_PATTERN mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ @@ -140,7 +140,7 @@ MachineCheck: /* Alignment exception */ . = 0x600 Alignment: - EXCEPTION_PROLOG + EXCEPTION_PROLOG handle_dar_dsisr=1 save_dar_dsisr_on_stack r4, r5, r11 li r6, RPN_PATTERN mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ @@ -462,20 +462,26 @@ InstructionTLBError: */ . = 0x1400 DataTLBError: - EXCEPTION_PROLOG_0 + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_DAR cmpwi cr1, r11, RPN_PATTERN beq- cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ +#ifdef CONFIG_VMAP_STACK + li r11, RPN_PATTERN + mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ +#endif EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 handle_dar_dsisr=1 get_and_save_dar_dsisr_on_stack r4, r5, r11 andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 .Ldtlbie: +#ifndef CONFIG_VMAP_STACK li r10,RPN_PATTERN mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ +#endif /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) @@ -491,22 +497,29 @@ DARFixed:/* Return from dcbx instruction bug workaround */ rfi #endif +stack_overflow: + vmap_stack_overflow_exception + /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. */ do_databreakpoint: EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 handle_dar_dsisr=1 addi r3,r1,STACK_FRAME_OVERHEAD mfspr r4,SPRN_BAR stw r4,_DAR(r11) +#ifdef CONFIG_VMAP_STACK + lwz r5,_DSISR(r11) +#else mfspr r5,SPRN_DSISR +#endif EXC_XFER_STD(0x1c00, do_break) . = 0x1c00 DataBreakpoint: - EXCEPTION_PROLOG_0 + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_SRR0 cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l @@ -655,7 +668,14 @@ FixupDAR:/* Entry point for dcbx workaround. */ 152: mfdar r11 mtctr r11 /* restore ctr reg from DAR */ +#ifdef CONFIG_VMAP_STACK + mfspr r11, SPRN_SPRG_THREAD + stw r10, DAR(r11) + mfspr r10, SPRN_DSISR + stw r10, DSISR(r11) +#else mtdar r10 /* save fault EA to DAR */ +#endif mfspr r10,SPRN_M_TW b DARFixed /* Go back to normal TLB handling */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8d7f9c3dc771..000dd297337c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -49,6 +49,7 @@ config PPC_8xx select PPC_HAVE_KUEP select PPC_HAVE_KUAP select PPC_MM_SLICES if HUGETLB_PAGE + select HAVE_ARCH_VMAP_STACK config 40x bool "AMCC 40x" From 2e15001ea9ea110837a6eb6dba518416dedb4c39 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:36 +0000 Subject: [PATCH 102/131] powerpc/32s: Reorganise DSI handler. The part decidated to handling hash_page() is fully unneeded for processors not having real hash pages like the 603. Lets enlarge the content of the feature fixup, and provide an alternative which jumps directly instead of getting NIPs. Also, in preparation of VMAP stacks, the end of DSI handler has moved to later in the code as it won't fit anymore once VMAP stacks are there. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c31b22c91af8b011d0a4fd9e52ad6afb4b593f71.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 449625b4ff03..7ec780858299 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -295,24 +295,20 @@ __secondary_hold_acknowledge: DO_KVM 0x300 DataAccess: EXCEPTION_PROLOG - mfspr r10,SPRN_DSISR - stw r10,_DSISR(r11) -#ifdef CONFIG_PPC_KUAP - andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h -#else - andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h -#endif - bne 1f /* if not, try to put a PTE */ - mfspr r4,SPRN_DAR /* into the hash table */ - rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ + get_and_save_dar_dsisr_on_stack r4, r5, r11 BEGIN_MMU_FTR_SECTION +#ifdef CONFIG_PPC_KUAP + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h +#else + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h +#endif + bne handle_page_fault_tramp_2 /* if not, try to put a PTE */ + rlwinm r3, r5, 32 - 15, 21, 21 /* DSISR_STORE -> _PAGE_RW */ bl hash_page -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -1: lwz r5,_DSISR(r11) /* get DSISR value */ - mfspr r4,SPRN_DAR - stw r4, _DAR(r11) - EXC_XFER_LITE(0x300, handle_page_fault) - + b handle_page_fault_tramp_1 +FTR_SECTION_ELSE + b handle_page_fault_tramp_2 +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) /* Instruction access exception. */ . = 0x400 @@ -642,6 +638,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) . = 0x3000 +handle_page_fault_tramp_1: + lwz r4, _DAR(r11) + lwz r5, _DSISR(r11) + /* fall through */ +handle_page_fault_tramp_2: + EXC_XFER_LITE(0x300, handle_page_fault) + AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC From 94dd54c51a410b9ffa6356c3ed2ab0317f998ded Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:37 +0000 Subject: [PATCH 103/131] powerpc/32s: Avoid crossing page boundary while changing SRR0/1. Trying VMAP_STACK with KVM, vmlinux was not starting. This was due to SRR0 and SRR1 clobbered by an ISI due to the rfi being in a different page than the mtsrr0/1: c0003fe0 : c0003fe0: 38 83 00 54 addi r4,r3,84 c0003fe4: 7c 60 00 a6 mfmsr r3 c0003fe8: 70 60 00 30 andi. r0,r3,48 c0003fec: 4d 82 00 20 beqlr c0003ff0: 7c 63 00 78 andc r3,r3,r0 c0003ff4: 7c 9a 03 a6 mtsrr0 r4 c0003ff8: 7c 7b 03 a6 mtsrr1 r3 c0003ffc: 7c 00 04 ac hwsync c0004000: 4c 00 00 64 rfi Align the 4 instruction block used to deactivate MMU to order 4, so that the block never crosses a page boundary. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/30d2cda111b7977227fff067fa7e358440e2b3a4.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 7ec780858299..90ef355e958b 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -917,6 +917,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r4,r4,2f@l tophys(r4,r4) li r3,MSR_KERNEL & ~(MSR_IR|MSR_DR) + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 SYNC @@ -1058,6 +1060,8 @@ _ENTRY(update_bats) rlwinm r0, r6, 0, ~MSR_RI rlwinm r0, r0, 0, ~MSR_EE mtmsr r0 + + .align 4 mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r3 SYNC @@ -1097,6 +1101,8 @@ mmu_off: andi. r0,r3,MSR_DR|MSR_IR /* MMU enabled? */ beqlr andc r3,r3,r0 + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 sync From cd08f109e26231b279bcc0388428afcac6408ec6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 21 Dec 2019 08:32:38 +0000 Subject: [PATCH 104/131] powerpc/32s: Enable CONFIG_VMAP_STACK A few changes to retrieve DAR and DSISR from struct regs instead of retrieving them directly, as they may have changed due to a TLB miss. Also modifies hash_page() and friends to work with virtual data addresses instead of physical ones. Same on load_up_fpu() and load_up_altivec(). Signed-off-by: Christophe Leroy [mpe: Fix tovirt_vmstack call in head_32.S to fix CHRP build] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2e2509a242fd5f3e23df4a06530c18060c4d321e.1576916812.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/fpu.S | 3 ++ arch/powerpc/kernel/head_32.S | 16 +++++++-- arch/powerpc/kernel/head_32.h | 4 ++- arch/powerpc/kernel/vector.S | 3 ++ arch/powerpc/mm/book3s32/hash_low.S | 46 ++++++++++++++++---------- arch/powerpc/mm/book3s32/mmu.c | 9 +++-- arch/powerpc/platforms/Kconfig.cputype | 2 ++ 8 files changed, 61 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7e5a1722e4f2..3795654d15d1 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1339,7 +1339,7 @@ _GLOBAL(enter_rtas) lis r6,1f@ha /* physical return address for rtas */ addi r6,r6,1f@l tophys(r6,r6) - tophys(r7,r1) + tophys_novmstack r7, r1 lwz r8,RTASENTRY(r4) lwz r4,RTASBASE(r4) mfmsr r9 diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 0bb991ddd264..3235a8da6af7 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -94,6 +94,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) /* enable use of FP after return */ #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 90ef355e958b..0493fcac6409 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -272,14 +272,21 @@ __secondary_hold_acknowledge: */ . = 0x200 DO_KVM 0x200 +MachineCheck: EXCEPTION_PROLOG_0 +#ifdef CONFIG_VMAP_STACK + li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r11 + isync +#endif #ifdef CONFIG_PPC_CHRP mfspr r11, SPRN_SPRG_THREAD + tovirt_vmstack r11, r11 lwz r11, RTAS_SP(r11) cmpwi cr1, r11, 0 bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ - EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_1 for_rtas=1 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP @@ -294,7 +301,7 @@ __secondary_hold_acknowledge: . = 0x300 DO_KVM 0x300 DataAccess: - EXCEPTION_PROLOG + EXCEPTION_PROLOG handle_dar_dsisr=1 get_and_save_dar_dsisr_on_stack r4, r5, r11 BEGIN_MMU_FTR_SECTION #ifdef CONFIG_PPC_KUAP @@ -334,7 +341,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) . = 0x600 DO_KVM 0x600 Alignment: - EXCEPTION_PROLOG + EXCEPTION_PROLOG handle_dar_dsisr=1 save_dar_dsisr_on_stack r4, r5, r11 addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x600, alignment_exception) @@ -645,6 +652,9 @@ handle_page_fault_tramp_1: handle_page_fault_tramp_2: EXC_XFER_LITE(0x300, handle_page_fault) +stack_overflow: + vmap_stack_overflow_exception + AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index c39209d56020..a6a5fbbf8504 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -38,11 +38,13 @@ andi. r11, r11, MSR_PR .endm -.macro EXCEPTION_PROLOG_1 +.macro EXCEPTION_PROLOG_1 for_rtas=0 #ifdef CONFIG_VMAP_STACK + .ifeq \for_rtas li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ mtmsr r11 isync + .endif subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ #else tophys(r11,r1) /* use tophys(r1) if kernel */ diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 8eb867dbad5f..25c14a0981bf 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -67,6 +67,9 @@ _GLOBAL(load_up_altivec) #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ oris r9,r9,MSR_VEC@h +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 8bbbd9775c8a..c11b0a005196 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -25,6 +25,12 @@ #include #include +#ifdef CONFIG_VMAP_STACK +#define ADDR_OFFSET 0 +#else +#define ADDR_OFFSET PAGE_OFFSET +#endif + #ifdef CONFIG_SMP .section .bss .align 2 @@ -47,8 +53,8 @@ mmu_hash_lock: .text _GLOBAL(hash_page) #ifdef CONFIG_SMP - lis r8, (mmu_hash_lock - PAGE_OFFSET)@h - ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l + lis r8, (mmu_hash_lock - ADDR_OFFSET)@h + ori r8, r8, (mmu_hash_lock - ADDR_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -66,9 +72,12 @@ _GLOBAL(hash_page) cmplw 0,r4,r0 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif blt+ 112f /* assume user more likely */ - lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ - addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ + lis r5, (swapper_pg_dir - ADDR_OFFSET)@ha /* if kernel address, use */ + addi r5 ,r5 ,(swapper_pg_dir - ADDR_OFFSET)@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ 112: #ifndef CONFIG_PTE_64BIT @@ -80,6 +89,9 @@ _GLOBAL(hash_page) lwzx r8,r8,r5 /* Get L1 entry */ rlwinm. r8,r8,0,0,20 /* extract pt base address */ #endif +#ifdef CONFIG_VMAP_STACK + tovirt(r8, r8) +#endif #ifdef CONFIG_SMP beq- hash_page_out /* return if no mapping */ #else @@ -137,9 +149,9 @@ retry: #ifdef CONFIG_SMP eieio - lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha + lis r8, (mmu_hash_lock - ADDR_OFFSET)@ha li r0,0 - stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) + stw r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8) #endif /* Return from the exception */ @@ -152,9 +164,9 @@ retry: #ifdef CONFIG_SMP hash_page_out: eieio - lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha + lis r8, (mmu_hash_lock - ADDR_OFFSET)@ha li r0,0 - stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) + stw r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ @@ -329,7 +341,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) patch_site 1f, patch__hash_page_A1 patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ +0: lis r0, (Hash_base - ADDR_OFFSET)@h /* base address of hash table */ 1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ @@ -343,10 +355,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - lis r4, (htab_hash_searches - PAGE_OFFSET)@ha - lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) + lis r4, (htab_hash_searches - ADDR_OFFSET)@ha + lwz r6, (htab_hash_searches - ADDR_OFFSET)@l(r4) addi r6,r6,1 /* count how many searches we do */ - stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) + stw r6, (htab_hash_searches - ADDR_OFFSET)@l(r4) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 @@ -378,10 +390,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ found_empty /* update counter of times that the primary PTEG is full */ - lis r4, (primary_pteg_full - PAGE_OFFSET)@ha - lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) + lis r4, (primary_pteg_full - ADDR_OFFSET)@ha + lwz r6, (primary_pteg_full - ADDR_OFFSET)@l(r4) addi r6,r6,1 - stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) + stw r6, (primary_pteg_full - ADDR_OFFSET)@l(r4) patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ @@ -415,8 +427,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) * lockup here but that shouldn't happen */ -1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ - lwz r6, (next_slot - PAGE_OFFSET)@l(r4) +1: lis r4, (next_slot - ADDR_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - ADDR_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 69b2419accef..0a1c65a2c565 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -413,6 +413,7 @@ void __init MMU_init_hw(void) void __init MMU_init_hw_patch(void) { unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + unsigned int hash; if (ppc_md.progress) ppc_md.progress("hash:patch", 0x345); @@ -424,8 +425,12 @@ void __init MMU_init_hw_patch(void) /* * Patch up the instructions in hashtable.S:create_hpte */ - modify_instruction_site(&patch__hash_page_A0, 0xffff, - ((unsigned int)Hash - PAGE_OFFSET) >> 16); + if (IS_ENABLED(CONFIG_VMAP_STACK)) + hash = (unsigned int)Hash; + else + hash = (unsigned int)Hash - PAGE_OFFSET; + + modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 000dd297337c..e90bbb7f74b6 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -31,12 +31,14 @@ config PPC_BOOK3S_6xx select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP + select HAVE_ARCH_VMAP_STACK config PPC_BOOK3S_601 bool "PowerPC 601" select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_KUAP + select HAVE_ARCH_VMAP_STACK config PPC_85xx bool "Freescale 85xx" From 0f9aee0cb9da7db7d96f63cfa2dc5e4f1bffeb87 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Dec 2019 07:54:22 +0000 Subject: [PATCH 105/131] powerpc/mm: Don't log user reads to 0xffffffff Running vdsotest leaves many times the following log: [ 79.629901] vdsotest[396]: User access of kernel address (ffffffff) - exploit attempt? (uid: 0) A pointer set to (-1) is likely a programming error similar to a NULL pointer and is not worth logging as an exploit attempt. Don't log user accesses to 0xffffffff. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0728849e826ba16f1fbd6fa7f5c6cc87bd64e097.1577087627.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/fault.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 9e119f98a725..7534ee5bf9b2 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -350,6 +350,9 @@ static void sanity_check_fault(bool is_write, bool is_user, * Userspace trying to access kernel address, we get PROTFAULT for that. */ if (is_user && address >= TASK_SIZE) { + if ((long)address == -1) + return; + pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n", current->comm, current->pid, address, from_kuid(&init_user_ns, current_uid())); From 3d4247fcc938d0ab5cf6fdb752dae07fdeab9736 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 17:54:00 +0000 Subject: [PATCH 106/131] powerpc/32: Add support of KASAN_VMALLOC Add support of KASAN_VMALLOC on PPC32. To allow this, the early shadow covering the VMALLOC space need to be removed once high_memory var is set and before freeing memblock. And the VMALLOC area need to be aligned such that boundaries are covered by a full shadow page. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/031dec5487bde9b2181c8b3c9800e1879cf98c1a.1579024426.git.christophe.leroy@c-s.fr --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/32/pgtable.h | 5 +++ arch/powerpc/include/asm/kasan.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 5 +++ arch/powerpc/mm/kasan/kasan_init_32.c | 33 +++++++++++++++++++- arch/powerpc/mm/mem.c | 4 +++ 6 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6093c48976bf..a7f10a1b79f7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -173,6 +173,7 @@ config PPC select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if PPC32 + select HAVE_ARCH_KASAN_VMALLOC if PPC32 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 0796533d37dd..5b39c11e884a 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -193,7 +193,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #else #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) #endif + +#ifdef CONFIG_KASAN_VMALLOC +#define VMALLOC_END _ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else #define VMALLOC_END ioremap_bot +#endif #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 296e51c2f066..fbff9ff9032e 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -31,9 +31,11 @@ void kasan_early_init(void); void kasan_mmu_init(void); void kasan_init(void); +void kasan_late_init(void); #else static inline void kasan_init(void) { } static inline void kasan_mmu_init(void) { } +static inline void kasan_late_init(void) { } #endif #endif /* __ASSEMBLY */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 552b96eef0c8..60c4d829152e 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -114,7 +114,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #else #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) #endif + +#ifdef CONFIG_KASAN_VMALLOC +#define VMALLOC_END _ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else #define VMALLOC_END ioremap_bot +#endif /* * Bits in a linux-style PTE. These match the bits in the diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 0e6ed4413eea..88036fb88350 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -129,6 +129,31 @@ static void __init kasan_remap_early_shadow_ro(void) flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } +static void __init kasan_unmap_early_shadow_vmalloc(void) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END); + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); + + if (!early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + int ret = kasan_init_shadow_page_tables(k_start, k_end); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, __pte(0), 0); + } + flush_tlb_kernel_range(k_start, k_end); +} + void __init kasan_mmu_init(void) { int ret; @@ -165,7 +190,13 @@ void __init kasan_init(void) pr_info("KASAN init done\n"); } -#ifdef CONFIG_MODULES +void __init kasan_late_init(void) +{ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_unmap_early_shadow_vmalloc(); +} + +#if defined(CONFIG_MODULES) && !defined(CONFIG_KASAN_VMALLOC) void *module_alloc(unsigned long size) { void *base; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index f5535eae637f..ef7b1119b2e2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -301,6 +302,9 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); set_max_mapnr(max_pfn); + + kasan_late_init(); + memblock_free_all(); #ifdef CONFIG_HIGHMEM From af1725d2493dcad4eeb7e63141678181fcd8a2ff Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 17:54:01 +0000 Subject: [PATCH 107/131] powerpc/kconfig: Move CONFIG_PPC32 into Kconfig.cputype Move CONFIG_PPC32 at the same place as CONFIG_PPC64 for consistency. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6f28085c2a1aa987093d50db17586633bbf8e206.1579024426.git.christophe.leroy@c-s.fr --- arch/powerpc/Kconfig | 4 ---- arch/powerpc/platforms/Kconfig.cputype | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a7f10a1b79f7..62752c3bfabf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1,10 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 source "arch/powerpc/platforms/Kconfig.cputype" -config PPC32 - bool - default y if !PPC64 - config 32BIT bool default y if PPC32 diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e90bbb7f74b6..1c4f24a81580 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 +config PPC32 + bool + default y if !PPC64 + config PPC64 bool "64-bit kernel" select ZLIB_DEFLATE From 47febbeeec440eec213960e3d25c57a8312d5340 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 17:54:02 +0000 Subject: [PATCH 108/131] powerpc/32: Force KASAN_VMALLOC for modules Unloading/Reloading of modules seems to fail with KASAN_VMALLOC but works properly with it. Force selection of KASAN_VMALLOC when MODULES are selected, and drop module_alloc() which was dedicated to KASAN for modules. Reported-by: Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=205283 Link: https://lore.kernel.org/r/f909da11aecb59ab7f32ba01fae6f356eaa4d7bc.1579024426.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 31 +++++--------------------- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 88036fb88350..b782d92622b4 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -12,7 +12,7 @@ #include #include -static pgprot_t kasan_prot_ro(void) +static pgprot_t __init kasan_prot_ro(void) { if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) return PAGE_READONLY; @@ -20,7 +20,7 @@ static pgprot_t kasan_prot_ro(void) return PAGE_KERNEL_RO; } -static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) +static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) { unsigned long va = (unsigned long)kasan_early_shadow_page; phys_addr_t pa = __pa(kasan_early_shadow_page); @@ -30,7 +30,7 @@ static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); } -static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) { pmd_t *pmd; unsigned long k_cur, k_next; @@ -70,7 +70,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l return 0; } -static void __ref *kasan_get_one_page(void) +static void __init *kasan_get_one_page(void) { if (slab_is_available()) return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); @@ -78,7 +78,7 @@ static void __ref *kasan_get_one_page(void) return memblock_alloc(PAGE_SIZE, PAGE_SIZE); } -static int __ref kasan_init_region(void *start, size_t size) +static int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); @@ -196,27 +196,6 @@ void __init kasan_late_init(void) kasan_unmap_early_shadow_vmalloc(); } -#if defined(CONFIG_MODULES) && !defined(CONFIG_KASAN_VMALLOC) -void *module_alloc(unsigned long size) -{ - void *base; - - base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (!base) - return NULL; - - if (!kasan_init_region(base, size)) - return base; - - vfree(base); - - return NULL; -} -#endif - #ifdef CONFIG_PPC_BOOK3S_32 u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 1c4f24a81580..6caedc88474f 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -2,6 +2,7 @@ config PPC32 bool default y if !PPC64 + select KASAN_VMALLOC if KASAN && MODULES config PPC64 bool "64-bit kernel" From 509cd3f2b473330238c768bb21a4f2cdc80393fa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 17:54:03 +0000 Subject: [PATCH 109/131] powerpc/32: Simplify KASAN init Since kasan_init_region() is not used anymore for modules, KASAN init is done while slab_is_available() is false. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/84b27bf08b41c8343efd88e10f2eccd8e9f85593.1579024426.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index b782d92622b4..c4bf9ed04f88 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -34,7 +34,6 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned { pmd_t *pmd; unsigned long k_cur, k_next; - pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL; pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -45,14 +44,11 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) continue; - if (slab_is_available()) - new = pte_alloc_one_kernel(&init_mm); - else - new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); if (!new) return -ENOMEM; - kasan_populate_pte(new, prot); + kasan_populate_pte(new, PAGE_KERNEL); smp_wmb(); /* See comment in __pte_alloc */ @@ -63,39 +59,27 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned new = NULL; } spin_unlock(&init_mm.page_table_lock); - - if (new && slab_is_available()) - pte_free_kernel(&init_mm, new); } return 0; } -static void __init *kasan_get_one_page(void) -{ - if (slab_is_available()) - return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - return memblock_alloc(PAGE_SIZE, PAGE_SIZE); -} - static int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); unsigned long k_cur; int ret; - void *block = NULL; + void *block; ret = kasan_init_shadow_page_tables(k_start, k_end); if (ret) return ret; - if (!slab_is_available()) - block = memblock_alloc(k_end - k_start, PAGE_SIZE); + block = memblock_alloc(k_end - k_start, PAGE_SIZE); for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); - void *va = block ? block + k_cur - k_start : kasan_get_one_page(); + void *va = block + k_cur - k_start; pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); if (!va) From 21613cfad181c882b1effd227dcfbddc61dc80f7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 14 Jan 2020 17:54:04 +0000 Subject: [PATCH 110/131] powerpc/32: Reuse orphaned memblocks in kasan_init_shadow_page_tables() If concurrent PMD population has happened, re-use orphaned memblocks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b29ffffb9206dc14541fa420c17604240728041b.1579024426.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index c4bf9ed04f88..d3cacd462560 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -34,17 +34,17 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned { pmd_t *pmd; unsigned long k_cur, k_next; + pte_t *new = NULL; pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { - pte_t *new; - k_next = pgd_addr_end(k_cur, k_end); if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) continue; - new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + if (!new) + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); if (!new) return -ENOMEM; From 9933819099c4600b41a042f27a074470a43cf6b9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 27 Jan 2020 10:42:04 +0000 Subject: [PATCH 111/131] powerpc/32s: Fix CPU wake-up from sleep mode Commit f7354ccac844 ("powerpc/32: Remove CURRENT_THREAD_INFO and rename TI_CPU") broke the CPU wake-up from sleep mode (i.e. when _TLF_SLEEPING is set) by delaying the tovirt(r2, r2). This is because r2 is not restored by fast_exception_return. It used to work (by chance ?) because CPU wake-up interrupt never comes from user, so r2 is expected to point to 'current' on return. Commit e2fb9f544431 ("powerpc/32: Prepare for Kernel Userspace Access Protection") broke it even more by clobbering r0 which is not restored by fast_exception_return either. Use r6 instead of r0. This is possible because r3-r6 are restored by fast_exception_return and only r3-r5 are used for exception arguments. For r2 it could be converted back to virtual address, but stay on the safe side and restore it from the stack instead. It should be live in the cache at that moment, so loading from the stack should make no difference compared to converting it from phys to virt. Fixes: f7354ccac844 ("powerpc/32: Remove CURRENT_THREAD_INFO and rename TI_CPU") Fixes: e2fb9f544431 ("powerpc/32: Prepare for Kernel Userspace Access Protection") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6d02c3ae6ad77af34392e98117e44c2bf6d13ba1.1580121710.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3795654d15d1..ac27b6b136d0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -180,7 +180,7 @@ transfer_to_handler: 2: /* if from kernel, check interrupted DOZE/NAP mode and * check for stack overflow */ - kuap_save_and_lock r11, r12, r9, r2, r0 + kuap_save_and_lock r11, r12, r9, r2, r6 addi r2, r12, -THREAD #ifndef CONFIG_VMAP_STACK lwz r9,KSP_LIMIT(r12) @@ -288,6 +288,7 @@ reenable_mmu: rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ kuap_restore r11, r2, r3, r4, r5 + lwz r2, GPR2(r11) b fast_exception_return #endif From 6ec20aa2e510b6297906c45f009aa08b2d97269a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:40 +0000 Subject: [PATCH 112/131] powerpc/32s: Fix bad_kuap_fault() At the moment, bad_kuap_fault() reports a fault only if a bad access to userspace occurred while access to userspace was not granted. But if a fault occurs for a write outside the allowed userspace segment(s) that have been unlocked, bad_kuap_fault() fails to detect it and the kernel loops forever in do_page_fault(). Fix it by checking that the accessed address is within the allowed range. Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access Protection") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f48244e9485ada0a304ed33ccbb8da271180c80d.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/kup.h | 9 +++++++-- arch/powerpc/include/asm/book3s/64/kup-radix.h | 3 ++- arch/powerpc/include/asm/kup.h | 6 +++++- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 3 ++- arch/powerpc/mm/fault.c | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index f9dc597b0b86..d88008c8eb85 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -131,12 +131,17 @@ static inline void prevent_user_access(void __user *to, const void __user *from, kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ } -static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +static inline bool +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { + unsigned long begin = regs->kuap & 0xf0000000; + unsigned long end = regs->kuap << 28; + if (!is_write) return false; - return WARN(!regs->kuap, "Bug: write fault blocked by segment registers !"); + return WARN(address < begin || address >= end, + "Bug: write fault blocked by segment registers !"); } #endif /* CONFIG_PPC_KUAP */ diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index f254de956d6a..dbbd22cb80f5 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -95,7 +95,8 @@ static inline void prevent_user_access(void __user *to, const void __user *from, set_kuap(AMR_KUAP_BLOCKED); } -static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +static inline bool +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) && (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)), diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 5b5e39643a27..812e66f31934 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -45,7 +45,11 @@ static inline void allow_user_access(void __user *to, const void __user *from, unsigned long size) { } static inline void prevent_user_access(void __user *to, const void __user *from, unsigned long size) { } -static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) { return false; } +static inline bool +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +{ + return false; +} #endif /* CONFIG_PPC_KUAP */ static inline void allow_read_from_user(const void __user *from, unsigned long size) diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 1006a427e99c..f2fea603b929 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -46,7 +46,8 @@ static inline void prevent_user_access(void __user *to, const void __user *from, mtspr(SPRN_MD_AP, MD_APG_KUAP); } -static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +static inline bool +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000), "Bug: fault blocked by AP register !"); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b5047f9b5dec..1baeb045f7f4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -233,7 +233,7 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, // Read/write fault in a valid region (the exception table search passed // above), but blocked by KUAP is bad, it can never succeed. - if (bad_kuap_fault(regs, is_write)) + if (bad_kuap_fault(regs, address, is_write)) return true; // What's left? Kernel fault on user in well defined regions (extable From 1d8f739b07bd538f272f60bf53f10e7e6248d295 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:41 +0000 Subject: [PATCH 113/131] powerpc/kuap: Fix set direction in allow/prevent_user_access() __builtin_constant_p() always return 0 for pointers, so on RADIX we always end up opening both direction (by writing 0 in SPR29): 0000000000000170 <._copy_to_user>: ... 1b0: 4c 00 01 2c isync 1b4: 39 20 00 00 li r9,0 1b8: 7d 3d 03 a6 mtspr 29,r9 1bc: 4c 00 01 2c isync 1c0: 48 00 00 01 bl 1c0 <._copy_to_user+0x50> 1c0: R_PPC64_REL24 .__copy_tofrom_user ... 0000000000000220 <._copy_from_user>: ... 2ac: 4c 00 01 2c isync 2b0: 39 20 00 00 li r9,0 2b4: 7d 3d 03 a6 mtspr 29,r9 2b8: 4c 00 01 2c isync 2bc: 7f c5 f3 78 mr r5,r30 2c0: 7f 83 e3 78 mr r3,r28 2c4: 48 00 00 01 bl 2c4 <._copy_from_user+0xa4> 2c4: R_PPC64_REL24 .__copy_tofrom_user ... Use an explicit parameter for direction selection, so that GCC is able to see it is a constant: 00000000000001b0 <._copy_to_user>: ... 1f0: 4c 00 01 2c isync 1f4: 3d 20 40 00 lis r9,16384 1f8: 79 29 07 c6 rldicr r9,r9,32,31 1fc: 7d 3d 03 a6 mtspr 29,r9 200: 4c 00 01 2c isync 204: 48 00 00 01 bl 204 <._copy_to_user+0x54> 204: R_PPC64_REL24 .__copy_tofrom_user ... 0000000000000260 <._copy_from_user>: ... 2ec: 4c 00 01 2c isync 2f0: 39 20 ff ff li r9,-1 2f4: 79 29 00 04 rldicr r9,r9,0,0 2f8: 7d 3d 03 a6 mtspr 29,r9 2fc: 4c 00 01 2c isync 300: 7f c5 f3 78 mr r5,r30 304: 7f 83 e3 78 mr r3,r28 308: 48 00 00 01 bl 308 <._copy_from_user+0xa8> 308: R_PPC64_REL24 .__copy_tofrom_user ... Signed-off-by: Christophe Leroy [mpe: Spell out the directions, s/KUAP_R/KUAP_READ/ etc.] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f4e88ec4941d5facb35ce75026b0112f980086c3.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/kup.h | 13 ++++++-- .../powerpc/include/asm/book3s/64/kup-radix.h | 11 +++---- arch/powerpc/include/asm/kup.h | 30 ++++++++++++++----- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 4 +-- arch/powerpc/include/asm/uaccess.h | 4 +-- 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index d88008c8eb85..91c8f1d9bcee 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -102,11 +102,13 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 end) isync(); /* Context sync required after mtsrin() */ } -static inline void allow_user_access(void __user *to, const void __user *from, u32 size) +static __always_inline void allow_user_access(void __user *to, const void __user *from, + u32 size, unsigned long dir) { u32 addr, end; - if (__builtin_constant_p(to) && to == NULL) + BUILD_BUG_ON(!__builtin_constant_p(dir)); + if (!(dir & KUAP_WRITE)) return; addr = (__force u32)to; @@ -119,11 +121,16 @@ static inline void allow_user_access(void __user *to, const void __user *from, u kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */ } -static inline void prevent_user_access(void __user *to, const void __user *from, u32 size) +static __always_inline void prevent_user_access(void __user *to, const void __user *from, + u32 size, unsigned long dir) { u32 addr = (__force u32)to; u32 end = min(addr + size, TASK_SIZE); + BUILD_BUG_ON(!__builtin_constant_p(dir)); + if (!(dir & KUAP_WRITE)) + return; + if (!addr || addr >= TASK_SIZE || !size) return; diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index dbbd22cb80f5..c8d1076e0ebb 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -77,20 +77,21 @@ static inline void set_kuap(unsigned long value) isync(); } -static inline void allow_user_access(void __user *to, const void __user *from, - unsigned long size) +static __always_inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size, unsigned long dir) { // This is written so we can resolve to a single case at build time - if (__builtin_constant_p(to) && to == NULL) + BUILD_BUG_ON(!__builtin_constant_p(dir)); + if (dir == KUAP_READ) set_kuap(AMR_KUAP_BLOCK_WRITE); - else if (__builtin_constant_p(from) && from == NULL) + else if (dir == KUAP_WRITE) set_kuap(AMR_KUAP_BLOCK_READ); else set_kuap(0); } static inline void prevent_user_access(void __user *to, const void __user *from, - unsigned long size) + unsigned long size, unsigned long dir) { set_kuap(AMR_KUAP_BLOCKED); } diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 812e66f31934..94f24928916a 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -2,6 +2,10 @@ #ifndef _ASM_POWERPC_KUP_H_ #define _ASM_POWERPC_KUP_H_ +#define KUAP_READ 1 +#define KUAP_WRITE 2 +#define KUAP_READ_WRITE (KUAP_READ | KUAP_WRITE) + #ifdef CONFIG_PPC64 #include #endif @@ -42,9 +46,9 @@ void setup_kuap(bool disabled); #else static inline void setup_kuap(bool disabled) { } static inline void allow_user_access(void __user *to, const void __user *from, - unsigned long size) { } + unsigned long size, unsigned long dir) { } static inline void prevent_user_access(void __user *to, const void __user *from, - unsigned long size) { } + unsigned long size, unsigned long dir) { } static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { @@ -54,24 +58,36 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) static inline void allow_read_from_user(const void __user *from, unsigned long size) { - allow_user_access(NULL, from, size); + allow_user_access(NULL, from, size, KUAP_READ); } static inline void allow_write_to_user(void __user *to, unsigned long size) { - allow_user_access(to, NULL, size); + allow_user_access(to, NULL, size, KUAP_WRITE); +} + +static inline void allow_read_write_user(void __user *to, const void __user *from, + unsigned long size) +{ + allow_user_access(to, from, size, KUAP_READ_WRITE); } static inline void prevent_read_from_user(const void __user *from, unsigned long size) { - prevent_user_access(NULL, from, size); + prevent_user_access(NULL, from, size, KUAP_READ); } static inline void prevent_write_to_user(void __user *to, unsigned long size) { - prevent_user_access(to, NULL, size); + prevent_user_access(to, NULL, size, KUAP_WRITE); +} + +static inline void prevent_read_write_user(void __user *to, const void __user *from, + unsigned long size) +{ + prevent_user_access(to, from, size, KUAP_READ_WRITE); } #endif /* !__ASSEMBLY__ */ -#endif /* _ASM_POWERPC_KUP_H_ */ +#endif /* _ASM_POWERPC_KUAP_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index f2fea603b929..1d70c80366fd 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -35,13 +35,13 @@ #include static inline void allow_user_access(void __user *to, const void __user *from, - unsigned long size) + unsigned long size, unsigned long dir) { mtspr(SPRN_MD_AP, MD_APG_INIT); } static inline void prevent_user_access(void __user *to, const void __user *from, - unsigned long size) + unsigned long size, unsigned long dir) { mtspr(SPRN_MD_AP, MD_APG_KUAP); } diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index c92fe7fe9692..cafad1960e76 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -313,9 +313,9 @@ raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) unsigned long ret; barrier_nospec(); - allow_user_access(to, from, n); + allow_read_write_user(to, from, n); ret = __copy_tofrom_user(to, from, n); - prevent_user_access(to, from, n); + prevent_read_write_user(to, from, n); return ret; } #endif /* __powerpc64__ */ From 88f8c080d47f307871840a0a825ef556673eb592 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:42 +0000 Subject: [PATCH 114/131] powerpc/32s: Drop NULL addr verification NULL addr is a user address. Don't waste time checking it. If someone tries to access it, it will SIGFAULT the same way as for address 1, so no need to make it special. The special case is when not doing a write, in that case we want to drop the entire function. This is now handled by 'dir' param and not by the nulity of 'to' anymore. Also make beginning of prevent_user_access() similar to beginning of allow_user_access(), and tell the compiler that writing in kernel space or with a 0 length is unlikely Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/85e971223dfe6ace734637db1841678939a76155.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/kup.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 91c8f1d9bcee..de29fb752cca 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -113,7 +113,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user addr = (__force u32)to; - if (!addr || addr >= TASK_SIZE || !size) + if (unlikely(addr >= TASK_SIZE || !size)) return; end = min(addr + size, TASK_SIZE); @@ -124,16 +124,18 @@ static __always_inline void allow_user_access(void __user *to, const void __user static __always_inline void prevent_user_access(void __user *to, const void __user *from, u32 size, unsigned long dir) { - u32 addr = (__force u32)to; - u32 end = min(addr + size, TASK_SIZE); + u32 addr, end; BUILD_BUG_ON(!__builtin_constant_p(dir)); if (!(dir & KUAP_WRITE)) return; - if (!addr || addr >= TASK_SIZE || !size) + addr = (__force u32)to; + + if (unlikely(addr >= TASK_SIZE || !size)) return; + end = min(addr + size, TASK_SIZE); current->thread.kuap = 0; kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ } From bedb4dbe443c11ff551b4ae4e48c8676fdc96467 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:43 +0000 Subject: [PATCH 115/131] powerpc/32s: Prepare prevent_user_access() for user_access_end() In preparation of implementing user_access_begin and friends on powerpc, the book3s/32 version of prevent_user_access() need to be prepared for user_access_end(). user_access_end() doesn't provide the address and size which were passed to user_access_begin(), required by prevent_user_access() to know which segment to modify. The list of segments which where unprotected by allow_user_access() are available in current->kuap. But we don't want prevent_user_access() to read this all the time, especially everytime it is 0 (for instance because the access was not a write access). Implement a special direction named KUAP_CURRENT. In this case only, the addr and end are retrieved from current->kuap. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/55bcc1f25d8200892a31f67a0b024ff3b816c3cc.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/kup.h | 27 ++++++++++++++----- .../powerpc/include/asm/book3s/64/kup-radix.h | 4 ++- arch/powerpc/include/asm/kup.h | 11 ++++++++ 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index de29fb752cca..17e069291c72 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -108,6 +108,8 @@ static __always_inline void allow_user_access(void __user *to, const void __user u32 addr, end; BUILD_BUG_ON(!__builtin_constant_p(dir)); + BUILD_BUG_ON(dir == KUAP_CURRENT); + if (!(dir & KUAP_WRITE)) return; @@ -117,6 +119,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user return; end = min(addr + size, TASK_SIZE); + current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf); kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */ } @@ -127,15 +130,25 @@ static __always_inline void prevent_user_access(void __user *to, const void __us u32 addr, end; BUILD_BUG_ON(!__builtin_constant_p(dir)); - if (!(dir & KUAP_WRITE)) + + if (dir == KUAP_CURRENT) { + u32 kuap = current->thread.kuap; + + if (unlikely(!kuap)) + return; + + addr = kuap & 0xf0000000; + end = kuap << 28; + } else if (dir & KUAP_WRITE) { + addr = (__force u32)to; + end = min(addr + size, TASK_SIZE); + + if (unlikely(addr >= TASK_SIZE || !size)) + return; + } else { return; + } - addr = (__force u32)to; - - if (unlikely(addr >= TASK_SIZE || !size)) - return; - - end = min(addr + size, TASK_SIZE); current->thread.kuap = 0; kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ } diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index c8d1076e0ebb..a0263e94df33 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -86,8 +86,10 @@ static __always_inline void allow_user_access(void __user *to, const void __user set_kuap(AMR_KUAP_BLOCK_WRITE); else if (dir == KUAP_WRITE) set_kuap(AMR_KUAP_BLOCK_READ); - else + else if (dir == KUAP_READ_WRITE) set_kuap(0); + else + BUILD_BUG(); } static inline void prevent_user_access(void __user *to, const void __user *from, diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 94f24928916a..c3ce7e8ae9ea 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -5,6 +5,12 @@ #define KUAP_READ 1 #define KUAP_WRITE 2 #define KUAP_READ_WRITE (KUAP_READ | KUAP_WRITE) +/* + * For prevent_user_access() only. + * Use the current saved situation instead of the to/from/size params. + * Used on book3s/32 + */ +#define KUAP_CURRENT 4 #ifdef CONFIG_PPC64 #include @@ -88,6 +94,11 @@ static inline void prevent_read_write_user(void __user *to, const void __user *f prevent_user_access(to, from, size, KUAP_READ_WRITE); } +static inline void prevent_current_access_user(void) +{ + prevent_user_access(NULL, NULL, ~0UL, KUAP_CURRENT); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_KUAP_H_ */ From 5cd623333e7cf4e3a334c70529268b65f2a6c2c7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:44 +0000 Subject: [PATCH 116/131] powerpc: Implement user_access_begin and friends Today, when a function like strncpy_from_user() is called, the userspace access protection is de-activated and re-activated for every word read. By implementing user_access_begin and friends, the protection is de-activated at the beginning of the copy and re-activated at the end. Implement user_access_begin(), user_access_end() and unsafe_get_user(), unsafe_put_user() and unsafe_copy_to_user() For the time being, we keep user_access_save() and user_access_restore() as nops. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/36d4fbf9e56a75994aca4ee2214c77b26a5a8d35.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/uaccess.h | 85 +++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index cafad1960e76..af905d7fc1df 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -91,9 +91,14 @@ static inline int __access_ok(unsigned long addr, unsigned long size, __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) #define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr))) + __get_user_nocheck((x), (ptr), sizeof(*(ptr)), true) #define __put_user(x, ptr) \ - __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) + __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), true) + +#define __get_user_allowed(x, ptr) \ + __get_user_nocheck((x), (ptr), sizeof(*(ptr)), false) +#define __put_user_allowed(x, ptr) \ + __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), false) #define __get_user_inatomic(x, ptr) \ __get_user_nosleep((x), (ptr), sizeof(*(ptr))) @@ -138,10 +143,9 @@ extern long __put_user_bad(void); : "r" (x), "b" (addr), "i" (-EFAULT), "0" (err)) #endif /* __powerpc64__ */ -#define __put_user_size(x, ptr, size, retval) \ +#define __put_user_size_allowed(x, ptr, size, retval) \ do { \ retval = 0; \ - allow_write_to_user(ptr, size); \ switch (size) { \ case 1: __put_user_asm(x, ptr, retval, "stb"); break; \ case 2: __put_user_asm(x, ptr, retval, "sth"); break; \ @@ -149,17 +153,26 @@ do { \ case 8: __put_user_asm2(x, ptr, retval); break; \ default: __put_user_bad(); \ } \ +} while (0) + +#define __put_user_size(x, ptr, size, retval) \ +do { \ + allow_write_to_user(ptr, size); \ + __put_user_size_allowed(x, ptr, size, retval); \ prevent_write_to_user(ptr, size); \ } while (0) -#define __put_user_nocheck(x, ptr, size) \ +#define __put_user_nocheck(x, ptr, size, do_allow) \ ({ \ long __pu_err; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ if (!is_kernel_addr((unsigned long)__pu_addr)) \ might_fault(); \ __chk_user_ptr(ptr); \ - __put_user_size((x), __pu_addr, (size), __pu_err); \ + if (do_allow) \ + __put_user_size((x), __pu_addr, (size), __pu_err); \ + else \ + __put_user_size_allowed((x), __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -236,13 +249,12 @@ extern long __get_user_bad(void); : "b" (addr), "i" (-EFAULT), "0" (err)) #endif /* __powerpc64__ */ -#define __get_user_size(x, ptr, size, retval) \ +#define __get_user_size_allowed(x, ptr, size, retval) \ do { \ retval = 0; \ __chk_user_ptr(ptr); \ if (size > sizeof(x)) \ (x) = __get_user_bad(); \ - allow_read_from_user(ptr, size); \ switch (size) { \ case 1: __get_user_asm(x, ptr, retval, "lbz"); break; \ case 2: __get_user_asm(x, ptr, retval, "lhz"); break; \ @@ -250,6 +262,12 @@ do { \ case 8: __get_user_asm2(x, ptr, retval); break; \ default: (x) = __get_user_bad(); \ } \ +} while (0) + +#define __get_user_size(x, ptr, size, retval) \ +do { \ + allow_read_from_user(ptr, size); \ + __get_user_size_allowed(x, ptr, size, retval); \ prevent_read_from_user(ptr, size); \ } while (0) @@ -260,7 +278,7 @@ do { \ #define __long_type(x) \ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) -#define __get_user_nocheck(x, ptr, size) \ +#define __get_user_nocheck(x, ptr, size, do_allow) \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ @@ -269,7 +287,10 @@ do { \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ barrier_nospec(); \ - __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ + if (do_allow) \ + __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ + else \ + __get_user_size_allowed(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) @@ -356,33 +377,40 @@ static inline unsigned long raw_copy_from_user(void *to, return ret; } -static inline unsigned long raw_copy_to_user(void __user *to, - const void *from, unsigned long n) +static inline unsigned long +raw_copy_to_user_allowed(void __user *to, const void *from, unsigned long n) { - unsigned long ret; if (__builtin_constant_p(n) && (n <= 8)) { - ret = 1; + unsigned long ret = 1; switch (n) { case 1: - __put_user_size(*(u8 *)from, (u8 __user *)to, 1, ret); + __put_user_size_allowed(*(u8 *)from, (u8 __user *)to, 1, ret); break; case 2: - __put_user_size(*(u16 *)from, (u16 __user *)to, 2, ret); + __put_user_size_allowed(*(u16 *)from, (u16 __user *)to, 2, ret); break; case 4: - __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret); + __put_user_size_allowed(*(u32 *)from, (u32 __user *)to, 4, ret); break; case 8: - __put_user_size(*(u64 *)from, (u64 __user *)to, 8, ret); + __put_user_size_allowed(*(u64 *)from, (u64 __user *)to, 8, ret); break; } if (ret == 0) return 0; } + return __copy_tofrom_user(to, (__force const void __user *)from, n); +} + +static inline unsigned long +raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + unsigned long ret; + allow_write_to_user(to, n); - ret = __copy_tofrom_user(to, (__force const void __user *)from, n); + ret = raw_copy_to_user_allowed(to, from, n); prevent_write_to_user(to, n); return ret; } @@ -428,4 +456,23 @@ extern long __copy_from_user_flushcache(void *dst, const void __user *src, extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); +static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +{ + if (unlikely(!access_ok(ptr, len))) + return false; + allow_read_write_user((void __user *)ptr, ptr, len); + return true; +} +#define user_access_begin user_access_begin +#define user_access_end prevent_current_access_user + +static inline unsigned long user_access_save(void) { return 0UL; } +static inline void user_access_restore(unsigned long flags) { } + +#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) +#define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e) +#define unsafe_put_user(x, p, e) unsafe_op_wrap(__put_user_allowed(x, p), e) +#define unsafe_copy_to_user(d, s, l, e) \ + unsafe_op_wrap(raw_copy_to_user_allowed(d, s, l), e) + #endif /* _ARCH_POWERPC_UACCESS_H */ From 3d7dfd632f9b60cfce069b4da517e6b1a1c3f613 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 24 Jan 2020 11:54:45 +0000 Subject: [PATCH 117/131] powerpc: Implement user_access_save() and user_access_restore() Implement user_access_save() and user_access_restore() On 8xx and radix: - On save, get the value of the associated special register then prevent user access. - On restore, set back the saved value to the associated special register. On book3s/32: - On save, get the value stored in current->thread.kuap and prevent user access. - On restore, regenerate address range from the stored value and reopen read/write access for that range. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/54f2f74938006b33c55a416674807b42ef222068.1579866752.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/kup.h | 23 +++++++++++++++++++ .../powerpc/include/asm/book3s/64/kup-radix.h | 22 ++++++++++++++++++ arch/powerpc/include/asm/kup.h | 2 ++ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 14 +++++++++++ arch/powerpc/include/asm/uaccess.h | 5 ++-- 5 files changed, 63 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 17e069291c72..3c0ba22dc360 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -153,6 +153,29 @@ static __always_inline void prevent_user_access(void __user *to, const void __us kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ } +static inline unsigned long prevent_user_access_return(void) +{ + unsigned long flags = current->thread.kuap; + unsigned long addr = flags & 0xf0000000; + unsigned long end = flags << 28; + void __user *to = (__force void __user *)addr; + + if (flags) + prevent_user_access(to, to, end - addr, KUAP_READ_WRITE); + + return flags; +} + +static inline void restore_user_access(unsigned long flags) +{ + unsigned long addr = flags & 0xf0000000; + unsigned long end = flags << 28; + void __user *to = (__force void __user *)addr; + + if (flags) + allow_user_access(to, to, end - addr, KUAP_READ_WRITE); +} + static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index a0263e94df33..90dd3a3fc8c7 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -63,6 +63,14 @@ * because that would require an expensive read/modify write of the AMR. */ +static inline unsigned long get_kuap(void) +{ + if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP)) + return 0; + + return mfspr(SPRN_AMR); +} + static inline void set_kuap(unsigned long value) { if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP)) @@ -98,6 +106,20 @@ static inline void prevent_user_access(void __user *to, const void __user *from, set_kuap(AMR_KUAP_BLOCKED); } +static inline unsigned long prevent_user_access_return(void) +{ + unsigned long flags = get_kuap(); + + set_kuap(AMR_KUAP_BLOCKED); + + return flags; +} + +static inline void restore_user_access(unsigned long flags) +{ + set_kuap(flags); +} + static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index c3ce7e8ae9ea..92bcd1a26d73 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -55,6 +55,8 @@ static inline void allow_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { } static inline void prevent_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { } +static inline unsigned long prevent_user_access_return(void) { return 0UL; } +static inline void restore_user_access(unsigned long flags) { } static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 1d70c80366fd..85ed2390fb99 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -46,6 +46,20 @@ static inline void prevent_user_access(void __user *to, const void __user *from, mtspr(SPRN_MD_AP, MD_APG_KUAP); } +static inline unsigned long prevent_user_access_return(void) +{ + unsigned long flags = mfspr(SPRN_MD_AP); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); + + return flags; +} + +static inline void restore_user_access(unsigned long flags) +{ + mtspr(SPRN_MD_AP, flags); +} + static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index af905d7fc1df..2f500debae21 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -465,9 +465,8 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t } #define user_access_begin user_access_begin #define user_access_end prevent_current_access_user - -static inline unsigned long user_access_save(void) { return 0UL; } -static inline void user_access_restore(unsigned long flags) { } +#define user_access_save prevent_user_access_return +#define user_access_restore restore_user_access #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e) From 58b278f568f0509497e2df7310bfd719156a60d1 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Wed, 28 Aug 2019 13:57:29 +0530 Subject: [PATCH 118/131] powerpc: Provide initial documentation for PAPR hcalls This doc patch provides an initial description of the hcall op-codes that are used by Linux kernel running as a guest (LPAR) on top of PowerVM or any other sPAPR compliant hyper-visor (e.g qemu). Apart from documenting the hcalls the doc-patch also provides a rudimentary overview of how hcall ABI, how they are issued with the Linux kernel and how information/control flows between the guest and hypervisor. Signed-off-by: Vaibhav Jain Reviewed-by: Laurent Dufour Acked-by: Nicholas Piggin [mpe: Add SPDX tag, add it to index.rst] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190828082729.16695-1-vaibhav@linux.ibm.com --- Documentation/powerpc/index.rst | 1 + Documentation/powerpc/papr_hcalls.rst | 250 ++++++++++++++++++++++++++ arch/powerpc/kernel/exceptions-64s.S | 19 +- 3 files changed, 254 insertions(+), 16 deletions(-) create mode 100644 Documentation/powerpc/papr_hcalls.rst diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index 2df04259db65..0d45f0fc8e57 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -22,6 +22,7 @@ powerpc isa-versions kaslr-booke32 mpc52xx + papr_hcalls pci_iov_resource_on_powernv pmu-ebb ptrace diff --git a/Documentation/powerpc/papr_hcalls.rst b/Documentation/powerpc/papr_hcalls.rst new file mode 100644 index 000000000000..3493631a60f8 --- /dev/null +++ b/Documentation/powerpc/papr_hcalls.rst @@ -0,0 +1,250 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Hypercall Op-codes (hcalls) +=========================== + +Overview +========= + +Virtualization on 64-bit Power Book3S Platforms is based on the PAPR +specification [1]_ which describes the run-time environment for a guest +operating system and how it should interact with the hypervisor for +privileged operations. Currently there are two PAPR compliant hypervisors: + +- **IBM PowerVM (PHYP)**: IBM's proprietary hypervisor that supports AIX, + IBM-i and Linux as supported guests (termed as Logical Partitions + or LPARS). It supports the full PAPR specification. + +- **Qemu/KVM**: Supports PPC64 linux guests running on a PPC64 linux host. + Though it only implements a subset of PAPR specification called LoPAPR [2]_. + +On PPC64 arch a guest kernel running on top of a PAPR hypervisor is called +a *pSeries guest*. A pseries guest runs in a supervisor mode (HV=0) and must +issue hypercalls to the hypervisor whenever it needs to perform an action +that is hypervisor priviledged [3]_ or for other services managed by the +hypervisor. + +Hence a Hypercall (hcall) is essentially a request by the pseries guest +asking hypervisor to perform a privileged operation on behalf of the guest. The +guest issues a with necessary input operands. The hypervisor after performing +the privilege operation returns a status code and output operands back to the +guest. + +HCALL ABI +========= +The ABI specification for a hcall between a pseries guest and PAPR hypervisor +is covered in section 14.5.3 of ref [2]_. Switch to the Hypervisor context is +done via the instruction **HVCS** that expects the Opcode for hcall is set in *r3* +and any in-arguments for the hcall are provided in registers *r4-r12*. If values +have to be passed through a memory buffer, the data stored in that buffer should be +in Big-endian byte order. + +Once control is returns back to the guest after hypervisor has serviced the +'HVCS' instruction the return value of the hcall is available in *r3* and any +out values are returned in registers *r4-r12*. Again like in case of in-arguments, +any out values stored in a memory buffer will be in Big-endian byte order. + +Powerpc arch code provides convenient wrappers named **plpar_hcall_xxx** defined +in a arch specific header [4]_ to issue hcalls from the linux kernel +running as pseries guest. + +Register Conventions +==================== + +Any hcall should follow same register convention as described in section 2.2.1.1 +of "64-Bit ELF V2 ABI Specification: Power Architecture"[5]_. Table below +summarizes these conventions: + ++----------+----------+-------------------------------------------+ +| Register |Volatile | Purpose | +| Range |(Y/N) | | ++==========+==========+===========================================+ +| r0 | Y | Optional-usage | ++----------+----------+-------------------------------------------+ +| r1 | N | Stack Pointer | ++----------+----------+-------------------------------------------+ +| r2 | N | TOC | ++----------+----------+-------------------------------------------+ +| r3 | Y | hcall opcode/return value | ++----------+----------+-------------------------------------------+ +| r4-r10 | Y | in and out values | ++----------+----------+-------------------------------------------+ +| r11 | Y | Optional-usage/Environmental pointer | ++----------+----------+-------------------------------------------+ +| r12 | Y | Optional-usage/Function entry address at | +| | | global entry point | ++----------+----------+-------------------------------------------+ +| r13 | N | Thread-Pointer | ++----------+----------+-------------------------------------------+ +| r14-r31 | N | Local Variables | ++----------+----------+-------------------------------------------+ +| LR | Y | Link Register | ++----------+----------+-------------------------------------------+ +| CTR | Y | Loop Counter | ++----------+----------+-------------------------------------------+ +| XER | Y | Fixed-point exception register. | ++----------+----------+-------------------------------------------+ +| CR0-1 | Y | Condition register fields. | ++----------+----------+-------------------------------------------+ +| CR2-4 | N | Condition register fields. | ++----------+----------+-------------------------------------------+ +| CR5-7 | Y | Condition register fields. | ++----------+----------+-------------------------------------------+ +| Others | N | | ++----------+----------+-------------------------------------------+ + +DRC & DRC Indexes +================= +:: + + DR1 Guest + +--+ +------------+ +---------+ + | | <----> | | | User | + +--+ DRC1 | | DRC | Space | + | PAPR | Index +---------+ + DR2 | Hypervisor | | | + +--+ | | <-----> | Kernel | + | | <----> | | Hcall | | + +--+ DRC2 +------------+ +---------+ + +PAPR hypervisor terms shared hardware resources like PCI devices, NVDIMMs etc +available for use by LPARs as Dynamic Resource (DR). When a DR is allocated to +an LPAR, PHYP creates a data-structure called Dynamic Resource Connector (DRC) +to manage LPAR access. An LPAR refers to a DRC via an opaque 32-bit number +called DRC-Index. The DRC-index value is provided to the LPAR via device-tree +where its present as an attribute in the device tree node associated with the +DR. + +HCALL Return-values +=================== + +After servicing the hcall, hypervisor sets the return-value in *r3* indicating +success or failure of the hcall. In case of a failure an error code indicates +the cause for error. These codes are defined and documented in arch specific +header [4]_. + +In some cases a hcall can potentially take a long time and need to be issued +multiple times in order to be completely serviced. These hcalls will usually +accept an opaque value *continue-token* within there argument list and a +return value of *H_CONTINUE* indicates that hypervisor hasn't still finished +servicing the hcall yet. + +To make such hcalls the guest need to set *continue-token == 0* for the +initial call and use the hypervisor returned value of *continue-token* +for each subsequent hcall until hypervisor returns a non *H_CONTINUE* +return value. + +HCALL Op-codes +============== + +Below is a partial list of HCALLs that are supported by PHYP. For the +corresponding opcode values please look into the arch specific header [4]_: + +**H_SCM_READ_METADATA** + +| Input: *drcIndex, offset, buffer-address, numBytesToRead* +| Out: *numBytesRead* +| Return Value: *H_Success, H_Parameter, H_P2, H_P3, H_Hardware* + +Given a DRC Index of an NVDIMM, read N-bytes from the the metadata area +associated with it, at a specified offset and copy it to provided buffer. +The metadata area stores configuration information such as label information, +bad-blocks etc. The metadata area is located out-of-band of NVDIMM storage +area hence a separate access semantics is provided. + +**H_SCM_WRITE_METADATA** + +| Input: *drcIndex, offset, data, numBytesToWrite* +| Out: *None* +| Return Value: *H_Success, H_Parameter, H_P2, H_P4, H_Hardware* + +Given a DRC Index of an NVDIMM, write N-bytes to the metadata area +associated with it, at the specified offset and from the provided buffer. + +**H_SCM_BIND_MEM** + +| Input: *drcIndex, startingScmBlockIndex, numScmBlocksToBind,* +| *targetLogicalMemoryAddress, continue-token* +| Out: *continue-token, targetLogicalMemoryAddress, numScmBlocksToBound* +| Return Value: *H_Success, H_Parameter, H_P2, H_P3, H_P4, H_Overlap,* +| *H_Too_Big, H_P5, H_Busy* + +Given a DRC-Index of an NVDIMM, map a continuous SCM blocks range +*(startingScmBlockIndex, startingScmBlockIndex+numScmBlocksToBind)* to the guest +at *targetLogicalMemoryAddress* within guest physical address space. In +case *targetLogicalMemoryAddress == 0xFFFFFFFF_FFFFFFFF* then hypervisor +assigns a target address to the guest. The HCALL can fail if the Guest has +an active PTE entry to the SCM block being bound. + +**H_SCM_UNBIND_MEM** +| Input: drcIndex, startingScmLogicalMemoryAddress, numScmBlocksToUnbind +| Out: numScmBlocksUnbound +| Return Value: *H_Success, H_Parameter, H_P2, H_P3, H_In_Use, H_Overlap,* +| *H_Busy, H_LongBusyOrder1mSec, H_LongBusyOrder10mSec* + +Given a DRC-Index of an NVDimm, unmap *numScmBlocksToUnbind* SCM blocks starting +at *startingScmLogicalMemoryAddress* from guest physical address space. The +HCALL can fail if the Guest has an active PTE entry to the SCM block being +unbound. + +**H_SCM_QUERY_BLOCK_MEM_BINDING** + +| Input: *drcIndex, scmBlockIndex* +| Out: *Guest-Physical-Address* +| Return Value: *H_Success, H_Parameter, H_P2, H_NotFound* + +Given a DRC-Index and an SCM Block index return the guest physical address to +which the SCM block is mapped to. + +**H_SCM_QUERY_LOGICAL_MEM_BINDING** + +| Input: *Guest-Physical-Address* +| Out: *drcIndex, scmBlockIndex* +| Return Value: *H_Success, H_Parameter, H_P2, H_NotFound* + +Given a guest physical address return which DRC Index and SCM block is mapped +to that address. + +**H_SCM_UNBIND_ALL** + +| Input: *scmTargetScope, drcIndex* +| Out: *None* +| Return Value: *H_Success, H_Parameter, H_P2, H_P3, H_In_Use, H_Busy,* +| *H_LongBusyOrder1mSec, H_LongBusyOrder10mSec* + +Depending on the Target scope unmap all SCM blocks belonging to all NVDIMMs +or all SCM blocks belonging to a single NVDIMM identified by its drcIndex +from the LPAR memory. + +**H_SCM_HEALTH** + +| Input: drcIndex +| Out: *health-bitmap, health-bit-valid-bitmap* +| Return Value: *H_Success, H_Parameter, H_Hardware* + +Given a DRC Index return the info on predictive failure and overall health of +the NVDIMM. The asserted bits in the health-bitmap indicate a single predictive +failure and health-bit-valid-bitmap indicate which bits in health-bitmap are +valid. + +**H_SCM_PERFORMANCE_STATS** + +| Input: drcIndex, resultBuffer Addr +| Out: None +| Return Value: *H_Success, H_Parameter, H_Unsupported, H_Hardware, H_Authority, H_Privilege* + +Given a DRC Index collect the performance statistics for NVDIMM and copy them +to the resultBuffer. + +References +========== +.. [1] "Power Architecture Platform Reference" + https://en.wikipedia.org/wiki/Power_Architecture_Platform_Reference +.. [2] "Linux on Power Architecture Platform Reference" + https://members.openpowerfoundation.org/document/dl/469 +.. [3] "Definitions and Notation" Book III-Section 14.5.3 + https://openpowerfoundation.org/?resource_lib=power-isa-version-3-0 +.. [4] arch/powerpc/include/asm/hvcall.h +.. [5] "64-Bit ELF V2 ABI Specification: Power Architecture" + https://openpowerfoundation.org/?resource_lib=64-bit-elf-v2-abi-specification-power-architecture diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b049c0bfa883..ffc15f4f079d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1408,22 +1408,9 @@ EXC_VIRT_NONE(0x4b00, 0x100) * * Call convention: * - * syscall register convention is in Documentation/powerpc/syscall64-abi.rst - * - * For hypercalls, the register convention is as follows: - * r0 volatile - * r1-2 nonvolatile - * r3 volatile parameter and return value for status - * r4-r10 volatile input and output value - * r11 volatile hypercall number and output value - * r12 volatile input and output value - * r13-r31 nonvolatile - * LR nonvolatile - * CTR volatile - * XER volatile - * CR0-1 CR5-7 volatile - * CR2-4 nonvolatile - * Other registers nonvolatile + * syscall and hypercalls register conventions are documented in + * Documentation/powerpc/syscall64-abi.rst and + * Documentation/powerpc/papr_hcalls.rst respectively. * * The intersection of volatile registers that don't contain possible * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry From 76be4414be4a0d17e29e2337167bf976533149cd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 28 Jan 2020 18:22:25 -0800 Subject: [PATCH 119/131] powerpc: indent to improve Kconfig readability Indent a Kconfig continuation line to improve readability. Signed-off-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ff8729c1-3a4b-c720-48ba-a1a42b0ef892@infradead.org --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 62752c3bfabf..4e1a6e2f99e0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -483,7 +483,7 @@ config MPROFILE_KERNEL config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && (PPC_PSERIES || \ - PPC_PMAC || PPC_POWERNV || FSL_SOC_BOOKE) + PPC_PMAC || PPC_POWERNV || FSL_SOC_BOOKE) help Say Y here to be able to disable and re-enable individual CPUs at runtime on SMP machines. From 41196224883a64e56e0ef237c19eb837058df071 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 29 Jan 2020 12:34:36 +0000 Subject: [PATCH 120/131] powerpc/32s: Fix kasan_early_hash_table() for CONFIG_VMAP_STACK On book3s/32 CPUs that are handling MMU through a hash table, MMU_init_hw() function was adapted for VMAP_STACK in order to handle virtual addresses instead of physical addresses in the low level hash functions. When using KASAN, the same adaptations are required for the early hash table set up by kasan_early_hash_table() function. Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fc8390a33c2a470105f01abbcbdc7916c30c0a54.1580301269.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index d3cacd462560..16dd95bd0749 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -185,8 +185,11 @@ u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; static void __init kasan_early_hash_table(void) { - modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16); - modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16); + unsigned int hash = IS_ENABLED(CONFIG_VMAP_STACK) ? (unsigned int)early_hash : + __pa(early_hash); + + modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16); Hash = (struct hash_pte *)early_hash; } From 43e76cd368fbb67e767da5363ffeaa3989993c8c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sat, 18 Jan 2020 12:03:35 -0500 Subject: [PATCH 121/131] powerpc: Do not consider weak unresolved symbol relocations as bad Commit 8580ac9404f6 ("bpf: Process in-kernel BTF") introduced two weak symbols that may be unresolved at link time which result in an absolute relocation to 0. relocs_check.sh emits the following warning: "WARNING: 2 bad relocations c000000001a41478 R_PPC64_ADDR64 _binary__btf_vmlinux_bin_start c000000001a41480 R_PPC64_ADDR64 _binary__btf_vmlinux_bin_end" whereas those relocations are legitimate even for a relocatable kernel compiled with -pie option. relocs_check.sh already excluded some weak unresolved symbols explicitly: remove those hardcoded symbols and add some logic that parses the symbols using nm, retrieves all the weak unresolved symbols and excludes those from the list of the potential bad relocations. Reported-by: Stephen Rothwell Signed-off-by: Alexandre Ghiti Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200118170335.21440-1-alex@ghiti.fr --- arch/powerpc/Makefile.postlink | 4 ++-- arch/powerpc/tools/relocs_check.sh | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index 134f12f89b92..2268396ff4bb 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -17,11 +17,11 @@ quiet_cmd_head_check = CHKHEAD $@ quiet_cmd_relocs_check = CHKREL $@ ifdef CONFIG_PPC_BOOK3S_64 cmd_relocs_check = \ - $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" ; \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" ; \ $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@" else cmd_relocs_check = \ - $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" endif # `@true` prevents complaint when there is nothing to be done diff --git a/arch/powerpc/tools/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh index 7b9fe0a567cf..014e00e74d2b 100755 --- a/arch/powerpc/tools/relocs_check.sh +++ b/arch/powerpc/tools/relocs_check.sh @@ -10,14 +10,21 @@ # based on relocs_check.pl # Copyright © 2009 IBM Corporation -if [ $# -lt 2 ]; then - echo "$0 [path to objdump] [path to vmlinux]" 1>&2 +if [ $# -lt 3 ]; then + echo "$0 [path to objdump] [path to nm] [path to vmlinux]" 1>&2 exit 1 fi -# Have Kbuild supply the path to objdump so we handle cross compilation. +# Have Kbuild supply the path to objdump and nm so we handle cross compilation. objdump="$1" -vmlinux="$2" +nm="$2" +vmlinux="$3" + +# Remove from the bad relocations those that match an undefined weak symbol +# which will result in an absolute relocation to 0. +# Weak unresolved symbols are of that form in nm output: +# " w _binary__btf_vmlinux_bin_end" +undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }') bad_relocs=$( $objdump -R "$vmlinux" | @@ -26,8 +33,6 @@ $objdump -R "$vmlinux" | # These relocations are okay # On PPC64: # R_PPC64_RELATIVE, R_PPC64_NONE - # R_PPC64_ADDR64 mach_ - # R_PPC64_ADDR64 __crc_ # On PPC: # R_PPC_RELATIVE, R_PPC_ADDR16_HI, # R_PPC_ADDR16_HA,R_PPC_ADDR16_LO, @@ -39,8 +44,7 @@ R_PPC_ADDR16_HI R_PPC_ADDR16_HA R_PPC_RELATIVE R_PPC_NONE' | - grep -E -v '\ Date: Tue, 21 Jan 2020 15:29:51 +1100 Subject: [PATCH 122/131] powerpc/configs: Drop CONFIG_QLGE which moved to staging The QLGE driver moved to staging in commit 955315b0dc8c ("qlge: Move drivers/net/ethernet/qlogic/qlge/ to drivers/staging/qlge/"), meaning our defconfigs that enable it have no effect as we don't enable CONFIG_STAGING. It sounds like the device is obsolete, so drop the driver. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-1-mpe@ellerman.id.au --- arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/powerpc/configs/skiroot_defconfig | 1 - 5 files changed, 5 deletions(-) diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 32841456a573..71749377d164 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -181,7 +181,6 @@ CONFIG_MLX5_FPGA=y CONFIG_MLX5_CORE_EN=y CONFIG_MLX5_CORE_IPOIB=y CONFIG_MYRI10GE=m -CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_USB_NET_DRIVERS=m # CONFIG_WLAN is not set diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index b250e6f5a7ca..7e68cb222c7b 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -189,7 +189,6 @@ CONFIG_MLX4_EN=m CONFIG_MYRI10GE=m CONFIG_S2IO=m CONFIG_PASEMI_MAC=y -CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_SUNGEM=y CONFIG_GELIC_NET=m diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 7e28919041cf..3e2f44f38ac5 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -507,7 +507,6 @@ CONFIG_FORCEDETH=m CONFIG_HAMACHI=m CONFIG_YELLOWFIN=m CONFIG_QLA3XXX=m -CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_8139CP=m CONFIG_8139TOO=m diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 26126b4d4de3..6b68109e248f 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -169,7 +169,6 @@ CONFIG_IXGBE=m CONFIG_I40E=m CONFIG_MLX4_EN=m CONFIG_MYRI10GE=m -CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_PPP=m CONFIG_PPP_BSDCOMP=m diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 069f67f12731..7ff1ff1ddc28 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -171,7 +171,6 @@ CONFIG_MYRI10GE=m # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set -CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_QED=m CONFIG_QEDE=m From f3e96a71aaa9aa84fd40f93c417d3da917b8f13d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:52 +1100 Subject: [PATCH 123/131] powerpc/configs: NET_CADENCE became NET_VENDOR_CADENCE The NET_CADENCE symbol was renamed to NET_VENDOR_CADENCE, so we don't need to disable the former, see commit 0df5f81c481e ("net: ethernet: Add missing VENDOR to Cadence and Packet Engines symbols"). Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-2-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 7ff1ff1ddc28..eaaffe9ae8b9 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -138,7 +138,6 @@ CONFIG_TIGON3=m CONFIG_BNX2X=m # CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set -# CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set CONFIG_CHELSIO_T1=m # CONFIG_NET_VENDOR_CISCO is not set From 7115bf789c4b907e4b82dc0a2965b4ee668c545d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:53 +1100 Subject: [PATCH 124/131] powerpc/configs: Drop NET_VENDOR_HP which moved to staging The HP network driver moved to staging in commit 52340b82cf1a ("hp100: Move 100BaseVG AnyLAN driver to staging") meaning we don't need to disable it any more in our defconfigs. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-3-mpe@ellerman.id.au --- arch/powerpc/configs/44x/akebono_defconfig | 1 - arch/powerpc/configs/skiroot_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/powerpc/configs/44x/akebono_defconfig b/arch/powerpc/configs/44x/akebono_defconfig index f0c8a07cc274..7705a5c3f4ea 100644 --- a/arch/powerpc/configs/44x/akebono_defconfig +++ b/arch/powerpc/configs/44x/akebono_defconfig @@ -59,7 +59,6 @@ CONFIG_BLK_DEV_SD=y # CONFIG_NET_VENDOR_DLINK is not set # CONFIG_NET_VENDOR_EMULEX is not set # CONFIG_NET_VENDOR_EXAR is not set -# CONFIG_NET_VENDOR_HP is not set CONFIG_IBM_EMAC=y # CONFIG_NET_VENDOR_MARVELL is not set # CONFIG_NET_VENDOR_MELLANOX is not set diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index eaaffe9ae8b9..3eee39c50941 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -146,7 +146,6 @@ CONFIG_CHELSIO_T1=m # CONFIG_NET_VENDOR_DLINK is not set CONFIG_BE2NET=m # CONFIG_NET_VENDOR_EZCHIP is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_E1000=m CONFIG_E1000E=m From 028fb6ded72f89b090a455656bb2ce091fbf996f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:54 +1100 Subject: [PATCH 125/131] powerpc/configs/skiroot: Drop HID_LOGITECH Commit bdd08fff4915 ("HID: logitech: Add depends on LEDS_CLASS to Logitech Kconfig entry") made HID_LOGITECH depend on LEDS_CLASS which we do not enable, meaning we are not actually enabling those drivers any more. The Kconfig help text suggests USB HID compliant Logictech devices will continue to work without HID_LOGITECH, so just drop it. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-4-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 3eee39c50941..74cffb854c0f 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -235,7 +235,6 @@ CONFIG_HID_CYPRESS=y CONFIG_HID_EZKEY=y CONFIG_HID_ITE=y CONFIG_HID_KENSINGTON=y -CONFIG_HID_LOGITECH=y CONFIG_HID_MICROSOFT=y CONFIG_HID_MONTEREY=y CONFIG_USB_HIDDEV=y From 81881e0998bcdf4dd151fa7162ad9f9fd00e0fe3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:55 +1100 Subject: [PATCH 126/131] powerpc/configs/skiroot: Drop default n CONFIG_CRYPTO_ECHAINIV It's default n so we don't need to disable it. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-5-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 74cffb854c0f..0aa060eef06c 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -293,5 +293,4 @@ CONFIG_WQ_WATCHDOG=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_ENCRYPTED_KEYS=y -# CONFIG_CRYPTO_ECHAINIV is not set # CONFIG_CRYPTO_HW is not set From cdc7b23f1e90983cb5fb0c9a0a5e6e08b5d71563 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:56 +1100 Subject: [PATCH 127/131] powerpc/configs/skiroot: Update for symbol movement only Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-6-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 0aa060eef06c..24a210fe0049 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -1,8 +1,3 @@ -CONFIG_PPC64=y -CONFIG_ALTIVEC=y -CONFIG_VSX=y -CONFIG_NR_CPUS=2048 -CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_KERNEL_XZ=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y @@ -29,16 +24,11 @@ CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_JUMP_LABEL=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_SIG=y -CONFIG_MODULE_SIG_FORCE=y -CONFIG_MODULE_SIG_SHA512=y -CONFIG_PARTITION_ADVANCED=y -# CONFIG_MQ_IOSCHED_DEADLINE is not set -# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_PPC64=y +CONFIG_ALTIVEC=y +CONFIG_VSX=y +CONFIG_NR_CPUS=2048 +CONFIG_CPU_LITTLE_ENDIAN=y # CONFIG_PPC_VAS is not set # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set @@ -49,14 +39,24 @@ CONFIG_KEXEC=y CONFIG_PRESERVE_FA_DUMP=y CONFIG_IRQ_ALL_CPUS=y CONFIG_NUMA=y -# CONFIG_COMPACTION is not set -# CONFIG_MIGRATION is not set CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=tty0 console=hvc0 ipr.fast_reboot=1 quiet" # CONFIG_SECCOMP is not set # CONFIG_PPC_MEM_KEYS is not set +CONFIG_JUMP_LABEL=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_FORCE=y +CONFIG_MODULE_SIG_SHA512=y +CONFIG_PARTITION_ADVANCED=y +# CONFIG_MQ_IOSCHED_DEADLINE is not set +# CONFIG_MQ_IOSCHED_KYBER is not set +# CONFIG_COMPACTION is not set +# CONFIG_MIGRATION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -153,7 +153,6 @@ CONFIG_IGB=m CONFIG_IXGB=m CONFIG_IXGBE=m CONFIG_I40E=m -CONFIG_S2IO=m # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m # CONFIG_MLX4_CORE_GEN2 is not set @@ -164,6 +163,7 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_MICROSEMI is not set CONFIG_MYRI10GE=m # CONFIG_NET_VENDOR_NATSEMI is not set +CONFIG_S2IO=m # CONFIG_NET_VENDOR_NETRONOME is not set # CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set @@ -271,6 +271,8 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y +CONFIG_ENCRYPTED_KEYS=y +# CONFIG_CRYPTO_HW is not set CONFIG_CRC16=y CONFIG_CRC_ITU_T=y CONFIG_LIBCRC32C=y @@ -289,8 +291,6 @@ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set # CONFIG_FTRACE is not set -# CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_XMON=y CONFIG_XMON_DEFAULT=y -CONFIG_ENCRYPTED_KEYS=y -# CONFIG_CRYPTO_HW is not set +# CONFIG_RUNTIME_TESTING_MENU is not set From 579baeece66ef9360da7d815fd328faf271a3008 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 21 Jan 2020 15:29:57 +1100 Subject: [PATCH 128/131] powerpc/configs/skiroot: Enable security features This turns on HARDENED_USERCOPY with HARDENED_USERCOPY_PAGESPAN, and FORTIFY_SOURCE. It also enables SECURITY_LOCKDOWN_LSM with _EARLY and LOCK_DOWN_KERNEL_FORCE_INTEGRITY options enabled. This still allows xmon to be used in read-only mode. MODULE_SIG is selected by lockdown, so it is still enabled. Because we're setting LOCK_DOWN_KERNELFORCE_INTEGRITY=y we also need to enable KEXEC_FILE=y so that kexec continues to work. Signed-off-by: Joel Stanley [mpe: Switch to lockdown integrity mode per oohal, enable KEXEC_FILE as reported by jms] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200121043000.16212-7-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 24a210fe0049..715d29a43372 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -36,6 +36,7 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_IDLE=y CONFIG_HZ_100=y CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y CONFIG_PRESERVE_FA_DUMP=y CONFIG_IRQ_ALL_CPUS=y CONFIG_NUMA=y @@ -49,7 +50,6 @@ CONFIG_JUMP_LABEL=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_SIG=y CONFIG_MODULE_SIG_FORCE=y CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y @@ -272,6 +272,16 @@ CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y CONFIG_ENCRYPTED_KEYS=y +CONFIG_SECURITY=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +CONFIG_HARDENED_USERCOPY_PAGESPAN=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y +CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY=y +# CONFIG_INTEGRITY is not set +CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_CRYPTO_HW is not set CONFIG_CRC16=y CONFIG_CRC_ITU_T=y From 3554c12d835d16824afb0abaa6689bd89d26a009 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:58 +1100 Subject: [PATCH 129/131] powerpc/configs/skiroot: Disable xmon default & enable reboot on panic If the skiroot kernel crashes we don't want it sitting at an xmon prompt forever. Instead it's more helpful to reboot and bring the boot loader back up, and if the crash was transient we can then boot successfully. Similarly if we panic we should reboot, with a short timeout in case someone is watching the console. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-8-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 715d29a43372..b05769d05fd6 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -29,6 +29,7 @@ CONFIG_ALTIVEC=y CONFIG_VSX=y CONFIG_NR_CPUS=2048 CONFIG_CPU_LITTLE_ENDIAN=y +CONFIG_PANIC_TIMEOUT=30 # CONFIG_PPC_VAS is not set # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set @@ -294,6 +295,7 @@ CONFIG_LIBCRC32C=y CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_HARDLOCKUP_DETECTOR=y @@ -302,5 +304,4 @@ CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set # CONFIG_FTRACE is not set CONFIG_XMON=y -CONFIG_XMON_DEFAULT=y # CONFIG_RUNTIME_TESTING_MENU is not set From 5e84dd547bce965c203cae71418cacb8592a0d3f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Jan 2020 15:29:59 +1100 Subject: [PATCH 130/131] powerpc/configs/skiroot: Enable some more hardening options Enable more hardening options. Note BUG_ON_DATA_CORRUPTION selects DEBUG_LIST and is essentially just a synonym for it. DEBUG_SG, DEBUG_NOTIFIERS, DEBUG_LIST, DEBUG_CREDENTIALS and SCHED_STACK_END_CHECK should all be low overhead and just add a few extra checks. SLAB_FREELIST_RANDOM, and SLUB_DEBUG_ON will add some overhead to the SLAB allocator, but nothing that should be meaningful for skiroot. Unselecting SLAB_MERGE_DEFAULT causes the SLAB to use more memory, but the skiroot kernel shouldn't be memory constrained on any of our systems, all it does is run a small bootloader. Disabling merging has some security/robustness benefit as it means a user-after-free or overflow will be limited to the objects in that slab, rather than potentially affecting objects from unrelated slabs that have been merged. Note also that slab merging is disabled anyway by enabling SLUB_DEBUG_ON, because of the SLAB_NEVER_MERGE mask. Signed-off-by: Michael Ellerman Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20200121043000.16212-9-mpe@ellerman.id.au --- arch/powerpc/configs/skiroot_defconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index b05769d05fd6..1b6bdad36b13 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -23,6 +23,8 @@ CONFIG_EXPERT=y # CONFIG_AIO is not set CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB_MERGE_DEFAULT is not set +CONFIG_SLAB_FREELIST_RANDOM=y CONFIG_SLAB_FREELIST_HARDENED=y CONFIG_PPC64=y CONFIG_ALTIVEC=y @@ -294,6 +296,8 @@ CONFIG_LIBCRC32C=y # CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y +CONFIG_SLUB_DEBUG_ON=y +CONFIG_SCHED_STACK_END_CHECK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y @@ -302,6 +306,10 @@ CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_DEBUG_CREDENTIALS=y # CONFIG_FTRACE is not set CONFIG_XMON=y # CONFIG_RUNTIME_TESTING_MENU is not set From 34b5a946a9543ce38d8ad1aacc4362533a813db7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Jan 2020 20:52:23 +0100 Subject: [PATCH 131/131] powerpc: configs: Cleanup old Kconfig options CONFIG_ENABLE_WARN_DEPRECATED is gone since commit 771c035372a0 ("deprecate the '__deprecated' attribute warnings entirely and for good"). CONFIG_IOSCHED_DEADLINE and CONFIG_IOSCHED_CFQ are gone since commit f382fb0bcef4 ("block: remove legacy IO schedulers"). The IOSCHED_DEADLINE was replaced by MQ_IOSCHED_DEADLINE and it will be now enabled by default (along with MQ_IOSCHED_KYBER). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200130195223.3843-1-krzk@kernel.org --- arch/powerpc/configs/44x/sam440ep_defconfig | 2 -- arch/powerpc/configs/52xx/pcm030_defconfig | 2 -- arch/powerpc/configs/83xx/kmeter1_defconfig | 2 -- arch/powerpc/configs/adder875_defconfig | 1 - arch/powerpc/configs/ep8248e_defconfig | 1 - arch/powerpc/configs/ep88xc_defconfig | 1 - arch/powerpc/configs/mgcoge_defconfig | 1 - arch/powerpc/configs/mpc512x_defconfig | 1 - arch/powerpc/configs/mpc885_ads_defconfig | 1 - arch/powerpc/configs/storcenter_defconfig | 1 - arch/powerpc/configs/tqm8xx_defconfig | 1 - 11 files changed, 14 deletions(-) diff --git a/arch/powerpc/configs/44x/sam440ep_defconfig b/arch/powerpc/configs/44x/sam440ep_defconfig index ed02f12dbd54..22dc0dadf576 100644 --- a/arch/powerpc/configs/44x/sam440ep_defconfig +++ b/arch/powerpc/configs/44x/sam440ep_defconfig @@ -10,8 +10,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y CONFIG_AMIGA_PARTITION=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set # CONFIG_EBONY is not set CONFIG_SAM440EP=y CONFIG_CMDLINE_BOOL=y diff --git a/arch/powerpc/configs/52xx/pcm030_defconfig b/arch/powerpc/configs/52xx/pcm030_defconfig index fdb11daeb688..789622ffd844 100644 --- a/arch/powerpc/configs/52xx/pcm030_defconfig +++ b/arch/powerpc/configs/52xx/pcm030_defconfig @@ -14,8 +14,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set CONFIG_PPC_MPC52xx=y CONFIG_PPC_MPC5200_SIMPLE=y diff --git a/arch/powerpc/configs/83xx/kmeter1_defconfig b/arch/powerpc/configs/83xx/kmeter1_defconfig index 648c6b3dccf9..24bf1bde1bb4 100644 --- a/arch/powerpc/configs/83xx/kmeter1_defconfig +++ b/arch/powerpc/configs/83xx/kmeter1_defconfig @@ -11,8 +11,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set # CONFIG_PPC_PMAC is not set CONFIG_PPC_83xx=y diff --git a/arch/powerpc/configs/adder875_defconfig b/arch/powerpc/configs/adder875_defconfig index 510f7fd1f6a3..f55e23cb176c 100644 --- a/arch/powerpc/configs/adder875_defconfig +++ b/arch/powerpc/configs/adder875_defconfig @@ -9,7 +9,6 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_PPC_ADDER875=y CONFIG_8xx_COPYBACK=y CONFIG_GEN_RTC=y diff --git a/arch/powerpc/configs/ep8248e_defconfig b/arch/powerpc/configs/ep8248e_defconfig index 6e08d9502d89..00d69965f898 100644 --- a/arch/powerpc/configs/ep8248e_defconfig +++ b/arch/powerpc/configs/ep8248e_defconfig @@ -6,7 +6,6 @@ CONFIG_EXPERT=y CONFIG_KALLSYMS_ALL=y CONFIG_SLAB=y CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set # CONFIG_PPC_PMAC is not set CONFIG_PPC_82xx=y diff --git a/arch/powerpc/configs/ep88xc_defconfig b/arch/powerpc/configs/ep88xc_defconfig index 9c1bf60f1e19..0e2e5e81a359 100644 --- a/arch/powerpc/configs/ep88xc_defconfig +++ b/arch/powerpc/configs/ep88xc_defconfig @@ -11,7 +11,6 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_PPC_EP88XC=y CONFIG_8xx_COPYBACK=y CONFIG_GEN_RTC=y diff --git a/arch/powerpc/configs/mgcoge_defconfig b/arch/powerpc/configs/mgcoge_defconfig index 6ce4f206eac7..dcc8dccf54f3 100644 --- a/arch/powerpc/configs/mgcoge_defconfig +++ b/arch/powerpc/configs/mgcoge_defconfig @@ -12,7 +12,6 @@ CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_PMAC is not set CONFIG_PPC_82xx=y CONFIG_MGCOGE=y diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig index 1f3a045ab081..e39346b3dc3b 100644 --- a/arch/powerpc/configs/mpc512x_defconfig +++ b/arch/powerpc/configs/mpc512x_defconfig @@ -9,7 +9,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set CONFIG_PPC_MPC512x=y CONFIG_MPC512x_LPBFIFO=y diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 0327a329316f..82a008c04eae 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -11,7 +11,6 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_8xx_COPYBACK=y CONFIG_GEN_RTC=y CONFIG_HZ_100=y diff --git a/arch/powerpc/configs/storcenter_defconfig b/arch/powerpc/configs/storcenter_defconfig index 29b19ec7e5d7..b964084e4056 100644 --- a/arch/powerpc/configs/storcenter_defconfig +++ b/arch/powerpc/configs/storcenter_defconfig @@ -77,5 +77,4 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y CONFIG_CRC_T10DIF=y -# CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set diff --git a/arch/powerpc/configs/tqm8xx_defconfig b/arch/powerpc/configs/tqm8xx_defconfig index ffed2b4256d6..eda8bfb2d0a3 100644 --- a/arch/powerpc/configs/tqm8xx_defconfig +++ b/arch/powerpc/configs/tqm8xx_defconfig @@ -14,7 +14,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_SRCVERSION_ALL=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_TQM8XX=y CONFIG_8xx_COPYBACK=y # CONFIG_8xx_CPU15 is not set