2dd57d3415
Patch series "device-dax: Support sub-dividing soft-reserved ranges", v5. The device-dax facility allows an address range to be directly mapped through a chardev, or optionally hotplugged to the core kernel page allocator as System-RAM. It is the mechanism for converting persistent memory (pmem) to be used as another volatile memory pool i.e. the current Memory Tiering hot topic on linux-mm. In the case of pmem the nvdimm-namespace-label mechanism can sub-divide it, but that labeling mechanism is not available / applicable to soft-reserved ("EFI specific purpose") memory [3]. This series provides a sysfs-mechanism for the daxctl utility to enable provisioning of volatile-soft-reserved memory ranges. The motivations for this facility are: 1/ Allow performance differentiated memory ranges to be split between kernel-managed and directly-accessed use cases. 2/ Allow physical memory to be provisioned along performance relevant address boundaries. For example, divide a memory-side cache [4] along cache-color boundaries. 3/ Parcel out soft-reserved memory to VMs using device-dax as a security / permissions boundary [5]. Specifically I have seen people (ab)using memmap=nn!ss (mark System-RAM as Persistent Memory) just to get the device-dax interface on custom address ranges. A follow-on for the VM use case is to teach device-dax to dynamically allocate 'struct page' at runtime to reduce the duplication of 'struct page' space in both the guest and the host kernel for the same physical pages. [2]: http://lore.kernel.org/r/20200713160837.13774-11-joao.m.martins@oracle.com [3]: http://lore.kernel.org/r/157309097008.1579826.12818463304589384434.stgit@dwillia2-desk3.amr.corp.intel.com [4]: http://lore.kernel.org/r/154899811738.3165233.12325692939590944259.stgit@dwillia2-desk3.amr.corp.intel.com [5]: http://lore.kernel.org/r/20200110190313.17144-1-joao.m.martins@oracle.com This patch (of 23): In preparation for adding a new numa= option clean up the existing ones to avoid ifdefs in numa_setup(), and provide feedback when the option is numa=fake= option is invalid due to kernel config. The same does not need to be done for numa=noacpi, since the capability is already hard disabled at compile-time. Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brice Goglin <Brice.Goglin@inria.fr> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Dave Jiang <dave.jiang@intel.com> Cc: David Airlie <airlied@linux.ie> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jason Gunthorpe <jgg@mellanox.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Jia He <justin.he@arm.com> Cc: Joao Martins <joao.m.martins@oracle.com> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Paul Mackerras <paulus@ozlabs.org> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: Wei Yang <richard.weiyang@linux.alibaba.com> Cc: Will Deacon <will@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Hulk Robot <hulkci@huawei.com> Cc: Jason Yan <yanaijie@huawei.com> Cc: "Jérôme Glisse" <jglisse@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: kernel test robot <lkp@intel.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Link: https://lkml.kernel.org/r/160106109960.30709.7379926726669669398.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/159643094279.4062302.17779410714418721328.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/159643094925.4062302.14979872973043772305.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
447 lines
11 KiB
C
447 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* acpi_numa.c - ACPI NUMA support
|
|
*
|
|
* Copyright (C) 2002 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "ACPI: " fmt
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/init.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/types.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/acpi.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/numa.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/topology.h>
|
|
|
|
static nodemask_t nodes_found_map = NODE_MASK_NONE;
|
|
|
|
/* maps to convert between proximity domain and logical node ID */
|
|
static int pxm_to_node_map[MAX_PXM_DOMAINS]
|
|
= { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE };
|
|
static int node_to_pxm_map[MAX_NUMNODES]
|
|
= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
|
|
|
|
unsigned char acpi_srat_revision __initdata;
|
|
static int acpi_numa __initdata;
|
|
|
|
void __init disable_srat(void)
|
|
{
|
|
acpi_numa = -1;
|
|
}
|
|
|
|
int pxm_to_node(int pxm)
|
|
{
|
|
if (pxm < 0)
|
|
return NUMA_NO_NODE;
|
|
return pxm_to_node_map[pxm];
|
|
}
|
|
EXPORT_SYMBOL(pxm_to_node);
|
|
|
|
int node_to_pxm(int node)
|
|
{
|
|
if (node < 0)
|
|
return PXM_INVAL;
|
|
return node_to_pxm_map[node];
|
|
}
|
|
|
|
static void __acpi_map_pxm_to_node(int pxm, int node)
|
|
{
|
|
if (pxm_to_node_map[pxm] == NUMA_NO_NODE || node < pxm_to_node_map[pxm])
|
|
pxm_to_node_map[pxm] = node;
|
|
if (node_to_pxm_map[node] == PXM_INVAL || pxm < node_to_pxm_map[node])
|
|
node_to_pxm_map[node] = pxm;
|
|
}
|
|
|
|
int acpi_map_pxm_to_node(int pxm)
|
|
{
|
|
int node;
|
|
|
|
if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off)
|
|
return NUMA_NO_NODE;
|
|
|
|
node = pxm_to_node_map[pxm];
|
|
|
|
if (node == NUMA_NO_NODE) {
|
|
if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
|
|
return NUMA_NO_NODE;
|
|
node = first_unset_node(nodes_found_map);
|
|
__acpi_map_pxm_to_node(pxm, node);
|
|
node_set(node, nodes_found_map);
|
|
}
|
|
|
|
return node;
|
|
}
|
|
EXPORT_SYMBOL(acpi_map_pxm_to_node);
|
|
|
|
static void __init
|
|
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
|
|
{
|
|
switch (header->type) {
|
|
case ACPI_SRAT_TYPE_CPU_AFFINITY:
|
|
{
|
|
struct acpi_srat_cpu_affinity *p =
|
|
(struct acpi_srat_cpu_affinity *)header;
|
|
pr_debug("SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n",
|
|
p->apic_id, p->local_sapic_eid,
|
|
p->proximity_domain_lo,
|
|
(p->flags & ACPI_SRAT_CPU_ENABLED) ?
|
|
"enabled" : "disabled");
|
|
}
|
|
break;
|
|
|
|
case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
|
|
{
|
|
struct acpi_srat_mem_affinity *p =
|
|
(struct acpi_srat_mem_affinity *)header;
|
|
pr_debug("SRAT Memory (0x%llx length 0x%llx) in proximity domain %d %s%s%s\n",
|
|
(unsigned long long)p->base_address,
|
|
(unsigned long long)p->length,
|
|
p->proximity_domain,
|
|
(p->flags & ACPI_SRAT_MEM_ENABLED) ?
|
|
"enabled" : "disabled",
|
|
(p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
|
|
" hot-pluggable" : "",
|
|
(p->flags & ACPI_SRAT_MEM_NON_VOLATILE) ?
|
|
" non-volatile" : "");
|
|
}
|
|
break;
|
|
|
|
case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY:
|
|
{
|
|
struct acpi_srat_x2apic_cpu_affinity *p =
|
|
(struct acpi_srat_x2apic_cpu_affinity *)header;
|
|
pr_debug("SRAT Processor (x2apicid[0x%08x]) in proximity domain %d %s\n",
|
|
p->apic_id,
|
|
p->proximity_domain,
|
|
(p->flags & ACPI_SRAT_CPU_ENABLED) ?
|
|
"enabled" : "disabled");
|
|
}
|
|
break;
|
|
|
|
case ACPI_SRAT_TYPE_GICC_AFFINITY:
|
|
{
|
|
struct acpi_srat_gicc_affinity *p =
|
|
(struct acpi_srat_gicc_affinity *)header;
|
|
pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n",
|
|
p->acpi_processor_uid,
|
|
p->proximity_domain,
|
|
(p->flags & ACPI_SRAT_GICC_ENABLED) ?
|
|
"enabled" : "disabled");
|
|
}
|
|
break;
|
|
|
|
default:
|
|
pr_warn("Found unsupported SRAT entry (type = 0x%x)\n",
|
|
header->type);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* A lot of BIOS fill in 10 (= no distance) everywhere. This messes
|
|
* up the NUMA heuristics which wants the local node to have a smaller
|
|
* distance than the others.
|
|
* Do some quick checks here and only use the SLIT if it passes.
|
|
*/
|
|
static int __init slit_valid(struct acpi_table_slit *slit)
|
|
{
|
|
int i, j;
|
|
int d = slit->locality_count;
|
|
for (i = 0; i < d; i++) {
|
|
for (j = 0; j < d; j++) {
|
|
u8 val = slit->entry[d*i + j];
|
|
if (i == j) {
|
|
if (val != LOCAL_DISTANCE)
|
|
return 0;
|
|
} else if (val <= LOCAL_DISTANCE)
|
|
return 0;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
void __init bad_srat(void)
|
|
{
|
|
pr_err("SRAT: SRAT not used.\n");
|
|
disable_srat();
|
|
}
|
|
|
|
int __init srat_disabled(void)
|
|
{
|
|
return acpi_numa < 0;
|
|
}
|
|
|
|
#if defined(CONFIG_X86) || defined(CONFIG_ARM64)
|
|
/*
|
|
* Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
|
|
* I/O localities since SRAT does not list them. I/O localities are
|
|
* not supported at this point.
|
|
*/
|
|
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
|
|
{
|
|
int i, j;
|
|
|
|
for (i = 0; i < slit->locality_count; i++) {
|
|
const int from_node = pxm_to_node(i);
|
|
|
|
if (from_node == NUMA_NO_NODE)
|
|
continue;
|
|
|
|
for (j = 0; j < slit->locality_count; j++) {
|
|
const int to_node = pxm_to_node(j);
|
|
|
|
if (to_node == NUMA_NO_NODE)
|
|
continue;
|
|
|
|
numa_set_distance(from_node, to_node,
|
|
slit->entry[slit->locality_count * i + j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Default callback for parsing of the Proximity Domain <-> Memory
|
|
* Area mappings
|
|
*/
|
|
int __init
|
|
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
|
{
|
|
u64 start, end;
|
|
u32 hotpluggable;
|
|
int node, pxm;
|
|
|
|
if (srat_disabled())
|
|
goto out_err;
|
|
if (ma->header.length < sizeof(struct acpi_srat_mem_affinity)) {
|
|
pr_err("SRAT: Unexpected header length: %d\n",
|
|
ma->header.length);
|
|
goto out_err_bad_srat;
|
|
}
|
|
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
|
|
goto out_err;
|
|
hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
|
|
if (hotpluggable && !IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
|
|
goto out_err;
|
|
|
|
start = ma->base_address;
|
|
end = start + ma->length;
|
|
pxm = ma->proximity_domain;
|
|
if (acpi_srat_revision <= 1)
|
|
pxm &= 0xff;
|
|
|
|
node = acpi_map_pxm_to_node(pxm);
|
|
if (node == NUMA_NO_NODE) {
|
|
pr_err("SRAT: Too many proximity domains.\n");
|
|
goto out_err_bad_srat;
|
|
}
|
|
|
|
if (numa_add_memblk(node, start, end) < 0) {
|
|
pr_err("SRAT: Failed to add memblk to node %u [mem %#010Lx-%#010Lx]\n",
|
|
node, (unsigned long long) start,
|
|
(unsigned long long) end - 1);
|
|
goto out_err_bad_srat;
|
|
}
|
|
|
|
node_set(node, numa_nodes_parsed);
|
|
|
|
pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
|
|
node, pxm,
|
|
(unsigned long long) start, (unsigned long long) end - 1,
|
|
hotpluggable ? " hotplug" : "",
|
|
ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
|
|
|
|
/* Mark hotplug range in memblock. */
|
|
if (hotpluggable && memblock_mark_hotplug(start, ma->length))
|
|
pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
|
|
(unsigned long long)start, (unsigned long long)end - 1);
|
|
|
|
max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
|
|
|
|
return 0;
|
|
out_err_bad_srat:
|
|
bad_srat();
|
|
out_err:
|
|
return -EINVAL;
|
|
}
|
|
#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
|
|
|
|
static int __init acpi_parse_slit(struct acpi_table_header *table)
|
|
{
|
|
struct acpi_table_slit *slit = (struct acpi_table_slit *)table;
|
|
|
|
if (!slit_valid(slit)) {
|
|
pr_info("SLIT table looks invalid. Not used.\n");
|
|
return -EINVAL;
|
|
}
|
|
acpi_numa_slit_init(slit);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void __init __weak
|
|
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
|
|
{
|
|
pr_warn("Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id);
|
|
}
|
|
|
|
static int __init
|
|
acpi_parse_x2apic_affinity(union acpi_subtable_headers *header,
|
|
const unsigned long end)
|
|
{
|
|
struct acpi_srat_x2apic_cpu_affinity *processor_affinity;
|
|
|
|
processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header;
|
|
|
|
acpi_table_print_srat_entry(&header->common);
|
|
|
|
/* let architecture-dependent part to do it */
|
|
acpi_numa_x2apic_affinity_init(processor_affinity);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init
|
|
acpi_parse_processor_affinity(union acpi_subtable_headers *header,
|
|
const unsigned long end)
|
|
{
|
|
struct acpi_srat_cpu_affinity *processor_affinity;
|
|
|
|
processor_affinity = (struct acpi_srat_cpu_affinity *)header;
|
|
|
|
acpi_table_print_srat_entry(&header->common);
|
|
|
|
/* let architecture-dependent part to do it */
|
|
acpi_numa_processor_affinity_init(processor_affinity);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init
|
|
acpi_parse_gicc_affinity(union acpi_subtable_headers *header,
|
|
const unsigned long end)
|
|
{
|
|
struct acpi_srat_gicc_affinity *processor_affinity;
|
|
|
|
processor_affinity = (struct acpi_srat_gicc_affinity *)header;
|
|
|
|
acpi_table_print_srat_entry(&header->common);
|
|
|
|
/* let architecture-dependent part to do it */
|
|
acpi_numa_gicc_affinity_init(processor_affinity);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __initdata parsed_numa_memblks;
|
|
|
|
static int __init
|
|
acpi_parse_memory_affinity(union acpi_subtable_headers * header,
|
|
const unsigned long end)
|
|
{
|
|
struct acpi_srat_mem_affinity *memory_affinity;
|
|
|
|
memory_affinity = (struct acpi_srat_mem_affinity *)header;
|
|
|
|
acpi_table_print_srat_entry(&header->common);
|
|
|
|
/* let architecture-dependent part to do it */
|
|
if (!acpi_numa_memory_affinity_init(memory_affinity))
|
|
parsed_numa_memblks++;
|
|
return 0;
|
|
}
|
|
|
|
static int __init acpi_parse_srat(struct acpi_table_header *table)
|
|
{
|
|
struct acpi_table_srat *srat = (struct acpi_table_srat *)table;
|
|
|
|
acpi_srat_revision = srat->header.revision;
|
|
|
|
/* Real work done in acpi_table_parse_srat below. */
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init
|
|
acpi_table_parse_srat(enum acpi_srat_type id,
|
|
acpi_tbl_entry_handler handler, unsigned int max_entries)
|
|
{
|
|
return acpi_table_parse_entries(ACPI_SIG_SRAT,
|
|
sizeof(struct acpi_table_srat), id,
|
|
handler, max_entries);
|
|
}
|
|
|
|
int __init acpi_numa_init(void)
|
|
{
|
|
int cnt = 0;
|
|
|
|
if (acpi_disabled)
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
|
|
* SRAT cpu entries could have different order with that in MADT.
|
|
* So go over all cpu entries in SRAT to get apicid to node mapping.
|
|
*/
|
|
|
|
/* SRAT: System Resource Affinity Table */
|
|
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
|
|
struct acpi_subtable_proc srat_proc[3];
|
|
|
|
memset(srat_proc, 0, sizeof(srat_proc));
|
|
srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY;
|
|
srat_proc[0].handler = acpi_parse_processor_affinity;
|
|
srat_proc[1].id = ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY;
|
|
srat_proc[1].handler = acpi_parse_x2apic_affinity;
|
|
srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY;
|
|
srat_proc[2].handler = acpi_parse_gicc_affinity;
|
|
|
|
acpi_table_parse_entries_array(ACPI_SIG_SRAT,
|
|
sizeof(struct acpi_table_srat),
|
|
srat_proc, ARRAY_SIZE(srat_proc), 0);
|
|
|
|
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
|
|
acpi_parse_memory_affinity, 0);
|
|
}
|
|
|
|
/* SLIT: System Locality Information Table */
|
|
acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
|
|
|
|
if (cnt < 0)
|
|
return cnt;
|
|
else if (!parsed_numa_memblks)
|
|
return -ENOENT;
|
|
return 0;
|
|
}
|
|
|
|
static int acpi_get_pxm(acpi_handle h)
|
|
{
|
|
unsigned long long pxm;
|
|
acpi_status status;
|
|
acpi_handle handle;
|
|
acpi_handle phandle = h;
|
|
|
|
do {
|
|
handle = phandle;
|
|
status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
|
|
if (ACPI_SUCCESS(status))
|
|
return pxm;
|
|
status = acpi_get_parent(handle, &phandle);
|
|
} while (ACPI_SUCCESS(status));
|
|
return -1;
|
|
}
|
|
|
|
int acpi_get_node(acpi_handle handle)
|
|
{
|
|
int pxm;
|
|
|
|
pxm = acpi_get_pxm(handle);
|
|
|
|
return acpi_map_pxm_to_node(pxm);
|
|
}
|
|
EXPORT_SYMBOL(acpi_get_node);
|