s390/numa: add core infrastructure

Enable core NUMA support for s390 and add one simple default mode "plain"
that creates one single NUMA node.

This patch contains several changes from Michael Holzheu.

Signed-off-by: Philipp Hachtmann <phacht@linux.vnet.ibm.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Philipp Hachtmann 2014-03-06 18:25:13 +01:00 committed by Martin Schwidefsky
parent 199071f108
commit 3a368f742d
15 changed files with 375 additions and 26 deletions

View File

@ -6,3 +6,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/
obj-$(CONFIG_APPLDATA_BASE) += appldata/
obj-y += net/
obj-$(CONFIG_PCI) += pci/
obj-$(CONFIG_NUMA) += numa/

View File

@ -153,6 +153,10 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_ARCH_EARLY_PFN_TO_NID
config SCHED_OMIT_FRAME_POINTER
def_bool y
@ -386,6 +390,39 @@ config HOTPLUG_CPU
config SCHED_SMT
def_bool n
# Some NUMA nodes have memory ranges that span
# other nodes. Even though a pfn is valid and
# between a node's start and end pfns, it may not
# reside on that node. See memmap_init_zone()
# for details. <- They meant memory holes!
config NODES_SPAN_OTHER_NODES
def_bool NUMA
config NUMA
bool "NUMA support"
depends on SMP && 64BIT && SCHED_TOPOLOGY
default n
help
Enable NUMA support
This option adds NUMA support to the kernel.
An operation mode can be selected by appending
numa=<method> to the kernel command line.
The default behaviour is identical to appending numa=plain to
the command line. This will create just one node with all
available memory and all CPUs in it.
config NODES_SHIFT
int "Maximum NUMA nodes (as a power of 2)"
range 1 10
depends on NUMA
default "4"
help
Specify the maximum number of NUMA nodes available on the target
system. Increases memory reserved to accommodate various tables.
config SCHED_MC
def_bool n

View File

@ -0,0 +1,16 @@
/*
* NUMA support for s390
*
* Copyright IBM Corp. 2015
*/
#ifndef _ASM_S390_MMZONE_H
#define _ASM_S390_MMZONE_H
#ifdef CONFIG_NUMA
extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
#endif /* CONFIG_NUMA */
#endif /* _ASM_S390_MMZONE_H */

View File

@ -0,0 +1,31 @@
/*
* NUMA support for s390
*
* Declare the NUMA core code structures and functions.
*
* Copyright IBM Corp. 2015
*/
#ifndef _ASM_S390_NUMA_H
#define _ASM_S390_NUMA_H
#ifdef CONFIG_NUMA
#include <linux/numa.h>
#include <linux/cpumask.h>
void numa_setup(void);
int numa_pfn_to_nid(unsigned long pfn);
int __node_distance(int a, int b);
void numa_update_cpu_topology(void);
extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
extern int numa_debug_enabled;
#else
static inline void numa_setup(void) { }
static inline void numa_update_cpu_topology(void) { }
#endif /* CONFIG_NUMA */
#endif /* _ASM_S390_NUMA_H */

View File

@ -192,4 +192,20 @@ void zpci_debug_init_device(struct zpci_dev *);
void zpci_debug_exit_device(struct zpci_dev *);
void zpci_debug_info(struct zpci_dev *, struct seq_file *);
#ifdef CONFIG_NUMA
/* Returns the node based on PCI bus */
static inline int __pcibus_to_node(const struct pci_bus *bus)
{
return NUMA_NO_NODE;
}
static inline const struct cpumask *
cpumask_of_pcibus(const struct pci_bus *bus)
{
return cpu_online_mask;
}
#endif /* CONFIG_NUMA */
#endif

View File

@ -2,6 +2,7 @@
#define _ASM_S390_TOPOLOGY_H
#include <linux/cpumask.h>
#include <asm/numa.h>
struct sysinfo_15_1_x;
struct cpu;
@ -13,6 +14,7 @@ struct cpu_topology_s390 {
unsigned short core_id;
unsigned short socket_id;
unsigned short book_id;
unsigned short node_id;
cpumask_t thread_mask;
cpumask_t core_mask;
cpumask_t book_mask;
@ -52,6 +54,43 @@ static inline void topology_expect_change(void) { }
#define POLARIZATION_VM (2)
#define POLARIZATION_VH (3)
#define SD_BOOK_INIT SD_CPU_INIT
#ifdef CONFIG_NUMA
#define cpu_to_node cpu_to_node
static inline int cpu_to_node(int cpu)
{
return per_cpu(cpu_topology, cpu).node_id;
}
/* Returns a pointer to the cpumask of CPUs on node 'node'. */
#define cpumask_of_node cpumask_of_node
static inline const struct cpumask *cpumask_of_node(int node)
{
return node_to_cpumask_map[node];
}
/*
* Returns the number of the node containing node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
#define parent_node(node) (node)
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#define node_distance(a, b) __node_distance(a, b)
#else /* !CONFIG_NUMA */
#define numa_node_id numa_node_id
static inline int numa_node_id(void)
{
return 0;
}
#endif /* CONFIG_NUMA */
#include <asm-generic/topology.h>
#endif /* _ASM_S390_TOPOLOGY_H */

View File

@ -11,12 +11,12 @@
#define __IGNORE_time
/* Ignore NUMA system calls. Not wired up on s390. */
#define __IGNORE_mbind
#define __IGNORE_get_mempolicy
#define __IGNORE_set_mempolicy
#define __IGNORE_migrate_pages
#define __IGNORE_move_pages
/* NUMA system calls */
#define _ARCH_WANT_mbind
#define __ARCH_WANT_get_mempolicy
#define __ARCH_WANT_set_mempolicy
#define __ARCH_WANT_migrate_pages
#define __ARCH_WANT_move_pages
/* Ignore system calls that are also reachable via sys_socket */
#define __IGNORE_recvmmsg

View File

@ -204,9 +204,9 @@
#define __NR_statfs64 265
#define __NR_fstatfs64 266
#define __NR_remap_file_pages 267
/* Number 268 is reserved for new sys_mbind */
/* Number 269 is reserved for new sys_get_mempolicy */
/* Number 270 is reserved for new sys_set_mempolicy */
#define __NR_mbind 268
#define __NR_get_mempolicy 269
#define __NR_set_mempolicy 270
#define __NR_mq_open 271
#define __NR_mq_unlink 272
#define __NR_mq_timedsend 273
@ -223,7 +223,7 @@
#define __NR_inotify_init 284
#define __NR_inotify_add_watch 285
#define __NR_inotify_rm_watch 286
/* Number 287 is reserved for new sys_migrate_pages */
#define __NR_migrate_pages 287
#define __NR_openat 288
#define __NR_mkdirat 289
#define __NR_mknodat 290
@ -245,7 +245,7 @@
#define __NR_sync_file_range 307
#define __NR_tee 308
#define __NR_vmsplice 309
/* Number 310 is reserved for new sys_move_pages */
#define __NR_move_pages 310
#define __NR_getcpu 311
#define __NR_epoll_pwait 312
#define __NR_utimes 313

View File

@ -62,6 +62,7 @@
#include <asm/os_info.h>
#include <asm/sclp.h>
#include <asm/sysinfo.h>
#include <asm/numa.h>
#include "entry.h"
/*
@ -879,6 +880,7 @@ void __init setup_arch(char **cmdline_p)
setup_lowcore();
smp_fill_possible_mask();
cpu_init();
numa_setup();
/*
* Setup capabilities (ELF_HWCAP & ELF_PLATFORM).

View File

@ -276,9 +276,9 @@ SYSCALL(sys_ni_syscall,compat_sys_s390_fadvise64_64)
SYSCALL(sys_statfs64,compat_sys_statfs64)
SYSCALL(sys_fstatfs64,compat_sys_fstatfs64)
SYSCALL(sys_remap_file_pages,compat_sys_remap_file_pages)
NI_SYSCALL /* 268 sys_mbind */
NI_SYSCALL /* 269 sys_get_mempolicy */
NI_SYSCALL /* 270 sys_set_mempolicy */
SYSCALL(sys_mbind,compat_sys_mbind)
SYSCALL(sys_get_mempolicy,compat_sys_get_mempolicy)
SYSCALL(sys_set_mempolicy,compat_sys_set_mempolicy)
SYSCALL(sys_mq_open,compat_sys_mq_open)
SYSCALL(sys_mq_unlink,compat_sys_mq_unlink)
SYSCALL(sys_mq_timedsend,compat_sys_mq_timedsend)
@ -295,7 +295,7 @@ SYSCALL(sys_ioprio_get,compat_sys_ioprio_get)
SYSCALL(sys_inotify_init,sys_inotify_init)
SYSCALL(sys_inotify_add_watch,compat_sys_inotify_add_watch) /* 285 */
SYSCALL(sys_inotify_rm_watch,compat_sys_inotify_rm_watch)
NI_SYSCALL /* 287 sys_migrate_pages */
SYSCALL(sys_migrate_pages,compat_sys_migrate_pages)
SYSCALL(sys_openat,compat_sys_openat)
SYSCALL(sys_mkdirat,compat_sys_mkdirat)
SYSCALL(sys_mknodat,compat_sys_mknodat) /* 290 */
@ -318,7 +318,7 @@ SYSCALL(sys_splice,compat_sys_splice)
SYSCALL(sys_sync_file_range,compat_sys_s390_sync_file_range)
SYSCALL(sys_tee,compat_sys_tee)
SYSCALL(sys_vmsplice,compat_sys_vmsplice)
NI_SYSCALL /* 310 sys_move_pages */
SYSCALL(sys_move_pages,compat_sys_move_pages)
SYSCALL(sys_getcpu,compat_sys_getcpu)
SYSCALL(sys_epoll_pwait,compat_sys_epoll_pwait)
SYSCALL(sys_utimes,compat_sys_utimes)

View File

@ -18,7 +18,10 @@
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/node.h>
#include <asm/sysinfo.h>
#include <asm/numa.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@ -260,6 +263,7 @@ static void update_cpu_masks(void)
}
}
spin_unlock_irqrestore(&topology_lock, flags);
numa_update_cpu_topology();
}
void store_topology(struct sysinfo_15_1_x *info)
@ -274,21 +278,21 @@ int arch_update_cpu_topology(void)
{
struct sysinfo_15_1_x *info = tl_info;
struct device *dev;
int cpu;
int cpu, rc = 0;
if (!MACHINE_HAS_TOPOLOGY) {
update_cpu_masks();
topology_update_polarization_simple();
return 0;
if (MACHINE_HAS_TOPOLOGY) {
rc = 1;
store_topology(info);
tl_to_masks(info);
}
store_topology(info);
tl_to_masks(info);
update_cpu_masks();
if (!MACHINE_HAS_TOPOLOGY)
topology_update_polarization_simple();
for_each_online_cpu(cpu) {
dev = get_cpu_device(cpu);
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
}
return 1;
return rc;
}
static void topology_work_fn(struct work_struct *work)
@ -450,7 +454,6 @@ static struct sched_domain_topology_level s390_topology[] = {
{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};

View File

@ -139,7 +139,7 @@ void __init mem_init(void)
cpumask_set_cpu(0, mm_cpumask(&init_mm));
atomic_set(&init_mm.context.attach_count, 1);
max_mapnr = max_low_pfn;
set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
/* Setup guest page hinting */

1
arch/s390/numa/Makefile Normal file
View File

@ -0,0 +1 @@
obj-y += numa.o

180
arch/s390/numa/numa.c Normal file
View File

@ -0,0 +1,180 @@
/*
* NUMA support for s390
*
* Implement NUMA core code.
*
* Copyright IBM Corp. 2015
*/
#define KMSG_COMPONENT "numa"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/kernel.h>
#include <linux/mmzone.h>
#include <linux/cpumask.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/node.h>
#include <asm/numa.h>
#include "numa_mode.h"
pg_data_t *node_data[MAX_NUMNODES];
EXPORT_SYMBOL(node_data);
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
const struct numa_mode numa_mode_plain = {
.name = "plain",
};
static const struct numa_mode *mode = &numa_mode_plain;
int numa_pfn_to_nid(unsigned long pfn)
{
return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
}
void numa_update_cpu_topology(void)
{
if (mode->update_cpu_topology)
mode->update_cpu_topology();
}
int __node_distance(int a, int b)
{
return mode->distance ? mode->distance(a, b) : 0;
}
int numa_debug_enabled;
/*
* alloc_node_data() - Allocate node data
*/
static __init pg_data_t *alloc_node_data(void)
{
pg_data_t *res;
res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1);
if (!res)
panic("Could not allocate memory for node data!\n");
memset(res, 0, sizeof(pg_data_t));
return res;
}
/*
* numa_setup_memory() - Assign bootmem to nodes
*
* The memory is first added to memblock without any respect to nodes.
* This is fixed before remaining memblock memory is handed over to the
* buddy allocator.
* An important side effect is that large bootmem allocations might easily
* cross node boundaries, which can be needed for large allocations with
* smaller memory stripes in each node (i.e. when using NUMA emulation).
*
* Memory defines nodes:
* Therefore this routine also sets the nodes online with memory.
*/
static void __init numa_setup_memory(void)
{
unsigned long cur_base, align, end_of_dram;
int nid = 0;
end_of_dram = memblock_end_of_DRAM();
align = mode->align ? mode->align() : ULONG_MAX;
/*
* Step through all available memory and assign it to the nodes
* indicated by the mode implementation.
* All nodes which are seen here will be set online.
*/
cur_base = 0;
do {
nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
node_set_online(nid);
memblock_set_node(cur_base, align, &memblock.memory, nid);
cur_base += align;
} while (cur_base < end_of_dram);
/* Allocate and fill out node_data */
for (nid = 0; nid < MAX_NUMNODES; nid++)
NODE_DATA(nid) = alloc_node_data();
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
unsigned long t_start, t_end;
int i;
start_pfn = ULONG_MAX;
end_pfn = 0;
for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
if (t_start < start_pfn)
start_pfn = t_start;
if (t_end > end_pfn)
end_pfn = t_end;
}
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
NODE_DATA(nid)->node_id = nid;
}
}
/*
* numa_setup() - Earliest initialization
*
* Assign the mode and call the mode's setup routine.
*/
void __init numa_setup(void)
{
pr_info("NUMA mode: %s\n", mode->name);
if (mode->setup)
mode->setup();
numa_setup_memory();
memblock_dump_all();
}
/*
* numa_init_early() - Initialization initcall
*
* This runs when only one CPU is online and before the first
* topology update is called for by the scheduler.
*/
static int __init numa_init_early(void)
{
/* Attach all possible CPUs to node 0 for now. */
cpumask_copy(node_to_cpumask_map[0], cpu_possible_mask);
return 0;
}
early_initcall(numa_init_early);
/*
* numa_init_late() - Initialization initcall
*
* Register NUMA nodes.
*/
static int __init numa_init_late(void)
{
int nid;
for_each_online_node(nid)
register_one_node(nid);
return 0;
}
device_initcall(numa_init_late);
static int __init parse_debug(char *parm)
{
numa_debug_enabled = 1;
return 0;
}
early_param("numa_debug", parse_debug);
static int __init parse_numa(char *parm)
{
if (strcmp(parm, numa_mode_plain.name) == 0)
mode = &numa_mode_plain;
return 0;
}
early_param("numa", parse_numa);

View File

@ -0,0 +1,23 @@
/*
* NUMA support for s390
*
* Define declarations used for communication between NUMA mode
* implementations and NUMA core functionality.
*
* Copyright IBM Corp. 2015
*/
#ifndef __S390_NUMA_MODE_H
#define __S390_NUMA_MODE_H
struct numa_mode {
char *name; /* Name of mode */
void (*setup)(void); /* Initizalize mode */
void (*update_cpu_topology)(void); /* Called by topology code */
int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */
unsigned long (*align)(void); /* Minimum node alignment */
int (*distance)(int a, int b); /* Distance between two nodes */
};
extern const struct numa_mode numa_mode_plain;
#endif /* __S390_NUMA_MODE_H */