8b375f64dc
The setup_node_data() function allocates a pg_data_t object, inserts it into the node_data[] array and initializes the following fields: node_id, node_start_pfn and node_spanned_pages. However, a few function calls later during the kernel boot, free_area_init_node() re-initializes those fields, possibly with setup_node_data() is not used. This causes a small glitch when running Linux as a hyperv numa guest: SRAT: PXM 0 -> APIC 0x00 -> Node 0 SRAT: PXM 0 -> APIC 0x01 -> Node 0 SRAT: PXM 1 -> APIC 0x02 -> Node 1 SRAT: PXM 1 -> APIC 0x03 -> Node 1 SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] SRAT: Node 1 PXM 1 [mem 0x80200000-0xf7ffffff] SRAT: Node 1 PXM 1 [mem 0x100000000-0x1081fffff] NUMA: Node 1 [mem 0x80200000-0xf7ffffff] + [mem 0x100000000-0x1081fffff] -> [mem 0x80200000-0x1081fffff] Initmem setup node 0 [mem 0x00000000-0x7fffffff] NODE_DATA [mem 0x7ffdc000-0x7ffeffff] Initmem setup node 1 [mem 0x80800000-0x1081fffff] NODE_DATA [mem 0x1081ea000-0x1081fdfff] crashkernel: memory value expected [ffffea0000000000-ffffea0001ffffff] PMD -> [ffff88007de00000-ffff88007fdfffff] on node 0 [ffffea0002000000-ffffea00043fffff] PMD -> [ffff880105600000-ffff8801077fffff] on node 1 Zone ranges: DMA [mem 0x00001000-0x00ffffff] DMA32 [mem 0x01000000-0xffffffff] Normal [mem 0x100000000-0x1081fffff] Movable zone start for each node Early memory node ranges node 0: [mem 0x00001000-0x0009efff] node 0: [mem 0x00100000-0x7ffeffff] node 1: [mem 0x80200000-0xf7ffffff] node 1: [mem 0x100000000-0x1081fffff] On node 0 totalpages: 524174 DMA zone: 64 pages used for memmap DMA zone: 21 pages reserved DMA zone: 3998 pages, LIFO batch:0 DMA32 zone: 8128 pages used for memmap DMA32 zone: 520176 pages, LIFO batch:31 On node 1 totalpages: 524288 DMA32 zone: 7672 pages used for memmap DMA32 zone: 491008 pages, LIFO batch:31 Normal zone: 520 pages used for memmap Normal zone: 33280 pages, LIFO batch:7 In this dmesg, the SRAT table reports that the memory range for node 1 starts at 0x80200000. However, the line starting with "Initmem" reports that node 1 memory range starts at 0x80800000. The "Initmem" line is reported by setup_node_data() and is wrong, because the kernel ends up using the range as reported in the SRAT table. This commit drops all that dead code from setup_node_data(), renames it to alloc_node_data() and adds a printk() to free_area_init_node() so that we report a node's memory range accurately. Here's the same dmesg section with this patch applied: SRAT: PXM 0 -> APIC 0x00 -> Node 0 SRAT: PXM 0 -> APIC 0x01 -> Node 0 SRAT: PXM 1 -> APIC 0x02 -> Node 1 SRAT: PXM 1 -> APIC 0x03 -> Node 1 SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] SRAT: Node 1 PXM 1 [mem 0x80200000-0xf7ffffff] SRAT: Node 1 PXM 1 [mem 0x100000000-0x1081fffff] NUMA: Node 1 [mem 0x80200000-0xf7ffffff] + [mem 0x100000000-0x1081fffff] -> [mem 0x80200000-0x1081fffff] NODE_DATA(0) allocated [mem 0x7ffdc000-0x7ffeffff] NODE_DATA(1) allocated [mem 0x1081ea000-0x1081fdfff] crashkernel: memory value expected [ffffea0000000000-ffffea0001ffffff] PMD -> [ffff88007de00000-ffff88007fdfffff] on node 0 [ffffea0002000000-ffffea00043fffff] PMD -> [ffff880105600000-ffff8801077fffff] on node 1 Zone ranges: DMA [mem 0x00001000-0x00ffffff] DMA32 [mem 0x01000000-0xffffffff] Normal [mem 0x100000000-0x1081fffff] Movable zone start for each node Early memory node ranges node 0: [mem 0x00001000-0x0009efff] node 0: [mem 0x00100000-0x7ffeffff] node 1: [mem 0x80200000-0xf7ffffff] node 1: [mem 0x100000000-0x1081fffff] Initmem setup node 0 [mem 0x00001000-0x7ffeffff] On node 0 totalpages: 524174 DMA zone: 64 pages used for memmap DMA zone: 21 pages reserved DMA zone: 3998 pages, LIFO batch:0 DMA32 zone: 8128 pages used for memmap DMA32 zone: 520176 pages, LIFO batch:31 Initmem setup node 1 [mem 0x80200000-0x1081fffff] On node 1 totalpages: 524288 DMA32 zone: 7672 pages used for memmap DMA32 zone: 491008 pages, LIFO batch:31 Normal zone: 520 pages used for memmap Normal zone: 33280 pages, LIFO batch:7 This commit was tested on a two node bare-metal NUMA machine and Linux as a numa guest on hyperv and qemu/kvm. PS: The wrong memory range reported by setup_node_data() seems to be harmless in the current kernel because it's just not used. However, that bad range is used in kernel 2.6.32 to initialize the old boot memory allocator, which causes a crash during boot. Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: David Rientjes <rientjes@google.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
83 lines
2.1 KiB
C
83 lines
2.1 KiB
C
#ifndef _ASM_X86_NUMA_H
|
|
#define _ASM_X86_NUMA_H
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <asm/topology.h>
|
|
#include <asm/apicdef.h>
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
|
|
|
|
/*
|
|
* Too small node sizes may confuse the VM badly. Usually they
|
|
* result from BIOS bugs. So dont recognize nodes as standalone
|
|
* NUMA entities that have less than this amount of RAM listed:
|
|
*/
|
|
#define NODE_MIN_SIZE (4*1024*1024)
|
|
|
|
extern int numa_off;
|
|
|
|
/*
|
|
* __apicid_to_node[] stores the raw mapping between physical apicid and
|
|
* node and is used to initialize cpu_to_node mapping.
|
|
*
|
|
* The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
|
|
* should be accessed by the accessors - set_apicid_to_node() and
|
|
* numa_cpu_node().
|
|
*/
|
|
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
|
|
extern nodemask_t numa_nodes_parsed __initdata;
|
|
|
|
extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
|
|
extern void __init numa_set_distance(int from, int to, int distance);
|
|
|
|
static inline void set_apicid_to_node(int apicid, s16 node)
|
|
{
|
|
__apicid_to_node[apicid] = node;
|
|
}
|
|
|
|
extern int numa_cpu_node(int cpu);
|
|
|
|
#else /* CONFIG_NUMA */
|
|
static inline void set_apicid_to_node(int apicid, s16 node)
|
|
{
|
|
}
|
|
|
|
static inline int numa_cpu_node(int cpu)
|
|
{
|
|
return NUMA_NO_NODE;
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#ifdef CONFIG_X86_32
|
|
# include <asm/numa_32.h>
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
extern void numa_set_node(int cpu, int node);
|
|
extern void numa_clear_node(int cpu);
|
|
extern void __init init_cpu_to_node(void);
|
|
extern void numa_add_cpu(int cpu);
|
|
extern void numa_remove_cpu(int cpu);
|
|
#else /* CONFIG_NUMA */
|
|
static inline void numa_set_node(int cpu, int node) { }
|
|
static inline void numa_clear_node(int cpu) { }
|
|
static inline void init_cpu_to_node(void) { }
|
|
static inline void numa_add_cpu(int cpu) { }
|
|
static inline void numa_remove_cpu(int cpu) { }
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
|
|
void debug_cpumask_set_cpu(int cpu, int node, bool enable);
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
|
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
|
|
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
|
|
void numa_emu_cmdline(char *);
|
|
#endif /* CONFIG_NUMA_EMU */
|
|
|
|
#endif /* _ASM_X86_NUMA_H */
|