mirror of
https://github.com/torvalds/linux.git
synced 2024-12-02 09:01:34 +00:00
f2bb0b4f15
This per CPU variable is just yet another form of voodoo programming. The boot ordering is: per_cpu(x86_cpu_to_logical_apicid, cpu) = 1U << cpu; ..... setup_apic() apic->init_apic_ldr() default_init_apic_ldr() apic_write(SET_APIC_LOGICAL_ID(1UL << smp_processor_id(), APIC_LDR); id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR); WARN_ON(id != per_cpu(x86_cpu_to_logical_apicid, cpu)); per_cpu(x86_cpu_to_logical_apicid, cpu) = id; So first write the default into LDR and then validate it against the same default which was set up during early boot APIC enumeration. Brilliant, isn't it? The comment above the per CPU variable declaration describes it well: 'Let's keep it ugly for now.' Remove the useless gunk and use '1U << cpu' consistently all over the place. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Michael Kelley <mikelley@microsoft.com> Tested-by: Sohil Mehta <sohil.mehta@intel.com> Tested-by: Juergen Gross <jgross@suse.com> # Xen PV (dom0 and unpriv. guest)
235 lines
6.5 KiB
C
235 lines
6.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/export.h>
|
|
#include <linux/init.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/crash_dump.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/topology.h>
|
|
#include <linux/pfn.h>
|
|
#include <linux/stackprotector.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/desc.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/mpspec.h>
|
|
#include <asm/apicdef.h>
|
|
#include <asm/highmem.h>
|
|
#include <asm/proto.h>
|
|
#include <asm/cpumask.h>
|
|
#include <asm/cpu.h>
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
|
|
#else
|
|
#define BOOT_PERCPU_OFFSET 0
|
|
#endif
|
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
|
|
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
|
|
|
|
unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
|
|
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
|
|
};
|
|
EXPORT_SYMBOL(__per_cpu_offset);
|
|
|
|
/*
|
|
* On x86_64 symbols referenced from code should be reachable using
|
|
* 32bit relocations. Reserve space for static percpu variables in
|
|
* modules so that they are always served from the first chunk which
|
|
* is located at the percpu segment base. On x86_32, anything can
|
|
* address anywhere. No need to reserve space in the first chunk.
|
|
*/
|
|
#ifdef CONFIG_X86_64
|
|
#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
|
|
#else
|
|
#define PERCPU_FIRST_CHUNK_RESERVE 0
|
|
#endif
|
|
|
|
#ifdef CONFIG_X86_32
|
|
/**
|
|
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
|
|
*
|
|
* If NUMA is not configured or there is only one NUMA node available,
|
|
* there is no reason to consider NUMA. This function determines
|
|
* whether percpu allocation should consider NUMA or not.
|
|
*
|
|
* RETURNS:
|
|
* true if NUMA should be considered; otherwise, false.
|
|
*/
|
|
static bool __init pcpu_need_numa(void)
|
|
{
|
|
#ifdef CONFIG_NUMA
|
|
pg_data_t *last = NULL;
|
|
unsigned int cpu;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
int node = early_cpu_to_node(cpu);
|
|
|
|
if (node_online(node) && NODE_DATA(node) &&
|
|
last && last != NODE_DATA(node))
|
|
return true;
|
|
|
|
last = NODE_DATA(node);
|
|
}
|
|
#endif
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
|
{
|
|
#ifdef CONFIG_NUMA
|
|
if (early_cpu_to_node(from) == early_cpu_to_node(to))
|
|
return LOCAL_DISTANCE;
|
|
else
|
|
return REMOTE_DISTANCE;
|
|
#else
|
|
return LOCAL_DISTANCE;
|
|
#endif
|
|
}
|
|
|
|
static int __init pcpu_cpu_to_node(int cpu)
|
|
{
|
|
return early_cpu_to_node(cpu);
|
|
}
|
|
|
|
void __init pcpu_populate_pte(unsigned long addr)
|
|
{
|
|
populate_extra_pte(addr);
|
|
}
|
|
|
|
static inline void setup_percpu_segment(int cpu)
|
|
{
|
|
#ifdef CONFIG_X86_32
|
|
struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
|
|
0xFFFFF);
|
|
|
|
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
|
|
#endif
|
|
}
|
|
|
|
void __init setup_per_cpu_areas(void)
|
|
{
|
|
unsigned int cpu;
|
|
unsigned long delta;
|
|
int rc;
|
|
|
|
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
|
|
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
|
|
|
|
/*
|
|
* Allocate percpu area. Embedding allocator is our favorite;
|
|
* however, on NUMA configurations, it can result in very
|
|
* sparse unit mapping and vmalloc area isn't spacious enough
|
|
* on 32bit. Use page in that case.
|
|
*/
|
|
#ifdef CONFIG_X86_32
|
|
if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
|
|
pcpu_chosen_fc = PCPU_FC_PAGE;
|
|
#endif
|
|
rc = -EINVAL;
|
|
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
|
|
const size_t dyn_size = PERCPU_MODULE_RESERVE +
|
|
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
|
|
size_t atom_size;
|
|
|
|
/*
|
|
* On 64bit, use PMD_SIZE for atom_size so that embedded
|
|
* percpu areas are aligned to PMD. This, in the future,
|
|
* can also allow using PMD mappings in vmalloc area. Use
|
|
* PAGE_SIZE on 32bit as vmalloc space is highly contended
|
|
* and large vmalloc area allocs can easily fail.
|
|
*/
|
|
#ifdef CONFIG_X86_64
|
|
atom_size = PMD_SIZE;
|
|
#else
|
|
atom_size = PAGE_SIZE;
|
|
#endif
|
|
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
|
|
dyn_size, atom_size,
|
|
pcpu_cpu_distance,
|
|
pcpu_cpu_to_node);
|
|
if (rc < 0)
|
|
pr_warn("%s allocator failed (%d), falling back to page size\n",
|
|
pcpu_fc_names[pcpu_chosen_fc], rc);
|
|
}
|
|
if (rc < 0)
|
|
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
|
|
pcpu_cpu_to_node);
|
|
if (rc < 0)
|
|
panic("cannot initialize percpu area (err=%d)", rc);
|
|
|
|
/* alrighty, percpu areas up and running */
|
|
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
|
for_each_possible_cpu(cpu) {
|
|
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
|
|
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
|
|
per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
|
|
setup_percpu_segment(cpu);
|
|
/*
|
|
* Copy data used in early init routines from the
|
|
* initial arrays to the per cpu data areas. These
|
|
* arrays then become expendable and the *_early_ptr's
|
|
* are zeroed indicating that the static arrays are
|
|
* gone.
|
|
*/
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
per_cpu(x86_cpu_to_apicid, cpu) =
|
|
early_per_cpu_map(x86_cpu_to_apicid, cpu);
|
|
per_cpu(x86_cpu_to_acpiid, cpu) =
|
|
early_per_cpu_map(x86_cpu_to_acpiid, cpu);
|
|
#endif
|
|
#ifdef CONFIG_NUMA
|
|
per_cpu(x86_cpu_to_node_map, cpu) =
|
|
early_per_cpu_map(x86_cpu_to_node_map, cpu);
|
|
/*
|
|
* Ensure that the boot cpu numa_node is correct when the boot
|
|
* cpu is on a node that doesn't have memory installed.
|
|
* Also cpu_up() will call cpu_to_node() for APs when
|
|
* MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
|
|
* up later with c_init aka intel_init/amd_init.
|
|
* So set them all (boot cpu and all APs).
|
|
*/
|
|
set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
|
|
#endif
|
|
/*
|
|
* Up to this point, the boot CPU has been using .init.data
|
|
* area. Reload any changed state for the boot CPU.
|
|
*/
|
|
if (!cpu)
|
|
switch_gdt_and_percpu_base(cpu);
|
|
}
|
|
|
|
/* indicate the early static arrays will soon be gone */
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
|
|
early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
|
|
#endif
|
|
#ifdef CONFIG_NUMA
|
|
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
|
|
#endif
|
|
|
|
/* Setup node to cpumask map */
|
|
setup_node_to_cpumask_map();
|
|
|
|
/* Setup cpu initialized, callin, callout masks */
|
|
setup_cpu_local_masks();
|
|
|
|
/*
|
|
* Sync back kernel address range again. We already did this in
|
|
* setup_arch(), but percpu data also needs to be available in
|
|
* the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
|
|
* swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
|
|
* there too.
|
|
*
|
|
* FIXME: Can the later sync in setup_cpu_entry_areas() replace
|
|
* this call?
|
|
*/
|
|
sync_initial_page_table();
|
|
}
|