linux/arch/x86/kernel/cpu/mshyperv.c
Dexuan Cui d981059e13 x86/hyperv: Enable 15-bit APIC ID if the hypervisor supports it
When a Linux VM runs on Hyper-V, if the VM has CPUs with >255 APIC IDs,
the CPUs can't be the destination of IOAPIC interrupts, because the
IOAPIC RTE's Dest Field has only 8 bits. Currently the hackery driver
drivers/iommu/hyperv-iommu.c is used to ensure IOAPIC interrupts are
only routed to CPUs that don't have >255 APIC IDs. However, there is
an issue with kdump, because the kdump kernel can run on any CPU, and
hence IOAPIC interrupts can't work if the kdump kernel run on a CPU
with a >255 APIC ID.

The kdump issue can be fixed by the Extended Dest ID, which is introduced
recently by David Woodhouse (for IOAPIC, see the field virt_destid_8_14 in
struct IO_APIC_route_entry). Of course, the Extended Dest ID needs the
support of the underlying hypervisor. The latest Hyper-V has added the
support recently: with this commit, on such a Hyper-V host, Linux VM
does not use hyperv-iommu.c because hyperv_prepare_irq_remapping()
returns -ENODEV; instead, Linux kernel's generic support of Extended Dest
ID from David is used, meaning that Linux VM is able to support up to
32K CPUs, and IOAPIC interrupts can be routed to all the CPUs.

On an old Hyper-V host that doesn't support the Extended Dest ID, nothing
changes with this commit: Linux VM is still able to bring up the CPUs with
> 255 APIC IDs with the help of hyperv-iommu.c, but IOAPIC interrupts still
can not go to such CPUs, and the kdump kernel still can not work properly
on such CPUs.

[ tglx: Updated comment as suggested by David ]

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20201103011136.59108-1-decui@microsoft.com
2020-11-04 11:10:52 +01:00

404 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* HyperV Detection code.
*
* Copyright (C) 2010, Novell, Inc.
* Author : K. Y. Srinivasan <ksrinivasan@novell.com>
*/
#include <linux/types.h>
#include <linux/time.h>
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/efi.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idtentry.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
static void (*hv_crash_handler)(struct pt_regs *regs);
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
ack_APIC_irq();
set_irq_regs(old_regs);
}
int hv_setup_vmbus_irq(int irq, void (*handler)(void))
{
/*
* The 'irq' argument is ignored on x86/x64 because a hard-coded
* interrupt vector is used for Hyper-V interrupts.
*/
vmbus_handler = handler;
return 0;
}
void hv_remove_vmbus_irq(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
/*
* Routines to do per-architecture handling of stimer0
* interrupts when in Direct Mode
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
set_irq_regs(old_regs);
}
int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
{
*vector = HYPERV_STIMER0_VECTOR;
*irq = -1; /* Unused on x86/x64 */
hv_stimer0_handler = handler;
return 0;
}
EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
void hv_remove_stimer0_irq(int irq)
{
/* We have no way to deallocate the interrupt gate */
hv_stimer0_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
void hv_setup_kexec_handler(void (*handler)(void))
{
hv_kexec_handler = handler;
}
EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
void hv_remove_kexec_handler(void)
{
hv_kexec_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
{
hv_crash_handler = handler;
}
EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
void hv_remove_crash_handler(void)
{
hv_crash_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
#ifdef CONFIG_KEXEC_CORE
static void hv_machine_shutdown(void)
{
if (kexec_in_progress && hv_kexec_handler)
hv_kexec_handler();
native_machine_shutdown();
}
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
native_machine_crash_shutdown(regs);
}
#endif /* CONFIG_KEXEC_CORE */
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
if (eax >= HYPERV_CPUID_MIN &&
eax <= HYPERV_CPUID_MAX &&
!memcmp("Microsoft Hv", hyp_signature, 12))
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
return 0;
}
static unsigned char hv_get_nmi_reason(void)
{
return 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
* it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
* unknown NMI on the first CPU which gets it.
*/
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
static atomic_t nmi_cpu = ATOMIC_INIT(-1);
if (!unknown_nmi_panic)
return NMI_DONE;
if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
return NMI_HANDLED;
return NMI_DONE;
}
#endif
static unsigned long hv_get_tsc_khz(void)
{
unsigned long freq;
rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
return freq / 1000;
}
#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV)
static void __init hv_smp_prepare_boot_cpu(void)
{
native_smp_prepare_boot_cpu();
#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS)
hv_init_spinlocks();
#endif
}
#endif
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
int hv_host_info_ebx;
int hv_host_info_ecx;
int hv_host_info_edx;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
#endif
/*
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
* Extract host information.
*/
if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
HYPERV_CPUID_VERSION) {
hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
hv_host_info_eax, hv_host_info_ebx >> 16,
hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
}
/*
* Hyper-V expects to get crash register data or kmsg when
* crash enlightment is available and system crashes. Set
* crash_kexec_post_notifiers to be true to make sure that
* calling crash enlightment interface before running kdump
* kernel.
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
crash_kexec_post_notifiers = true;
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
*/
u64 hv_lapic_frequency;
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_period = hv_lapic_frequency;
pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
lapic_timer_period);
}
register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
"hv_nmi_unknown");
#endif
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
} else {
mark_tsc_unstable("running on Hyper-V");
}
/*
* Generation 2 instances don't support reading the NMI status from
* 0x61 port.
*/
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
/*
* Hyper-V VMs have a PIT emulation quirk such that zeroing the
* counter register during PIT shutdown restarts the PIT. So it
* continues to interrupt @18.2 HZ. Setting i8253_clear_counter
* to false tells pit_shutdown() not to zero the counter so that
* the PIT really is shutdown. Generation 2 VMs don't have a PIT,
* and setting this value has no effect.
*/
i8253_clear_counter_on_shutdown = false;
#if IS_ENABLED(CONFIG_HYPERV)
/*
* Setup the hook to get control post apic initialization.
*/
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
/* Setup the IDT for reenlightenment notifications */
if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
asm_sysvec_hyperv_reenlightenment);
}
/* Setup the IDT for stimer0 */
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
asm_sysvec_hyperv_stimer0);
}
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
# endif
/*
* Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
* set x2apic destination mode to physcial mode when x2apic is available
* and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
* have 8-bit APIC id.
*/
# ifdef CONFIG_X86_X2APIC
if (x2apic_supported())
x2apic_phys = 1;
# endif
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
#endif
}
static bool __init ms_hyperv_x2apic_available(void)
{
return x2apic_supported();
}
/*
* If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
* returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
* generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
*
* Note: for a VM on Hyper-V, the I/O-APIC is the only device which
* (logically) generates MSIs directly to the system APIC irq domain.
* There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
* pci-hyperv host bridge.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
u32 eax;
eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
return false;
eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
.type = X86_HYPER_MS_HYPERV,
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
};