From 1a06d017fb3f388734ffbe5dedee6f8c3af5f2db Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Sat, 11 Apr 2020 20:50:35 -0700 Subject: [PATCH 1/5] Drivers: hv: vmbus: Fix Suspend-to-Idle for Generation-2 VM Before the hibernation patchset (e.g. f53335e3289f), in a Generation-2 Linux VM on Hyper-V, the user can run "echo freeze > /sys/power/state" to freeze the system, i.e. Suspend-to-Idle. The user can press the keyboard or move the mouse to wake up the VM. With the hibernation patchset, Linux VM on Hyper-V can hibernate to disk, but Suspend-to-Idle is broken: when the synthetic keyboard/mouse are suspended, there is no way to wake up the VM. Fix the issue by not suspending and resuming the vmbus devices upon Suspend-to-Idle. Fixes: f53335e3289f ("Drivers: hv: vmbus: Suspend/resume the vmbus itself for hibernation") Cc: stable@vger.kernel.org Reviewed-by: Michael Kelley Signed-off-by: Dexuan Cui Link: https://lore.kernel.org/r/1586663435-36243-1-git-send-email-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 43 +++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a68bce4d0ddb..e06c6b9555cf 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -978,6 +978,9 @@ static int vmbus_resume(struct device *child_device) return drv->resume(dev); } +#else +#define vmbus_suspend NULL +#define vmbus_resume NULL #endif /* CONFIG_PM_SLEEP */ /* @@ -997,11 +1000,22 @@ static void vmbus_device_release(struct device *device) } /* - * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than - * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm. + * + * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we + * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there + * is no way to wake up a Generation-2 VM. + * + * The other 4 ops are for hibernation. */ + static const struct dev_pm_ops vmbus_pm = { - SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) + .suspend_noirq = NULL, + .resume_noirq = NULL, + .freeze_noirq = vmbus_suspend, + .thaw_noirq = vmbus_resume, + .poweroff_noirq = vmbus_suspend, + .restore_noirq = vmbus_resume, }; /* The one and only one */ @@ -2281,6 +2295,9 @@ static int vmbus_bus_resume(struct device *dev) return 0; } +#else +#define vmbus_bus_suspend NULL +#define vmbus_bus_resume NULL #endif /* CONFIG_PM_SLEEP */ static const struct acpi_device_id vmbus_acpi_device_ids[] = { @@ -2291,16 +2308,24 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); /* - * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than - * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the - * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the - * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * Note: we must use the "no_irq" ops, otherwise hibernation can not work with + * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in + * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's - * resume callback must also run via the "noirq" callbacks. + * resume callback must also run via the "noirq" ops. + * + * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment + * earlier in this file before vmbus_pm. */ + static const struct dev_pm_ops vmbus_bus_pm = { - SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) + .suspend_noirq = NULL, + .resume_noirq = NULL, + .freeze_noirq = vmbus_bus_suspend, + .thaw_noirq = vmbus_bus_resume, + .poweroff_noirq = vmbus_bus_suspend, + .restore_noirq = vmbus_bus_resume }; static struct acpi_driver vmbus_acpi_driver = { From 2ddddd0b4e89e1fc30ed257653239005d2f31f5b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 20 Apr 2020 09:49:26 -0700 Subject: [PATCH 2/5] Drivers: hv: Move AEOI determination to architecture dependent code Hyper-V on ARM64 doesn't provide a flag for the AEOI recommendation in ms_hyperv.hints, so having the test in architecture independent code doesn't work. Resolve this by moving the check of the flag to an architecture dependent helper function. No functionality is changed. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200420164926.24471-1-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/mshyperv.h | 2 ++ drivers/hv/hv.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b79515abb82..7c2bbd6675dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -34,6 +34,8 @@ typedef int (*hyperv_fill_flush_list_func)( rdmsrl(HV_X64_MSR_SINT0 + int_num, val) #define hv_set_synint_state(int_num, val) \ wrmsrl(HV_X64_MSR_SINT0 + int_num, val) +#define hv_recommend_using_aeoi() \ + (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)) #define hv_get_crash_ctl(val) \ rdmsrl(HV_X64_MSR_CRASH_CTL, val) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6098e0cbdb4b..533c8b82b344 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -184,11 +184,7 @@ void hv_synic_enable_regs(unsigned int cpu) shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR; shared_sint.masked = false; - if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) - shared_sint.auto_eoi = false; - else - shared_sint.auto_eoi = true; - + shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); /* Enable the global synic bit */ From 421f090c819d695942a470051cd624dc43deaf95 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 20 Apr 2020 19:46:11 -0700 Subject: [PATCH 3/5] x86/hyperv: Suspend/resume the VP assist page for hibernation Unlike the other CPUs, CPU0 is never offlined during hibernation, so in the resume path, the "new" kernel's VP assist page is not suspended (i.e. not disabled), and later when we jump to the "old" kernel, the page is not properly re-enabled for CPU0 with the allocated page from the old kernel. So far, the VP assist page is used by hv_apic_eoi_write(), and is also used in the case of nested virtualization (running KVM atop Hyper-V). For hv_apic_eoi_write(), when the page is not properly re-enabled, hvp->apic_assist is always 0, so the HV_X64_MSR_EOI MSR is always written. This is not ideal with respect to performance, but Hyper-V can still correctly handle this according to the Hyper-V spec; nevertheless, Linux still must update the Hyper-V hypervisor with the correct VP assist page to prevent Hyper-V from writing to the stale page, which causes guest memory corruption and consequently may have caused the hangs and triple faults seen during non-boot CPUs resume. Fix the issue by calling hv_cpu_die()/hv_cpu_init() in the syscore ops. Without the fix, hibernation can fail at a rate of 1/300 ~ 1/500. With the fix, hibernation can pass a long-haul test of 2000 runs. In the case of nested virtualization, disabling/reenabling the assist page upon hibernation may be unsafe if there are active L2 guests. It looks KVM should be enhanced to abort the hibernation request if there is any active L2 guest. Fixes: 05bd330a7fd8 ("x86/hyperv: Suspend/resume the hypercall page for hibernation") Cc: stable@vger.kernel.org Signed-off-by: Dexuan Cui Link: https://lore.kernel.org/r/1587437171-2472-1-git-send-email-decui@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 624f5d9b0f79..fd51bac11b46 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -73,7 +73,8 @@ static int hv_cpu_init(unsigned int cpu) struct page *pg; input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - pg = alloc_page(GFP_KERNEL); + /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ + pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!pg)) return -ENOMEM; *input_arg = page_address(pg); @@ -254,6 +255,7 @@ static int __init hv_pci_init(void) static int hv_suspend(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; /* * Reset the hypercall page as it is going to be invalidated @@ -270,12 +272,17 @@ static int hv_suspend(void) hypercall_msr.enable = 0; wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); - return 0; + ret = hv_cpu_die(0); + return ret; } static void hv_resume(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + ret = hv_cpu_init(0); + WARN_ON(ret); /* Re-enable the hypercall page */ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -288,6 +295,7 @@ static void hv_resume(void) hv_hypercall_pg_saved = NULL; } +/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ static struct syscore_ops hv_syscore_ops = { .suspend = hv_suspend, .resume = hv_resume, From 1d3c9c075462e9c5a474248e4b433861572f33e9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 Apr 2020 15:59:37 +0300 Subject: [PATCH 4/5] hyper-v: Use UUID API for exporting the GUID There is export_guid() function which exports guid_t to the u8 array. Use it instead of open coding variant. This allows to hide the uuid_t internals. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200422125937.38355-1-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- drivers/hv/hv_trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index e70783e33680..f9d14db980cb 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -286,8 +286,8 @@ TRACE_EVENT(vmbus_send_tl_connect_request, __field(int, ret) ), TP_fast_assign( - memcpy(__entry->guest_id, &msg->guest_endpoint_id.b, 16); - memcpy(__entry->host_id, &msg->host_service_id.b, 16); + export_guid(__entry->guest_id, &msg->guest_endpoint_id); + export_guid(__entry->host_id, &msg->host_service_id); __entry->ret = ret; ), TP_printk("sending guest_endpoint_id %pUl, host_service_id %pUl, " From f081bbb3fd03f949bcdc5aed95a827d7c65e0f30 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 Apr 2020 16:18:18 +0300 Subject: [PATCH 5/5] hyper-v: Remove internal types from UAPI header The uuid_le mistakenly comes to be an UAPI type. Since it's luckily not used by Hyper-V APIs, we may replace with POD types, i.e. __u8 array. Note, previously shared uuid_be had been removed from UAPI few releases ago. This is a continuation of that process towards removing uuid_le one. Note, there is no ABI change! Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200422131818.23088-1-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- include/uapi/linux/hyperv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 991b2b7ada7a..8f24404ad04f 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -119,8 +119,8 @@ enum hv_fcopy_op { struct hv_fcopy_hdr { __u32 operation; - uuid_le service_id0; /* currently unused */ - uuid_le service_id1; /* currently unused */ + __u8 service_id0[16]; /* currently unused */ + __u8 service_id1[16]; /* currently unused */ } __attribute__((packed)); #define OVER_WRITE 0x1