bf09fb6cba
Remove support for context switching between the guest's and host's
desired UMWAIT_CONTROL. Propagating the guest's value to hardware isn't
required for correct functionality, e.g. KVM intercepts reads and writes
to the MSR, and the latency effects of the settings controlled by the
MSR are not architecturally visible.
As a general rule, KVM should not allow the guest to control power
management settings unless explicitly enabled by userspace, e.g. see
KVM_CAP_X86_DISABLE_EXITS. E.g. Intel's SDM explicitly states that C0.2
can improve the performance of SMT siblings. A devious guest could
disable C0.2 so as to improve the performance of their workloads at the
detriment to workloads running in the host or on other VMs.
Wholesale removal of UMWAIT_CONTROL context switching also fixes a race
condition where updates from the host may cause KVM to enter the guest
with the incorrect value. Because updates are are propagated to all
CPUs via IPI (SMP function callback), the value in hardware may be
stale with respect to the cached value and KVM could enter the guest
with the wrong value in hardware. As above, the guest can't observe the
bad value, but it's a weird and confusing wart in the implementation.
Removal also fixes the unnecessary usage of VMX's atomic load/store MSR
lists. Using the lists is only necessary for MSRs that are required for
correct functionality immediately upon VM-Enter/VM-Exit, e.g. EFER on
old hardware, or for MSRs that need to-the-uop precision, e.g. perf
related MSRs. For UMWAIT_CONTROL, the effects are only visible in the
kernel via TPAUSE/delay(), and KVM doesn't do any form of delay in
vcpu_vmx_run(). Using the atomic lists is undesirable as they are more
expensive than direct RDMSR/WRMSR.
Furthermore, even if giving the guest control of the MSR is legitimate,
e.g. in pass-through scenarios, it's not clear that the benefits would
outweigh the overhead. E.g. saving and restoring an MSR across a VMX
roundtrip costs ~250 cycles, and if the guest diverged from the host
that cost would be paid on every run of the guest. In other words, if
there is a legitimate use case then it should be enabled by a new
per-VM capability.
Note, KVM still needs to emulate MSR_IA32_UMWAIT_CONTROL so that it can
correctly expose other WAITPKG features to the guest, e.g. TPAUSE,
UMWAIT and UMONITOR.
Fixes: 6e3ba4abce
("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL")
Cc: stable@vger.kernel.org
Cc: Jingqi Liu <jingqi.liu@intel.com>
Cc: Tao Xu <tao3.xu@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200623005135.10414-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
239 lines
6.0 KiB
C
239 lines
6.0 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/syscore_ops.h>
|
|
#include <linux/suspend.h>
|
|
#include <linux/cpu.h>
|
|
|
|
#include <asm/msr.h>
|
|
#include <asm/mwait.h>
|
|
|
|
#define UMWAIT_C02_ENABLE 0
|
|
|
|
#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
|
|
(((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
|
|
((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
|
|
|
|
/*
|
|
* Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
|
|
* umwait max time is 100000 in TSC-quanta and C0.2 is enabled
|
|
*/
|
|
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
|
|
|
|
/*
|
|
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
|
|
* hardware or BIOS before kernel boot.
|
|
*/
|
|
static u32 orig_umwait_control_cached __ro_after_init;
|
|
|
|
/*
|
|
* Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
|
|
* the sysfs write functions.
|
|
*/
|
|
static DEFINE_MUTEX(umwait_lock);
|
|
|
|
static void umwait_update_control_msr(void * unused)
|
|
{
|
|
lockdep_assert_irqs_disabled();
|
|
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
|
|
}
|
|
|
|
/*
|
|
* The CPU hotplug callback sets the control MSR to the global control
|
|
* value.
|
|
*
|
|
* Disable interrupts so the read of umwait_control_cached and the WRMSR
|
|
* are protected against a concurrent sysfs write. Otherwise the sysfs
|
|
* write could update the cached value after it had been read on this CPU
|
|
* and issue the IPI before the old value had been written. The IPI would
|
|
* interrupt, write the new value and after return from IPI the previous
|
|
* value would be written by this CPU.
|
|
*
|
|
* With interrupts disabled the upcoming CPU either sees the new control
|
|
* value or the IPI is updating this CPU to the new control value after
|
|
* interrupts have been reenabled.
|
|
*/
|
|
static int umwait_cpu_online(unsigned int cpu)
|
|
{
|
|
local_irq_disable();
|
|
umwait_update_control_msr(NULL);
|
|
local_irq_enable();
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The CPU hotplug callback sets the control MSR to the original control
|
|
* value.
|
|
*/
|
|
static int umwait_cpu_offline(unsigned int cpu)
|
|
{
|
|
/*
|
|
* This code is protected by the CPU hotplug already and
|
|
* orig_umwait_control_cached is never changed after it caches
|
|
* the original control MSR value in umwait_init(). So there
|
|
* is no race condition here.
|
|
*/
|
|
wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
|
|
* is the only active CPU at this time. The MSR is set up on the APs via the
|
|
* CPU hotplug callback.
|
|
*
|
|
* This function is invoked on resume from suspend and hibernation. On
|
|
* resume from suspend the restore should be not required, but we neither
|
|
* trust the firmware nor does it matter if the same value is written
|
|
* again.
|
|
*/
|
|
static void umwait_syscore_resume(void)
|
|
{
|
|
umwait_update_control_msr(NULL);
|
|
}
|
|
|
|
static struct syscore_ops umwait_syscore_ops = {
|
|
.resume = umwait_syscore_resume,
|
|
};
|
|
|
|
/* sysfs interface */
|
|
|
|
/*
|
|
* When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
|
|
* Otherwise, C0.2 is enabled.
|
|
*/
|
|
static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
|
|
{
|
|
return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
|
|
}
|
|
|
|
static inline u32 umwait_ctrl_max_time(u32 ctrl)
|
|
{
|
|
return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
|
|
}
|
|
|
|
static inline void umwait_update_control(u32 maxtime, bool c02_enable)
|
|
{
|
|
u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
|
|
|
|
if (!c02_enable)
|
|
ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
|
|
|
|
WRITE_ONCE(umwait_control_cached, ctrl);
|
|
/* Propagate to all CPUs */
|
|
on_each_cpu(umwait_update_control_msr, NULL, 1);
|
|
}
|
|
|
|
static ssize_t
|
|
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
|
|
{
|
|
u32 ctrl = READ_ONCE(umwait_control_cached);
|
|
|
|
return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
|
|
}
|
|
|
|
static ssize_t enable_c02_store(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
bool c02_enable;
|
|
u32 ctrl;
|
|
int ret;
|
|
|
|
ret = kstrtobool(buf, &c02_enable);
|
|
if (ret)
|
|
return ret;
|
|
|
|
mutex_lock(&umwait_lock);
|
|
|
|
ctrl = READ_ONCE(umwait_control_cached);
|
|
if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
|
|
umwait_update_control(ctrl, c02_enable);
|
|
|
|
mutex_unlock(&umwait_lock);
|
|
|
|
return count;
|
|
}
|
|
static DEVICE_ATTR_RW(enable_c02);
|
|
|
|
static ssize_t
|
|
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
|
|
{
|
|
u32 ctrl = READ_ONCE(umwait_control_cached);
|
|
|
|
return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
|
|
}
|
|
|
|
static ssize_t max_time_store(struct device *kobj,
|
|
struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
u32 max_time, ctrl;
|
|
int ret;
|
|
|
|
ret = kstrtou32(buf, 0, &max_time);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/* bits[1:0] must be zero */
|
|
if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
|
|
return -EINVAL;
|
|
|
|
mutex_lock(&umwait_lock);
|
|
|
|
ctrl = READ_ONCE(umwait_control_cached);
|
|
if (max_time != umwait_ctrl_max_time(ctrl))
|
|
umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
|
|
|
|
mutex_unlock(&umwait_lock);
|
|
|
|
return count;
|
|
}
|
|
static DEVICE_ATTR_RW(max_time);
|
|
|
|
static struct attribute *umwait_attrs[] = {
|
|
&dev_attr_enable_c02.attr,
|
|
&dev_attr_max_time.attr,
|
|
NULL
|
|
};
|
|
|
|
static struct attribute_group umwait_attr_group = {
|
|
.attrs = umwait_attrs,
|
|
.name = "umwait_control",
|
|
};
|
|
|
|
static int __init umwait_init(void)
|
|
{
|
|
struct device *dev;
|
|
int ret;
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
|
|
return -ENODEV;
|
|
|
|
/*
|
|
* Cache the original control MSR value before the control MSR is
|
|
* changed. This is the only place where orig_umwait_control_cached
|
|
* is modified.
|
|
*/
|
|
rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
|
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
|
|
umwait_cpu_online, umwait_cpu_offline);
|
|
if (ret < 0) {
|
|
/*
|
|
* On failure, the control MSR on all CPUs has the
|
|
* original control value.
|
|
*/
|
|
return ret;
|
|
}
|
|
|
|
register_syscore_ops(&umwait_syscore_ops);
|
|
|
|
/*
|
|
* Add umwait control interface. Ignore failure, so at least the
|
|
* default values are set up in case the machine manages to boot.
|
|
*/
|
|
dev = cpu_subsys.dev_root;
|
|
return sysfs_create_group(&dev->kobj, &umwait_attr_group);
|
|
}
|
|
device_initcall(umwait_init);
|