linux/arch/x86/kvm/xen.c
Wei Wang 896046474f KVM: x86: Introduce kvm_x86_call() to simplify static calls of kvm_x86_ops
Introduces kvm_x86_call(), to streamline the usage of static calls of
kvm_x86_ops. The current implementation of these calls is verbose and
could lead to alignment challenges. This makes the code susceptible to
exceeding the "80 columns per single line of code" limit as defined in
the coding-style document. Another issue with the existing implementation
is that the addition of kvm_x86_ prefix to hooks at the static_call sites
hinders code readability and navigation. kvm_x86_call() is added to
improve code readability and maintainability, while adhering to the coding
style guidelines.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Link: https://lore.kernel.org/r/20240507133103.15052-3-wei.w.wang@intel.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-16 12:14:12 -04:00

2305 lines
63 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
* Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* KVM Xen emulation
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
#include "irq.h"
#include <linux/eventfd.h>
#include <linux/kvm_host.h>
#include <linux/sched/stat.h>
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/version.h>
#include <xen/interface/event_channel.h>
#include <xen/interface/sched.h>
#include <asm/xen/cpuid.h>
#include <asm/pvclock.h>
#include "cpuid.h"
#include "trace.h"
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct pvclock_wall_clock *wc;
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
read_lock_irq(&gpc->lock);
while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
read_unlock_irq(&gpc->lock);
ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
if (ret)
goto out;
read_lock_irq(&gpc->lock);
}
/*
* This code mirrors kvm_write_wall_clock() except that it writes
* directly through the pfn cache and doesn't mark the page dirty.
*/
wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
#ifdef CONFIG_X86_64
/* Paranoia checks on the 64-bit struct layout */
BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
struct shared_info *shinfo = gpc->khva;
wc_sec_hi = &shinfo->wc_sec_hi;
wc = &shinfo->wc;
} else
#endif
{
struct compat_shared_info *shinfo = gpc->khva;
wc_sec_hi = &shinfo->arch.wc_sec_hi;
wc = &shinfo->wc;
}
/* Increment and ensure an odd value */
wc_version = wc->version = (wc->version + 1) | 1;
smp_wmb();
wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc->sec = (u32)wall_nsec;
*wc_sec_hi = wall_nsec >> 32;
smp_wmb();
wc->version = wc_version + 1;
read_unlock_irq(&gpc->lock);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
out:
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
{
if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
struct kvm_xen_evtchn e;
e.vcpu_id = vcpu->vcpu_id;
e.vcpu_idx = vcpu->vcpu_idx;
e.port = vcpu->arch.xen.timer_virq;
e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
kvm_xen_set_evtchn(&e, vcpu->kvm);
vcpu->arch.xen.timer_expires = 0;
atomic_set(&vcpu->arch.xen.timer_pending, 0);
}
}
static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
{
struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
arch.xen.timer);
struct kvm_xen_evtchn e;
int rc;
if (atomic_read(&vcpu->arch.xen.timer_pending))
return HRTIMER_NORESTART;
e.vcpu_id = vcpu->vcpu_id;
e.vcpu_idx = vcpu->vcpu_idx;
e.port = vcpu->arch.xen.timer_virq;
e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
if (rc != -EWOULDBLOCK) {
vcpu->arch.xen.timer_expires = 0;
return HRTIMER_NORESTART;
}
atomic_inc(&vcpu->arch.xen.timer_pending);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
return HRTIMER_NORESTART;
}
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
bool linux_wa)
{
int64_t kernel_now, delta;
uint64_t guest_now;
/*
* The guest provides the requested timeout in absolute nanoseconds
* of the KVM clock — as *it* sees it, based on the scaled TSC and
* the pvclock information provided by KVM.
*
* The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
* so use CLOCK_MONOTONIC. In the timescales covered by timers, the
* difference won't matter much as there is no cumulative effect.
*
* Calculate the time for some arbitrary point in time around "now"
* in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
* delta between the kvmclock "now" value and the guest's requested
* timeout, apply the "Linux workaround" described below, and add
* the resulting delta to the CLOCK_MONOTONIC "now" value, to get
* the absolute CLOCK_MONOTONIC time at which the timer should
* fire.
*/
if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
uint64_t host_tsc, guest_tsc;
if (!IS_ENABLED(CONFIG_64BIT) ||
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
/*
* Don't fall back to get_kvmclock_ns() because it's
* broken; it has a systemic error in its results
* because it scales directly from host TSC to
* nanoseconds, and doesn't scale first to guest TSC
* and *then* to nanoseconds as the guest does.
*
* There is a small error introduced here because time
* continues to elapse between the ktime_get() and the
* subsequent rdtsc(). But not the systemic drift due
* to get_kvmclock_ns().
*/
kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
host_tsc = rdtsc();
}
/* Calculate the guest kvmclock as the guest would do it. */
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
guest_tsc);
} else {
/*
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
*
* Also if the guest PV clock hasn't been set up yet, as is
* likely to be the case during migration when the vCPU has
* not been run yet. It would be possible to calculate the
* scaling factors properly in that case but there's not much
* point in doing so. The get_kvmclock_ns() drift accumulates
* over time, so it's OK to use it at startup. Besides, on
* migration there's going to be a little bit of skew in the
* precise moment at which timers fire anyway. Often they'll
* be in the "past" by the time the VM is running again after
* migration.
*/
guest_now = get_kvmclock_ns(vcpu->kvm);
kernel_now = ktime_get();
}
delta = guest_abs - guest_now;
/*
* Xen has a 'Linux workaround' in do_set_timer_op() which checks for
* negative absolute timeout values (caused by integer overflow), and
* for values about 13 days in the future (2^50ns) which would be
* caused by jiffies overflow. For those cases, Xen sets the timeout
* 100ms in the future (not *too* soon, since if a guest really did
* set a long timeout on purpose we don't want to keep churning CPU
* time by waking it up). Emulate Xen's workaround when starting the
* timer in response to __HYPERVISOR_set_timer_op.
*/
if (linux_wa &&
unlikely((int64_t)guest_abs < 0 ||
(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
delta = 100 * NSEC_PER_MSEC;
guest_abs = guest_now + delta;
}
/*
* Avoid races with the old timer firing. Checking timer_expires
* to avoid calling hrtimer_cancel() will only have false positives
* so is fine.
*/
if (vcpu->arch.xen.timer_expires)
hrtimer_cancel(&vcpu->arch.xen.timer);
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
if (delta <= 0)
xen_timer_callback(&vcpu->arch.xen.timer);
else
hrtimer_start(&vcpu->arch.xen.timer,
ktime_add_ns(kernel_now, delta),
HRTIMER_MODE_ABS_HARD);
}
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
{
hrtimer_cancel(&vcpu->arch.xen.timer);
vcpu->arch.xen.timer_expires = 0;
atomic_set(&vcpu->arch.xen.timer_pending, 0);
}
static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
{
hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_HARD);
vcpu->arch.xen.timer.function = xen_timer_callback;
}
static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
size_t user_len, user_len1, user_len2;
struct vcpu_runstate_info rs;
unsigned long flags;
size_t times_ofs;
uint8_t *update_bit = NULL;
uint64_t entry_time;
uint64_t *rs_times;
int *rs_state;
/*
* The only difference between 32-bit and 64-bit versions of the
* runstate struct is the alignment of uint64_t in 32-bit, which
* means that the 64-bit version has an additional 4 bytes of
* padding after the first field 'state'. Let's be really really
* paranoid about that, and matching it with our internal data
* structures that we memcpy into it...
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
#ifdef CONFIG_X86_64
/*
* The 64-bit structure has 4 bytes of padding before 'state_entry_time'
* so each subsequent field is shifted by 4, and it's 4 bytes longer.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
#endif
/*
* The state field is in the same place at the start of both structs,
* and is the same size (int) as vx->current_runstate.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
offsetof(struct compat_vcpu_runstate_info, state));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
/*
* The state_entry_time field is 64 bits in both versions, and the
* XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
* is little-endian means that it's in the last *byte* of the word.
* That detail is important later.
*/
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
sizeof(uint64_t));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(uint64_t));
BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
/*
* The time array is four 64-bit quantities in both versions, matching
* the vx->runstate_times and immediately following state_entry_time.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof_field(struct compat_vcpu_runstate_info, time));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
user_len = sizeof(struct vcpu_runstate_info);
times_ofs = offsetof(struct vcpu_runstate_info,
state_entry_time);
} else {
user_len = sizeof(struct compat_vcpu_runstate_info);
times_ofs = offsetof(struct compat_vcpu_runstate_info,
state_entry_time);
}
/*
* There are basically no alignment constraints. The guest can set it
* up so it crosses from one page to the next, and at arbitrary byte
* alignment (and the 32-bit ABI doesn't align the 64-bit integers
* anyway, even if the overall struct had been 64-bit aligned).
*/
if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
user_len2 = user_len - user_len1;
} else {
user_len1 = user_len;
user_len2 = 0;
}
BUG_ON(user_len1 + user_len2 != user_len);
retry:
/*
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
if (atomic) {
local_irq_save(flags);
if (!read_trylock(&gpc1->lock)) {
local_irq_restore(flags);
return;
}
} else {
read_lock_irqsave(&gpc1->lock, flags);
}
while (!kvm_gpc_check(gpc1, user_len1)) {
read_unlock_irqrestore(&gpc1->lock, flags);
/* When invoked from kvm_sched_out() we cannot sleep */
if (atomic)
return;
if (kvm_gpc_refresh(gpc1, user_len1))
return;
read_lock_irqsave(&gpc1->lock, flags);
}
if (likely(!user_len2)) {
/*
* Set up three pointers directly to the runstate_info
* struct in the guest (via the GPC).
*
* • @rs_state → state field
* • @rs_times → state_entry_time field.
* • @update_bit → last byte of state_entry_time, which
* contains the XEN_RUNSTATE_UPDATE bit.
*/
rs_state = gpc1->khva;
rs_times = gpc1->khva + times_ofs;
if (v->kvm->arch.xen.runstate_update_flag)
update_bit = ((void *)(&rs_times[1])) - 1;
} else {
/*
* The guest's runstate_info is split across two pages and we
* need to hold and validate both GPCs simultaneously. We can
* declare a lock ordering GPC1 > GPC2 because nothing else
* takes them more than one at a time. Set a subclass on the
* gpc1 lock to make lockdep shut up about it.
*/
lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
if (atomic) {
if (!read_trylock(&gpc2->lock)) {
read_unlock_irqrestore(&gpc1->lock, flags);
return;
}
} else {
read_lock(&gpc2->lock);
}
if (!kvm_gpc_check(gpc2, user_len2)) {
read_unlock(&gpc2->lock);
read_unlock_irqrestore(&gpc1->lock, flags);
/* When invoked from kvm_sched_out() we cannot sleep */
if (atomic)
return;
/*
* Use kvm_gpc_activate() here because if the runstate
* area was configured in 32-bit mode and only extends
* to the second page now because the guest changed to
* 64-bit mode, the second GPC won't have been set up.
*/
if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
user_len2))
return;
/*
* We dropped the lock on GPC1 so we have to go all the
* way back and revalidate that too.
*/
goto retry;
}
/*
* In this case, the runstate_info struct will be assembled on
* the kernel stack (compat or not as appropriate) and will
* be copied to GPC1/GPC2 with a dual memcpy. Set up the three
* rs pointers accordingly.
*/
rs_times = &rs.state_entry_time;
/*
* The rs_state pointer points to the start of what we'll
* copy to the guest, which in the case of a compat guest
* is the 32-bit field that the compiler thinks is padding.
*/
rs_state = ((void *)rs_times) - times_ofs;
/*
* The update_bit is still directly in the guest memory,
* via one GPC or the other.
*/
if (v->kvm->arch.xen.runstate_update_flag) {
if (user_len1 >= times_ofs + sizeof(uint64_t))
update_bit = gpc1->khva + times_ofs +
sizeof(uint64_t) - 1;
else
update_bit = gpc2->khva + times_ofs +
sizeof(uint64_t) - 1 - user_len1;
}
#ifdef CONFIG_X86_64
/*
* Don't leak kernel memory through the padding in the 64-bit
* version of the struct.
*/
memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
#endif
}
/*
* First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
* state_entry_time field, directly in the guest. We need to set
* that (and write-barrier) before writing to the rest of the
* structure, and clear it last. Just as Xen does, we address the
* single *byte* in which it resides because it might be in a
* different cache line to the rest of the 64-bit word, due to
* the (lack of) alignment constraints.
*/
entry_time = vx->runstate_entry_time;
if (update_bit) {
entry_time |= XEN_RUNSTATE_UPDATE;
*update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
smp_wmb();
}
/*
* Now assemble the actual structure, either on our kernel stack
* or directly in the guest according to how the rs_state and
* rs_times pointers were set up above.
*/
*rs_state = vx->current_runstate;
rs_times[0] = entry_time;
memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
/* For the split case, we have to then copy it to the guest. */
if (user_len2) {
memcpy(gpc1->khva, rs_state, user_len1);
memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
}
smp_wmb();
/* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
if (update_bit) {
entry_time &= ~XEN_RUNSTATE_UPDATE;
*update_bit = entry_time >> 56;
smp_wmb();
}
if (user_len2) {
kvm_gpc_mark_dirty_in_slot(gpc2);
read_unlock(&gpc2->lock);
}
kvm_gpc_mark_dirty_in_slot(gpc1);
read_unlock_irqrestore(&gpc1->lock, flags);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
u64 now = get_kvmclock_ns(v->kvm);
u64 delta_ns = now - vx->runstate_entry_time;
u64 run_delay = current->sched_info.run_delay;
if (unlikely(!vx->runstate_entry_time))
vx->current_runstate = RUNSTATE_offline;
/*
* Time waiting for the scheduler isn't "stolen" if the
* vCPU wasn't running anyway.
*/
if (vx->current_runstate == RUNSTATE_running) {
u64 steal_ns = run_delay - vx->last_steal;
delta_ns -= steal_ns;
vx->runstate_times[RUNSTATE_runnable] += steal_ns;
}
vx->last_steal = run_delay;
vx->runstate_times[vx->current_runstate] += delta_ns;
vx->current_runstate = state;
vx->runstate_entry_time = now;
if (vx->runstate_cache.active)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
irq.dest_id = v->vcpu_id;
irq.vector = v->arch.xen.upcall_vector;
irq.dest_mode = APIC_DEST_PHYSICAL;
irq.shorthand = APIC_DEST_NOSHORT;
irq.delivery_mode = APIC_DM_FIXED;
irq.level = 1;
kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
}
/*
* On event channel delivery, the vcpu_info may not have been accessible.
* In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
* need to be marked into the vcpu_info (and evtchn_upcall_pending set).
* Do so now that we can sleep in the context of the vCPU to bring the
* page in, and refresh the pfn cache for it.
*/
void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
{
unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
unsigned long flags;
if (!evtchn_pending_sel)
return;
/*
* Yes, this is an open-coded loop. But that's just what put_user()
* does anyway. Page it in and retry the instruction. We're just a
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
return;
read_lock_irqsave(&gpc->lock, flags);
}
/* Now gpc->khva is a valid kernel address for the vcpu_info */
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
struct vcpu_info *vi = gpc->khva;
asm volatile(LOCK_PREFIX "orq %0, %1\n"
"notq %0\n"
LOCK_PREFIX "andq %0, %2\n"
: "=r" (evtchn_pending_sel),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)
: "0" (evtchn_pending_sel));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
} else {
u32 evtchn_pending_sel32 = evtchn_pending_sel;
struct compat_vcpu_info *vi = gpc->khva;
asm volatile(LOCK_PREFIX "orl %0, %1\n"
"notl %0\n"
LOCK_PREFIX "andl %0, %2\n"
: "=r" (evtchn_pending_sel32),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
kvm_xen_inject_vcpu_vector(v);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
unsigned long flags;
u8 rc = 0;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
*/
/* No need for compat handling here */
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
/*
* This function gets called from kvm_vcpu_block() after setting the
* task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
* from a HLT. So we really mustn't sleep. If the page ended up absent
* at that point, just return 1 in order to trigger an immediate wake,
* and we'll end up getting called again from a context where we *can*
* fault in the page and wait for it.
*/
if (in_atomic() || !task_is_running(current))
return 1;
if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
*/
return 0;
}
read_lock_irqsave(&gpc->lock, flags);
}
rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
read_unlock_irqrestore(&gpc->lock, flags);
return rc;
}
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
/*
* Re-initialize shared_info to put the wallclock in the
* correct place. Whilst it's not necessary to do this
* unless the mode is actually changed, it does no harm
* to make the call anyway.
*/
r = kvm->arch.xen.shinfo_cache.active ?
kvm_xen_shared_info_init(kvm) : 0;
mutex_unlock(&kvm->arch.xen.xen_lock);
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
int idx;
mutex_lock(&kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&kvm->srcu);
if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
gfn_t gfn = data->u.shared_info.gfn;
if (gfn == KVM_XEN_INVALID_GFN) {
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
r = 0;
} else {
r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
gfn_to_gpa(gfn), PAGE_SIZE);
}
} else {
void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
if (!PAGE_ALIGNED(hva)) {
r = -EINVAL;
} else if (!hva) {
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
r = 0;
} else {
r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
(unsigned long)hva, PAGE_SIZE);
}
}
srcu_read_unlock(&kvm->srcu, idx);
if (!r && kvm->arch.xen.shinfo_cache.active)
r = kvm_xen_shared_info_init(kvm);
mutex_unlock(&kvm->arch.xen.xen_lock);
break;
}
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.upcall_vector = data->u.vector;
mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_EVTCHN:
r = kvm_xen_setattr_evtchn(kvm, data);
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.xen_version = data->u.xen_version;
mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
default:
break;
}
return r;
}
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
mutex_lock(&kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
data->u.long_mode = kvm->arch.xen.long_mode;
r = 0;
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
else
data->u.shared_info.hva = 0;
r = 0;
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = kvm->arch.xen.upcall_vector;
r = 0;
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
data->u.xen_version = kvm->arch.xen.xen_version;
r = 0;
break;
case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
r = 0;
break;
default:
break;
}
mutex_unlock(&kvm->arch.xen.xen_lock);
return r;
}
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int idx, r = -ENOENT;
mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
/* No compat necessary here. */
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
data->u.gpa, sizeof(struct vcpu_info));
} else {
if (data->u.hva == 0) {
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
data->u.hva, sizeof(struct vcpu_info));
}
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
size_t sz, sz1, sz2;
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
r = 0;
deactivate_out:
kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
break;
}
/*
* If the guest switches to 64-bit mode after setting the runstate
* address, that's actually OK. kvm_xen_update_runstate_guest()
* will cope.
*/
if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
sz = sizeof(struct vcpu_runstate_info);
else
sz = sizeof(struct compat_vcpu_runstate_info);
/* How much fits in the (first) page? */
sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
data->u.gpa, sz1);
if (r)
goto deactivate_out;
/* Either map the second page, or deactivate the second GPC */
if (sz1 >= sz) {
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
} else {
sz2 = sz - sz1;
BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
data->u.gpa + sz1, sz2);
if (r)
goto deactivate_out;
}
kvm_xen_update_runstate_guest(vcpu, false);
break;
}
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.runstate.state > RUNSTATE_offline) {
r = -EINVAL;
break;
}
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.runstate.state > RUNSTATE_offline) {
r = -EINVAL;
break;
}
if (data->u.runstate.state_entry_time !=
(data->u.runstate.time_running +
data->u.runstate.time_runnable +
data->u.runstate.time_blocked +
data->u.runstate.time_offline)) {
r = -EINVAL;
break;
}
if (get_kvmclock_ns(vcpu->kvm) <
data->u.runstate.state_entry_time) {
r = -EINVAL;
break;
}
vcpu->arch.xen.current_runstate = data->u.runstate.state;
vcpu->arch.xen.runstate_entry_time =
data->u.runstate.state_entry_time;
vcpu->arch.xen.runstate_times[RUNSTATE_running] =
data->u.runstate.time_running;
vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
data->u.runstate.time_runnable;
vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
data->u.runstate.time_blocked;
vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
data->u.runstate.time_offline;
vcpu->arch.xen.last_steal = current->sched_info.run_delay;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.runstate.state > RUNSTATE_offline &&
data->u.runstate.state != (u64)-1) {
r = -EINVAL;
break;
}
/* The adjustment must add up */
if (data->u.runstate.state_entry_time !=
(data->u.runstate.time_running +
data->u.runstate.time_runnable +
data->u.runstate.time_blocked +
data->u.runstate.time_offline)) {
r = -EINVAL;
break;
}
if (get_kvmclock_ns(vcpu->kvm) <
(vcpu->arch.xen.runstate_entry_time +
data->u.runstate.state_entry_time)) {
r = -EINVAL;
break;
}
vcpu->arch.xen.runstate_entry_time +=
data->u.runstate.state_entry_time;
vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
data->u.runstate.time_running;
vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
data->u.runstate.time_runnable;
vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
data->u.runstate.time_blocked;
vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
data->u.runstate.time_offline;
if (data->u.runstate.state <= RUNSTATE_offline)
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
else if (vcpu->arch.xen.runstate_cache.active)
kvm_xen_update_runstate_guest(vcpu, false);
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
if (data->u.vcpu_id >= KVM_MAX_VCPUS)
r = -EINVAL;
else {
vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
r = 0;
}
break;
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
if (data->u.timer.port &&
data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
r = -EINVAL;
break;
}
if (!vcpu->arch.xen.timer.function)
kvm_xen_init_timer(vcpu);
/* Stop the timer (if it's running) before changing the vector */
kvm_xen_stop_timer(vcpu);
vcpu->arch.xen.timer_virq = data->u.timer.port;
/* Start the timer if the new value has a valid vector+expiry. */
if (data->u.timer.port && data->u.timer.expires_ns)
kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
vcpu->arch.xen.upcall_vector = data->u.vector;
r = 0;
}
break;
default:
break;
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int r = -ENOENT;
mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
else
data->u.hva = 0;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (vcpu->arch.xen.runstate_cache.active) {
data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
r = 0;
}
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
data->u.runstate.state = vcpu->arch.xen.current_runstate;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
data->u.runstate.state = vcpu->arch.xen.current_runstate;
data->u.runstate.state_entry_time =
vcpu->arch.xen.runstate_entry_time;
data->u.runstate.time_running =
vcpu->arch.xen.runstate_times[RUNSTATE_running];
data->u.runstate.time_runnable =
vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
data->u.runstate.time_blocked =
vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
data->u.runstate.time_offline =
vcpu->arch.xen.runstate_times[RUNSTATE_offline];
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
r = -EINVAL;
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
/*
* Ensure a consistent snapshot of state is captured, with a
* timer either being pending, or the event channel delivered
* to the corresponding bit in the shared_info. Not still
* lurking in the timer_pending flag for deferred delivery.
* Purely as an optimisation, if the timer_expires field is
* zero, that means the timer isn't active (or even in the
* timer_pending flag) and there is no need to cancel it.
*/
if (vcpu->arch.xen.timer_expires) {
hrtimer_cancel(&vcpu->arch.xen.timer);
kvm_xen_inject_timer_irqs(vcpu);
}
data->u.timer.port = vcpu->arch.xen.timer_virq;
data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
/*
* The hrtimer may trigger and raise the IRQ immediately,
* while the returned state causes it to be set up and
* raised again on the destination system after migration.
* That's fine, as the guest won't even have had a chance
* to run and handle the interrupt. Asserting an already
* pending event channel is idempotent.
*/
if (vcpu->arch.xen.timer_expires)
hrtimer_start_expires(&vcpu->arch.xen.timer,
HRTIMER_MODE_ABS_HARD);
r = 0;
break;
case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = vcpu->arch.xen.upcall_vector;
r = 0;
break;
default:
break;
}
mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
bool lm = is_long_mode(vcpu);
int r = 0;
mutex_lock(&kvm->arch.xen.xen_lock);
if (kvm->arch.xen.long_mode != lm) {
kvm->arch.xen.long_mode = lm;
/*
* Re-initialize shared_info to put the wallclock in the
* correct place.
*/
if (kvm->arch.xen.shinfo_cache.active &&
kvm_xen_shared_info_init(kvm))
r = 1;
}
mutex_unlock(&kvm->arch.xen.xen_lock);
if (r)
return r;
/*
* If Xen hypercall intercept is enabled, fill the hypercall
* page with VMCALL/VMMCALL instructions since that's what
* we catch. Else the VMM has provided the hypercall pages
* with instructions of its own choosing, so use those.
*/
if (kvm_xen_hypercall_enabled(kvm)) {
u8 instructions[32];
int i;
if (page_num)
return 1;
/* mov imm32, %eax */
instructions[0] = 0xb8;
/* vmcall / vmmcall */
kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
/* int3 to pad */
memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
*(u32 *)&instructions[1] = i;
if (kvm_vcpu_write_guest(vcpu,
page_addr + (i * sizeof(instructions)),
instructions, sizeof(instructions)))
return 1;
}
} else {
/*
* Note, truncation is a non-issue as 'lm' is guaranteed to be
* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
*/
hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
: kvm->arch.xen_hvm_config.blob_addr_32;
u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
: kvm->arch.xen_hvm_config.blob_size_32;
u8 *page;
int ret;
if (page_num >= blob_size)
return 1;
blob_addr += page_num * PAGE_SIZE;
page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
if (IS_ERR(page))
return PTR_ERR(page);
ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
kfree(page);
if (ret)
return 1;
}
return 0;
}
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
/* Only some feature flags need to be *enabled* by userspace */
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
u32 old_flags;
if (xhc->flags & ~permitted_flags)
return -EINVAL;
/*
* With hypercall interception the kernel generates its own
* hypercall page so it must not be provided.
*/
if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
(xhc->blob_addr_32 || xhc->blob_addr_64 ||
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
old_flags = kvm->arch.xen_hvm_config.flags;
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
mutex_unlock(&kvm->arch.xen.xen_lock);
if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
return 0;
}
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
kvm_rax_write(vcpu, result);
return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
return 1;
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
static inline int max_evtchn_port(struct kvm *kvm)
{
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
return EVTCHN_2L_NR_CHANNELS;
else
return COMPAT_EVTCHN_2L_NR_CHANNELS;
}
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
evtchn_port_t *ports)
{
struct kvm *kvm = vcpu->kvm;
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
unsigned long *pending_bits;
unsigned long flags;
bool ret = true;
int idx, i;
idx = srcu_read_lock(&kvm->srcu);
read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
ret = false;
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
struct shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
} else {
struct compat_shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
}
for (i = 0; i < nr_ports; i++) {
if (test_bit(ports[i], pending_bits)) {
ret = true;
break;
}
}
out_rcu:
read_unlock_irqrestore(&gpc->lock, flags);
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
u64 param, u64 *r)
{
struct sched_poll sched_poll;
evtchn_port_t port, *ports;
struct x86_exception e;
int i;
if (!lapic_in_kernel(vcpu) ||
!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
return false;
if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
struct compat_sched_poll sp32;
/* Sanity check that the compat struct definition is correct */
BUILD_BUG_ON(sizeof(sp32) != 16);
if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
*r = -EFAULT;
return true;
}
/*
* This is a 32-bit pointer to an array of evtchn_port_t which
* are uint32_t, so once it's converted no further compat
* handling is needed.
*/
sched_poll.ports = (void *)(unsigned long)(sp32.ports);
sched_poll.nr_ports = sp32.nr_ports;
sched_poll.timeout = sp32.timeout;
} else {
if (kvm_read_guest_virt(vcpu, param, &sched_poll,
sizeof(sched_poll), &e)) {
*r = -EFAULT;
return true;
}
}
if (unlikely(sched_poll.nr_ports > 1)) {
/* Xen (unofficially) limits number of pollers to 128 */
if (sched_poll.nr_ports > 128) {
*r = -EINVAL;
return true;
}
ports = kmalloc_array(sched_poll.nr_ports,
sizeof(*ports), GFP_KERNEL);
if (!ports) {
*r = -ENOMEM;
return true;
}
} else
ports = &port;
if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
sched_poll.nr_ports * sizeof(*ports), &e)) {
*r = -EFAULT;
return true;
}
for (i = 0; i < sched_poll.nr_ports; i++) {
if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
*r = -EINVAL;
goto out;
}
}
if (sched_poll.nr_ports == 1)
vcpu->arch.xen.poll_evtchn = port;
else
vcpu->arch.xen.poll_evtchn = -1;
set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
if (sched_poll.timeout)
mod_timer(&vcpu->arch.xen.poll_timer,
jiffies + nsecs_to_jiffies(sched_poll.timeout));
kvm_vcpu_halt(vcpu);
if (sched_poll.timeout)
del_timer(&vcpu->arch.xen.poll_timer);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
vcpu->arch.xen.poll_evtchn = 0;
*r = 0;
out:
/* Really, this is only needed in case of timeout */
clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
if (unlikely(sched_poll.nr_ports > 1))
kfree(ports);
return true;
}
static void cancel_evtchn_poll(struct timer_list *t)
{
struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
int cmd, u64 param, u64 *r)
{
switch (cmd) {
case SCHEDOP_poll:
if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
return true;
fallthrough;
case SCHEDOP_yield:
kvm_vcpu_on_spin(vcpu, true);
*r = 0;
return true;
default:
break;
}
return false;
}
struct compat_vcpu_set_singleshot_timer {
uint64_t timeout_abs_ns;
uint32_t flags;
} __attribute__((packed));
static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
int vcpu_id, u64 param, u64 *r)
{
struct vcpu_set_singleshot_timer oneshot;
struct x86_exception e;
if (!kvm_xen_timer_enabled(vcpu))
return false;
switch (cmd) {
case VCPUOP_set_singleshot_timer:
if (vcpu->arch.xen.vcpu_id != vcpu_id) {
*r = -EINVAL;
return true;
}
/*
* The only difference for 32-bit compat is the 4 bytes of
* padding after the interesting part of the structure. So
* for a faithful emulation of Xen we have to *try* to copy
* the padding and return -EFAULT if we can't. Otherwise we
* might as well just have copied the 12-byte 32-bit struct.
*/
BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
offsetof(struct vcpu_set_singleshot_timer, flags));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
sizeof_field(struct vcpu_set_singleshot_timer, flags));
if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
*r = -EFAULT;
return true;
}
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
*r = 0;
return true;
case VCPUOP_stop_singleshot_timer:
if (vcpu->arch.xen.vcpu_id != vcpu_id) {
*r = -EINVAL;
return true;
}
kvm_xen_stop_timer(vcpu);
*r = 0;
return true;
}
return false;
}
static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
u64 *r)
{
if (!kvm_xen_timer_enabled(vcpu))
return false;
if (timeout)
kvm_xen_start_timer(vcpu, timeout, true);
else
kvm_xen_stop_timer(vcpu);
*r = 0;
return true;
}
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
{
bool longmode;
u64 input, params[6], r = -ENOSYS;
bool handled = false;
u8 cpl;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
/* Hyper-V hypercalls get bit 31 set in EAX */
if ((input & 0x80000000) &&
kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
longmode = is_64_bit_hypercall(vcpu);
if (!longmode) {
params[0] = (u32)kvm_rbx_read(vcpu);
params[1] = (u32)kvm_rcx_read(vcpu);
params[2] = (u32)kvm_rdx_read(vcpu);
params[3] = (u32)kvm_rsi_read(vcpu);
params[4] = (u32)kvm_rdi_read(vcpu);
params[5] = (u32)kvm_rbp_read(vcpu);
}
#ifdef CONFIG_X86_64
else {
params[0] = (u64)kvm_rdi_read(vcpu);
params[1] = (u64)kvm_rsi_read(vcpu);
params[2] = (u64)kvm_rdx_read(vcpu);
params[3] = (u64)kvm_r10_read(vcpu);
params[4] = (u64)kvm_r8_read(vcpu);
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
cpl = kvm_x86_call(get_cpl)(vcpu);
trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
/*
* Only allow hypercall acceleration for CPL0. The rare hypercalls that
* are permitted in guest userspace can be handled by the VMM.
*/
if (unlikely(cpl > 0))
goto handle_in_userspace;
switch (input) {
case __HYPERVISOR_xen_version:
if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
r = vcpu->kvm->arch.xen.xen_version;
handled = true;
}
break;
case __HYPERVISOR_event_channel_op:
if (params[0] == EVTCHNOP_send)
handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
break;
case __HYPERVISOR_sched_op:
handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
params[1], &r);
break;
case __HYPERVISOR_vcpu_op:
handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
params[2], &r);
break;
case __HYPERVISOR_set_timer_op: {
u64 timeout = params[0];
/* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
if (!longmode)
timeout |= params[1] << 32;
handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
break;
}
default:
break;
}
if (handled)
return kvm_xen_hypercall_set_result(vcpu, r);
handle_in_userspace:
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
vcpu->run->xen.u.hcall.cpl = cpl;
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
vcpu->run->xen.u.hcall.params[2] = params[2];
vcpu->run->xen.u.hcall.params[3] = params[3];
vcpu->run->xen.u.hcall.params[4] = params[4];
vcpu->run->xen.u.hcall.params[5] = params[5];
vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io =
kvm_xen_hypercall_complete_userspace;
return 0;
}
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
{
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
if ((poll_evtchn == port || poll_evtchn == -1) &&
test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
}
/*
* The return value from this function is propagated to kvm_set_irq() API,
* so it returns:
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
* = 0 Interrupt was coalesced (previous irq is still pending)
* > 0 Number of CPUs interrupt was delivered to
*
* It is also called directly from kvm_arch_set_irq_inatomic(), where the
* only check on its return value is a comparison with -EWOULDBLOCK'.
*/
int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct kvm_vcpu *vcpu;
unsigned long *pending_bits, *mask_bits;
unsigned long flags;
int port_word_bit;
bool kick_vcpu = false;
int vcpu_idx, idx, rc;
vcpu_idx = READ_ONCE(xe->vcpu_idx);
if (vcpu_idx >= 0)
vcpu = kvm_get_vcpu(kvm, vcpu_idx);
else {
vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
if (!vcpu)
return -EINVAL;
WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
}
if (xe->port >= max_evtchn_port(kvm))
return -EINVAL;
rc = -EWOULDBLOCK;
idx = srcu_read_lock(&kvm->srcu);
read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
struct shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
port_word_bit = xe->port / 64;
} else {
struct compat_shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
port_word_bit = xe->port / 32;
}
/*
* If this port wasn't already set, and if it isn't masked, then
* we try to set the corresponding bit in the in-kernel shadow of
* evtchn_pending_sel for the target vCPU. And if *that* wasn't
* already set, then we kick the vCPU in question to write to the
* *real* evtchn_pending_sel in its own guest vcpu_info struct.
*/
if (test_and_set_bit(xe->port, pending_bits)) {
rc = 0; /* It was already raised */
} else if (test_bit(xe->port, mask_bits)) {
rc = -ENOTCONN; /* Masked */
kvm_xen_check_poller(vcpu, xe->port);
} else {
rc = 1; /* Delivered to the bitmap in shared_info. */
/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
read_unlock_irqrestore(&gpc->lock, flags);
gpc = &vcpu->arch.xen.vcpu_info_cache;
read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
*/
if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
kick_vcpu = true;
goto out_rcu;
}
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
struct vcpu_info *vcpu_info = gpc->khva;
if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
kick_vcpu = true;
}
} else {
struct compat_vcpu_info *vcpu_info = gpc->khva;
if (!test_and_set_bit(port_word_bit,
(unsigned long *)&vcpu_info->evtchn_pending_sel)) {
WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
kick_vcpu = true;
}
}
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
kvm_xen_inject_vcpu_vector(vcpu);
kick_vcpu = false;
}
}
out_rcu:
read_unlock_irqrestore(&gpc->lock, flags);
srcu_read_unlock(&kvm->srcu, idx);
if (kick_vcpu) {
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
return rc;
}
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
bool mm_borrowed = false;
int rc;
rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
return rc;
if (current->mm != kvm->mm) {
/*
* If not on a thread which already belongs to this KVM,
* we'd better be in the irqfd workqueue.
*/
if (WARN_ON_ONCE(current->mm))
return -EINVAL;
kthread_use_mm(kvm->mm);
mm_borrowed = true;
}
/*
* It is theoretically possible for the page to be unmapped
* and the MMU notifier to invalidate the shared_info before
* we even get to use it. In that case, this looks like an
* infinite loop. It was tempting to do it via the userspace
* HVA instead... but that just *hides* the fact that it's
* an infinite loop, because if a fault occurs and it waits
* for the page to come back, it can *still* immediately
* fault and have to wait again, repeatedly.
*
* Conversely, the page could also have been reinstated by
* another thread before we even obtain the mutex above, so
* check again *first* before remapping it.
*/
do {
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
int idx;
rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
break;
idx = srcu_read_lock(&kvm->srcu);
rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
return rc;
}
/* This is the version called from kvm_set_irq() as the .set function */
static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status)
{
if (!level)
return -EINVAL;
return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
}
/*
* Set up an event channel interrupt from the KVM IRQ routing table.
* Used for e.g. PIRQ from passed through physical devices.
*/
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
struct kvm_vcpu *vcpu;
if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
return -EINVAL;
/* We only support 2 level event channels for now */
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
return -EINVAL;
/*
* Xen gives us interesting mappings from vCPU index to APIC ID,
* which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
* to find it. Do that once at setup time, instead of every time.
* But beware that on live update / live migration, the routing
* table might be reinstated before the vCPU threads have finished
* recreating their vCPUs.
*/
vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
if (vcpu)
e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
else
e->xen_evtchn.vcpu_idx = -1;
e->xen_evtchn.port = ue->u.xen_evtchn.port;
e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
e->set = evtchn_set_fn;
return 0;
}
/*
* Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
*/
int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
{
struct kvm_xen_evtchn e;
int ret;
if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
return -EINVAL;
/* We only support 2 level event channels for now */
if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
return -EINVAL;
e.port = uxe->port;
e.vcpu_id = uxe->vcpu;
e.vcpu_idx = -1;
e.priority = uxe->priority;
ret = kvm_xen_set_evtchn(&e, kvm);
/*
* None of that 'return 1 if it actually got delivered' nonsense.
* We don't care if it was masked (-ENOTCONN) either.
*/
if (ret > 0 || ret == -ENOTCONN)
ret = 0;
return ret;
}
/*
* Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
*/
struct evtchnfd {
u32 send_port;
u32 type;
union {
struct kvm_xen_evtchn port;
struct {
u32 port; /* zero */
struct eventfd_ctx *ctx;
} eventfd;
} deliver;
};
/*
* Update target vCPU or priority for a registered sending channel.
*/
static int kvm_xen_eventfd_update(struct kvm *kvm,
struct kvm_xen_hvm_attr *data)
{
u32 port = data->u.evtchn.send_port;
struct evtchnfd *evtchnfd;
int ret;
/* Protect writes to evtchnfd as well as the idr lookup. */
mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
ret = -ENOENT;
if (!evtchnfd)
goto out_unlock;
/* For an UPDATE, nothing may change except the priority/vcpu */
ret = -EINVAL;
if (evtchnfd->type != data->u.evtchn.type)
goto out_unlock;
/*
* Port cannot change, and if it's zero that was an eventfd
* which can't be changed either.
*/
if (!evtchnfd->deliver.port.port ||
evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
goto out_unlock;
/* We only support 2 level event channels for now */
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
goto out_unlock;
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
evtchnfd->deliver.port.vcpu_idx = -1;
}
ret = 0;
out_unlock:
mutex_unlock(&kvm->arch.xen.xen_lock);
return ret;
}
/*
* Configure the target (eventfd or local port delivery) for sending on
* a given event channel.
*/
static int kvm_xen_eventfd_assign(struct kvm *kvm,
struct kvm_xen_hvm_attr *data)
{
u32 port = data->u.evtchn.send_port;
struct eventfd_ctx *eventfd = NULL;
struct evtchnfd *evtchnfd;
int ret = -EINVAL;
evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
if (!evtchnfd)
return -ENOMEM;
switch(data->u.evtchn.type) {
case EVTCHNSTAT_ipi:
/* IPI must map back to the same port# */
if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
goto out_noeventfd; /* -EINVAL */
break;
case EVTCHNSTAT_interdomain:
if (data->u.evtchn.deliver.port.port) {
if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
goto out_noeventfd; /* -EINVAL */
} else {
eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
goto out_noeventfd;
}
}
break;
case EVTCHNSTAT_virq:
case EVTCHNSTAT_closed:
case EVTCHNSTAT_unbound:
case EVTCHNSTAT_pirq:
default: /* Unknown event channel type */
goto out; /* -EINVAL */
}
evtchnfd->send_port = data->u.evtchn.send_port;
evtchnfd->type = data->u.evtchn.type;
if (eventfd) {
evtchnfd->deliver.eventfd.ctx = eventfd;
} else {
/* We only support 2 level event channels for now */
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
goto out; /* -EINVAL; */
evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
evtchnfd->deliver.port.vcpu_idx = -1;
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
}
mutex_lock(&kvm->arch.xen.xen_lock);
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
GFP_KERNEL);
mutex_unlock(&kvm->arch.xen.xen_lock);
if (ret >= 0)
return 0;
if (ret == -ENOSPC)
ret = -EEXIST;
out:
if (eventfd)
eventfd_ctx_put(eventfd);
out_noeventfd:
kfree(evtchnfd);
return ret;
}
static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
{
struct evtchnfd *evtchnfd;
mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
mutex_unlock(&kvm->arch.xen.xen_lock);
if (!evtchnfd)
return -ENOENT;
synchronize_srcu(&kvm->srcu);
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
return 0;
}
static int kvm_xen_eventfd_reset(struct kvm *kvm)
{
struct evtchnfd *evtchnfd, **all_evtchnfds;
int i;
int n = 0;
mutex_lock(&kvm->arch.xen.xen_lock);
/*
* Because synchronize_srcu() cannot be called inside the
* critical section, first collect all the evtchnfd objects
* in an array as they are removed from evtchn_ports.
*/
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
n++;
all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
if (!all_evtchnfds) {
mutex_unlock(&kvm->arch.xen.xen_lock);
return -ENOMEM;
}
n = 0;
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
all_evtchnfds[n++] = evtchnfd;
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
}
mutex_unlock(&kvm->arch.xen.xen_lock);
synchronize_srcu(&kvm->srcu);
while (n--) {
evtchnfd = all_evtchnfds[n];
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
}
kfree(all_evtchnfds);
return 0;
}
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
u32 port = data->u.evtchn.send_port;
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
return kvm_xen_eventfd_reset(kvm);
if (!port || port >= max_evtchn_port(kvm))
return -EINVAL;
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
return kvm_xen_eventfd_deassign(kvm, port);
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
return kvm_xen_eventfd_update(kvm, data);
if (data->u.evtchn.flags)
return -EINVAL;
return kvm_xen_eventfd_assign(kvm, data);
}
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
{
struct evtchnfd *evtchnfd;
struct evtchn_send send;
struct x86_exception e;
/* Sanity check: this structure is the same for 32-bit and 64-bit */
BUILD_BUG_ON(sizeof(send) != 4);
if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
*r = -EFAULT;
return true;
}
/*
* evtchnfd is protected by kvm->srcu; the idr lookup instead
* is protected by RCU.
*/
rcu_read_lock();
evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
rcu_read_unlock();
if (!evtchnfd)
return false;
if (evtchnfd->deliver.port.port) {
int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
if (ret < 0 && ret != -ENOTCONN)
return false;
} else {
eventfd_signal(evtchnfd->deliver.eventfd.ctx);
}
*r = 0;
return true;
}
void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
{
vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
vcpu->arch.xen.poll_evtchn = 0;
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);
kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *entry;
u32 function;
if (!vcpu->arch.xen.cpuid.base)
return;
function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
if (function > vcpu->arch.xen.cpuid.limit)
return;
entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
if (entry) {
entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
entry->edx = vcpu->arch.hv_clock.tsc_shift;
}
entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
if (entry)
entry->eax = vcpu->arch.hw_tsc_khz;
}
void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
{
struct evtchnfd *evtchnfd;
int i;
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
}
idr_destroy(&kvm->arch.xen.evtchn_ports);
if (kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
}