forked from Minki/linux
ARM:
- New page table code for both hypervisor and guest stage-2 - Introduction of a new EL2-private host context - Allow EL2 to have its own private per-CPU variables - Support of PMU event filtering - Complete rework of the Spectre mitigation PPC: - Fix for running nested guests with in-kernel IRQ chip - Fix race condition causing occasional host hard lockup - Minor cleanups and bugfixes x86: - allow trapping unknown MSRs to userspace - allow userspace to force #GP on specific MSRs - INVPCID support on AMD - nested AMD cleanup, on demand allocation of nested SVM state - hide PV MSRs and hypercalls for features not enabled in CPUID - new test for MSR_IA32_TSC writes from host and guest - cleanups: MMU, CPUID, shared MSRs - LAPIC latency optimizations ad bugfixes For x86, also included in this pull request is a new alternative and (in the future) more scalable implementation of extended page tables that does not need a reverse map from guest physical addresses to host physical addresses. For now it is disabled by default because it is still lacking a few of the existing MMU's bells and whistles. However it is a very solid piece of work and it is already available for people to hammer on it. -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAl+S8dsUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroM40Af+M46NJmuS5rcwFfybvK/c42KT6svX Co1NrZDwzSQ2mMy3WQzH9qeLvb+nbY4sT3n5BPNPNsT+aIDPOTDt//qJ2/Ip9UUs tRNea0MAR96JWLE7MSeeRxnTaQIrw/AAZC0RXFzZvxcgytXwdqBExugw4im+b+dn Dcz8QxX1EkwT+4lTm5HC0hKZAuo4apnK1QkqCq4SdD2QVJ1YE6+z7pgj4wX7xitr STKD6q/Yt/0ndwqS0GSGbyg0jy6mE620SN6isFRkJYwqfwLJci6KnqvEK67EcNMu qeE017K+d93yIVC46/6TfVHzLR/D1FpQ8LZ16Yl6S13OuGIfAWBkQZtPRg== =AD6a -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM updates from Paolo Bonzini: "For x86, there is a new alternative and (in the future) more scalable implementation of extended page tables that does not need a reverse map from guest physical addresses to host physical addresses. For now it is disabled by default because it is still lacking a few of the existing MMU's bells and whistles. However it is a very solid piece of work and it is already available for people to hammer on it. Other updates: ARM: - New page table code for both hypervisor and guest stage-2 - Introduction of a new EL2-private host context - Allow EL2 to have its own private per-CPU variables - Support of PMU event filtering - Complete rework of the Spectre mitigation PPC: - Fix for running nested guests with in-kernel IRQ chip - Fix race condition causing occasional host hard lockup - Minor cleanups and bugfixes x86: - allow trapping unknown MSRs to userspace - allow userspace to force #GP on specific MSRs - INVPCID support on AMD - nested AMD cleanup, on demand allocation of nested SVM state - hide PV MSRs and hypercalls for features not enabled in CPUID - new test for MSR_IA32_TSC writes from host and guest - cleanups: MMU, CPUID, shared MSRs - LAPIC latency optimizations ad bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (232 commits) kvm: x86/mmu: NX largepage recovery for TDP MMU kvm: x86/mmu: Don't clear write flooding count for direct roots kvm: x86/mmu: Support MMIO in the TDP MMU kvm: x86/mmu: Support write protection for nesting in tdp MMU kvm: x86/mmu: Support disabling dirty logging for the tdp MMU kvm: x86/mmu: Support dirty logging for the TDP MMU kvm: x86/mmu: Support changed pte notifier in tdp MMU kvm: x86/mmu: Add access tracking for tdp_mmu kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU kvm: x86/mmu: Add TDP MMU PF handler kvm: x86/mmu: Remove disallowed_hugepage_adjust shadow_walk_iterator arg kvm: x86/mmu: Support zapping SPTEs in the TDP MMU KVM: Cache as_id in kvm_memory_slot kvm: x86/mmu: Add functions to handle changed TDP SPTEs kvm: x86/mmu: Allocate and free TDP MMU roots kvm: x86/mmu: Init / Uninit the TDP MMU kvm: x86/mmu: Introduce tdp_iter KVM: mmu: extract spte.h and spte.c KVM: mmu: Separate updating a PTE from kvm_set_pte_rmapp ...
This commit is contained in:
commit
f9a705ad1c
@ -4498,11 +4498,14 @@ Currently, the following list of CPUID leaves are returned:
|
||||
- HYPERV_CPUID_ENLIGHTMENT_INFO
|
||||
- HYPERV_CPUID_IMPLEMENT_LIMITS
|
||||
- HYPERV_CPUID_NESTED_FEATURES
|
||||
- HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS
|
||||
- HYPERV_CPUID_SYNDBG_INTERFACE
|
||||
- HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
|
||||
|
||||
HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
|
||||
enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
|
||||
|
||||
Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
|
||||
Userspace invokes KVM_GET_SUPPORTED_HV_CPUID by passing a kvm_cpuid2 structure
|
||||
with the 'nent' field indicating the number of entries in the variable-size
|
||||
array 'entries'. If the number of entries is too low to describe all Hyper-V
|
||||
feature leaves, an error (E2BIG) is returned. If the number is more or equal
|
||||
@ -4704,6 +4707,106 @@ KVM_PV_VM_VERIFY
|
||||
Verify the integrity of the unpacked image. Only if this succeeds,
|
||||
KVM is allowed to start protected VCPUs.
|
||||
|
||||
4.126 KVM_X86_SET_MSR_FILTER
|
||||
----------------------------
|
||||
|
||||
:Capability: KVM_X86_SET_MSR_FILTER
|
||||
:Architectures: x86
|
||||
:Type: vm ioctl
|
||||
:Parameters: struct kvm_msr_filter
|
||||
:Returns: 0 on success, < 0 on error
|
||||
|
||||
::
|
||||
|
||||
struct kvm_msr_filter_range {
|
||||
#define KVM_MSR_FILTER_READ (1 << 0)
|
||||
#define KVM_MSR_FILTER_WRITE (1 << 1)
|
||||
__u32 flags;
|
||||
__u32 nmsrs; /* number of msrs in bitmap */
|
||||
__u32 base; /* MSR index the bitmap starts at */
|
||||
__u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
|
||||
};
|
||||
|
||||
#define KVM_MSR_FILTER_MAX_RANGES 16
|
||||
struct kvm_msr_filter {
|
||||
#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
|
||||
#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
|
||||
__u32 flags;
|
||||
struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
|
||||
};
|
||||
|
||||
flags values for ``struct kvm_msr_filter_range``:
|
||||
|
||||
``KVM_MSR_FILTER_READ``
|
||||
|
||||
Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
|
||||
indicates that a read should immediately fail, while a 1 indicates that
|
||||
a read for a particular MSR should be handled regardless of the default
|
||||
filter action.
|
||||
|
||||
``KVM_MSR_FILTER_WRITE``
|
||||
|
||||
Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
|
||||
indicates that a write should immediately fail, while a 1 indicates that
|
||||
a write for a particular MSR should be handled regardless of the default
|
||||
filter action.
|
||||
|
||||
``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
|
||||
|
||||
Filter both read and write accesses to MSRs using the given bitmap. A 0
|
||||
in the bitmap indicates that both reads and writes should immediately fail,
|
||||
while a 1 indicates that reads and writes for a particular MSR are not
|
||||
filtered by this range.
|
||||
|
||||
flags values for ``struct kvm_msr_filter``:
|
||||
|
||||
``KVM_MSR_FILTER_DEFAULT_ALLOW``
|
||||
|
||||
If no filter range matches an MSR index that is getting accessed, KVM will
|
||||
fall back to allowing access to the MSR.
|
||||
|
||||
``KVM_MSR_FILTER_DEFAULT_DENY``
|
||||
|
||||
If no filter range matches an MSR index that is getting accessed, KVM will
|
||||
fall back to rejecting access to the MSR. In this mode, all MSRs that should
|
||||
be processed by KVM need to explicitly be marked as allowed in the bitmaps.
|
||||
|
||||
This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
|
||||
specify whether a certain MSR access should be explicitly filtered for or not.
|
||||
|
||||
If this ioctl has never been invoked, MSR accesses are not guarded and the
|
||||
default KVM in-kernel emulation behavior is fully preserved.
|
||||
|
||||
Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
|
||||
filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
|
||||
an error.
|
||||
|
||||
As soon as the filtering is in place, every MSR access is processed through
|
||||
the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
|
||||
x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
|
||||
and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
|
||||
register.
|
||||
|
||||
If a bit is within one of the defined ranges, read and write accesses are
|
||||
guarded by the bitmap's value for the MSR index if the kind of access
|
||||
is included in the ``struct kvm_msr_filter_range`` flags. If no range
|
||||
cover this particular access, the behavior is determined by the flags
|
||||
field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
|
||||
and ``KVM_MSR_FILTER_DEFAULT_DENY``.
|
||||
|
||||
Each bitmap range specifies a range of MSRs to potentially allow access on.
|
||||
The range goes from MSR index [base .. base+nmsrs]. The flags field
|
||||
indicates whether reads, writes or both reads and writes are filtered
|
||||
by setting a 1 bit in the bitmap for the corresponding MSR index.
|
||||
|
||||
If an MSR access is not permitted through the filtering, it generates a
|
||||
#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
|
||||
allows user space to deflect and potentially handle various MSR accesses
|
||||
into user space.
|
||||
|
||||
If a vCPU is in running state while this ioctl is invoked, the vCPU may
|
||||
experience inconsistent filtering behavior on MSR accesses.
|
||||
|
||||
|
||||
5. The kvm_run structure
|
||||
========================
|
||||
@ -4869,14 +4972,13 @@ to the byte array.
|
||||
|
||||
.. note::
|
||||
|
||||
For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and
|
||||
KVM_EXIT_EPR the corresponding
|
||||
|
||||
operations are complete (and guest state is consistent) only after userspace
|
||||
has re-entered the kernel with KVM_RUN. The kernel side will first finish
|
||||
incomplete operations and then check for pending signals. Userspace
|
||||
can re-enter the guest with an unmasked signal pending to complete
|
||||
pending operations.
|
||||
For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
|
||||
KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
|
||||
operations are complete (and guest state is consistent) only after userspace
|
||||
has re-entered the kernel with KVM_RUN. The kernel side will first finish
|
||||
incomplete operations and then check for pending signals. Userspace
|
||||
can re-enter the guest with an unmasked signal pending to complete
|
||||
pending operations.
|
||||
|
||||
::
|
||||
|
||||
@ -5163,6 +5265,44 @@ Note that KVM does not skip the faulting instruction as it does for
|
||||
KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
|
||||
if it decides to decode and emulate the instruction.
|
||||
|
||||
::
|
||||
|
||||
/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
|
||||
struct {
|
||||
__u8 error; /* user -> kernel */
|
||||
__u8 pad[7];
|
||||
__u32 reason; /* kernel -> user */
|
||||
__u32 index; /* kernel -> user */
|
||||
__u64 data; /* kernel <-> user */
|
||||
} msr;
|
||||
|
||||
Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
|
||||
enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
|
||||
will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
|
||||
exit for writes.
|
||||
|
||||
The "reason" field specifies why the MSR trap occurred. User space will only
|
||||
receive MSR exit traps when a particular reason was requested during through
|
||||
ENABLE_CAP. Currently valid exit reasons are:
|
||||
|
||||
KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
|
||||
KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
|
||||
KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
|
||||
|
||||
For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
|
||||
wants to read. To respond to this request with a successful read, user space
|
||||
writes the respective data into the "data" field and must continue guest
|
||||
execution to ensure the read data is transferred into guest register state.
|
||||
|
||||
If the RDMSR request was unsuccessful, user space indicates that with a "1" in
|
||||
the "error" field. This will inject a #GP into the guest when the VCPU is
|
||||
executed again.
|
||||
|
||||
For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
|
||||
wants to write. Once finished processing the event, user space must continue
|
||||
vCPU execution. If the MSR write was unsuccessful, user space also sets the
|
||||
"error" field to "1".
|
||||
|
||||
::
|
||||
|
||||
/* Fix the size of the union. */
|
||||
@ -5852,6 +5992,28 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows
|
||||
the maximum halt time to specified on a per-VM basis, effectively overriding
|
||||
the module parameter for the target VM.
|
||||
|
||||
7.21 KVM_CAP_X86_USER_SPACE_MSR
|
||||
-------------------------------
|
||||
|
||||
:Architectures: x86
|
||||
:Target: VM
|
||||
:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
|
||||
:Returns: 0 on success; -1 on error
|
||||
|
||||
This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
|
||||
into user space.
|
||||
|
||||
When a guest requests to read or write an MSR, KVM may not implement all MSRs
|
||||
that are relevant to a respective system. It also does not differentiate by
|
||||
CPU type.
|
||||
|
||||
To allow more fine grained control over MSR handling, user space may enable
|
||||
this capability. With it enabled, MSR accesses that match the mask specified in
|
||||
args[0] and trigger a #GP event inside the guest by KVM will instead trigger
|
||||
KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
|
||||
can then handle to implement model specific MSR handling and/or user notifications
|
||||
to inform a user that an MSR was not handled.
|
||||
|
||||
8. Other capabilities.
|
||||
======================
|
||||
|
||||
@ -6193,3 +6355,39 @@ distribution...)
|
||||
|
||||
If this capability is available, then the CPNC and CPVC can be synchronized
|
||||
between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318).
|
||||
|
||||
8.26 KVM_CAP_X86_USER_SPACE_MSR
|
||||
-------------------------------
|
||||
|
||||
:Architectures: x86
|
||||
|
||||
This capability indicates that KVM supports deflection of MSR reads and
|
||||
writes to user space. It can be enabled on a VM level. If enabled, MSR
|
||||
accesses that would usually trigger a #GP by KVM into the guest will
|
||||
instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
|
||||
KVM_EXIT_X86_WRMSR exit notifications.
|
||||
|
||||
8.25 KVM_X86_SET_MSR_FILTER
|
||||
---------------------------
|
||||
|
||||
:Architectures: x86
|
||||
|
||||
This capability indicates that KVM supports that accesses to user defined MSRs
|
||||
may be rejected. With this capability exposed, KVM exports new VM ioctl
|
||||
KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
|
||||
ranges that KVM should reject access to.
|
||||
|
||||
In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
|
||||
trap and emulate MSRs that are outside of the scope of KVM as well as
|
||||
limit the attack surface on KVM's MSR emulation code.
|
||||
|
||||
|
||||
8.26 KVM_CAP_ENFORCE_PV_CPUID
|
||||
-----------------------------
|
||||
|
||||
Architectures: x86
|
||||
|
||||
When enabled, KVM will disable paravirtual features provided to the
|
||||
guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
|
||||
(0x40000001). Otherwise, a guest may use the paravirtual features
|
||||
regardless of what has actually been exposed through the CPUID leaf.
|
||||
|
@ -38,64 +38,64 @@ returns::
|
||||
|
||||
where ``flag`` is defined as below:
|
||||
|
||||
================================= =========== ================================
|
||||
flag value meaning
|
||||
================================= =========== ================================
|
||||
KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs
|
||||
0x11 and 0x12
|
||||
================================== =========== ================================
|
||||
flag value meaning
|
||||
================================== =========== ================================
|
||||
KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs
|
||||
0x11 and 0x12
|
||||
|
||||
KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays
|
||||
on PIO operations
|
||||
KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays
|
||||
on PIO operations
|
||||
|
||||
KVM_FEATURE_MMU_OP 2 deprecated
|
||||
KVM_FEATURE_MMU_OP 2 deprecated
|
||||
|
||||
KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs
|
||||
0x4b564d00 and 0x4b564d01
|
||||
KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs
|
||||
0x4b564d00 and 0x4b564d01
|
||||
|
||||
KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by
|
||||
writing to msr 0x4b564d02
|
||||
KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by
|
||||
writing to msr 0x4b564d02
|
||||
|
||||
KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by
|
||||
writing to msr 0x4b564d03
|
||||
KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by
|
||||
writing to msr 0x4b564d03
|
||||
|
||||
KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt
|
||||
handler can be enabled by
|
||||
writing to msr 0x4b564d04
|
||||
KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt
|
||||
handler can be enabled by
|
||||
writing to msr 0x4b564d04
|
||||
|
||||
KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
spinlock support
|
||||
KVM_FEATURE_PV_UNHALT 7 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
spinlock support
|
||||
|
||||
KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
tlb flush
|
||||
KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
tlb flush
|
||||
|
||||
KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT
|
||||
can be enabled by setting bit 2
|
||||
when writing to msr 0x4b564d02
|
||||
KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT
|
||||
can be enabled by setting bit 2
|
||||
when writing to msr 0x4b564d02
|
||||
|
||||
KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
sebd IPIs
|
||||
KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit
|
||||
before enabling paravirtualized
|
||||
send IPIs
|
||||
|
||||
KVM_FEATURE_POLL_CONTROL 12 host-side polling on HLT can
|
||||
be disabled by writing
|
||||
to msr 0x4b564d05.
|
||||
KVM_FEATURE_POLL_CONTROL 12 host-side polling on HLT can
|
||||
be disabled by writing
|
||||
to msr 0x4b564d05.
|
||||
|
||||
KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit
|
||||
before using paravirtualized
|
||||
sched yield.
|
||||
KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit
|
||||
before using paravirtualized
|
||||
sched yield.
|
||||
|
||||
KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit
|
||||
before using the second async
|
||||
pf control msr 0x4b564d06 and
|
||||
async pf acknowledgment msr
|
||||
0x4b564d07.
|
||||
KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit
|
||||
before using the second async
|
||||
pf control msr 0x4b564d06 and
|
||||
async pf acknowledgment msr
|
||||
0x4b564d07.
|
||||
|
||||
KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side
|
||||
per-cpu warps are expeced in
|
||||
kvmclock
|
||||
================================= =========== ================================
|
||||
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side
|
||||
per-cpu warps are expected in
|
||||
kvmclock
|
||||
================================== =========== ================================
|
||||
|
||||
::
|
||||
|
||||
|
@ -25,8 +25,10 @@ Returns:
|
||||
|
||||
======= ========================================================
|
||||
-EBUSY The PMU overflow interrupt is already set
|
||||
-ENXIO The overflow interrupt not set when attempting to get it
|
||||
-ENODEV PMUv3 not supported
|
||||
-EFAULT Error reading interrupt number
|
||||
-ENXIO PMUv3 not supported or the overflow interrupt not set
|
||||
when attempting to get it
|
||||
-ENODEV KVM_ARM_VCPU_PMU_V3 feature missing from VCPU
|
||||
-EINVAL Invalid PMU overflow interrupt number supplied or
|
||||
trying to set the IRQ number without using an in-kernel
|
||||
irqchip.
|
||||
@ -45,9 +47,10 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
|
||||
Returns:
|
||||
|
||||
======= ======================================================
|
||||
-EEXIST Interrupt number already used
|
||||
-ENODEV PMUv3 not supported or GIC not initialized
|
||||
-ENXIO PMUv3 not properly configured or in-kernel irqchip not
|
||||
configured as required prior to calling this attribute
|
||||
-ENXIO PMUv3 not supported, missing VCPU feature or interrupt
|
||||
number not set
|
||||
-EBUSY PMUv3 already initialized
|
||||
======= ======================================================
|
||||
|
||||
@ -55,6 +58,52 @@ Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel
|
||||
virtual GIC implementation, this must be done after initializing the in-kernel
|
||||
irqchip.
|
||||
|
||||
1.3 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_FILTER
|
||||
-----------------------------------------
|
||||
|
||||
:Parameters: in kvm_device_attr.addr the address for a PMU event filter is a
|
||||
pointer to a struct kvm_pmu_event_filter
|
||||
|
||||
:Returns:
|
||||
|
||||
======= ======================================================
|
||||
-ENODEV PMUv3 not supported or GIC not initialized
|
||||
-ENXIO PMUv3 not properly configured or in-kernel irqchip not
|
||||
configured as required prior to calling this attribute
|
||||
-EBUSY PMUv3 already initialized
|
||||
-EINVAL Invalid filter range
|
||||
======= ======================================================
|
||||
|
||||
Request the installation of a PMU event filter described as follows::
|
||||
|
||||
struct kvm_pmu_event_filter {
|
||||
__u16 base_event;
|
||||
__u16 nevents;
|
||||
|
||||
#define KVM_PMU_EVENT_ALLOW 0
|
||||
#define KVM_PMU_EVENT_DENY 1
|
||||
|
||||
__u8 action;
|
||||
__u8 pad[3];
|
||||
};
|
||||
|
||||
A filter range is defined as the range [@base_event, @base_event + @nevents),
|
||||
together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The
|
||||
first registered range defines the global policy (global ALLOW if the first
|
||||
@action is DENY, global DENY if the first @action is ALLOW). Multiple ranges
|
||||
can be programmed, and must fit within the event space defined by the PMU
|
||||
architecture (10 bits on ARMv8.0, 16 bits from ARMv8.1 onwards).
|
||||
|
||||
Note: "Cancelling" a filter by registering the opposite action for the same
|
||||
range doesn't change the default action. For example, installing an ALLOW
|
||||
filter for event range [0:10) as the first filter and then applying a DENY
|
||||
action for the same range will leave the whole range as disabled.
|
||||
|
||||
Restrictions: Event 0 (SW_INCR) is never filtered, as it doesn't count a
|
||||
hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it
|
||||
isn't strictly speaking an event. Filtering the cycle counter is possible
|
||||
using event 0x11 (CPU_CYCLES).
|
||||
|
||||
|
||||
2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
|
||||
=================================
|
||||
|
@ -218,6 +218,23 @@ lr .req x30 // link register
|
||||
str \src, [\tmp, :lo12:\sym]
|
||||
.endm
|
||||
|
||||
/*
|
||||
* @dst: destination register
|
||||
*/
|
||||
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
|
||||
.macro this_cpu_offset, dst
|
||||
mrs \dst, tpidr_el2
|
||||
.endm
|
||||
#else
|
||||
.macro this_cpu_offset, dst
|
||||
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
|
||||
mrs \dst, tpidr_el1
|
||||
alternative_else
|
||||
mrs \dst, tpidr_el2
|
||||
alternative_endif
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/*
|
||||
* @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP)
|
||||
* @sym: The name of the per-cpu variable
|
||||
@ -226,11 +243,7 @@ lr .req x30 // link register
|
||||
.macro adr_this_cpu, dst, sym, tmp
|
||||
adrp \tmp, \sym
|
||||
add \dst, \tmp, #:lo12:\sym
|
||||
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
|
||||
mrs \tmp, tpidr_el1
|
||||
alternative_else
|
||||
mrs \tmp, tpidr_el2
|
||||
alternative_endif
|
||||
this_cpu_offset \tmp
|
||||
add \dst, \dst, \tmp
|
||||
.endm
|
||||
|
||||
@ -241,11 +254,7 @@ alternative_endif
|
||||
*/
|
||||
.macro ldr_this_cpu dst, sym, tmp
|
||||
adr_l \dst, \sym
|
||||
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
|
||||
mrs \tmp, tpidr_el1
|
||||
alternative_else
|
||||
mrs \tmp, tpidr_el2
|
||||
alternative_endif
|
||||
this_cpu_offset \tmp
|
||||
ldr \dst, [\dst, \tmp]
|
||||
.endm
|
||||
|
||||
|
36
arch/arm64/include/asm/hyp_image.h
Normal file
36
arch/arm64/include/asm/hyp_image.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2020 Google LLC.
|
||||
* Written by David Brazdil <dbrazdil@google.com>
|
||||
*/
|
||||
|
||||
#ifndef __ARM64_HYP_IMAGE_H__
|
||||
#define __ARM64_HYP_IMAGE_H__
|
||||
|
||||
/*
|
||||
* KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
|
||||
* to separate it from the kernel proper.
|
||||
*/
|
||||
#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
|
||||
|
||||
#ifdef LINKER_SCRIPT
|
||||
|
||||
/*
|
||||
* KVM nVHE ELF section names are prefixed with .hyp, to separate them
|
||||
* from the kernel proper.
|
||||
*/
|
||||
#define HYP_SECTION_NAME(NAME) .hyp##NAME
|
||||
|
||||
/* Defines an ELF hyp section from input section @NAME and its subsections. */
|
||||
#define HYP_SECTION(NAME) \
|
||||
HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
|
||||
|
||||
/*
|
||||
* Defines a linker script alias of a kernel-proper symbol referenced by
|
||||
* KVM nVHE hyp code.
|
||||
*/
|
||||
#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym;
|
||||
|
||||
#endif /* LINKER_SCRIPT */
|
||||
|
||||
#endif /* __ARM64_HYP_IMAGE_H__ */
|
@ -7,6 +7,7 @@
|
||||
#ifndef __ARM_KVM_ASM_H__
|
||||
#define __ARM_KVM_ASM_H__
|
||||
|
||||
#include <asm/hyp_image.h>
|
||||
#include <asm/virt.h>
|
||||
|
||||
#define ARM_EXIT_WITH_SERROR_BIT 31
|
||||
@ -35,17 +36,34 @@
|
||||
|
||||
#define __SMCCC_WORKAROUND_1_SMC_SZ 36
|
||||
|
||||
#define KVM_HOST_SMCCC_ID(id) \
|
||||
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
|
||||
ARM_SMCCC_SMC_64, \
|
||||
ARM_SMCCC_OWNER_VENDOR_HYP, \
|
||||
(id))
|
||||
|
||||
#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
|
||||
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
|
||||
#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
|
||||
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <linux/mm.h>
|
||||
|
||||
/*
|
||||
* Translate name of a symbol defined in nVHE hyp to the name seen
|
||||
* by kernel proper. All nVHE symbols are prefixed by the build system
|
||||
* to avoid clashes with the VHE variants.
|
||||
*/
|
||||
#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
|
||||
|
||||
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
|
||||
#define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[]
|
||||
|
||||
@ -57,10 +75,53 @@
|
||||
DECLARE_KVM_VHE_SYM(sym); \
|
||||
DECLARE_KVM_NVHE_SYM(sym)
|
||||
|
||||
#define CHOOSE_VHE_SYM(sym) sym
|
||||
#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
|
||||
#define DECLARE_KVM_VHE_PER_CPU(type, sym) \
|
||||
DECLARE_PER_CPU(type, sym)
|
||||
#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \
|
||||
DECLARE_PER_CPU(type, kvm_nvhe_sym(sym))
|
||||
|
||||
#define DECLARE_KVM_HYP_PER_CPU(type, sym) \
|
||||
DECLARE_KVM_VHE_PER_CPU(type, sym); \
|
||||
DECLARE_KVM_NVHE_PER_CPU(type, sym)
|
||||
|
||||
/*
|
||||
* Compute pointer to a symbol defined in nVHE percpu region.
|
||||
* Returns NULL if percpu memory has not been allocated yet.
|
||||
*/
|
||||
#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id())
|
||||
#define per_cpu_ptr_nvhe_sym(sym, cpu) \
|
||||
({ \
|
||||
unsigned long base, off; \
|
||||
base = kvm_arm_hyp_percpu_base[cpu]; \
|
||||
off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
|
||||
(unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
|
||||
base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
|
||||
})
|
||||
|
||||
#if defined(__KVM_NVHE_HYPERVISOR__)
|
||||
|
||||
#define CHOOSE_NVHE_SYM(sym) sym
|
||||
#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym)
|
||||
|
||||
/* The nVHE hypervisor shouldn't even try to access VHE symbols */
|
||||
extern void *__nvhe_undefined_symbol;
|
||||
#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol
|
||||
#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
|
||||
#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
|
||||
|
||||
#elif defined(__KVM_VHE_HYPERVISOR__)
|
||||
|
||||
#define CHOOSE_VHE_SYM(sym) sym
|
||||
#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym)
|
||||
|
||||
/* The VHE hypervisor shouldn't even try to access nVHE symbols */
|
||||
extern void *__vhe_undefined_symbol;
|
||||
#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol
|
||||
#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol)
|
||||
#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef __KVM_NVHE_HYPERVISOR__
|
||||
/*
|
||||
* BIG FAT WARNINGS:
|
||||
*
|
||||
@ -72,12 +133,21 @@
|
||||
* - Don't let the nVHE hypervisor have access to this, as it will
|
||||
* pick the *wrong* symbol (yes, it runs at EL2...).
|
||||
*/
|
||||
#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
|
||||
#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \
|
||||
? CHOOSE_VHE_SYM(sym) \
|
||||
: CHOOSE_NVHE_SYM(sym))
|
||||
#else
|
||||
/* The nVHE hypervisor shouldn't even try to access anything */
|
||||
extern void *__nvhe_undefined_symbol;
|
||||
#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol
|
||||
|
||||
#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \
|
||||
? this_cpu_ptr(&sym) \
|
||||
: this_cpu_ptr_nvhe_sym(sym))
|
||||
|
||||
#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \
|
||||
? per_cpu_ptr(&sym, cpu) \
|
||||
: per_cpu_ptr_nvhe_sym(sym, cpu))
|
||||
|
||||
#define CHOOSE_VHE_SYM(sym) sym
|
||||
#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
|
||||
|
||||
#endif
|
||||
|
||||
/* Translate a kernel address @ptr into its equivalent linear mapping */
|
||||
@ -95,10 +165,16 @@ struct kvm_vcpu;
|
||||
struct kvm_s2_mmu;
|
||||
|
||||
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
|
||||
DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
|
||||
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
|
||||
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
|
||||
#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
|
||||
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
|
||||
|
||||
extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
|
||||
DECLARE_KVM_NVHE_SYM(__per_cpu_start);
|
||||
DECLARE_KVM_NVHE_SYM(__per_cpu_end);
|
||||
|
||||
extern atomic_t arm64_el2_vector_last_slot;
|
||||
DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
|
||||
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
|
||||
@ -144,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
|
||||
addr; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Home-grown __this_cpu_{ptr,read} variants that always work at HYP,
|
||||
* provided that sym is really a *symbol* and not a pointer obtained from
|
||||
* a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps
|
||||
* sparse quiet.
|
||||
*/
|
||||
#define __hyp_this_cpu_ptr(sym) \
|
||||
({ \
|
||||
void *__ptr; \
|
||||
__verify_pcpu_ptr(&sym); \
|
||||
__ptr = hyp_symbol_addr(sym); \
|
||||
__ptr += read_sysreg(tpidr_el2); \
|
||||
(typeof(sym) __kernel __force *)__ptr; \
|
||||
})
|
||||
|
||||
#define __hyp_this_cpu_read(sym) \
|
||||
({ \
|
||||
*__hyp_this_cpu_ptr(sym); \
|
||||
})
|
||||
|
||||
#define __KVM_EXTABLE(from, to) \
|
||||
" .pushsection __kvm_ex_table, \"a\"\n" \
|
||||
" .align 3\n" \
|
||||
@ -194,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
|
||||
.macro hyp_adr_this_cpu reg, sym, tmp
|
||||
adr_l \reg, \sym
|
||||
mrs \tmp, tpidr_el2
|
||||
add \reg, \reg, \tmp
|
||||
.endm
|
||||
|
||||
.macro hyp_ldr_this_cpu reg, sym, tmp
|
||||
adr_l \reg, \sym
|
||||
mrs \tmp, tpidr_el2
|
||||
ldr \reg, [\reg, \tmp]
|
||||
.endm
|
||||
|
||||
.macro get_host_ctxt reg, tmp
|
||||
hyp_adr_this_cpu \reg, kvm_host_data, \tmp
|
||||
adr_this_cpu \reg, kvm_host_data, \tmp
|
||||
add \reg, \reg, #HOST_DATA_CONTEXT
|
||||
.endm
|
||||
|
||||
@ -216,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
|
||||
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
|
||||
.endm
|
||||
|
||||
.macro get_loaded_vcpu vcpu, ctxt
|
||||
adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
|
||||
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
|
||||
.endm
|
||||
|
||||
.macro set_loaded_vcpu vcpu, ctxt, tmp
|
||||
adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
|
||||
str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
|
||||
.endm
|
||||
|
||||
/*
|
||||
* KVM extable for unexpected exceptions.
|
||||
* In the same format _asm_extable, but output to a different section so that
|
||||
@ -231,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
|
||||
#define CPU_LR_OFFSET CPU_XREG_OFFSET(30)
|
||||
#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8)
|
||||
|
||||
/*
|
||||
* We treat x18 as callee-saved as the host may use it as a platform
|
||||
* register (e.g. for shadow call stack).
|
||||
*/
|
||||
.macro save_callee_saved_regs ctxt
|
||||
str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
|
||||
stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
|
||||
stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
|
||||
stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
|
||||
stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
|
||||
stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
|
||||
stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
|
||||
.endm
|
||||
|
||||
.macro restore_callee_saved_regs ctxt
|
||||
// We require \ctxt is not x18-x28
|
||||
ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
|
||||
ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
|
||||
ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
|
||||
ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
|
||||
ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
|
||||
ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
|
||||
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
|
||||
.endm
|
||||
|
||||
.macro save_sp_el0 ctxt, tmp
|
||||
mrs \tmp, sp_el0
|
||||
str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
|
||||
.endm
|
||||
|
||||
.macro restore_sp_el0 ctxt, tmp
|
||||
ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
|
||||
msr sp_el0, \tmp
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* __ARM_KVM_ASM_H__ */
|
||||
|
@ -11,6 +11,7 @@
|
||||
#ifndef __ARM64_KVM_HOST_H__
|
||||
#define __ARM64_KVM_HOST_H__
|
||||
|
||||
#include <linux/arm-smccc.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
@ -79,8 +80,8 @@ struct kvm_s2_mmu {
|
||||
* for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the
|
||||
* canonical stage-2 page tables.
|
||||
*/
|
||||
pgd_t *pgd;
|
||||
phys_addr_t pgd_phys;
|
||||
struct kvm_pgtable *pgt;
|
||||
|
||||
/* The last vcpu id that ran on each physical CPU */
|
||||
int __percpu *last_vcpu_ran;
|
||||
@ -110,6 +111,13 @@ struct kvm_arch {
|
||||
* supported.
|
||||
*/
|
||||
bool return_nisv_io_abort_to_user;
|
||||
|
||||
/*
|
||||
* VM-wide PMU filter, implemented as a bitmap and big enough for
|
||||
* up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
|
||||
*/
|
||||
unsigned long *pmu_filter;
|
||||
unsigned int pmuver;
|
||||
};
|
||||
|
||||
struct kvm_vcpu_fault_info {
|
||||
@ -262,8 +270,6 @@ struct kvm_host_data {
|
||||
struct kvm_pmu_events pmu_events;
|
||||
};
|
||||
|
||||
typedef struct kvm_host_data kvm_host_data_t;
|
||||
|
||||
struct vcpu_reset_state {
|
||||
unsigned long pc;
|
||||
unsigned long r0;
|
||||
@ -480,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
|
||||
void kvm_arm_halt_guest(struct kvm *kvm);
|
||||
void kvm_arm_resume_guest(struct kvm *kvm);
|
||||
|
||||
u64 __kvm_call_hyp(void *hypfn, ...);
|
||||
|
||||
#define kvm_call_hyp_nvhe(f, ...) \
|
||||
do { \
|
||||
DECLARE_KVM_NVHE_SYM(f); \
|
||||
__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
#define kvm_call_hyp_nvhe_ret(f, ...) \
|
||||
#define kvm_call_hyp_nvhe(f, ...) \
|
||||
({ \
|
||||
DECLARE_KVM_NVHE_SYM(f); \
|
||||
__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
|
||||
struct arm_smccc_res res; \
|
||||
\
|
||||
arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \
|
||||
##__VA_ARGS__, &res); \
|
||||
WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \
|
||||
\
|
||||
res.a1; \
|
||||
})
|
||||
|
||||
/*
|
||||
@ -517,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
|
||||
ret = f(__VA_ARGS__); \
|
||||
isb(); \
|
||||
} else { \
|
||||
ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \
|
||||
ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
|
||||
} \
|
||||
\
|
||||
ret; \
|
||||
@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
|
||||
|
||||
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
|
||||
|
||||
DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
|
||||
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
|
||||
|
||||
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
|
||||
{
|
||||
|
@ -12,6 +12,9 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/sysreg.h>
|
||||
|
||||
DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
|
||||
DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
|
||||
|
||||
#define read_sysreg_elx(r,nvh,vh) \
|
||||
({ \
|
||||
u64 reg; \
|
||||
@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
|
||||
void deactivate_traps_vhe_put(void);
|
||||
#endif
|
||||
|
||||
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
|
||||
u64 __guest_enter(struct kvm_vcpu *vcpu);
|
||||
|
||||
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
|
||||
void __noreturn hyp_panic(void);
|
||||
#ifdef __KVM_NVHE_HYPERVISOR__
|
||||
void __noreturn __hyp_do_panic(unsigned long, ...);
|
||||
void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
|
||||
#endif
|
||||
|
||||
#endif /* __ARM64_KVM_HYP_H__ */
|
||||
|
@ -44,16 +44,6 @@
|
||||
* HYP_VA_MIN = 1 << (VA_BITS - 1)
|
||||
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
|
||||
*
|
||||
* This of course assumes that the trampoline page exists within the
|
||||
* VA_BITS range. If it doesn't, then it means we're in the odd case
|
||||
* where the kernel idmap (as well as HYP) uses more levels than the
|
||||
* kernel runtime page tables (as seen when the kernel is configured
|
||||
* for 4k pages, 39bits VA, and yet memory lives just above that
|
||||
* limit, forcing the idmap to use 4 levels of page tables while the
|
||||
* kernel itself only uses 3). In this particular case, it doesn't
|
||||
* matter which side of VA_BITS we use, as we're guaranteed not to
|
||||
* conflict with anything.
|
||||
*
|
||||
* When using VHE, there are no separate hyp mappings and all KVM
|
||||
* functionality is already mapped as part of the main kernel
|
||||
* mappings, and none of this applies in that case.
|
||||
@ -118,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
|
||||
#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
|
||||
#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
|
||||
|
||||
static inline bool kvm_page_empty(void *ptr)
|
||||
{
|
||||
struct page *ptr_page = virt_to_page(ptr);
|
||||
return page_count(ptr_page) == 1;
|
||||
}
|
||||
|
||||
#include <asm/kvm_pgtable.h>
|
||||
#include <asm/stage2_pgtable.h>
|
||||
|
||||
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
|
||||
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
|
||||
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
|
||||
void __iomem **kaddr,
|
||||
void __iomem **haddr);
|
||||
@ -142,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
||||
|
||||
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
|
||||
|
||||
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
|
||||
|
||||
phys_addr_t kvm_mmu_get_httbr(void);
|
||||
phys_addr_t kvm_get_idmap_vector(void);
|
||||
int kvm_mmu_init(void);
|
||||
void kvm_clear_hyp_idmap(void);
|
||||
|
||||
#define kvm_mk_pmd(ptep) \
|
||||
__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
|
||||
#define kvm_mk_pud(pmdp) \
|
||||
__pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
|
||||
#define kvm_mk_p4d(pmdp) \
|
||||
__p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
|
||||
|
||||
#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
|
||||
|
||||
#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
|
||||
#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
|
||||
#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
|
||||
|
||||
#define kvm_pud_pfn(pud) pud_pfn(pud)
|
||||
|
||||
#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
|
||||
#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
|
||||
|
||||
static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
|
||||
{
|
||||
pte_val(pte) |= PTE_S2_RDWR;
|
||||
return pte;
|
||||
}
|
||||
|
||||
static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
|
||||
{
|
||||
pmd_val(pmd) |= PMD_S2_RDWR;
|
||||
return pmd;
|
||||
}
|
||||
|
||||
static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
|
||||
{
|
||||
pud_val(pud) |= PUD_S2_RDWR;
|
||||
return pud;
|
||||
}
|
||||
|
||||
static inline pte_t kvm_s2pte_mkexec(pte_t pte)
|
||||
{
|
||||
pte_val(pte) &= ~PTE_S2_XN;
|
||||
return pte;
|
||||
}
|
||||
|
||||
static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
|
||||
{
|
||||
pmd_val(pmd) &= ~PMD_S2_XN;
|
||||
return pmd;
|
||||
}
|
||||
|
||||
static inline pud_t kvm_s2pud_mkexec(pud_t pud)
|
||||
{
|
||||
pud_val(pud) &= ~PUD_S2_XN;
|
||||
return pud;
|
||||
}
|
||||
|
||||
static inline void kvm_set_s2pte_readonly(pte_t *ptep)
|
||||
{
|
||||
pteval_t old_pteval, pteval;
|
||||
|
||||
pteval = READ_ONCE(pte_val(*ptep));
|
||||
do {
|
||||
old_pteval = pteval;
|
||||
pteval &= ~PTE_S2_RDWR;
|
||||
pteval |= PTE_S2_RDONLY;
|
||||
pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
|
||||
} while (pteval != old_pteval);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pte_readonly(pte_t *ptep)
|
||||
{
|
||||
return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pte_exec(pte_t *ptep)
|
||||
{
|
||||
return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
|
||||
}
|
||||
|
||||
static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
|
||||
{
|
||||
kvm_set_s2pte_readonly((pte_t *)pmdp);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
|
||||
{
|
||||
return kvm_s2pte_readonly((pte_t *)pmdp);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
|
||||
{
|
||||
return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
|
||||
}
|
||||
|
||||
static inline void kvm_set_s2pud_readonly(pud_t *pudp)
|
||||
{
|
||||
kvm_set_s2pte_readonly((pte_t *)pudp);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pud_readonly(pud_t *pudp)
|
||||
{
|
||||
return kvm_s2pte_readonly((pte_t *)pudp);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pud_exec(pud_t *pudp)
|
||||
{
|
||||
return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
|
||||
}
|
||||
|
||||
static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
|
||||
{
|
||||
return pud_mkyoung(pud);
|
||||
}
|
||||
|
||||
static inline bool kvm_s2pud_young(pud_t pud)
|
||||
{
|
||||
return pud_young(pud);
|
||||
}
|
||||
|
||||
#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
|
||||
|
||||
#ifdef __PAGETABLE_PMD_FOLDED
|
||||
#define hyp_pmd_table_empty(pmdp) (0)
|
||||
#else
|
||||
#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
|
||||
#endif
|
||||
|
||||
#ifdef __PAGETABLE_PUD_FOLDED
|
||||
#define hyp_pud_table_empty(pudp) (0)
|
||||
#else
|
||||
#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
|
||||
#endif
|
||||
|
||||
#ifdef __PAGETABLE_P4D_FOLDED
|
||||
#define hyp_p4d_table_empty(p4dp) (0)
|
||||
#else
|
||||
#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
|
||||
#endif
|
||||
|
||||
struct kvm;
|
||||
|
||||
@ -326,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __kvm_flush_dcache_pte(pte_t pte)
|
||||
{
|
||||
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
|
||||
struct page *page = pte_page(pte);
|
||||
kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
|
||||
{
|
||||
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
|
||||
struct page *page = pmd_page(pmd);
|
||||
kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __kvm_flush_dcache_pud(pud_t pud)
|
||||
{
|
||||
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
|
||||
struct page *page = pud_page(pud);
|
||||
kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
|
||||
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
|
||||
|
||||
static inline bool __kvm_cpu_uses_extended_idmap(void)
|
||||
{
|
||||
return __cpu_uses_extended_idmap_level();
|
||||
}
|
||||
|
||||
static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
|
||||
{
|
||||
return idmap_ptrs_per_pgd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Can't use pgd_populate here, because the extended idmap adds an extra level
|
||||
* above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
|
||||
* idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
|
||||
*/
|
||||
static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
|
||||
pgd_t *hyp_pgd,
|
||||
pgd_t *merged_hyp_pgd,
|
||||
unsigned long hyp_idmap_start)
|
||||
{
|
||||
int idmap_idx;
|
||||
u64 pgd_addr;
|
||||
|
||||
/*
|
||||
* Use the first entry to access the HYP mappings. It is
|
||||
* guaranteed to be free, otherwise we wouldn't use an
|
||||
* extended idmap.
|
||||
*/
|
||||
VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
|
||||
pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
|
||||
merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
|
||||
|
||||
/*
|
||||
* Create another extended level entry that points to the boot HYP map,
|
||||
* which contains an ID mapping of the HYP init code. We essentially
|
||||
* merge the boot and runtime HYP maps by doing so, but they don't
|
||||
* overlap anyway, so this is fine.
|
||||
*/
|
||||
idmap_idx = hyp_idmap_start >> VA_BITS;
|
||||
VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
|
||||
pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
|
||||
merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
|
||||
}
|
||||
|
||||
static inline unsigned int kvm_get_vmid_bits(void)
|
||||
{
|
||||
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
@ -479,30 +256,6 @@ static inline void *kvm_get_hyp_vector(void)
|
||||
|
||||
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
|
||||
|
||||
/*
|
||||
* Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
|
||||
* With v8.2 LVA extensions, 'x' should be a minimum of 6 with
|
||||
* 52bit IPS.
|
||||
*/
|
||||
static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
|
||||
{
|
||||
int x = ARM64_VTTBR_X(ipa_shift, levels);
|
||||
|
||||
return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
|
||||
}
|
||||
|
||||
static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
|
||||
{
|
||||
unsigned int x = arm64_vttbr_x(ipa_shift, levels);
|
||||
|
||||
return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
|
||||
}
|
||||
|
||||
static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
|
||||
{
|
||||
return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
|
||||
}
|
||||
|
||||
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
|
||||
{
|
||||
struct kvm_vmid *vmid = &mmu->vmid;
|
||||
|
309
arch/arm64/include/asm/kvm_pgtable.h
Normal file
309
arch/arm64/include/asm/kvm_pgtable.h
Normal file
@ -0,0 +1,309 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2020 Google LLC
|
||||
* Author: Will Deacon <will@kernel.org>
|
||||
*/
|
||||
|
||||
#ifndef __ARM64_KVM_PGTABLE_H__
|
||||
#define __ARM64_KVM_PGTABLE_H__
|
||||
|
||||
#include <linux/bits.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
typedef u64 kvm_pte_t;
|
||||
|
||||
/**
|
||||
* struct kvm_pgtable - KVM page-table.
|
||||
* @ia_bits: Maximum input address size, in bits.
|
||||
* @start_level: Level at which the page-table walk starts.
|
||||
* @pgd: Pointer to the first top-level entry of the page-table.
|
||||
* @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
|
||||
*/
|
||||
struct kvm_pgtable {
|
||||
u32 ia_bits;
|
||||
u32 start_level;
|
||||
kvm_pte_t *pgd;
|
||||
|
||||
/* Stage-2 only */
|
||||
struct kvm_s2_mmu *mmu;
|
||||
};
|
||||
|
||||
/**
|
||||
* enum kvm_pgtable_prot - Page-table permissions and attributes.
|
||||
* @KVM_PGTABLE_PROT_X: Execute permission.
|
||||
* @KVM_PGTABLE_PROT_W: Write permission.
|
||||
* @KVM_PGTABLE_PROT_R: Read permission.
|
||||
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
|
||||
*/
|
||||
enum kvm_pgtable_prot {
|
||||
KVM_PGTABLE_PROT_X = BIT(0),
|
||||
KVM_PGTABLE_PROT_W = BIT(1),
|
||||
KVM_PGTABLE_PROT_R = BIT(2),
|
||||
|
||||
KVM_PGTABLE_PROT_DEVICE = BIT(3),
|
||||
};
|
||||
|
||||
#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
|
||||
#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
|
||||
#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
|
||||
#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
|
||||
|
||||
/**
|
||||
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
|
||||
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
|
||||
* entries.
|
||||
* @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
|
||||
* children.
|
||||
* @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
|
||||
* children.
|
||||
*/
|
||||
enum kvm_pgtable_walk_flags {
|
||||
KVM_PGTABLE_WALK_LEAF = BIT(0),
|
||||
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
|
||||
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
|
||||
};
|
||||
|
||||
typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg);
|
||||
|
||||
/**
|
||||
* struct kvm_pgtable_walker - Hook into a page-table walk.
|
||||
* @cb: Callback function to invoke during the walk.
|
||||
* @arg: Argument passed to the callback function.
|
||||
* @flags: Bitwise-OR of flags to identify the entry types on which to
|
||||
* invoke the callback function.
|
||||
*/
|
||||
struct kvm_pgtable_walker {
|
||||
const kvm_pgtable_visitor_fn_t cb;
|
||||
void * const arg;
|
||||
const enum kvm_pgtable_walk_flags flags;
|
||||
};
|
||||
|
||||
/**
|
||||
* kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
|
||||
* @pgt: Uninitialised page-table structure to initialise.
|
||||
* @va_bits: Maximum virtual address bits.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
|
||||
*
|
||||
* The page-table is assumed to be unreachable by any hardware walkers prior
|
||||
* to freeing and therefore no TLB invalidation is performed.
|
||||
*/
|
||||
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
|
||||
* @addr: Virtual address at which to place the mapping.
|
||||
* @size: Size of the mapping.
|
||||
* @phys: Physical address of the memory to map.
|
||||
* @prot: Permissions and attributes for the mapping.
|
||||
*
|
||||
* The offset of @addr within a page is ignored, @size is rounded-up to
|
||||
* the next page boundary and @phys is rounded-down to the previous page
|
||||
* boundary.
|
||||
*
|
||||
* If device attributes are not explicitly requested in @prot, then the
|
||||
* mapping will be normal, cacheable. Attempts to install a new mapping
|
||||
* for a virtual address that is already mapped will be rejected with an
|
||||
* error and a WARN().
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
|
||||
enum kvm_pgtable_prot prot);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
|
||||
* @pgt: Uninitialised page-table structure to initialise.
|
||||
* @kvm: KVM structure representing the guest virtual machine.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
*
|
||||
* The page-table is assumed to be unreachable by any hardware walkers prior
|
||||
* to freeing and therefore no TLB invalidation is performed.
|
||||
*/
|
||||
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address at which to place the mapping.
|
||||
* @size: Size of the mapping.
|
||||
* @phys: Physical address of the memory to map.
|
||||
* @prot: Permissions and attributes for the mapping.
|
||||
* @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
|
||||
* allocate page-table pages.
|
||||
*
|
||||
* The offset of @addr within a page is ignored, @size is rounded-up to
|
||||
* the next page boundary and @phys is rounded-down to the previous page
|
||||
* boundary.
|
||||
*
|
||||
* If device attributes are not explicitly requested in @prot, then the
|
||||
* mapping will be normal, cacheable.
|
||||
*
|
||||
* Note that this function will both coalesce existing table entries and split
|
||||
* existing block mappings, relying on page-faults to fault back areas outside
|
||||
* of the new mapping lazily.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
u64 phys, enum kvm_pgtable_prot prot,
|
||||
struct kvm_mmu_memory_cache *mc);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address from which to remove the mapping.
|
||||
* @size: Size of the mapping.
|
||||
*
|
||||
* The offset of @addr within a page is ignored and @size is rounded-up to
|
||||
* the next page boundary.
|
||||
*
|
||||
* TLB invalidation is performed for each page-table entry cleared during the
|
||||
* unmapping operation and the reference count for the page-table page
|
||||
* containing the cleared entry is decremented, with unreferenced pages being
|
||||
* freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
|
||||
* FWB is not supported by the CPU.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
|
||||
* without TLB invalidation.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address from which to write-protect,
|
||||
* @size: Size of the range.
|
||||
*
|
||||
* The offset of @addr within a page is ignored and @size is rounded-up to
|
||||
* the next page boundary.
|
||||
*
|
||||
* Note that it is the caller's responsibility to invalidate the TLB after
|
||||
* calling this function to ensure that the updated permissions are visible
|
||||
* to the CPUs.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address to identify the page-table entry.
|
||||
*
|
||||
* The offset of @addr within a page is ignored.
|
||||
*
|
||||
* If there is a valid, leaf page-table entry used to translate @addr, then
|
||||
* set the access flag in that entry.
|
||||
*
|
||||
* Return: The old page-table entry prior to setting the flag, 0 on failure.
|
||||
*/
|
||||
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address to identify the page-table entry.
|
||||
*
|
||||
* The offset of @addr within a page is ignored.
|
||||
*
|
||||
* If there is a valid, leaf page-table entry used to translate @addr, then
|
||||
* clear the access flag in that entry.
|
||||
*
|
||||
* Note that it is the caller's responsibility to invalidate the TLB after
|
||||
* calling this function to ensure that the updated permissions are visible
|
||||
* to the CPUs.
|
||||
*
|
||||
* Return: The old page-table entry prior to clearing the flag, 0 on failure.
|
||||
*/
|
||||
kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
|
||||
* page-table entry.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address to identify the page-table entry.
|
||||
* @prot: Additional permissions to grant for the mapping.
|
||||
*
|
||||
* The offset of @addr within a page is ignored.
|
||||
*
|
||||
* If there is a valid, leaf page-table entry used to translate @addr, then
|
||||
* relax the permissions in that entry according to the read, write and
|
||||
* execute permissions specified by @prot. No permissions are removed, and
|
||||
* TLB invalidation is performed after updating the entry.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
|
||||
enum kvm_pgtable_prot prot);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
|
||||
* access flag set.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address to identify the page-table entry.
|
||||
*
|
||||
* The offset of @addr within a page is ignored.
|
||||
*
|
||||
* Return: True if the page-table entry has the access flag set, false otherwise.
|
||||
*/
|
||||
bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
|
||||
* of Coherency for guest stage-2 address
|
||||
* range.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||
* @addr: Intermediate physical address from which to flush.
|
||||
* @size: Size of the range.
|
||||
*
|
||||
* The offset of @addr within a page is ignored and @size is rounded-up to
|
||||
* the next page boundary.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_walk() - Walk a page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
|
||||
* @addr: Input address for the start of the walk.
|
||||
* @size: Size of the range to walk.
|
||||
* @walker: Walker callback description.
|
||||
*
|
||||
* The offset of @addr within a page is ignored and @size is rounded-up to
|
||||
* the next page boundary.
|
||||
*
|
||||
* The walker will walk the page-table entries corresponding to the input
|
||||
* address range specified, visiting entries according to the walker flags.
|
||||
* Invalid entries are treated as leaf entries. Leaf entries are reloaded
|
||||
* after invoking the walker callback, allowing the walker to descend into
|
||||
* a newly installed table.
|
||||
*
|
||||
* Returning a negative error code from the walker callback function will
|
||||
* terminate the walk immediately with the same error code.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
struct kvm_pgtable_walker *walker);
|
||||
|
||||
#endif /* __ARM64_KVM_PGTABLE_H__ */
|
@ -60,7 +60,7 @@
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
|
||||
* Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
|
||||
* check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
|
||||
* (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
|
||||
* then proceed ahead with the save/restore of Pointer Authentication
|
||||
@ -78,7 +78,7 @@ alternative_else_nop_endif
|
||||
.L__skip_switch\@:
|
||||
.endm
|
||||
|
||||
.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
|
||||
.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
|
||||
alternative_if_not ARM64_HAS_ADDRESS_AUTH
|
||||
b .L__skip_switch\@
|
||||
alternative_else_nop_endif
|
||||
@ -96,7 +96,7 @@ alternative_else_nop_endif
|
||||
#else /* !CONFIG_ARM64_PTR_AUTH */
|
||||
.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
|
||||
.endm
|
||||
.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
|
||||
.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
|
||||
.endm
|
||||
#endif /* CONFIG_ARM64_PTR_AUTH */
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off)
|
||||
:: "r" (off) : "memory");
|
||||
}
|
||||
|
||||
static inline unsigned long __my_cpu_offset(void)
|
||||
static inline unsigned long __hyp_my_cpu_offset(void)
|
||||
{
|
||||
/*
|
||||
* Non-VHE hyp code runs with preemption disabled. No need to hazard
|
||||
* the register access against barrier() as in __kern_my_cpu_offset.
|
||||
*/
|
||||
return read_sysreg(tpidr_el2);
|
||||
}
|
||||
|
||||
static inline unsigned long __kern_my_cpu_offset(void)
|
||||
{
|
||||
unsigned long off;
|
||||
|
||||
@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void)
|
||||
|
||||
return off;
|
||||
}
|
||||
#define __my_cpu_offset __my_cpu_offset()
|
||||
|
||||
#ifdef __KVM_NVHE_HYPERVISOR__
|
||||
#define __my_cpu_offset __hyp_my_cpu_offset()
|
||||
#else
|
||||
#define __my_cpu_offset __kern_my_cpu_offset()
|
||||
#endif
|
||||
|
||||
#define PERCPU_RW_OPS(sz) \
|
||||
static inline unsigned long __percpu_read_##sz(void *ptr) \
|
||||
@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd)
|
||||
|
||||
#include <asm-generic/percpu.h>
|
||||
|
||||
/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
|
||||
#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
|
||||
#undef this_cpu_ptr
|
||||
#define this_cpu_ptr raw_cpu_ptr
|
||||
#undef __this_cpu_read
|
||||
#define __this_cpu_read raw_cpu_read
|
||||
#undef __this_cpu_write
|
||||
#define __this_cpu_write raw_cpu_write
|
||||
#endif
|
||||
|
||||
#endif /* __ASM_PERCPU_H */
|
||||
|
@ -146,7 +146,6 @@
|
||||
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
|
||||
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
|
||||
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
|
||||
#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
|
||||
|
||||
#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
|
||||
#ifdef CONFIG_ARM64_PA_BITS_52
|
||||
@ -162,34 +161,11 @@
|
||||
#define PTE_ATTRINDX(t) (_AT(pteval_t, (t)) << 2)
|
||||
#define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2)
|
||||
|
||||
/*
|
||||
* 2nd stage PTE definitions
|
||||
*/
|
||||
#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
|
||||
#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
|
||||
#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
|
||||
|
||||
#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
|
||||
#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
|
||||
#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
|
||||
|
||||
#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
|
||||
#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
|
||||
|
||||
/*
|
||||
* Memory Attribute override for Stage-2 (MemAttr[3:0])
|
||||
*/
|
||||
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
|
||||
|
||||
/*
|
||||
* EL2/HYP PTE/PMD definitions
|
||||
*/
|
||||
#define PMD_HYP PMD_SECT_USER
|
||||
#define PTE_HYP PTE_USER
|
||||
|
||||
/*
|
||||
* Highest possible physical address supported.
|
||||
*/
|
||||
|
@ -64,7 +64,6 @@ extern bool arm64_use_ng_mappings;
|
||||
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
|
||||
|
||||
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
|
||||
#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT
|
||||
|
||||
#define PAGE_KERNEL __pgprot(PROT_NORMAL)
|
||||
#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED)
|
||||
@ -73,11 +72,6 @@ extern bool arm64_use_ng_mappings;
|
||||
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
|
||||
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
|
||||
|
||||
#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
|
||||
#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
|
||||
#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
|
||||
#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
|
||||
|
||||
#define PAGE_S2_MEMATTR(attr) \
|
||||
({ \
|
||||
u64 __val; \
|
||||
@ -88,19 +82,6 @@ extern bool arm64_use_ng_mappings;
|
||||
__val; \
|
||||
})
|
||||
|
||||
#define PAGE_S2_XN \
|
||||
({ \
|
||||
u64 __val; \
|
||||
if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \
|
||||
__val = 0; \
|
||||
else \
|
||||
__val = PTE_S2_XN; \
|
||||
__val; \
|
||||
})
|
||||
|
||||
#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
|
||||
#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
|
||||
|
||||
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
|
||||
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
|
||||
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
|
||||
|
@ -8,7 +8,6 @@
|
||||
#ifndef __ARM64_S2_PGTABLE_H_
|
||||
#define __ARM64_S2_PGTABLE_H_
|
||||
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/pgtable.h>
|
||||
|
||||
/*
|
||||
@ -36,21 +35,6 @@
|
||||
#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm))
|
||||
#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
|
||||
|
||||
/*
|
||||
* The number of PTRS across all concatenated stage2 tables given by the
|
||||
* number of bits resolved at the initial level.
|
||||
* If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
|
||||
* in which case, stage2_pgd_ptrs will have one entry.
|
||||
*/
|
||||
#define pgd_ptrs_shift(ipa, pgdir_shift) \
|
||||
((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
|
||||
#define __s2_pgd_ptrs(ipa, lvls) \
|
||||
(1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
|
||||
#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
|
||||
|
||||
#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
|
||||
#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
|
||||
|
||||
/*
|
||||
* kvm_mmmu_cache_min_pages() is the number of pages required to install
|
||||
* a stage-2 translation. We pre-allocate the entry level page table at
|
||||
@ -58,196 +42,6 @@
|
||||
*/
|
||||
#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
|
||||
|
||||
/* Stage2 PUD definitions when the level is present */
|
||||
static inline bool kvm_stage2_has_pud(struct kvm *kvm)
|
||||
{
|
||||
return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
|
||||
}
|
||||
|
||||
#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
|
||||
#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
|
||||
#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
|
||||
|
||||
#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
|
||||
#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
|
||||
#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
|
||||
#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
|
||||
|
||||
static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
|
||||
pgd_t *pgd, unsigned long address)
|
||||
{
|
||||
return p4d_offset(pgd, address);
|
||||
}
|
||||
|
||||
static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
|
||||
phys_addr_t addr, phys_addr_t end)
|
||||
{
|
||||
return end;
|
||||
}
|
||||
|
||||
static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
return p4d_none(p4d);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
p4d_clear(p4dp);
|
||||
}
|
||||
|
||||
static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
return p4d_present(p4d);
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
p4d_populate(NULL, p4d, pud);
|
||||
}
|
||||
|
||||
static inline pud_t *stage2_pud_offset(struct kvm *kvm,
|
||||
p4d_t *p4d, unsigned long address)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
return pud_offset(p4d, address);
|
||||
else
|
||||
return (pud_t *)p4d;
|
||||
}
|
||||
|
||||
static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
free_page((unsigned long)pud);
|
||||
}
|
||||
|
||||
static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm))
|
||||
return kvm_page_empty(pudp);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline phys_addr_t
|
||||
stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
|
||||
{
|
||||
if (kvm_stage2_has_pud(kvm)) {
|
||||
phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
|
||||
|
||||
return (boundary - 1 < end - 1) ? boundary : end;
|
||||
} else {
|
||||
return end;
|
||||
}
|
||||
}
|
||||
|
||||
/* Stage2 PMD definitions when the level is present */
|
||||
static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
|
||||
{
|
||||
return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
|
||||
}
|
||||
|
||||
#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
|
||||
#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
|
||||
#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
|
||||
|
||||
static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
return pud_none(pud);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
pud_clear(pud);
|
||||
}
|
||||
|
||||
static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
return pud_present(pud);
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
pud_populate(NULL, pud, pmd);
|
||||
}
|
||||
|
||||
static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
|
||||
pud_t *pud, unsigned long address)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
return pmd_offset(pud, address);
|
||||
else
|
||||
return (pmd_t *)pud;
|
||||
}
|
||||
|
||||
static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
free_page((unsigned long)pmd);
|
||||
}
|
||||
|
||||
static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
return pud_huge(pud);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm))
|
||||
return kvm_page_empty(pmdp);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline phys_addr_t
|
||||
stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
|
||||
{
|
||||
if (kvm_stage2_has_pmd(kvm)) {
|
||||
phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
|
||||
|
||||
return (boundary - 1 < end - 1) ? boundary : end;
|
||||
} else {
|
||||
return end;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
|
||||
{
|
||||
return kvm_page_empty(ptep);
|
||||
}
|
||||
|
||||
static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
|
||||
{
|
||||
return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
|
||||
}
|
||||
|
||||
static inline phys_addr_t
|
||||
stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
|
||||
{
|
||||
@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
|
||||
return (boundary - 1 < end - 1) ? boundary : end;
|
||||
}
|
||||
|
||||
/*
|
||||
* Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
|
||||
* the architectural page-table level.
|
||||
*/
|
||||
#define S2_NO_LEVEL_HINT 0
|
||||
#define S2_PUD_LEVEL 1
|
||||
#define S2_PMD_LEVEL 2
|
||||
#define S2_PTE_LEVEL 3
|
||||
|
||||
#endif /* __ARM64_S2_PGTABLE_H_ */
|
||||
|
@ -159,6 +159,21 @@ struct kvm_sync_regs {
|
||||
struct kvm_arch_memory_slot {
|
||||
};
|
||||
|
||||
/*
|
||||
* PMU filter structure. Describe a range of events with a particular
|
||||
* action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
|
||||
*/
|
||||
struct kvm_pmu_event_filter {
|
||||
__u16 base_event;
|
||||
__u16 nevents;
|
||||
|
||||
#define KVM_PMU_EVENT_ALLOW 0
|
||||
#define KVM_PMU_EVENT_DENY 1
|
||||
|
||||
__u8 action;
|
||||
__u8 pad[3];
|
||||
};
|
||||
|
||||
/* for KVM_GET/SET_VCPU_EVENTS */
|
||||
struct kvm_vcpu_events {
|
||||
struct {
|
||||
@ -338,6 +353,7 @@ struct kvm_vcpu_events {
|
||||
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
|
||||
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
|
||||
#define KVM_ARM_VCPU_PMU_V3_INIT 1
|
||||
#define KVM_ARM_VCPU_PMU_V3_FILTER 2
|
||||
#define KVM_ARM_VCPU_TIMER_CTRL 1
|
||||
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
|
||||
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
|
||||
|
@ -61,14 +61,11 @@ __efistub__ctype = _ctype;
|
||||
* memory mappings.
|
||||
*/
|
||||
|
||||
#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
|
||||
|
||||
/* Alternative callbacks for init-time patching of nVHE hyp code. */
|
||||
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
|
||||
KVM_NVHE_ALIAS(kvm_update_va_mask);
|
||||
|
||||
/* Global kernel state accessed by nVHE hyp code. */
|
||||
KVM_NVHE_ALIAS(kvm_host_data);
|
||||
KVM_NVHE_ALIAS(kvm_vgic_global_state);
|
||||
|
||||
/* Kernel constant needed to compute idmap addresses. */
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include <asm-generic/vmlinux.lds.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/hyp_image.h>
|
||||
#include <asm/kernel-pgtable.h>
|
||||
#include <asm/memory.h>
|
||||
#include <asm/page.h>
|
||||
@ -22,12 +23,23 @@ ENTRY(_text)
|
||||
jiffies = jiffies_64;
|
||||
|
||||
|
||||
#ifdef CONFIG_KVM
|
||||
#define HYPERVISOR_EXTABLE \
|
||||
. = ALIGN(SZ_8); \
|
||||
__start___kvm_ex_table = .; \
|
||||
*(__kvm_ex_table) \
|
||||
__stop___kvm_ex_table = .;
|
||||
|
||||
#define HYPERVISOR_PERCPU_SECTION \
|
||||
. = ALIGN(PAGE_SIZE); \
|
||||
HYP_SECTION_NAME(.data..percpu) : { \
|
||||
*(HYP_SECTION_NAME(.data..percpu)) \
|
||||
}
|
||||
#else /* CONFIG_KVM */
|
||||
#define HYPERVISOR_EXTABLE
|
||||
#define HYPERVISOR_PERCPU_SECTION
|
||||
#endif
|
||||
|
||||
#define HYPERVISOR_TEXT \
|
||||
/* \
|
||||
* Align to 4 KB so that \
|
||||
@ -196,6 +208,7 @@ SECTIONS
|
||||
}
|
||||
|
||||
PERCPU_SECTION(L1_CACHE_BYTES)
|
||||
HYPERVISOR_PERCPU_SECTION
|
||||
|
||||
.rela.dyn : ALIGN(8) {
|
||||
*(.rela .rela*)
|
||||
|
@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
|
||||
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
|
||||
$(KVM)/vfio.o $(KVM)/irqchip.o \
|
||||
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
|
||||
inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
|
||||
inject_fault.o regmap.o va_layout.o handle_exit.o \
|
||||
guest.o debug.o reset.o sys_regs.o \
|
||||
vgic-sys-reg-v3.o fpsimd.o pmu.o \
|
||||
aarch32.o arch_timer.o \
|
||||
|
@ -46,8 +46,10 @@
|
||||
__asm__(".arch_extension virt");
|
||||
#endif
|
||||
|
||||
DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
|
||||
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
|
||||
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
|
||||
|
||||
/* The VMID used in the VTTBR */
|
||||
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
|
||||
@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
|
||||
{
|
||||
int i;
|
||||
|
||||
bitmap_free(kvm->arch.pmu_filter);
|
||||
|
||||
kvm_vgic_destroy(kvm);
|
||||
|
||||
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
|
||||
@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
|
||||
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
|
||||
static_branch_dec(&userspace_irqchip_in_use);
|
||||
|
||||
kvm_mmu_free_memory_caches(vcpu);
|
||||
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
|
||||
kvm_timer_vcpu_terminate(vcpu);
|
||||
kvm_pmu_vcpu_destroy(vcpu);
|
||||
|
||||
@ -1259,6 +1263,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long nvhe_percpu_size(void)
|
||||
{
|
||||
return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
|
||||
(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
|
||||
}
|
||||
|
||||
static unsigned long nvhe_percpu_order(void)
|
||||
{
|
||||
unsigned long size = nvhe_percpu_size();
|
||||
|
||||
return size ? get_order(size) : 0;
|
||||
}
|
||||
|
||||
static int kvm_map_vectors(void)
|
||||
{
|
||||
/*
|
||||
@ -1299,6 +1316,7 @@ static void cpu_init_hyp_mode(void)
|
||||
unsigned long hyp_stack_ptr;
|
||||
unsigned long vector_ptr;
|
||||
unsigned long tpidr_el2;
|
||||
struct arm_smccc_res res;
|
||||
|
||||
/* Switch from the HYP stub to our own HYP init vector */
|
||||
__hyp_set_vectors(kvm_get_idmap_vector());
|
||||
@ -1308,12 +1326,13 @@ static void cpu_init_hyp_mode(void)
|
||||
* kernel's mapping to the linear mapping, and store it in tpidr_el2
|
||||
* so that we can use adr_l to access per-cpu variables in EL2.
|
||||
*/
|
||||
tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
|
||||
(unsigned long)kvm_ksym_ref(&kvm_host_data));
|
||||
tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
|
||||
(unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
|
||||
|
||||
pgd_ptr = kvm_mmu_get_httbr();
|
||||
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
|
||||
vector_ptr = (unsigned long)kvm_get_hyp_vector();
|
||||
hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
|
||||
vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
|
||||
|
||||
/*
|
||||
* Call initialization code, and switch to the full blown HYP code.
|
||||
@ -1322,7 +1341,9 @@ static void cpu_init_hyp_mode(void)
|
||||
* cpus_have_const_cap() wrapper.
|
||||
*/
|
||||
BUG_ON(!system_capabilities_finalized());
|
||||
__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
|
||||
arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
|
||||
pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
|
||||
WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
|
||||
|
||||
/*
|
||||
* Disabling SSBD on a non-VHE system requires us to enable SSBS
|
||||
@ -1342,10 +1363,12 @@ static void cpu_hyp_reset(void)
|
||||
|
||||
static void cpu_hyp_reinit(void)
|
||||
{
|
||||
kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
|
||||
kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
|
||||
|
||||
cpu_hyp_reset();
|
||||
|
||||
*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
|
||||
|
||||
if (is_kernel_in_hyp_mode())
|
||||
kvm_timer_init_vhe();
|
||||
else
|
||||
@ -1496,8 +1519,10 @@ static void teardown_hyp_mode(void)
|
||||
int cpu;
|
||||
|
||||
free_hyp_pgds();
|
||||
for_each_possible_cpu(cpu)
|
||||
for_each_possible_cpu(cpu) {
|
||||
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
|
||||
free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1530,6 +1555,24 @@ static int init_hyp_mode(void)
|
||||
per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate and initialize pages for Hypervisor-mode percpu regions.
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct page *page;
|
||||
void *page_addr;
|
||||
|
||||
page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
|
||||
if (!page) {
|
||||
err = -ENOMEM;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
page_addr = page_address(page);
|
||||
memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
|
||||
kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the Hyp-code called directly from the host
|
||||
*/
|
||||
@ -1574,14 +1617,17 @@ static int init_hyp_mode(void)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Map Hyp percpu pages
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
kvm_host_data_t *cpu_data;
|
||||
char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
|
||||
char *percpu_end = percpu_begin + nvhe_percpu_size();
|
||||
|
||||
cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
|
||||
err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
|
||||
err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
|
||||
|
||||
if (err) {
|
||||
kvm_err("Cannot map host CPU state: %d\n", err);
|
||||
kvm_err("Cannot map hyp percpu region\n");
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
@ -1,34 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright (C) 2012,2013 - ARM Ltd
|
||||
* Author: Marc Zyngier <marc.zyngier@arm.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
/*
|
||||
* u64 __kvm_call_hyp(void *hypfn, ...);
|
||||
*
|
||||
* This is not really a variadic function in the classic C-way and care must
|
||||
* be taken when calling this to ensure parameters are passed in registers
|
||||
* only, since the stack will change between the caller and the callee.
|
||||
*
|
||||
* Call the function with the first argument containing a pointer to the
|
||||
* function you wish to call in Hyp mode, and subsequent arguments will be
|
||||
* passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
|
||||
* function pointer can be passed). The function being called must be mapped
|
||||
* in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
|
||||
* passed in x0.
|
||||
*
|
||||
* A function pointer with a value less than 0xfff has a special meaning,
|
||||
* and is used to implement hyp stubs in the same way as in
|
||||
* arch/arm64/kernel/hyp_stub.S.
|
||||
*/
|
||||
SYM_FUNC_START(__kvm_call_hyp)
|
||||
hvc #0
|
||||
ret
|
||||
SYM_FUNC_END(__kvm_call_hyp)
|
@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
|
||||
-DDISABLE_BRANCH_PROFILING \
|
||||
$(DISABLE_STACKLEAK_PLUGIN)
|
||||
|
||||
obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o
|
||||
obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/fpsimdmacros.h>
|
||||
#include <asm/kvm.h>
|
||||
@ -16,66 +15,28 @@
|
||||
#include <asm/kvm_mmu.h>
|
||||
#include <asm/kvm_ptrauth.h>
|
||||
|
||||
#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
|
||||
#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
|
||||
|
||||
.text
|
||||
|
||||
/*
|
||||
* We treat x18 as callee-saved as the host may use it as a platform
|
||||
* register (e.g. for shadow call stack).
|
||||
*/
|
||||
.macro save_callee_saved_regs ctxt
|
||||
str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
|
||||
stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
|
||||
stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
|
||||
stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
|
||||
stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
|
||||
stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
|
||||
stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
|
||||
.endm
|
||||
|
||||
.macro restore_callee_saved_regs ctxt
|
||||
// We require \ctxt is not x18-x28
|
||||
ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
|
||||
ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
|
||||
ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
|
||||
ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
|
||||
ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
|
||||
ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
|
||||
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
|
||||
.endm
|
||||
|
||||
.macro save_sp_el0 ctxt, tmp
|
||||
mrs \tmp, sp_el0
|
||||
str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
|
||||
.endm
|
||||
|
||||
.macro restore_sp_el0 ctxt, tmp
|
||||
ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
|
||||
msr sp_el0, \tmp
|
||||
.endm
|
||||
|
||||
/*
|
||||
* u64 __guest_enter(struct kvm_vcpu *vcpu,
|
||||
* struct kvm_cpu_context *host_ctxt);
|
||||
* u64 __guest_enter(struct kvm_vcpu *vcpu);
|
||||
*/
|
||||
SYM_FUNC_START(__guest_enter)
|
||||
// x0: vcpu
|
||||
// x1: host context
|
||||
// x2-x17: clobbered by macros
|
||||
// x1-x17: clobbered by macros
|
||||
// x29: guest context
|
||||
|
||||
// Store the host regs
|
||||
adr_this_cpu x1, kvm_hyp_ctxt, x2
|
||||
|
||||
// Store the hyp regs
|
||||
save_callee_saved_regs x1
|
||||
|
||||
// Save the host's sp_el0
|
||||
// Save hyp's sp_el0
|
||||
save_sp_el0 x1, x2
|
||||
|
||||
// Now the host state is stored if we have a pending RAS SError it must
|
||||
// affect the host. If any asynchronous exception is pending we defer
|
||||
// the guest entry. The DSB isn't necessary before v8.2 as any SError
|
||||
// would be fatal.
|
||||
// Now the hyp state is stored if we have a pending RAS SError it must
|
||||
// affect the host or hyp. If any asynchronous exception is pending we
|
||||
// defer the guest entry. The DSB isn't necessary before v8.2 as any
|
||||
// SError would be fatal.
|
||||
alternative_if ARM64_HAS_RAS_EXTN
|
||||
dsb nshst
|
||||
isb
|
||||
@ -86,6 +47,8 @@ alternative_else_nop_endif
|
||||
ret
|
||||
|
||||
1:
|
||||
set_loaded_vcpu x0, x1, x2
|
||||
|
||||
add x29, x0, #VCPU_CONTEXT
|
||||
|
||||
// Macro ptrauth_switch_to_guest format:
|
||||
@ -116,6 +79,26 @@ alternative_else_nop_endif
|
||||
eret
|
||||
sb
|
||||
|
||||
SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
|
||||
// x2-x29,lr: vcpu regs
|
||||
// vcpu x0-x1 on the stack
|
||||
|
||||
// If the hyp context is loaded, go straight to hyp_panic
|
||||
get_loaded_vcpu x0, x1
|
||||
cbz x0, hyp_panic
|
||||
|
||||
// The hyp context is saved so make sure it is restored to allow
|
||||
// hyp_panic to run at hyp and, subsequently, panic to run in the host.
|
||||
// This makes use of __guest_exit to avoid duplication but sets the
|
||||
// return address to tail call into hyp_panic. As a side effect, the
|
||||
// current state is saved to the guest context but it will only be
|
||||
// accurate if the guest had been completely restored.
|
||||
adr_this_cpu x0, kvm_hyp_ctxt, x1
|
||||
adr x1, hyp_panic
|
||||
str x1, [x0, #CPU_XREG_OFFSET(30)]
|
||||
|
||||
get_vcpu_ptr x1, x0
|
||||
|
||||
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
|
||||
// x0: return code
|
||||
// x1: vcpu
|
||||
@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
|
||||
// Store the guest's sp_el0
|
||||
save_sp_el0 x1, x2
|
||||
|
||||
get_host_ctxt x2, x3
|
||||
adr_this_cpu x2, kvm_hyp_ctxt, x3
|
||||
|
||||
// Macro ptrauth_switch_to_guest format:
|
||||
// ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
|
||||
// Macro ptrauth_switch_to_hyp format:
|
||||
// ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
|
||||
// The below macro to save/restore keys is not implemented in C code
|
||||
// as it may cause Pointer Authentication key signing mismatch errors
|
||||
// when this feature is enabled for kernel code.
|
||||
ptrauth_switch_to_host x1, x2, x3, x4, x5
|
||||
ptrauth_switch_to_hyp x1, x2, x3, x4, x5
|
||||
|
||||
// Restore the hosts's sp_el0
|
||||
// Restore hyp's sp_el0
|
||||
restore_sp_el0 x2, x3
|
||||
|
||||
// Now restore the host regs
|
||||
// Now restore the hyp regs
|
||||
restore_callee_saved_regs x2
|
||||
|
||||
set_loaded_vcpu xzr, x1, x2
|
||||
|
||||
alternative_if ARM64_HAS_RAS_EXTN
|
||||
// If we have the RAS extensions we can consume a pending error
|
||||
// without an unmask-SError and isb. The ESB-instruction consumed any
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/kvm_arm.h>
|
||||
#include <asm/kvm_asm.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
#include <asm/mmu.h>
|
||||
|
||||
.macro save_caller_saved_regs_vect
|
||||
@ -41,20 +40,6 @@
|
||||
|
||||
.text
|
||||
|
||||
.macro do_el2_call
|
||||
/*
|
||||
* Shuffle the parameters before calling the function
|
||||
* pointed to in x0. Assumes parameters in x[1,2,3].
|
||||
*/
|
||||
str lr, [sp, #-16]!
|
||||
mov lr, x0
|
||||
mov x0, x1
|
||||
mov x1, x2
|
||||
mov x2, x3
|
||||
blr lr
|
||||
ldr lr, [sp], #16
|
||||
.endm
|
||||
|
||||
el1_sync: // Guest trapped into EL2
|
||||
|
||||
mrs x0, esr_el2
|
||||
@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
|
||||
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
|
||||
b.ne el1_trap
|
||||
|
||||
#ifdef __KVM_NVHE_HYPERVISOR__
|
||||
mrs x1, vttbr_el2 // If vttbr is valid, the guest
|
||||
cbnz x1, el1_hvc_guest // called HVC
|
||||
|
||||
/* Here, we're pretty sure the host called HVC. */
|
||||
ldp x0, x1, [sp], #16
|
||||
|
||||
/* Check for a stub HVC call */
|
||||
cmp x0, #HVC_STUB_HCALL_NR
|
||||
b.hs 1f
|
||||
|
||||
/*
|
||||
* Compute the idmap address of __kvm_handle_stub_hvc and
|
||||
* jump there. Since we use kimage_voffset, do not use the
|
||||
* HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
|
||||
* (by loading it from the constant pool).
|
||||
*
|
||||
* Preserve x0-x4, which may contain stub parameters.
|
||||
*/
|
||||
ldr x5, =__kvm_handle_stub_hvc
|
||||
ldr_l x6, kimage_voffset
|
||||
|
||||
/* x5 = __pa(x5) */
|
||||
sub x5, x5, x6
|
||||
br x5
|
||||
|
||||
1:
|
||||
/*
|
||||
* Perform the EL2 call
|
||||
*/
|
||||
kern_hyp_va x0
|
||||
do_el2_call
|
||||
|
||||
eret
|
||||
sb
|
||||
#endif /* __KVM_NVHE_HYPERVISOR__ */
|
||||
|
||||
el1_hvc_guest:
|
||||
/*
|
||||
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
|
||||
* The workaround has already been applied on the host,
|
||||
@ -169,24 +116,7 @@ el2_error:
|
||||
eret
|
||||
sb
|
||||
|
||||
#ifdef __KVM_NVHE_HYPERVISOR__
|
||||
SYM_FUNC_START(__hyp_do_panic)
|
||||
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
|
||||
PSR_MODE_EL1h)
|
||||
msr spsr_el2, lr
|
||||
ldr lr, =panic
|
||||
msr elr_el2, lr
|
||||
eret
|
||||
sb
|
||||
SYM_FUNC_END(__hyp_do_panic)
|
||||
#endif
|
||||
|
||||
SYM_CODE_START(__hyp_panic)
|
||||
get_host_ctxt x0, x1
|
||||
b hyp_panic
|
||||
SYM_CODE_END(__hyp_panic)
|
||||
|
||||
.macro invalid_vector label, target = __hyp_panic
|
||||
.macro invalid_vector label, target = __guest_exit_panic
|
||||
.align 2
|
||||
SYM_CODE_START(\label)
|
||||
b \target
|
||||
@ -198,7 +128,6 @@ SYM_CODE_END(\label)
|
||||
invalid_vector el2t_irq_invalid
|
||||
invalid_vector el2t_fiq_invalid
|
||||
invalid_vector el2t_error_invalid
|
||||
invalid_vector el2h_sync_invalid
|
||||
invalid_vector el2h_irq_invalid
|
||||
invalid_vector el2h_fiq_invalid
|
||||
invalid_vector el1_fiq_invalid
|
||||
@ -228,10 +157,9 @@ check_preamble_length 661b, 662b
|
||||
.macro invalid_vect target
|
||||
.align 7
|
||||
661:
|
||||
b \target
|
||||
nop
|
||||
stp x0, x1, [sp, #-16]!
|
||||
662:
|
||||
ldp x0, x1, [sp], #16
|
||||
b \target
|
||||
|
||||
check_preamble_length 661b, 662b
|
||||
|
@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
|
||||
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
|
||||
return;
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
guest_ctxt = &vcpu->arch.ctxt;
|
||||
host_dbg = &vcpu->arch.host_debug_state.regs;
|
||||
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
|
||||
@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
|
||||
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
|
||||
return;
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
guest_ctxt = &vcpu->arch.ctxt;
|
||||
host_dbg = &vcpu->arch.host_debug_state.regs;
|
||||
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
|
||||
|
@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __activate_vm(struct kvm_s2_mmu *mmu)
|
||||
{
|
||||
__load_guest_stage2(mmu);
|
||||
}
|
||||
|
||||
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
|
||||
{
|
||||
u64 par, tmp;
|
||||
@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
|
||||
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
|
||||
} while(0)
|
||||
|
||||
DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
|
||||
|
||||
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpu_context *ctxt;
|
||||
@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
|
||||
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
|
||||
return false;
|
||||
|
||||
ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
|
||||
__ptrauth_save_key(ctxt, APIA);
|
||||
__ptrauth_save_key(ctxt, APIB);
|
||||
__ptrauth_save_key(ctxt, APDA);
|
||||
@ -481,14 +478,13 @@ exit:
|
||||
|
||||
static inline void __kvm_unexpected_el2_exception(void)
|
||||
{
|
||||
extern char __guest_exit_panic[];
|
||||
unsigned long addr, fixup;
|
||||
struct kvm_cpu_context *host_ctxt;
|
||||
struct exception_table_entry *entry, *end;
|
||||
unsigned long elr_el2 = read_sysreg(elr_el2);
|
||||
|
||||
entry = hyp_symbol_addr(__start___kvm_ex_table);
|
||||
end = hyp_symbol_addr(__stop___kvm_ex_table);
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
|
||||
while (entry < end) {
|
||||
addr = (unsigned long)&entry->insn + entry->insn;
|
||||
@ -503,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
|
||||
return;
|
||||
}
|
||||
|
||||
hyp_panic(host_ctxt);
|
||||
/* Trigger a panic after restoring the hyp context. */
|
||||
write_sysreg(__guest_exit_panic, elr_el2);
|
||||
}
|
||||
|
||||
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
|
||||
|
2
arch/arm64/kvm/hyp/nvhe/.gitignore
vendored
Normal file
2
arch/arm64/kvm/hyp/nvhe/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
hyp.lds
|
@ -6,44 +6,50 @@
|
||||
asflags-y := -D__KVM_NVHE_HYPERVISOR__
|
||||
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
|
||||
|
||||
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
|
||||
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
|
||||
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
|
||||
../fpsimd.o ../hyp-entry.o
|
||||
|
||||
obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
|
||||
extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
|
||||
##
|
||||
## Build rules for compiling nVHE hyp code
|
||||
## Output of this folder is `kvm_nvhe.o`, a partially linked object
|
||||
## file containing all nVHE hyp code and data.
|
||||
##
|
||||
|
||||
$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
|
||||
hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
|
||||
obj-y := kvm_nvhe.o
|
||||
extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
|
||||
|
||||
# 1) Compile all source files to `.nvhe.o` object files. The file extension
|
||||
# avoids file name clashes for files shared with VHE.
|
||||
$(obj)/%.nvhe.o: $(src)/%.c FORCE
|
||||
$(call if_changed_rule,cc_o_c)
|
||||
$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
|
||||
$(obj)/%.nvhe.o: $(src)/%.S FORCE
|
||||
$(call if_changed_rule,as_o_S)
|
||||
$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
|
||||
|
||||
# 2) Compile linker script.
|
||||
$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
|
||||
$(call if_changed_dep,cpp_lds_S)
|
||||
|
||||
# 3) Partially link all '.nvhe.o' files and apply the linker script.
|
||||
# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
|
||||
# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
|
||||
# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
|
||||
# keep the dependency on the target while avoiding an error from
|
||||
# GNU ld if the linker script is passed to it twice.
|
||||
LDFLAGS_kvm_nvhe.tmp.o := -r -T
|
||||
$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
|
||||
$(call if_changed,ld)
|
||||
|
||||
# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
|
||||
# Prefixes names of ELF symbols with '__kvm_nvhe_'.
|
||||
$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
|
||||
$(call if_changed,hypcopy)
|
||||
|
||||
# Disable reordering functions by GCC (enabled at -O2).
|
||||
# This pass puts functions into '.text.*' sections to aid the linker
|
||||
# in optimizing ELF layout. See HYPCOPY comment below for more info.
|
||||
ccflags-y += $(call cc-option,-fno-reorder-functions)
|
||||
|
||||
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
|
||||
# and relevant ELF section names to avoid clashes with VHE code/data.
|
||||
#
|
||||
# Hyp code is assumed to be in the '.text' section of the input object
|
||||
# files (with the exception of specialized sections such as
|
||||
# '.hyp.idmap.text'). This assumption may be broken by a compiler that
|
||||
# divides code into sections like '.text.unlikely' so as to optimize
|
||||
# ELF layout. HYPCOPY checks that no such sections exist in the input
|
||||
# using `objdump`, otherwise they would be linked together with other
|
||||
# kernel code and not memory-mapped correctly at runtime.
|
||||
# to avoid clashes with VHE code/data.
|
||||
quiet_cmd_hypcopy = HYPCOPY $@
|
||||
cmd_hypcopy = \
|
||||
if $(OBJDUMP) -h $< | grep -F '.text.'; then \
|
||||
echo "$@: function reordering not supported in nVHE hyp code" >&2; \
|
||||
/bin/false; \
|
||||
fi; \
|
||||
$(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \
|
||||
--rename-section=.text=.hyp.text \
|
||||
$< $@
|
||||
cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
|
||||
|
||||
# Remove ftrace and Shadow Call Stack CFLAGS.
|
||||
# This is equivalent to the 'notrace' and '__noscs' annotations.
|
||||
|
187
arch/arm64/kvm/hyp/nvhe/host.S
Normal file
187
arch/arm64/kvm/hyp/nvhe/host.S
Normal file
@ -0,0 +1,187 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright (C) 2020 - Google Inc
|
||||
* Author: Andrew Scull <ascull@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/kvm_asm.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
|
||||
.text
|
||||
|
||||
SYM_FUNC_START(__host_exit)
|
||||
stp x0, x1, [sp, #-16]!
|
||||
|
||||
get_host_ctxt x0, x1
|
||||
|
||||
ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
|
||||
|
||||
/* Store the host regs x2 and x3 */
|
||||
stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
|
||||
|
||||
/* Retrieve the host regs x0-x1 from the stack */
|
||||
ldp x2, x3, [sp], #16 // x0, x1
|
||||
|
||||
/* Store the host regs x0-x1 and x4-x17 */
|
||||
stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
|
||||
stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
|
||||
stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
|
||||
stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
|
||||
stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
|
||||
stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
|
||||
stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
|
||||
stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
|
||||
|
||||
/* Store the host regs x18-x29, lr */
|
||||
save_callee_saved_regs x0
|
||||
|
||||
/* Save the host context pointer in x29 across the function call */
|
||||
mov x29, x0
|
||||
bl handle_trap
|
||||
|
||||
/* Restore host regs x0-x17 */
|
||||
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
|
||||
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
|
||||
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
|
||||
ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
|
||||
|
||||
/* x0-7 are use for panic arguments */
|
||||
__host_enter_for_panic:
|
||||
ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
|
||||
ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
|
||||
ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
|
||||
ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
|
||||
ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
|
||||
|
||||
/* Restore host regs x18-x29, lr */
|
||||
restore_callee_saved_regs x29
|
||||
|
||||
/* Do not touch any register after this! */
|
||||
__host_enter_without_restoring:
|
||||
eret
|
||||
sb
|
||||
SYM_FUNC_END(__host_exit)
|
||||
|
||||
/*
|
||||
* void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
|
||||
*/
|
||||
SYM_FUNC_START(__hyp_do_panic)
|
||||
/* Load the format arguments into x1-7 */
|
||||
mov x6, x3
|
||||
get_vcpu_ptr x7, x3
|
||||
|
||||
mrs x3, esr_el2
|
||||
mrs x4, far_el2
|
||||
mrs x5, hpfar_el2
|
||||
|
||||
/* Prepare and exit to the host's panic funciton. */
|
||||
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
|
||||
PSR_MODE_EL1h)
|
||||
msr spsr_el2, lr
|
||||
ldr lr, =panic
|
||||
msr elr_el2, lr
|
||||
|
||||
/*
|
||||
* Set the panic format string and enter the host, conditionally
|
||||
* restoring the host context.
|
||||
*/
|
||||
cmp x0, xzr
|
||||
ldr x0, =__hyp_panic_string
|
||||
b.eq __host_enter_without_restoring
|
||||
b __host_enter_for_panic
|
||||
SYM_FUNC_END(__hyp_do_panic)
|
||||
|
||||
.macro host_el1_sync_vect
|
||||
.align 7
|
||||
.L__vect_start\@:
|
||||
stp x0, x1, [sp, #-16]!
|
||||
mrs x0, esr_el2
|
||||
lsr x0, x0, #ESR_ELx_EC_SHIFT
|
||||
cmp x0, #ESR_ELx_EC_HVC64
|
||||
ldp x0, x1, [sp], #16
|
||||
b.ne __host_exit
|
||||
|
||||
/* Check for a stub HVC call */
|
||||
cmp x0, #HVC_STUB_HCALL_NR
|
||||
b.hs __host_exit
|
||||
|
||||
/*
|
||||
* Compute the idmap address of __kvm_handle_stub_hvc and
|
||||
* jump there. Since we use kimage_voffset, do not use the
|
||||
* HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
|
||||
* (by loading it from the constant pool).
|
||||
*
|
||||
* Preserve x0-x4, which may contain stub parameters.
|
||||
*/
|
||||
ldr x5, =__kvm_handle_stub_hvc
|
||||
ldr_l x6, kimage_voffset
|
||||
|
||||
/* x5 = __pa(x5) */
|
||||
sub x5, x5, x6
|
||||
br x5
|
||||
.L__vect_end\@:
|
||||
.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
|
||||
.error "host_el1_sync_vect larger than vector entry"
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro invalid_host_el2_vect
|
||||
.align 7
|
||||
/* If a guest is loaded, panic out of it. */
|
||||
stp x0, x1, [sp, #-16]!
|
||||
get_loaded_vcpu x0, x1
|
||||
cbnz x0, __guest_exit_panic
|
||||
add sp, sp, #16
|
||||
|
||||
/*
|
||||
* The panic may not be clean if the exception is taken before the host
|
||||
* context has been saved by __host_exit or after the hyp context has
|
||||
* been partially clobbered by __host_enter.
|
||||
*/
|
||||
b hyp_panic
|
||||
.endm
|
||||
|
||||
.macro invalid_host_el1_vect
|
||||
.align 7
|
||||
mov x0, xzr /* restore_host = false */
|
||||
mrs x1, spsr_el2
|
||||
mrs x2, elr_el2
|
||||
mrs x3, par_el1
|
||||
b __hyp_do_panic
|
||||
.endm
|
||||
|
||||
/*
|
||||
* The host vector does not use an ESB instruction in order to avoid consuming
|
||||
* SErrors that should only be consumed by the host. Guest entry is deferred by
|
||||
* __guest_enter if there are any pending asynchronous exceptions so hyp will
|
||||
* always return to the host without having consumerd host SErrors.
|
||||
*
|
||||
* CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
|
||||
* host knows about the EL2 vectors already, and there is no point in hiding
|
||||
* them.
|
||||
*/
|
||||
.align 11
|
||||
SYM_CODE_START(__kvm_hyp_host_vector)
|
||||
invalid_host_el2_vect // Synchronous EL2t
|
||||
invalid_host_el2_vect // IRQ EL2t
|
||||
invalid_host_el2_vect // FIQ EL2t
|
||||
invalid_host_el2_vect // Error EL2t
|
||||
|
||||
invalid_host_el2_vect // Synchronous EL2h
|
||||
invalid_host_el2_vect // IRQ EL2h
|
||||
invalid_host_el2_vect // FIQ EL2h
|
||||
invalid_host_el2_vect // Error EL2h
|
||||
|
||||
host_el1_sync_vect // Synchronous 64-bit EL1
|
||||
invalid_host_el1_vect // IRQ 64-bit EL1
|
||||
invalid_host_el1_vect // FIQ 64-bit EL1
|
||||
invalid_host_el1_vect // Error 64-bit EL1
|
||||
|
||||
invalid_host_el1_vect // Synchronous 32-bit EL1
|
||||
invalid_host_el1_vect // IRQ 32-bit EL1
|
||||
invalid_host_el1_vect // FIQ 32-bit EL1
|
||||
invalid_host_el1_vect // Error 32-bit EL1
|
||||
SYM_CODE_END(__kvm_hyp_host_vector)
|
@ -4,11 +4,13 @@
|
||||
* Author: Marc Zyngier <marc.zyngier@arm.com>
|
||||
*/
|
||||
|
||||
#include <linux/arm-smccc.h>
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/kvm_arm.h>
|
||||
#include <asm/kvm_asm.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
#include <asm/pgtable-hwdef.h>
|
||||
#include <asm/sysreg.h>
|
||||
@ -44,27 +46,37 @@ __invalid:
|
||||
b .
|
||||
|
||||
/*
|
||||
* x0: HYP pgd
|
||||
* x1: HYP stack
|
||||
* x2: HYP vectors
|
||||
* x3: per-CPU offset
|
||||
* x0: SMCCC function ID
|
||||
* x1: HYP pgd
|
||||
* x2: per-CPU offset
|
||||
* x3: HYP stack
|
||||
* x4: HYP vectors
|
||||
*/
|
||||
__do_hyp_init:
|
||||
/* Check for a stub HVC call */
|
||||
cmp x0, #HVC_STUB_HCALL_NR
|
||||
b.lo __kvm_handle_stub_hvc
|
||||
|
||||
phys_to_ttbr x4, x0
|
||||
alternative_if ARM64_HAS_CNP
|
||||
orr x4, x4, #TTBR_CNP_BIT
|
||||
alternative_else_nop_endif
|
||||
msr ttbr0_el2, x4
|
||||
/* Set tpidr_el2 for use by HYP to free a register */
|
||||
msr tpidr_el2, x2
|
||||
|
||||
mrs x4, tcr_el1
|
||||
mov_q x5, TCR_EL2_MASK
|
||||
and x4, x4, x5
|
||||
mov x5, #TCR_EL2_RES1
|
||||
orr x4, x4, x5
|
||||
mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
|
||||
cmp x0, x2
|
||||
b.eq 1f
|
||||
mov x0, #SMCCC_RET_NOT_SUPPORTED
|
||||
eret
|
||||
|
||||
1: phys_to_ttbr x0, x1
|
||||
alternative_if ARM64_HAS_CNP
|
||||
orr x0, x0, #TTBR_CNP_BIT
|
||||
alternative_else_nop_endif
|
||||
msr ttbr0_el2, x0
|
||||
|
||||
mrs x0, tcr_el1
|
||||
mov_q x1, TCR_EL2_MASK
|
||||
and x0, x0, x1
|
||||
mov x1, #TCR_EL2_RES1
|
||||
orr x0, x0, x1
|
||||
|
||||
/*
|
||||
* The ID map may be configured to use an extended virtual address
|
||||
@ -80,18 +92,18 @@ alternative_else_nop_endif
|
||||
*
|
||||
* So use the same T0SZ value we use for the ID map.
|
||||
*/
|
||||
ldr_l x5, idmap_t0sz
|
||||
bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
|
||||
ldr_l x1, idmap_t0sz
|
||||
bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
|
||||
|
||||
/*
|
||||
* Set the PS bits in TCR_EL2.
|
||||
*/
|
||||
tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
|
||||
tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
|
||||
|
||||
msr tcr_el2, x4
|
||||
msr tcr_el2, x0
|
||||
|
||||
mrs x4, mair_el1
|
||||
msr mair_el2, x4
|
||||
mrs x0, mair_el1
|
||||
msr mair_el2, x0
|
||||
isb
|
||||
|
||||
/* Invalidate the stale TLBs from Bootloader */
|
||||
@ -103,25 +115,22 @@ alternative_else_nop_endif
|
||||
* as well as the EE bit on BE. Drop the A flag since the compiler
|
||||
* is allowed to generate unaligned accesses.
|
||||
*/
|
||||
mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
|
||||
CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
|
||||
mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
|
||||
CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
|
||||
alternative_if ARM64_HAS_ADDRESS_AUTH
|
||||
mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
|
||||
mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
|
||||
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
|
||||
orr x4, x4, x5
|
||||
orr x0, x0, x1
|
||||
alternative_else_nop_endif
|
||||
msr sctlr_el2, x4
|
||||
msr sctlr_el2, x0
|
||||
isb
|
||||
|
||||
/* Set the stack and new vectors */
|
||||
kern_hyp_va x1
|
||||
mov sp, x1
|
||||
msr vbar_el2, x2
|
||||
|
||||
/* Set tpidr_el2 for use by HYP */
|
||||
msr tpidr_el2, x3
|
||||
mov sp, x3
|
||||
msr vbar_el2, x4
|
||||
|
||||
/* Hello, World! */
|
||||
mov x0, #SMCCC_RET_SUCCESS
|
||||
eret
|
||||
SYM_CODE_END(__kvm_hyp_init)
|
||||
|
||||
|
117
arch/arm64/kvm/hyp/nvhe/hyp-main.c
Normal file
117
arch/arm64/kvm/hyp/nvhe/hyp-main.c
Normal file
@ -0,0 +1,117 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2020 - Google Inc
|
||||
* Author: Andrew Scull <ascull@google.com>
|
||||
*/
|
||||
|
||||
#include <hyp/switch.h>
|
||||
|
||||
#include <asm/kvm_asm.h>
|
||||
#include <asm/kvm_emulate.h>
|
||||
#include <asm/kvm_host.h>
|
||||
#include <asm/kvm_hyp.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
|
||||
#include <kvm/arm_hypercalls.h>
|
||||
|
||||
static void handle_host_hcall(unsigned long func_id,
|
||||
struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
unsigned long ret = 0;
|
||||
|
||||
switch (func_id) {
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
|
||||
|
||||
ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
|
||||
__kvm_flush_vm_context();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
|
||||
phys_addr_t ipa = host_ctxt->regs.regs[2];
|
||||
int level = host_ctxt->regs.regs[3];
|
||||
|
||||
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
|
||||
|
||||
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
|
||||
|
||||
__kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
|
||||
u64 cntvoff = host_ctxt->regs.regs[1];
|
||||
|
||||
__kvm_timer_set_cntvoff(cntvoff);
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
|
||||
__kvm_enable_ssbs();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
|
||||
ret = __vgic_v3_get_ich_vtr_el2();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
|
||||
ret = __vgic_v3_read_vmcr();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
|
||||
u32 vmcr = host_ctxt->regs.regs[1];
|
||||
|
||||
__vgic_v3_write_vmcr(vmcr);
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
|
||||
__vgic_v3_init_lrs();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
|
||||
ret = __kvm_get_mdcr_el2();
|
||||
break;
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
|
||||
|
||||
__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
|
||||
break;
|
||||
}
|
||||
case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
|
||||
unsigned long r1 = host_ctxt->regs.regs[1];
|
||||
struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
|
||||
|
||||
__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
/* Invalid host HVC. */
|
||||
host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
|
||||
return;
|
||||
}
|
||||
|
||||
host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
|
||||
host_ctxt->regs.regs[1] = ret;
|
||||
}
|
||||
|
||||
void handle_trap(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
u64 esr = read_sysreg_el2(SYS_ESR);
|
||||
unsigned long func_id;
|
||||
|
||||
if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
|
||||
hyp_panic();
|
||||
|
||||
func_id = host_ctxt->regs.regs[0];
|
||||
handle_host_hcall(func_id, host_ctxt);
|
||||
}
|
19
arch/arm64/kvm/hyp/nvhe/hyp.lds.S
Normal file
19
arch/arm64/kvm/hyp/nvhe/hyp.lds.S
Normal file
@ -0,0 +1,19 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2020 Google LLC.
|
||||
* Written by David Brazdil <dbrazdil@google.com>
|
||||
*
|
||||
* Linker script used for partial linking of nVHE EL2 object files.
|
||||
*/
|
||||
|
||||
#include <asm/hyp_image.h>
|
||||
#include <asm-generic/vmlinux.lds.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/memory.h>
|
||||
|
||||
SECTIONS {
|
||||
HYP_SECTION(.text)
|
||||
HYP_SECTION_NAME(.data..percpu) : {
|
||||
PERCPU_INPUT(L1_CACHE_BYTES)
|
||||
}
|
||||
}
|
@ -27,6 +27,11 @@
|
||||
#include <asm/processor.h>
|
||||
#include <asm/thread_info.h>
|
||||
|
||||
/* Non-VHE specific context */
|
||||
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
|
||||
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
|
||||
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
|
||||
|
||||
static void __activate_traps(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u64 val;
|
||||
@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
|
||||
write_sysreg(val, cptr_el2);
|
||||
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
|
||||
|
||||
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
|
||||
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
|
||||
@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
|
||||
|
||||
static void __deactivate_traps(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
extern char __kvm_hyp_host_vector[];
|
||||
u64 mdcr_el2;
|
||||
|
||||
___deactivate_traps(vcpu);
|
||||
@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
|
||||
write_sysreg(mdcr_el2, mdcr_el2);
|
||||
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
|
||||
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
|
||||
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
|
||||
}
|
||||
|
||||
static void __deactivate_vm(struct kvm_vcpu *vcpu)
|
||||
static void __load_host_stage2(void)
|
||||
{
|
||||
write_sysreg(0, vttbr_el2);
|
||||
}
|
||||
@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
pmr_sync();
|
||||
}
|
||||
|
||||
vcpu = kern_hyp_va(vcpu);
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
host_ctxt->__hyp_running_vcpu = vcpu;
|
||||
guest_ctxt = &vcpu->arch.ctxt;
|
||||
|
||||
@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
__sysreg32_restore_state(vcpu);
|
||||
__sysreg_restore_state_nvhe(guest_ctxt);
|
||||
|
||||
__activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
|
||||
__load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
|
||||
__activate_traps(vcpu);
|
||||
|
||||
__hyp_vgic_restore_state(vcpu);
|
||||
@ -204,7 +210,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
|
||||
do {
|
||||
/* Jump in the fire! */
|
||||
exit_code = __guest_enter(vcpu, host_ctxt);
|
||||
exit_code = __guest_enter(vcpu);
|
||||
|
||||
/* And we're baaack! */
|
||||
} while (fixup_guest_exit(vcpu, &exit_code));
|
||||
@ -215,7 +221,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
__hyp_vgic_save_state(vcpu);
|
||||
|
||||
__deactivate_traps(vcpu);
|
||||
__deactivate_vm(vcpu);
|
||||
__load_host_stage2();
|
||||
|
||||
__sysreg_restore_state_nvhe(host_ctxt);
|
||||
|
||||
@ -235,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
if (system_uses_irq_prio_masking())
|
||||
gic_write_pmr(GIC_PRIO_IRQOFF);
|
||||
|
||||
host_ctxt->__hyp_running_vcpu = NULL;
|
||||
|
||||
return exit_code;
|
||||
}
|
||||
|
||||
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
|
||||
void __noreturn hyp_panic(void)
|
||||
{
|
||||
u64 spsr = read_sysreg_el2(SYS_SPSR);
|
||||
u64 elr = read_sysreg_el2(SYS_ELR);
|
||||
u64 par = read_sysreg(par_el1);
|
||||
struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
|
||||
unsigned long str_va;
|
||||
bool restore_host = true;
|
||||
struct kvm_cpu_context *host_ctxt;
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
if (read_sysreg(vttbr_el2)) {
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
vcpu = host_ctxt->__hyp_running_vcpu;
|
||||
|
||||
if (vcpu) {
|
||||
__timer_disable_traps(vcpu);
|
||||
__deactivate_traps(vcpu);
|
||||
__deactivate_vm(vcpu);
|
||||
__load_host_stage2();
|
||||
__sysreg_restore_state_nvhe(host_ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Force the panic string to be loaded from the literal pool,
|
||||
* making sure it is a kernel address and not a PC-relative
|
||||
* reference.
|
||||
*/
|
||||
asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
|
||||
|
||||
__hyp_do_panic(str_va,
|
||||
spsr, elr,
|
||||
read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
|
||||
read_sysreg(hpfar_el2), par, vcpu);
|
||||
__hyp_do_panic(restore_host, spsr, elr, par);
|
||||
unreachable();
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
|
||||
dsb(ishst);
|
||||
|
||||
/* Switch to requested VMID */
|
||||
mmu = kern_hyp_va(mmu);
|
||||
__tlb_switch_to_guest(mmu, &cxt);
|
||||
|
||||
/*
|
||||
@ -115,7 +114,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
|
||||
dsb(ishst);
|
||||
|
||||
/* Switch to requested VMID */
|
||||
mmu = kern_hyp_va(mmu);
|
||||
__tlb_switch_to_guest(mmu, &cxt);
|
||||
|
||||
__tlbi(vmalls12e1is);
|
||||
|
892
arch/arm64/kvm/hyp/pgtable.c
Normal file
892
arch/arm64/kvm/hyp/pgtable.c
Normal file
@ -0,0 +1,892 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
|
||||
* No bombay mix was harmed in the writing of this file.
|
||||
*
|
||||
* Copyright (C) 2020 Google LLC
|
||||
* Author: Will Deacon <will@kernel.org>
|
||||
*/
|
||||
|
||||
#include <linux/bitfield.h>
|
||||
#include <asm/kvm_pgtable.h>
|
||||
|
||||
#define KVM_PGTABLE_MAX_LEVELS 4U
|
||||
|
||||
#define KVM_PTE_VALID BIT(0)
|
||||
|
||||
#define KVM_PTE_TYPE BIT(1)
|
||||
#define KVM_PTE_TYPE_BLOCK 0
|
||||
#define KVM_PTE_TYPE_PAGE 1
|
||||
#define KVM_PTE_TYPE_TABLE 1
|
||||
|
||||
#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
|
||||
#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
|
||||
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
|
||||
|
||||
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
|
||||
|
||||
struct kvm_pgtable_walk_data {
|
||||
struct kvm_pgtable *pgt;
|
||||
struct kvm_pgtable_walker *walker;
|
||||
|
||||
u64 addr;
|
||||
u64 end;
|
||||
};
|
||||
|
||||
static u64 kvm_granule_shift(u32 level)
|
||||
{
|
||||
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
|
||||
return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
|
||||
}
|
||||
|
||||
static u64 kvm_granule_size(u32 level)
|
||||
{
|
||||
return BIT(kvm_granule_shift(level));
|
||||
}
|
||||
|
||||
static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
|
||||
{
|
||||
u64 granule = kvm_granule_size(level);
|
||||
|
||||
/*
|
||||
* Reject invalid block mappings and don't bother with 4TB mappings for
|
||||
* 52-bit PAs.
|
||||
*/
|
||||
if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
|
||||
return false;
|
||||
|
||||
if (granule > (end - addr))
|
||||
return false;
|
||||
|
||||
return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
|
||||
}
|
||||
|
||||
static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
|
||||
{
|
||||
u64 shift = kvm_granule_shift(level);
|
||||
u64 mask = BIT(PAGE_SHIFT - 3) - 1;
|
||||
|
||||
return (data->addr >> shift) & mask;
|
||||
}
|
||||
|
||||
static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
|
||||
{
|
||||
u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
|
||||
u64 mask = BIT(pgt->ia_bits) - 1;
|
||||
|
||||
return (addr & mask) >> shift;
|
||||
}
|
||||
|
||||
static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
|
||||
{
|
||||
return __kvm_pgd_page_idx(data->pgt, data->addr);
|
||||
}
|
||||
|
||||
static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
|
||||
{
|
||||
struct kvm_pgtable pgt = {
|
||||
.ia_bits = ia_bits,
|
||||
.start_level = start_level,
|
||||
};
|
||||
|
||||
return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
|
||||
}
|
||||
|
||||
static bool kvm_pte_valid(kvm_pte_t pte)
|
||||
{
|
||||
return pte & KVM_PTE_VALID;
|
||||
}
|
||||
|
||||
static bool kvm_pte_table(kvm_pte_t pte, u32 level)
|
||||
{
|
||||
if (level == KVM_PGTABLE_MAX_LEVELS - 1)
|
||||
return false;
|
||||
|
||||
if (!kvm_pte_valid(pte))
|
||||
return false;
|
||||
|
||||
return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
|
||||
}
|
||||
|
||||
static u64 kvm_pte_to_phys(kvm_pte_t pte)
|
||||
{
|
||||
u64 pa = pte & KVM_PTE_ADDR_MASK;
|
||||
|
||||
if (PAGE_SHIFT == 16)
|
||||
pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
|
||||
|
||||
return pa;
|
||||
}
|
||||
|
||||
static kvm_pte_t kvm_phys_to_pte(u64 pa)
|
||||
{
|
||||
kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
|
||||
|
||||
if (PAGE_SHIFT == 16)
|
||||
pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
|
||||
{
|
||||
return __va(kvm_pte_to_phys(pte));
|
||||
}
|
||||
|
||||
static void kvm_set_invalid_pte(kvm_pte_t *ptep)
|
||||
{
|
||||
kvm_pte_t pte = *ptep;
|
||||
WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
|
||||
}
|
||||
|
||||
static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
|
||||
{
|
||||
kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
|
||||
|
||||
pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
|
||||
pte |= KVM_PTE_VALID;
|
||||
|
||||
WARN_ON(kvm_pte_valid(old));
|
||||
smp_store_release(ptep, pte);
|
||||
}
|
||||
|
||||
static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
|
||||
u32 level)
|
||||
{
|
||||
kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
|
||||
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
|
||||
KVM_PTE_TYPE_BLOCK;
|
||||
|
||||
pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
|
||||
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
|
||||
pte |= KVM_PTE_VALID;
|
||||
|
||||
/* Tolerate KVM recreating the exact same mapping. */
|
||||
if (kvm_pte_valid(old))
|
||||
return old == pte;
|
||||
|
||||
smp_store_release(ptep, pte);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
|
||||
u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag)
|
||||
{
|
||||
struct kvm_pgtable_walker *walker = data->walker;
|
||||
return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
|
||||
}
|
||||
|
||||
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
|
||||
kvm_pte_t *pgtable, u32 level);
|
||||
|
||||
static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
|
||||
kvm_pte_t *ptep, u32 level)
|
||||
{
|
||||
int ret = 0;
|
||||
u64 addr = data->addr;
|
||||
kvm_pte_t *childp, pte = *ptep;
|
||||
bool table = kvm_pte_table(pte, level);
|
||||
enum kvm_pgtable_walk_flags flags = data->walker->flags;
|
||||
|
||||
if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
|
||||
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
|
||||
KVM_PGTABLE_WALK_TABLE_PRE);
|
||||
}
|
||||
|
||||
if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
|
||||
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
|
||||
KVM_PGTABLE_WALK_LEAF);
|
||||
pte = *ptep;
|
||||
table = kvm_pte_table(pte, level);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (!table) {
|
||||
data->addr += kvm_granule_size(level);
|
||||
goto out;
|
||||
}
|
||||
|
||||
childp = kvm_pte_follow(pte);
|
||||
ret = __kvm_pgtable_walk(data, childp, level + 1);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
|
||||
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
|
||||
KVM_PGTABLE_WALK_TABLE_POST);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
|
||||
kvm_pte_t *pgtable, u32 level)
|
||||
{
|
||||
u32 idx;
|
||||
int ret = 0;
|
||||
|
||||
if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
|
||||
return -EINVAL;
|
||||
|
||||
for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
|
||||
kvm_pte_t *ptep = &pgtable[idx];
|
||||
|
||||
if (data->addr >= data->end)
|
||||
break;
|
||||
|
||||
ret = __kvm_pgtable_visit(data, ptep, level);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
|
||||
{
|
||||
u32 idx;
|
||||
int ret = 0;
|
||||
struct kvm_pgtable *pgt = data->pgt;
|
||||
u64 limit = BIT(pgt->ia_bits);
|
||||
|
||||
if (data->addr > limit || data->end > limit)
|
||||
return -ERANGE;
|
||||
|
||||
if (!pgt->pgd)
|
||||
return -EINVAL;
|
||||
|
||||
for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
|
||||
kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
|
||||
|
||||
ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
struct kvm_pgtable_walker *walker)
|
||||
{
|
||||
struct kvm_pgtable_walk_data walk_data = {
|
||||
.pgt = pgt,
|
||||
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
|
||||
.end = PAGE_ALIGN(walk_data.addr + size),
|
||||
.walker = walker,
|
||||
};
|
||||
|
||||
return _kvm_pgtable_walk(&walk_data);
|
||||
}
|
||||
|
||||
struct hyp_map_data {
|
||||
u64 phys;
|
||||
kvm_pte_t attr;
|
||||
};
|
||||
|
||||
static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
|
||||
struct hyp_map_data *data)
|
||||
{
|
||||
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
|
||||
u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
|
||||
kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
|
||||
u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
|
||||
u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
|
||||
KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
|
||||
|
||||
if (!(prot & KVM_PGTABLE_PROT_R))
|
||||
return -EINVAL;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_X) {
|
||||
if (prot & KVM_PGTABLE_PROT_W)
|
||||
return -EINVAL;
|
||||
|
||||
if (device)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
|
||||
}
|
||||
|
||||
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
|
||||
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
|
||||
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
|
||||
data->attr = attr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep, struct hyp_map_data *data)
|
||||
{
|
||||
u64 granule = kvm_granule_size(level), phys = data->phys;
|
||||
|
||||
if (!kvm_block_mapping_supported(addr, end, phys, level))
|
||||
return false;
|
||||
|
||||
WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
|
||||
data->phys += granule;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag, void * const arg)
|
||||
{
|
||||
kvm_pte_t *childp;
|
||||
|
||||
if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
|
||||
return 0;
|
||||
|
||||
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
|
||||
return -EINVAL;
|
||||
|
||||
childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
|
||||
if (!childp)
|
||||
return -ENOMEM;
|
||||
|
||||
kvm_set_table_pte(ptep, childp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
|
||||
enum kvm_pgtable_prot prot)
|
||||
{
|
||||
int ret;
|
||||
struct hyp_map_data map_data = {
|
||||
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
|
||||
};
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = hyp_map_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
.arg = &map_data,
|
||||
};
|
||||
|
||||
ret = hyp_map_set_prot_attr(prot, &map_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
dsb(ishst);
|
||||
isb();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
|
||||
{
|
||||
u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
|
||||
|
||||
pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
|
||||
if (!pgt->pgd)
|
||||
return -ENOMEM;
|
||||
|
||||
pgt->ia_bits = va_bits;
|
||||
pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
|
||||
pgt->mmu = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag, void * const arg)
|
||||
{
|
||||
free_page((unsigned long)kvm_pte_follow(*ptep));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = hyp_free_walker,
|
||||
.flags = KVM_PGTABLE_WALK_TABLE_POST,
|
||||
};
|
||||
|
||||
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
|
||||
free_page((unsigned long)pgt->pgd);
|
||||
pgt->pgd = NULL;
|
||||
}
|
||||
|
||||
struct stage2_map_data {
|
||||
u64 phys;
|
||||
kvm_pte_t attr;
|
||||
|
||||
kvm_pte_t *anchor;
|
||||
|
||||
struct kvm_s2_mmu *mmu;
|
||||
struct kvm_mmu_memory_cache *memcache;
|
||||
};
|
||||
|
||||
static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
|
||||
struct stage2_map_data *data)
|
||||
{
|
||||
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
|
||||
kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
|
||||
PAGE_S2_MEMATTR(NORMAL);
|
||||
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
|
||||
|
||||
if (!(prot & KVM_PGTABLE_PROT_X))
|
||||
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
|
||||
else if (device)
|
||||
return -EINVAL;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_R)
|
||||
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_W)
|
||||
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
|
||||
|
||||
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
|
||||
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
|
||||
data->attr = attr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
struct stage2_map_data *data)
|
||||
{
|
||||
u64 granule = kvm_granule_size(level), phys = data->phys;
|
||||
|
||||
if (!kvm_block_mapping_supported(addr, end, phys, level))
|
||||
return false;
|
||||
|
||||
if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
|
||||
goto out;
|
||||
|
||||
/* There's an existing valid leaf entry, so perform break-before-make */
|
||||
kvm_set_invalid_pte(ptep);
|
||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
|
||||
kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
|
||||
out:
|
||||
data->phys += granule;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
struct stage2_map_data *data)
|
||||
{
|
||||
if (data->anchor)
|
||||
return 0;
|
||||
|
||||
if (!kvm_block_mapping_supported(addr, end, data->phys, level))
|
||||
return 0;
|
||||
|
||||
kvm_set_invalid_pte(ptep);
|
||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
|
||||
data->anchor = ptep;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
struct stage2_map_data *data)
|
||||
{
|
||||
kvm_pte_t *childp, pte = *ptep;
|
||||
struct page *page = virt_to_page(ptep);
|
||||
|
||||
if (data->anchor) {
|
||||
if (kvm_pte_valid(pte))
|
||||
put_page(page);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
|
||||
goto out_get_page;
|
||||
|
||||
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
|
||||
return -EINVAL;
|
||||
|
||||
if (!data->memcache)
|
||||
return -ENOMEM;
|
||||
|
||||
childp = kvm_mmu_memory_cache_alloc(data->memcache);
|
||||
if (!childp)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* If we've run into an existing block mapping then replace it with
|
||||
* a table. Accesses beyond 'end' that fall within the new table
|
||||
* will be mapped lazily.
|
||||
*/
|
||||
if (kvm_pte_valid(pte)) {
|
||||
kvm_set_invalid_pte(ptep);
|
||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
kvm_set_table_pte(ptep, childp);
|
||||
|
||||
out_get_page:
|
||||
get_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
struct stage2_map_data *data)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (!data->anchor)
|
||||
return 0;
|
||||
|
||||
free_page((unsigned long)kvm_pte_follow(*ptep));
|
||||
put_page(virt_to_page(ptep));
|
||||
|
||||
if (data->anchor == ptep) {
|
||||
data->anchor = NULL;
|
||||
ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a little fiddly, as we use all three of the walk flags. The idea
|
||||
* is that the TABLE_PRE callback runs for table entries on the way down,
|
||||
* looking for table entries which we could conceivably replace with a
|
||||
* block entry for this mapping. If it finds one, then it sets the 'anchor'
|
||||
* field in 'struct stage2_map_data' to point at the table entry, before
|
||||
* clearing the entry to zero and descending into the now detached table.
|
||||
*
|
||||
* The behaviour of the LEAF callback then depends on whether or not the
|
||||
* anchor has been set. If not, then we're not using a block mapping higher
|
||||
* up the table and we perform the mapping at the existing leaves instead.
|
||||
* If, on the other hand, the anchor _is_ set, then we drop references to
|
||||
* all valid leaves so that the pages beneath the anchor can be freed.
|
||||
*
|
||||
* Finally, the TABLE_POST callback does nothing if the anchor has not
|
||||
* been set, but otherwise frees the page-table pages while walking back up
|
||||
* the page-table, installing the block entry when it revisits the anchor
|
||||
* pointer and clearing the anchor to NULL.
|
||||
*/
|
||||
static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag, void * const arg)
|
||||
{
|
||||
struct stage2_map_data *data = arg;
|
||||
|
||||
switch (flag) {
|
||||
case KVM_PGTABLE_WALK_TABLE_PRE:
|
||||
return stage2_map_walk_table_pre(addr, end, level, ptep, data);
|
||||
case KVM_PGTABLE_WALK_LEAF:
|
||||
return stage2_map_walk_leaf(addr, end, level, ptep, data);
|
||||
case KVM_PGTABLE_WALK_TABLE_POST:
|
||||
return stage2_map_walk_table_post(addr, end, level, ptep, data);
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
u64 phys, enum kvm_pgtable_prot prot,
|
||||
struct kvm_mmu_memory_cache *mc)
|
||||
{
|
||||
int ret;
|
||||
struct stage2_map_data map_data = {
|
||||
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
|
||||
.mmu = pgt->mmu,
|
||||
.memcache = mc,
|
||||
};
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_map_walker,
|
||||
.flags = KVM_PGTABLE_WALK_TABLE_PRE |
|
||||
KVM_PGTABLE_WALK_LEAF |
|
||||
KVM_PGTABLE_WALK_TABLE_POST,
|
||||
.arg = &map_data,
|
||||
};
|
||||
|
||||
ret = stage2_map_set_prot_attr(prot, &map_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
dsb(ishst);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void stage2_flush_dcache(void *addr, u64 size)
|
||||
{
|
||||
if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
|
||||
return;
|
||||
|
||||
__flush_dcache_area(addr, size);
|
||||
}
|
||||
|
||||
static bool stage2_pte_cacheable(kvm_pte_t pte)
|
||||
{
|
||||
u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
|
||||
return memattr == PAGE_S2_MEMATTR(NORMAL);
|
||||
}
|
||||
|
||||
static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
{
|
||||
struct kvm_s2_mmu *mmu = arg;
|
||||
kvm_pte_t pte = *ptep, *childp = NULL;
|
||||
bool need_flush = false;
|
||||
|
||||
if (!kvm_pte_valid(pte))
|
||||
return 0;
|
||||
|
||||
if (kvm_pte_table(pte, level)) {
|
||||
childp = kvm_pte_follow(pte);
|
||||
|
||||
if (page_count(virt_to_page(childp)) != 1)
|
||||
return 0;
|
||||
} else if (stage2_pte_cacheable(pte)) {
|
||||
need_flush = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is similar to the map() path in that we unmap the entire
|
||||
* block entry and rely on the remaining portions being faulted
|
||||
* back lazily.
|
||||
*/
|
||||
kvm_set_invalid_pte(ptep);
|
||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
|
||||
put_page(virt_to_page(ptep));
|
||||
|
||||
if (need_flush) {
|
||||
stage2_flush_dcache(kvm_pte_follow(pte),
|
||||
kvm_granule_size(level));
|
||||
}
|
||||
|
||||
if (childp)
|
||||
free_page((unsigned long)childp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_unmap_walker,
|
||||
.arg = pgt->mmu,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
||||
};
|
||||
|
||||
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
}
|
||||
|
||||
struct stage2_attr_data {
|
||||
kvm_pte_t attr_set;
|
||||
kvm_pte_t attr_clr;
|
||||
kvm_pte_t pte;
|
||||
u32 level;
|
||||
};
|
||||
|
||||
static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
{
|
||||
kvm_pte_t pte = *ptep;
|
||||
struct stage2_attr_data *data = arg;
|
||||
|
||||
if (!kvm_pte_valid(pte))
|
||||
return 0;
|
||||
|
||||
data->level = level;
|
||||
data->pte = pte;
|
||||
pte &= ~data->attr_clr;
|
||||
pte |= data->attr_set;
|
||||
|
||||
/*
|
||||
* We may race with the CPU trying to set the access flag here,
|
||||
* but worst-case the access flag update gets lost and will be
|
||||
* set on the next access instead.
|
||||
*/
|
||||
if (data->pte != pte)
|
||||
WRITE_ONCE(*ptep, pte);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
|
||||
u64 size, kvm_pte_t attr_set,
|
||||
kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
|
||||
u32 *level)
|
||||
{
|
||||
int ret;
|
||||
kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
|
||||
struct stage2_attr_data data = {
|
||||
.attr_set = attr_set & attr_mask,
|
||||
.attr_clr = attr_clr & attr_mask,
|
||||
};
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_attr_walker,
|
||||
.arg = &data,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
};
|
||||
|
||||
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (orig_pte)
|
||||
*orig_pte = data.pte;
|
||||
|
||||
if (level)
|
||||
*level = data.level;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
||||
{
|
||||
return stage2_update_leaf_attrs(pgt, addr, size, 0,
|
||||
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
|
||||
NULL, NULL);
|
||||
}
|
||||
|
||||
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
|
||||
{
|
||||
kvm_pte_t pte = 0;
|
||||
stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
|
||||
&pte, NULL);
|
||||
dsb(ishst);
|
||||
return pte;
|
||||
}
|
||||
|
||||
kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
|
||||
{
|
||||
kvm_pte_t pte = 0;
|
||||
stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
|
||||
&pte, NULL);
|
||||
/*
|
||||
* "But where's the TLBI?!", you scream.
|
||||
* "Over in the core code", I sigh.
|
||||
*
|
||||
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
|
||||
*/
|
||||
return pte;
|
||||
}
|
||||
|
||||
bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
|
||||
{
|
||||
kvm_pte_t pte = 0;
|
||||
stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
|
||||
return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
|
||||
enum kvm_pgtable_prot prot)
|
||||
{
|
||||
int ret;
|
||||
u32 level;
|
||||
kvm_pte_t set = 0, clr = 0;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_R)
|
||||
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_W)
|
||||
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
|
||||
|
||||
if (prot & KVM_PGTABLE_PROT_X)
|
||||
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
|
||||
|
||||
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
|
||||
if (!ret)
|
||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
{
|
||||
kvm_pte_t pte = *ptep;
|
||||
|
||||
if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
|
||||
return 0;
|
||||
|
||||
stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_flush_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
};
|
||||
|
||||
if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
|
||||
return 0;
|
||||
|
||||
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
|
||||
{
|
||||
size_t pgd_sz;
|
||||
u64 vtcr = kvm->arch.vtcr;
|
||||
u32 ia_bits = VTCR_EL2_IPA(vtcr);
|
||||
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
|
||||
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
|
||||
|
||||
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
|
||||
pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!pgt->pgd)
|
||||
return -ENOMEM;
|
||||
|
||||
pgt->ia_bits = ia_bits;
|
||||
pgt->start_level = start_level;
|
||||
pgt->mmu = &kvm->arch.mmu;
|
||||
|
||||
/* Ensure zeroed PGD pages are visible to the hardware walker */
|
||||
dsb(ishst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
{
|
||||
kvm_pte_t pte = *ptep;
|
||||
|
||||
if (!kvm_pte_valid(pte))
|
||||
return 0;
|
||||
|
||||
put_page(virt_to_page(ptep));
|
||||
|
||||
if (kvm_pte_table(pte, level))
|
||||
free_page((unsigned long)kvm_pte_follow(pte));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
|
||||
{
|
||||
size_t pgd_sz;
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_free_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF |
|
||||
KVM_PGTABLE_WALK_TABLE_POST,
|
||||
};
|
||||
|
||||
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
|
||||
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
|
||||
free_pages_exact(pgt->pgd, pgd_sz);
|
||||
pgt->pgd = NULL;
|
||||
}
|
@ -28,6 +28,11 @@
|
||||
|
||||
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
|
||||
|
||||
/* VHE specific context */
|
||||
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
|
||||
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
|
||||
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
|
||||
|
||||
static void __activate_traps(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u64 val;
|
||||
@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
|
||||
|
||||
write_sysreg(val, cpacr_el1);
|
||||
|
||||
write_sysreg(kvm_get_hyp_vector(), vbar_el1);
|
||||
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
|
||||
}
|
||||
NOKPROBE_SYMBOL(__activate_traps);
|
||||
|
||||
@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
|
||||
struct kvm_cpu_context *guest_ctxt;
|
||||
u64 exit_code;
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
host_ctxt->__hyp_running_vcpu = vcpu;
|
||||
guest_ctxt = &vcpu->arch.ctxt;
|
||||
|
||||
@ -120,12 +125,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
|
||||
* HCR_EL2.TGE.
|
||||
*
|
||||
* We have already configured the guest's stage 1 translation in
|
||||
* kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
|
||||
* before __activate_traps, because __activate_vm configures
|
||||
* stage 2 translation, and __activate_traps clear HCR_EL2.TGE
|
||||
* (among other things).
|
||||
* kvm_vcpu_load_sysregs_vhe above. We must now call
|
||||
* __load_guest_stage2 before __activate_traps, because
|
||||
* __load_guest_stage2 configures stage 2 translation, and
|
||||
* __activate_traps clear HCR_EL2.TGE (among other things).
|
||||
*/
|
||||
__activate_vm(vcpu->arch.hw_mmu);
|
||||
__load_guest_stage2(vcpu->arch.hw_mmu);
|
||||
__activate_traps(vcpu);
|
||||
|
||||
sysreg_restore_guest_state_vhe(guest_ctxt);
|
||||
@ -133,7 +138,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
|
||||
|
||||
do {
|
||||
/* Jump in the fire! */
|
||||
exit_code = __guest_enter(vcpu, host_ctxt);
|
||||
exit_code = __guest_enter(vcpu);
|
||||
|
||||
/* And we're baaack! */
|
||||
} while (fixup_guest_exit(vcpu, &exit_code));
|
||||
@ -188,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
|
||||
struct kvm_cpu_context *host_ctxt)
|
||||
static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
|
||||
{
|
||||
struct kvm_cpu_context *host_ctxt;
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
vcpu = host_ctxt->__hyp_running_vcpu;
|
||||
|
||||
__deactivate_traps(vcpu);
|
||||
@ -204,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
|
||||
}
|
||||
NOKPROBE_SYMBOL(__hyp_call_panic);
|
||||
|
||||
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
|
||||
void __noreturn hyp_panic(void)
|
||||
{
|
||||
u64 spsr = read_sysreg_el2(SYS_SPSR);
|
||||
u64 elr = read_sysreg_el2(SYS_ELR);
|
||||
u64 par = read_sysreg(par_el1);
|
||||
|
||||
__hyp_call_panic(spsr, elr, par, host_ctxt);
|
||||
__hyp_call_panic(spsr, elr, par);
|
||||
unreachable();
|
||||
}
|
||||
|
||||
|
@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
|
||||
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
|
||||
struct kvm_cpu_context *host_ctxt;
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
__sysreg_save_user_state(host_ctxt);
|
||||
|
||||
/*
|
||||
@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
|
||||
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
|
||||
struct kvm_cpu_context *host_ctxt;
|
||||
|
||||
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
|
||||
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
|
||||
deactivate_traps_vhe_put();
|
||||
|
||||
__sysreg_save_el1_state(guest_ctxt);
|
||||
|
@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
|
||||
|
||||
/**
|
||||
* kvm_inject_undefined - inject an undefined instruction into the guest
|
||||
* @vcpu: The vCPU in which to inject the exception
|
||||
*
|
||||
* It is assumed that this code is called from the VCPU thread and that the
|
||||
* VCPU therefore is not currently executing guest code.
|
||||
|
1611
arch/arm64/kvm/mmu.c
1611
arch/arm64/kvm/mmu.c
File diff suppressed because it is too large
Load Diff
@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
|
||||
|
||||
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
|
||||
|
||||
static u32 kvm_pmu_event_mask(struct kvm *kvm)
|
||||
{
|
||||
switch (kvm->arch.pmuver) {
|
||||
case 1: /* ARMv8.0 */
|
||||
return GENMASK(9, 0);
|
||||
case 4: /* ARMv8.1 */
|
||||
case 5: /* ARMv8.4 */
|
||||
case 6: /* ARMv8.5 */
|
||||
return GENMASK(15, 0);
|
||||
default: /* Shouldn't be here, just for sanity */
|
||||
WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
|
||||
* @vcpu: The vcpu pointer
|
||||
@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
return false;
|
||||
|
||||
reg = PMEVTYPER0_EL0 + select_idx;
|
||||
eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
|
||||
eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
|
||||
}
|
||||
@ -516,7 +531,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
|
||||
|
||||
/* PMSWINC only applies to ... SW_INC! */
|
||||
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
|
||||
type &= ARMV8_PMU_EVTYPE_EVENT;
|
||||
type &= kvm_pmu_event_mask(vcpu->kvm);
|
||||
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
|
||||
continue;
|
||||
|
||||
@ -599,11 +614,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
data = __vcpu_sys_reg(vcpu, reg);
|
||||
|
||||
kvm_pmu_stop_counter(vcpu, pmc);
|
||||
eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
|
||||
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
|
||||
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
|
||||
else
|
||||
eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
/* Software increment event does't need to be backed by a perf event */
|
||||
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
|
||||
pmc->idx != ARMV8_PMU_CYCLE_IDX)
|
||||
/* Software increment event doesn't need to be backed by a perf event */
|
||||
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we have a filter in place and that the event isn't allowed, do
|
||||
* not install a perf event either.
|
||||
*/
|
||||
if (vcpu->kvm->arch.pmu_filter &&
|
||||
!test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
|
||||
return;
|
||||
|
||||
memset(&attr, 0, sizeof(struct perf_event_attr));
|
||||
@ -615,8 +640,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
|
||||
attr.exclude_hv = 1; /* Don't count EL2 events */
|
||||
attr.exclude_host = 1; /* Don't count host events */
|
||||
attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
|
||||
ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
|
||||
attr.config = eventsel;
|
||||
|
||||
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
|
||||
|
||||
@ -700,17 +724,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
|
||||
u64 select_idx)
|
||||
{
|
||||
u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
|
||||
u64 reg, mask;
|
||||
|
||||
mask = ARMV8_PMU_EVTYPE_MASK;
|
||||
mask &= ~ARMV8_PMU_EVTYPE_EVENT;
|
||||
mask |= kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
|
||||
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
|
||||
|
||||
__vcpu_sys_reg(vcpu, reg) = event_type;
|
||||
__vcpu_sys_reg(vcpu, reg) = data & mask;
|
||||
|
||||
kvm_pmu_update_pmc_chained(vcpu, select_idx);
|
||||
kvm_pmu_create_perf_event(vcpu, select_idx);
|
||||
}
|
||||
|
||||
static int kvm_pmu_probe_pmuver(void)
|
||||
{
|
||||
struct perf_event_attr attr = { };
|
||||
struct perf_event *event;
|
||||
struct arm_pmu *pmu;
|
||||
int pmuver = 0xf;
|
||||
|
||||
/*
|
||||
* Create a dummy event that only counts user cycles. As we'll never
|
||||
* leave this function with the event being live, it will never
|
||||
* count anything. But it allows us to probe some of the PMU
|
||||
* details. Yes, this is terrible.
|
||||
*/
|
||||
attr.type = PERF_TYPE_RAW;
|
||||
attr.size = sizeof(attr);
|
||||
attr.pinned = 1;
|
||||
attr.disabled = 0;
|
||||
attr.exclude_user = 0;
|
||||
attr.exclude_kernel = 1;
|
||||
attr.exclude_hv = 1;
|
||||
attr.exclude_host = 1;
|
||||
attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
|
||||
attr.sample_period = GENMASK(63, 0);
|
||||
|
||||
event = perf_event_create_kernel_counter(&attr, -1, current,
|
||||
kvm_pmu_perf_overflow, &attr);
|
||||
|
||||
if (IS_ERR(event)) {
|
||||
pr_err_once("kvm: pmu event creation failed %ld\n",
|
||||
PTR_ERR(event));
|
||||
return 0xf;
|
||||
}
|
||||
|
||||
if (event->pmu) {
|
||||
pmu = to_arm_pmu(event->pmu);
|
||||
if (pmu->pmuver)
|
||||
pmuver = pmu->pmuver;
|
||||
}
|
||||
|
||||
perf_event_disable(event);
|
||||
perf_event_release_kernel(event);
|
||||
|
||||
return pmuver;
|
||||
}
|
||||
|
||||
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
|
||||
{
|
||||
unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
|
||||
u64 val, mask = 0;
|
||||
int base, i;
|
||||
|
||||
if (!pmceid1) {
|
||||
val = read_sysreg(pmceid0_el0);
|
||||
base = 0;
|
||||
} else {
|
||||
val = read_sysreg(pmceid1_el0);
|
||||
base = 32;
|
||||
}
|
||||
|
||||
if (!bmap)
|
||||
return val;
|
||||
|
||||
for (i = 0; i < 32; i += 8) {
|
||||
u64 byte;
|
||||
|
||||
byte = bitmap_get_value8(bmap, base + i);
|
||||
mask |= byte << i;
|
||||
byte = bitmap_get_value8(bmap, 0x4000 + base + i);
|
||||
mask |= byte << (32 + i);
|
||||
}
|
||||
|
||||
return val & mask;
|
||||
}
|
||||
|
||||
bool kvm_arm_support_pmu_v3(void)
|
||||
{
|
||||
/*
|
||||
@ -756,15 +858,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
|
||||
|
||||
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!kvm_arm_support_pmu_v3())
|
||||
return -ENODEV;
|
||||
|
||||
if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
|
||||
return -ENXIO;
|
||||
|
||||
if (vcpu->arch.pmu.created)
|
||||
return -EBUSY;
|
||||
|
||||
if (irqchip_in_kernel(vcpu->kvm)) {
|
||||
int ret;
|
||||
|
||||
@ -820,6 +913,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
|
||||
|
||||
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
|
||||
{
|
||||
if (!kvm_arm_support_pmu_v3() ||
|
||||
!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
|
||||
return -ENODEV;
|
||||
|
||||
if (vcpu->arch.pmu.created)
|
||||
return -EBUSY;
|
||||
|
||||
if (!vcpu->kvm->arch.pmuver)
|
||||
vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
|
||||
|
||||
if (vcpu->kvm->arch.pmuver == 0xf)
|
||||
return -ENODEV;
|
||||
|
||||
switch (attr->attr) {
|
||||
case KVM_ARM_VCPU_PMU_V3_IRQ: {
|
||||
int __user *uaddr = (int __user *)(long)attr->addr;
|
||||
@ -828,9 +934,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
|
||||
if (!irqchip_in_kernel(vcpu->kvm))
|
||||
return -EINVAL;
|
||||
|
||||
if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
|
||||
return -ENODEV;
|
||||
|
||||
if (get_user(irq, uaddr))
|
||||
return -EFAULT;
|
||||
|
||||
@ -848,6 +951,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
|
||||
vcpu->arch.pmu.irq_num = irq;
|
||||
return 0;
|
||||
}
|
||||
case KVM_ARM_VCPU_PMU_V3_FILTER: {
|
||||
struct kvm_pmu_event_filter __user *uaddr;
|
||||
struct kvm_pmu_event_filter filter;
|
||||
int nr_events;
|
||||
|
||||
nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
|
||||
|
||||
uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
|
||||
|
||||
if (copy_from_user(&filter, uaddr, sizeof(filter)))
|
||||
return -EFAULT;
|
||||
|
||||
if (((u32)filter.base_event + filter.nevents) > nr_events ||
|
||||
(filter.action != KVM_PMU_EVENT_ALLOW &&
|
||||
filter.action != KVM_PMU_EVENT_DENY))
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&vcpu->kvm->lock);
|
||||
|
||||
if (!vcpu->kvm->arch.pmu_filter) {
|
||||
vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
|
||||
if (!vcpu->kvm->arch.pmu_filter) {
|
||||
mutex_unlock(&vcpu->kvm->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* The default depends on the first applied filter.
|
||||
* If it allows events, the default is to deny.
|
||||
* Conversely, if the first filter denies a set of
|
||||
* events, the default is to allow.
|
||||
*/
|
||||
if (filter.action == KVM_PMU_EVENT_ALLOW)
|
||||
bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
|
||||
else
|
||||
bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
|
||||
}
|
||||
|
||||
if (filter.action == KVM_PMU_EVENT_ALLOW)
|
||||
bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
|
||||
else
|
||||
bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
|
||||
|
||||
mutex_unlock(&vcpu->kvm->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
case KVM_ARM_VCPU_PMU_V3_INIT:
|
||||
return kvm_arm_pmu_v3_init(vcpu);
|
||||
}
|
||||
@ -884,6 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
|
||||
switch (attr->attr) {
|
||||
case KVM_ARM_VCPU_PMU_V3_IRQ:
|
||||
case KVM_ARM_VCPU_PMU_V3_INIT:
|
||||
case KVM_ARM_VCPU_PMU_V3_FILTER:
|
||||
if (kvm_arm_support_pmu_v3() &&
|
||||
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
|
||||
return 0;
|
||||
|
@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
|
||||
*/
|
||||
void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
|
||||
{
|
||||
struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
|
||||
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
|
||||
|
||||
if (!kvm_pmu_switch_needed(attr))
|
||||
if (!ctx || !kvm_pmu_switch_needed(attr))
|
||||
return;
|
||||
|
||||
if (!attr->exclude_host)
|
||||
@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
|
||||
*/
|
||||
void kvm_clr_pmu_events(u32 clr)
|
||||
{
|
||||
struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
|
||||
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
|
||||
|
||||
if (!ctx)
|
||||
return;
|
||||
|
||||
ctx->pmu_events.events_host &= ~clr;
|
||||
ctx->pmu_events.events_guest &= ~clr;
|
||||
@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
|
||||
return;
|
||||
|
||||
preempt_disable();
|
||||
host = this_cpu_ptr(&kvm_host_data);
|
||||
host = this_cpu_ptr_hyp_sym(kvm_host_data);
|
||||
events_guest = host->pmu_events.events_guest;
|
||||
events_host = host->pmu_events.events_host;
|
||||
|
||||
@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
|
||||
if (!has_vhe())
|
||||
return;
|
||||
|
||||
host = this_cpu_ptr(&kvm_host_data);
|
||||
host = this_cpu_ptr_hyp_sym(kvm_host_data);
|
||||
events_guest = host->pmu_events.events_guest;
|
||||
events_host = host->pmu_events.events_host;
|
||||
|
||||
|
@ -335,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
|
||||
|
||||
int kvm_set_ipa_limit(void)
|
||||
{
|
||||
unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
|
||||
unsigned int parange, tgran_2;
|
||||
u64 mmfr0;
|
||||
|
||||
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
@ -372,39 +372,11 @@ int kvm_set_ipa_limit(void)
|
||||
break;
|
||||
}
|
||||
|
||||
pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
|
||||
|
||||
/* Clamp the IPA limit to the PA size supported by the kernel */
|
||||
ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
|
||||
/*
|
||||
* Since our stage2 table is dependent on the stage1 page table code,
|
||||
* we must always honor the following condition:
|
||||
*
|
||||
* Number of levels in Stage1 >= Number of levels in Stage2.
|
||||
*
|
||||
* So clamp the ipa limit further down to limit the number of levels.
|
||||
* Since we can concatenate upto 16 tables at entry level, we could
|
||||
* go upto 4bits above the maximum VA addressable with the current
|
||||
* number of levels.
|
||||
*/
|
||||
va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
|
||||
va_max += 4;
|
||||
|
||||
if (va_max < ipa_max)
|
||||
ipa_max = va_max;
|
||||
|
||||
/*
|
||||
* If the final limit is lower than the real physical address
|
||||
* limit of the CPUs, report the reason.
|
||||
*/
|
||||
if (ipa_max < pa_max)
|
||||
pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
|
||||
(va_max < pa_max) ? "Virtual" : "Physical");
|
||||
|
||||
WARN(ipa_max < KVM_PHYS_SHIFT,
|
||||
"KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
|
||||
kvm_ipa_limit = ipa_max;
|
||||
kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
|
||||
kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
|
||||
WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
|
||||
"KVM IPA Size Limit (%d bits) is smaller than default size\n",
|
||||
kvm_ipa_limit);
|
||||
kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
|
||||
if (pmu_access_el0_disabled(vcpu))
|
||||
return false;
|
||||
|
||||
if (!(p->Op2 & 1))
|
||||
pmceid = read_sysreg(pmceid0_el0);
|
||||
else
|
||||
pmceid = read_sysreg(pmceid1_el0);
|
||||
pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
|
||||
|
||||
p->regval = pmceid;
|
||||
|
||||
|
@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations vgic_debug_seq_ops = {
|
||||
static const struct seq_operations vgic_debug_sops = {
|
||||
.start = vgic_debug_start,
|
||||
.next = vgic_debug_next,
|
||||
.stop = vgic_debug_stop,
|
||||
.show = vgic_debug_show
|
||||
};
|
||||
|
||||
static int debug_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
int ret;
|
||||
ret = seq_open(file, &vgic_debug_seq_ops);
|
||||
if (!ret) {
|
||||
struct seq_file *seq;
|
||||
/* seq_open will have modified file->private_data */
|
||||
seq = file->private_data;
|
||||
seq->private = inode->i_private;
|
||||
}
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
static const struct file_operations vgic_debug_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = debug_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release
|
||||
};
|
||||
DEFINE_SEQ_ATTRIBUTE(vgic_debug);
|
||||
|
||||
void vgic_debug_init(struct kvm *kvm)
|
||||
{
|
||||
|
@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
|
||||
if (likely(cpu_if->vgic_sre))
|
||||
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
|
||||
|
||||
kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
|
||||
kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
|
||||
|
||||
if (has_vhe())
|
||||
__vgic_v3_activate_traps(cpu_if);
|
||||
@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
|
||||
|
||||
vgic_v3_vmcr_sync(vcpu);
|
||||
|
||||
kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
|
||||
kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
|
||||
|
||||
if (has_vhe())
|
||||
__vgic_v3_deactivate_traps(cpu_if);
|
||||
|
@ -341,7 +341,7 @@ struct kvm_mips_tlb {
|
||||
#define KVM_MIPS_GUEST_TLB_SIZE 64
|
||||
struct kvm_vcpu_arch {
|
||||
void *guest_ebase;
|
||||
int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
||||
int (*vcpu_run)(struct kvm_vcpu *vcpu);
|
||||
|
||||
/* Host registers preserved across guest mode execution */
|
||||
unsigned long host_stack;
|
||||
@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
|
||||
/* Debug: dump vcpu state */
|
||||
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
|
||||
|
||||
extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
||||
extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu);
|
||||
|
||||
/* Building of entry/exception code */
|
||||
int kvm_mips_entry_setup(void);
|
||||
|
@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg)
|
||||
* Assemble the start of the vcpu_run function to run a guest VCPU. The function
|
||||
* conforms to the following prototype:
|
||||
*
|
||||
* int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
||||
* int vcpu_run(struct kvm_vcpu *vcpu);
|
||||
*
|
||||
* The exit from the guest and return to the caller is handled by the code
|
||||
* generated by kvm_mips_build_ret_to_host().
|
||||
@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
|
||||
unsigned int i;
|
||||
|
||||
/*
|
||||
* A0: run
|
||||
* A1: vcpu
|
||||
* A0: vcpu
|
||||
*/
|
||||
|
||||
/* k0/k1 not being used in host kernel context */
|
||||
@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr)
|
||||
kvm_mips_build_save_scratch(&p, V1, K1);
|
||||
|
||||
/* VCPU scratch register has pointer to vcpu */
|
||||
UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
|
||||
/* Offset into vcpu->arch */
|
||||
UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
|
||||
UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch));
|
||||
|
||||
/*
|
||||
* Save the host stack to VCPU, used for exception processing
|
||||
@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr)
|
||||
/* Now that context has been saved, we can use other registers */
|
||||
|
||||
/* Restore vcpu */
|
||||
UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
|
||||
/* Restore run (vcpu->run) */
|
||||
UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
|
||||
UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
|
||||
/*
|
||||
* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
|
||||
@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr)
|
||||
* with this in the kernel
|
||||
*/
|
||||
uasm_i_move(&p, A0, S0);
|
||||
uasm_i_move(&p, A1, S1);
|
||||
UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
|
||||
uasm_i_jalr(&p, RA, T9);
|
||||
UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
|
||||
@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr)
|
||||
* guest, reload k1
|
||||
*/
|
||||
|
||||
uasm_i_move(&p, K1, S1);
|
||||
uasm_i_move(&p, K1, S0);
|
||||
UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
|
||||
|
||||
/*
|
||||
@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
|
||||
{
|
||||
u32 *p = addr;
|
||||
|
||||
/* Put the saved pointer to vcpu (s1) back into the scratch register */
|
||||
UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
/* Put the saved pointer to vcpu (s0) back into the scratch register */
|
||||
UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
|
||||
|
||||
/* Load up the Guest EBASE to minimize the window where BEV is set */
|
||||
UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
|
||||
|
@ -1199,8 +1199,9 @@ static void kvm_mips_set_c0_status(void)
|
||||
/*
|
||||
* Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
|
||||
*/
|
||||
int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
|
||||
int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_run *run = vcpu->run;
|
||||
u32 cause = vcpu->arch.host_cp0_cause;
|
||||
u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
|
||||
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
|
||||
|
@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
*/
|
||||
kvm_mips_suspend_mm(cpu);
|
||||
|
||||
r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
|
||||
r = vcpu->arch.vcpu_run(vcpu);
|
||||
|
||||
/* We may have migrated while handling guest exits */
|
||||
cpu = smp_processor_id();
|
||||
|
@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
kvm_vz_vcpu_load_tlb(vcpu, cpu);
|
||||
kvm_vz_vcpu_load_wired(vcpu);
|
||||
|
||||
r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
|
||||
r = vcpu->arch.vcpu_run(vcpu);
|
||||
|
||||
kvm_vz_vcpu_save_wired(vcpu);
|
||||
|
||||
|
@ -326,6 +326,7 @@ struct kvm_arch {
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_XICS
|
||||
struct kvmppc_xics *xics;
|
||||
struct kvmppc_xics *xics_device;
|
||||
struct kvmppc_xive *xive; /* Current XIVE device in use */
|
||||
struct {
|
||||
struct kvmppc_xive *native;
|
||||
|
@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
|
||||
|
||||
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
|
||||
@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
|
||||
|
||||
#ifdef CONFIG_KVM_XICS
|
||||
/*
|
||||
* Free the XIVE devices which are not directly freed by the
|
||||
* Free the XIVE and XICS devices which are not directly freed by the
|
||||
* device 'release' method
|
||||
*/
|
||||
kfree(kvm->arch.xive_devices.native);
|
||||
kvm->arch.xive_devices.native = NULL;
|
||||
kfree(kvm->arch.xive_devices.xics_on_xive);
|
||||
kvm->arch.xive_devices.xics_on_xive = NULL;
|
||||
kfree(kvm->arch.xics_device);
|
||||
kvm->arch.xics_device = NULL;
|
||||
#endif /* CONFIG_KVM_XICS */
|
||||
}
|
||||
|
||||
|
@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
|
||||
return __radix_pte_update(ptep, clr, set);
|
||||
}
|
||||
|
||||
void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
|
||||
static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
|
||||
|
@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
|
||||
struct kvmppc_spapr_tce_table *siter;
|
||||
struct mm_struct *mm = kvm->mm;
|
||||
unsigned long npages, size = args->size;
|
||||
int ret = -ENOMEM;
|
||||
int ret;
|
||||
|
||||
if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
|
||||
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
|
||||
@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
|
||||
return ret;
|
||||
}
|
||||
|
||||
long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
|
||||
static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
|
||||
unsigned long entry, unsigned long ua,
|
||||
enum dma_data_direction dir)
|
||||
{
|
||||
|
@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern void iommu_tce_kill_rm(struct iommu_table *tbl,
|
||||
static void iommu_tce_kill_rm(struct iommu_table *tbl,
|
||||
unsigned long entry, unsigned long pages)
|
||||
{
|
||||
if (tbl->it_ops->tce_kill)
|
||||
|
@ -111,7 +111,7 @@ module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
|
||||
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
|
||||
|
||||
#ifdef CONFIG_KVM_XICS
|
||||
static struct kernel_param_ops module_param_ops = {
|
||||
static const struct kernel_param_ops module_param_ops = {
|
||||
.set = param_set_int,
|
||||
.get = param_get_int,
|
||||
};
|
||||
@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
unsigned long host_psscr = mfspr(SPRN_PSSCR);
|
||||
unsigned long host_pidr = mfspr(SPRN_PID);
|
||||
|
||||
/*
|
||||
* P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
|
||||
* so set HDICE before writing HDEC.
|
||||
*/
|
||||
mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
|
||||
isync();
|
||||
|
||||
hdec = time_limit - mftb();
|
||||
if (hdec < 0)
|
||||
if (hdec < 0) {
|
||||
mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
|
||||
isync();
|
||||
return BOOK3S_INTERRUPT_HV_DECREMENTER;
|
||||
}
|
||||
mtspr(SPRN_HDEC, hdec);
|
||||
|
||||
if (vc->tb_offset) {
|
||||
@ -3565,7 +3575,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
* Virtual-mode guest entry for POWER9 and later when the host and
|
||||
* guest are both using the radix MMU. The LPIDR has already been set.
|
||||
*/
|
||||
int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
unsigned long lpcr)
|
||||
{
|
||||
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
||||
@ -3579,7 +3589,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
|
||||
dec = mfspr(SPRN_DEC);
|
||||
tb = mftb();
|
||||
if (dec < 512)
|
||||
if (dec < 0)
|
||||
return BOOK3S_INTERRUPT_HV_DECREMENTER;
|
||||
local_paca->kvm_hstate.dec_expires = dec + tb;
|
||||
if (local_paca->kvm_hstate.dec_expires < time_limit)
|
||||
@ -5257,6 +5267,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
|
||||
case KVM_PPC_ALLOCATE_HTAB: {
|
||||
u32 htab_order;
|
||||
|
||||
/* If we're a nested hypervisor, we currently only support radix */
|
||||
if (kvmhv_on_pseries()) {
|
||||
r = -EOPNOTSUPP;
|
||||
break;
|
||||
}
|
||||
|
||||
r = -EFAULT;
|
||||
if (get_user(htab_order, (u32 __user *)argp))
|
||||
break;
|
||||
|
@ -58,13 +58,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
|
||||
/*
|
||||
* Put whatever is in the decrementer into the
|
||||
* hypervisor decrementer.
|
||||
* Because of a hardware deviation in P8 and P9,
|
||||
* we need to set LPCR[HDICE] before writing HDEC.
|
||||
*/
|
||||
BEGIN_FTR_SECTION
|
||||
ld r5, HSTATE_KVM_VCORE(r13)
|
||||
ld r6, VCORE_KVM(r5)
|
||||
ld r9, KVM_HOST_LPCR(r6)
|
||||
andis. r9, r9, LPCR_LD@h
|
||||
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
|
||||
ori r8, r9, LPCR_HDICE
|
||||
mtspr SPRN_LPCR, r8
|
||||
isync
|
||||
andis. r0, r9, LPCR_LD@h
|
||||
mfspr r8,SPRN_DEC
|
||||
mftb r7
|
||||
BEGIN_FTR_SECTION
|
||||
|
@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
|
||||
kvmhv_set_nested_ptbl(gp);
|
||||
}
|
||||
|
||||
struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
|
||||
static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
|
||||
{
|
||||
struct kvm_nested_guest *gp;
|
||||
long shadow_lpid;
|
||||
|
@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
|
||||
return ics_rm_eoi(vcpu, irq);
|
||||
}
|
||||
|
||||
unsigned long eoi_rc;
|
||||
static unsigned long eoi_rc;
|
||||
|
||||
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
|
||||
{
|
||||
|
@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
|
||||
#endif
|
||||
}
|
||||
|
||||
void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
|
||||
static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
|
||||
{
|
||||
u32 host_pvr;
|
||||
|
||||
|
@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
static void kvmppc_xics_free(struct kvm_device *dev)
|
||||
/*
|
||||
* Called when device fd is closed. kvm->lock is held.
|
||||
*/
|
||||
static void kvmppc_xics_release(struct kvm_device *dev)
|
||||
{
|
||||
struct kvmppc_xics *xics = dev->private;
|
||||
int i;
|
||||
struct kvm *kvm = xics->kvm;
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
pr_devel("Releasing xics device\n");
|
||||
|
||||
/*
|
||||
* Since this is the device release function, we know that
|
||||
* userspace does not have any open fd referring to the
|
||||
* device. Therefore there can not be any of the device
|
||||
* attribute set/get functions being executed concurrently,
|
||||
* and similarly, the connect_vcpu and set/clr_mapped
|
||||
* functions also cannot be being executed.
|
||||
*/
|
||||
|
||||
debugfs_remove(xics->dentry);
|
||||
|
||||
/*
|
||||
* We should clean up the vCPU interrupt presenters first.
|
||||
*/
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
/*
|
||||
* Take vcpu->mutex to ensure that no one_reg get/set ioctl
|
||||
* (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
|
||||
* Holding the vcpu->mutex also means that execution is
|
||||
* excluded for the vcpu until the ICP was freed. When the vcpu
|
||||
* can execute again, vcpu->arch.icp and vcpu->arch.irq_type
|
||||
* have been cleared and the vcpu will not be going into the
|
||||
* XICS code anymore.
|
||||
*/
|
||||
mutex_lock(&vcpu->mutex);
|
||||
kvmppc_xics_free_icp(vcpu);
|
||||
mutex_unlock(&vcpu->mutex);
|
||||
}
|
||||
|
||||
if (kvm)
|
||||
kvm->arch.xics = NULL;
|
||||
|
||||
for (i = 0; i <= xics->max_icsid; i++)
|
||||
for (i = 0; i <= xics->max_icsid; i++) {
|
||||
kfree(xics->ics[i]);
|
||||
kfree(xics);
|
||||
xics->ics[i] = NULL;
|
||||
}
|
||||
/*
|
||||
* A reference of the kvmppc_xics pointer is now kept under
|
||||
* the xics_device pointer of the machine for reuse. It is
|
||||
* freed when the VM is destroyed for now until we fix all the
|
||||
* execution paths.
|
||||
*/
|
||||
kfree(dev);
|
||||
}
|
||||
|
||||
static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
|
||||
{
|
||||
struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
|
||||
struct kvmppc_xics *xics = *kvm_xics_device;
|
||||
|
||||
if (!xics) {
|
||||
xics = kzalloc(sizeof(*xics), GFP_KERNEL);
|
||||
*kvm_xics_device = xics;
|
||||
} else {
|
||||
memset(xics, 0, sizeof(*xics));
|
||||
}
|
||||
|
||||
return xics;
|
||||
}
|
||||
|
||||
static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
|
||||
{
|
||||
struct kvmppc_xics *xics;
|
||||
struct kvm *kvm = dev->kvm;
|
||||
int ret = 0;
|
||||
|
||||
xics = kzalloc(sizeof(*xics), GFP_KERNEL);
|
||||
pr_devel("Creating xics for partition\n");
|
||||
|
||||
/* Already there ? */
|
||||
if (kvm->arch.xics)
|
||||
return -EEXIST;
|
||||
|
||||
xics = kvmppc_xics_get_device(kvm);
|
||||
if (!xics)
|
||||
return -ENOMEM;
|
||||
|
||||
dev->private = xics;
|
||||
xics->dev = dev;
|
||||
xics->kvm = kvm;
|
||||
|
||||
/* Already there ? */
|
||||
if (kvm->arch.xics)
|
||||
ret = -EEXIST;
|
||||
else
|
||||
kvm->arch.xics = xics;
|
||||
|
||||
if (ret) {
|
||||
kfree(xics);
|
||||
return ret;
|
||||
}
|
||||
kvm->arch.xics = xics;
|
||||
|
||||
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
||||
if (cpu_has_feature(CPU_FTR_ARCH_206) &&
|
||||
@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
|
||||
.name = "kvm-xics",
|
||||
.create = kvmppc_xics_create,
|
||||
.init = kvmppc_xics_init,
|
||||
.destroy = kvmppc_xics_free,
|
||||
.release = kvmppc_xics_release,
|
||||
.set_attr = xics_set_attr,
|
||||
.get_attr = xics_get_attr,
|
||||
.has_attr = xics_has_attr,
|
||||
@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
|
||||
return -EPERM;
|
||||
if (xics->kvm != vcpu->kvm)
|
||||
return -EPERM;
|
||||
if (vcpu->arch.irq_type)
|
||||
if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
|
||||
return -EBUSY;
|
||||
|
||||
r = kvmppc_xics_create_icp(vcpu, xcpu);
|
||||
|
@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int xive_native_debug_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, xive_native_debug_show, inode->i_private);
|
||||
}
|
||||
|
||||
static const struct file_operations xive_native_debug_fops = {
|
||||
.open = xive_native_debug_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
|
||||
|
||||
static void xive_native_debugfs_init(struct kvmppc_xive *xive)
|
||||
{
|
||||
|
@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
|
||||
|
||||
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
|
||||
@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
|
||||
|
||||
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
|
||||
|
@ -80,13 +80,14 @@
|
||||
#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
|
||||
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
|
||||
#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
|
||||
#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
|
||||
#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24)
|
||||
#define KVM_REQ_APICV_UPDATE \
|
||||
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
|
||||
#define KVM_REQ_HV_TLB_FLUSH \
|
||||
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
|
||||
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
|
||||
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
|
||||
|
||||
#define CR0_RESERVED_BITS \
|
||||
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
|
||||
@ -132,7 +133,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
|
||||
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
|
||||
#define KVM_MIN_FREE_MMU_PAGES 5
|
||||
#define KVM_REFILL_PAGES 25
|
||||
#define KVM_MAX_CPUID_ENTRIES 80
|
||||
#define KVM_MAX_CPUID_ENTRIES 256
|
||||
#define KVM_NR_FIXED_MTRR_REGION 88
|
||||
#define KVM_NR_VAR_MTRR 8
|
||||
|
||||
@ -636,7 +637,7 @@ struct kvm_vcpu_arch {
|
||||
int halt_request; /* real mode on Intel only */
|
||||
|
||||
int cpuid_nent;
|
||||
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
|
||||
struct kvm_cpuid_entry2 *cpuid_entries;
|
||||
|
||||
int maxphyaddr;
|
||||
int max_tdp_level;
|
||||
@ -788,6 +789,21 @@ struct kvm_vcpu_arch {
|
||||
|
||||
/* AMD MSRC001_0015 Hardware Configuration */
|
||||
u64 msr_hwcr;
|
||||
|
||||
/* pv related cpuid info */
|
||||
struct {
|
||||
/*
|
||||
* value of the eax register in the KVM_CPUID_FEATURES CPUID
|
||||
* leaf.
|
||||
*/
|
||||
u32 features;
|
||||
|
||||
/*
|
||||
* indicates whether pv emulation should be disabled if features
|
||||
* are not present in the guest's cpuid
|
||||
*/
|
||||
bool enforce;
|
||||
} pv_cpuid;
|
||||
};
|
||||
|
||||
struct kvm_lpage_info {
|
||||
@ -860,6 +876,13 @@ struct kvm_hv {
|
||||
struct kvm_hv_syndbg hv_syndbg;
|
||||
};
|
||||
|
||||
struct msr_bitmap_range {
|
||||
u32 flags;
|
||||
u32 nmsrs;
|
||||
u32 base;
|
||||
unsigned long *bitmap;
|
||||
};
|
||||
|
||||
enum kvm_irqchip_mode {
|
||||
KVM_IRQCHIP_NONE,
|
||||
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
|
||||
@ -961,8 +984,31 @@ struct kvm_arch {
|
||||
bool guest_can_read_msr_platform_info;
|
||||
bool exception_payload_enabled;
|
||||
|
||||
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
|
||||
u32 user_space_msr_mask;
|
||||
|
||||
struct {
|
||||
u8 count;
|
||||
bool default_allow:1;
|
||||
struct msr_bitmap_range ranges[16];
|
||||
} msr_filter;
|
||||
|
||||
struct kvm_pmu_event_filter *pmu_event_filter;
|
||||
struct task_struct *nx_lpage_recovery_thread;
|
||||
|
||||
/*
|
||||
* Whether the TDP MMU is enabled for this VM. This contains a
|
||||
* snapshot of the TDP MMU module parameter from when the VM was
|
||||
* created and remains unchanged for the life of the VM. If this is
|
||||
* true, TDP MMU handler functions will run for various MMU
|
||||
* operations.
|
||||
*/
|
||||
bool tdp_mmu_enabled;
|
||||
|
||||
/* List of struct tdp_mmu_pages being used as roots */
|
||||
struct list_head tdp_mmu_roots;
|
||||
/* List of struct tdp_mmu_pages not being used as roots */
|
||||
struct list_head tdp_mmu_pages;
|
||||
};
|
||||
|
||||
struct kvm_vm_stat {
|
||||
@ -1069,7 +1115,7 @@ struct kvm_x86_ops {
|
||||
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
|
||||
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
|
||||
int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
|
||||
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
|
||||
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
|
||||
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
|
||||
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
|
||||
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
|
||||
@ -1143,7 +1189,12 @@ struct kvm_x86_ops {
|
||||
/* Returns actual tsc_offset set in active VMCS */
|
||||
u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
|
||||
|
||||
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
|
||||
/*
|
||||
* Retrieve somewhat arbitrary exit information. Intended to be used
|
||||
* only from within tracepoints to avoid VMREADs when tracing is off.
|
||||
*/
|
||||
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
|
||||
u32 *exit_int_info, u32 *exit_int_info_err_code);
|
||||
|
||||
int (*check_intercept)(struct kvm_vcpu *vcpu,
|
||||
struct x86_instruction_info *info,
|
||||
@ -1221,12 +1272,13 @@ struct kvm_x86_ops {
|
||||
|
||||
int (*get_msr_feature)(struct kvm_msr_entry *entry);
|
||||
|
||||
bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
|
||||
bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
|
||||
|
||||
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
|
||||
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
|
||||
|
||||
void (*migrate_timers)(struct kvm_vcpu *vcpu);
|
||||
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
|
||||
};
|
||||
|
||||
struct kvm_x86_nested_ops {
|
||||
@ -1238,7 +1290,7 @@ struct kvm_x86_nested_ops {
|
||||
int (*set_state)(struct kvm_vcpu *vcpu,
|
||||
struct kvm_nested_state __user *user_kvm_nested_state,
|
||||
struct kvm_nested_state *kvm_state);
|
||||
bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
|
||||
bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
|
||||
int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
|
||||
|
||||
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
|
||||
@ -1612,8 +1664,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
|
||||
unsigned long ipi_bitmap_high, u32 min,
|
||||
unsigned long icr, int op_64_bit);
|
||||
|
||||
void kvm_define_shared_msr(unsigned index, u32 msr);
|
||||
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
|
||||
void kvm_define_user_return_msr(unsigned index, u32 msr);
|
||||
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
|
||||
|
||||
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
|
||||
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
|
||||
|
@ -3,10 +3,54 @@
|
||||
#define __SVM_H
|
||||
|
||||
#include <uapi/asm/svm.h>
|
||||
#include <uapi/asm/kvm.h>
|
||||
|
||||
/*
|
||||
* 32-bit intercept words in the VMCB Control Area, starting
|
||||
* at Byte offset 000h.
|
||||
*/
|
||||
|
||||
enum intercept_words {
|
||||
INTERCEPT_CR = 0,
|
||||
INTERCEPT_DR,
|
||||
INTERCEPT_EXCEPTION,
|
||||
INTERCEPT_WORD3,
|
||||
INTERCEPT_WORD4,
|
||||
INTERCEPT_WORD5,
|
||||
MAX_INTERCEPT,
|
||||
};
|
||||
|
||||
enum {
|
||||
INTERCEPT_INTR,
|
||||
/* Byte offset 000h (word 0) */
|
||||
INTERCEPT_CR0_READ = 0,
|
||||
INTERCEPT_CR3_READ = 3,
|
||||
INTERCEPT_CR4_READ = 4,
|
||||
INTERCEPT_CR8_READ = 8,
|
||||
INTERCEPT_CR0_WRITE = 16,
|
||||
INTERCEPT_CR3_WRITE = 16 + 3,
|
||||
INTERCEPT_CR4_WRITE = 16 + 4,
|
||||
INTERCEPT_CR8_WRITE = 16 + 8,
|
||||
/* Byte offset 004h (word 1) */
|
||||
INTERCEPT_DR0_READ = 32,
|
||||
INTERCEPT_DR1_READ,
|
||||
INTERCEPT_DR2_READ,
|
||||
INTERCEPT_DR3_READ,
|
||||
INTERCEPT_DR4_READ,
|
||||
INTERCEPT_DR5_READ,
|
||||
INTERCEPT_DR6_READ,
|
||||
INTERCEPT_DR7_READ,
|
||||
INTERCEPT_DR0_WRITE = 48,
|
||||
INTERCEPT_DR1_WRITE,
|
||||
INTERCEPT_DR2_WRITE,
|
||||
INTERCEPT_DR3_WRITE,
|
||||
INTERCEPT_DR4_WRITE,
|
||||
INTERCEPT_DR5_WRITE,
|
||||
INTERCEPT_DR6_WRITE,
|
||||
INTERCEPT_DR7_WRITE,
|
||||
/* Byte offset 008h (word 2) */
|
||||
INTERCEPT_EXCEPTION_OFFSET = 64,
|
||||
/* Byte offset 00Ch (word 3) */
|
||||
INTERCEPT_INTR = 96,
|
||||
INTERCEPT_NMI,
|
||||
INTERCEPT_SMI,
|
||||
INTERCEPT_INIT,
|
||||
@ -38,7 +82,8 @@ enum {
|
||||
INTERCEPT_TASK_SWITCH,
|
||||
INTERCEPT_FERR_FREEZE,
|
||||
INTERCEPT_SHUTDOWN,
|
||||
INTERCEPT_VMRUN,
|
||||
/* Byte offset 010h (word 4) */
|
||||
INTERCEPT_VMRUN = 128,
|
||||
INTERCEPT_VMMCALL,
|
||||
INTERCEPT_VMLOAD,
|
||||
INTERCEPT_VMSAVE,
|
||||
@ -53,15 +98,18 @@ enum {
|
||||
INTERCEPT_MWAIT_COND,
|
||||
INTERCEPT_XSETBV,
|
||||
INTERCEPT_RDPRU,
|
||||
/* Byte offset 014h (word 5) */
|
||||
INTERCEPT_INVLPGB = 160,
|
||||
INTERCEPT_INVLPGB_ILLEGAL,
|
||||
INTERCEPT_INVPCID,
|
||||
INTERCEPT_MCOMMIT,
|
||||
INTERCEPT_TLBSYNC,
|
||||
};
|
||||
|
||||
|
||||
struct __attribute__ ((__packed__)) vmcb_control_area {
|
||||
u32 intercept_cr;
|
||||
u32 intercept_dr;
|
||||
u32 intercept_exceptions;
|
||||
u64 intercept;
|
||||
u8 reserved_1[40];
|
||||
u32 intercepts[MAX_INTERCEPT];
|
||||
u32 reserved_1[15 - MAX_INTERCEPT];
|
||||
u16 pause_filter_thresh;
|
||||
u16 pause_filter_count;
|
||||
u64 iopm_base_pa;
|
||||
@ -287,32 +335,6 @@ struct vmcb {
|
||||
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
|
||||
#define SVM_SELECTOR_CODE_MASK (1 << 3)
|
||||
|
||||
#define INTERCEPT_CR0_READ 0
|
||||
#define INTERCEPT_CR3_READ 3
|
||||
#define INTERCEPT_CR4_READ 4
|
||||
#define INTERCEPT_CR8_READ 8
|
||||
#define INTERCEPT_CR0_WRITE (16 + 0)
|
||||
#define INTERCEPT_CR3_WRITE (16 + 3)
|
||||
#define INTERCEPT_CR4_WRITE (16 + 4)
|
||||
#define INTERCEPT_CR8_WRITE (16 + 8)
|
||||
|
||||
#define INTERCEPT_DR0_READ 0
|
||||
#define INTERCEPT_DR1_READ 1
|
||||
#define INTERCEPT_DR2_READ 2
|
||||
#define INTERCEPT_DR3_READ 3
|
||||
#define INTERCEPT_DR4_READ 4
|
||||
#define INTERCEPT_DR5_READ 5
|
||||
#define INTERCEPT_DR6_READ 6
|
||||
#define INTERCEPT_DR7_READ 7
|
||||
#define INTERCEPT_DR0_WRITE (16 + 0)
|
||||
#define INTERCEPT_DR1_WRITE (16 + 1)
|
||||
#define INTERCEPT_DR2_WRITE (16 + 2)
|
||||
#define INTERCEPT_DR3_WRITE (16 + 3)
|
||||
#define INTERCEPT_DR4_WRITE (16 + 4)
|
||||
#define INTERCEPT_DR5_WRITE (16 + 5)
|
||||
#define INTERCEPT_DR6_WRITE (16 + 6)
|
||||
#define INTERCEPT_DR7_WRITE (16 + 7)
|
||||
|
||||
#define SVM_EVTINJ_VEC_MASK 0xff
|
||||
|
||||
#define SVM_EVTINJ_TYPE_SHIFT 8
|
||||
|
@ -52,7 +52,7 @@
|
||||
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
|
||||
#define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT)
|
||||
#define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING)
|
||||
#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
|
||||
#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
|
||||
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
|
||||
#define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID)
|
||||
#define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING)
|
||||
|
@ -192,6 +192,26 @@ struct kvm_msr_list {
|
||||
__u32 indices[0];
|
||||
};
|
||||
|
||||
/* Maximum size of any access bitmap in bytes */
|
||||
#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600
|
||||
|
||||
/* for KVM_X86_SET_MSR_FILTER */
|
||||
struct kvm_msr_filter_range {
|
||||
#define KVM_MSR_FILTER_READ (1 << 0)
|
||||
#define KVM_MSR_FILTER_WRITE (1 << 1)
|
||||
__u32 flags;
|
||||
__u32 nmsrs; /* number of msrs in bitmap */
|
||||
__u32 base; /* MSR index the bitmap starts at */
|
||||
__u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
|
||||
};
|
||||
|
||||
#define KVM_MSR_FILTER_MAX_RANGES 16
|
||||
struct kvm_msr_filter {
|
||||
#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
|
||||
#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
|
||||
__u32 flags;
|
||||
struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
|
||||
};
|
||||
|
||||
struct kvm_cpuid_entry {
|
||||
__u32 function;
|
||||
|
@ -77,6 +77,7 @@
|
||||
#define SVM_EXIT_MWAIT_COND 0x08c
|
||||
#define SVM_EXIT_XSETBV 0x08d
|
||||
#define SVM_EXIT_RDPRU 0x08e
|
||||
#define SVM_EXIT_INVPCID 0x0a2
|
||||
#define SVM_EXIT_NPF 0x400
|
||||
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
|
||||
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
|
||||
@ -182,6 +183,7 @@
|
||||
{ SVM_EXIT_MONITOR, "monitor" }, \
|
||||
{ SVM_EXIT_MWAIT, "mwait" }, \
|
||||
{ SVM_EXIT_XSETBV, "xsetbv" }, \
|
||||
{ SVM_EXIT_INVPCID, "invpcid" }, \
|
||||
{ SVM_EXIT_NPF, "npf" }, \
|
||||
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
|
||||
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
|
||||
|
@ -975,7 +975,7 @@ void arch_haltpoll_disable(unsigned int cpu)
|
||||
if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
|
||||
return;
|
||||
|
||||
/* Enable guest halt poll disables host halt poll */
|
||||
/* Disable guest halt poll enables host halt poll */
|
||||
smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
|
||||
|
@ -66,6 +66,7 @@ config KVM_WERROR
|
||||
default y if X86_64 && !KASAN
|
||||
# We use the dependency on !COMPILE_TEST to not be enabled
|
||||
# blindly in allmodconfig or allyesconfig configurations
|
||||
depends on KVM
|
||||
depends on (X86_64 && !KASAN) || !COMPILE_TEST
|
||||
depends on EXPERT
|
||||
help
|
||||
|
@ -15,9 +15,11 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
|
||||
|
||||
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
|
||||
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
|
||||
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
|
||||
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
|
||||
mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
|
||||
|
||||
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
|
||||
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
|
||||
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
|
||||
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
|
||||
|
||||
obj-$(CONFIG_KVM) += kvm.o
|
||||
|
@ -54,7 +54,24 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
|
||||
|
||||
#define F feature_bit
|
||||
|
||||
static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
|
||||
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
|
||||
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *e;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nent; i++) {
|
||||
e = &entries[i];
|
||||
|
||||
if (e->function == function && (e->index == index ||
|
||||
!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
|
||||
return e;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
@ -62,7 +79,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
|
||||
* The existing code assumes virtual address is 48-bit or 57-bit in the
|
||||
* canonical address checks; exit if it is ever changed.
|
||||
*/
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 0x80000008, 0);
|
||||
if (best) {
|
||||
int vaddr_bits = (best->eax & 0xff00) >> 8;
|
||||
|
||||
@ -107,6 +124,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
|
||||
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
|
||||
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
|
||||
|
||||
/*
|
||||
* save the feature bitmap to avoid cpuid lookup for every PV
|
||||
* operation
|
||||
*/
|
||||
if (best)
|
||||
vcpu->arch.pv_cpuid.features = best->eax;
|
||||
|
||||
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
|
||||
if (best)
|
||||
@ -121,8 +145,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 1, 0);
|
||||
if (best && apic) {
|
||||
if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
|
||||
@ -146,7 +168,9 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
kvm_pmu_refresh(vcpu);
|
||||
vcpu->arch.cr4_guest_rsvd_bits =
|
||||
__cr4_reserved_bits(guest_cpuid_has, vcpu);
|
||||
kvm_x86_ops.update_exception_bitmap(vcpu);
|
||||
|
||||
/* Invoke the vendor callback only after the above state is updated. */
|
||||
kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
|
||||
}
|
||||
|
||||
static int is_efer_nx(void)
|
||||
@ -186,7 +210,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
|
||||
not_found:
|
||||
return 36;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
|
||||
|
||||
/* when an old userspace process fills a new kernel module */
|
||||
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
|
||||
@ -194,46 +217,53 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
|
||||
struct kvm_cpuid_entry __user *entries)
|
||||
{
|
||||
int r, i;
|
||||
struct kvm_cpuid_entry *cpuid_entries = NULL;
|
||||
struct kvm_cpuid_entry *e = NULL;
|
||||
struct kvm_cpuid_entry2 *e2 = NULL;
|
||||
|
||||
r = -E2BIG;
|
||||
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
|
||||
goto out;
|
||||
return -E2BIG;
|
||||
|
||||
if (cpuid->nent) {
|
||||
cpuid_entries = vmemdup_user(entries,
|
||||
array_size(sizeof(struct kvm_cpuid_entry),
|
||||
cpuid->nent));
|
||||
if (IS_ERR(cpuid_entries)) {
|
||||
r = PTR_ERR(cpuid_entries);
|
||||
goto out;
|
||||
e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
|
||||
if (IS_ERR(e))
|
||||
return PTR_ERR(e);
|
||||
|
||||
e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
|
||||
if (!e2) {
|
||||
r = -ENOMEM;
|
||||
goto out_free_cpuid;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < cpuid->nent; i++) {
|
||||
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
|
||||
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
|
||||
vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
|
||||
vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
|
||||
vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
|
||||
vcpu->arch.cpuid_entries[i].index = 0;
|
||||
vcpu->arch.cpuid_entries[i].flags = 0;
|
||||
vcpu->arch.cpuid_entries[i].padding[0] = 0;
|
||||
vcpu->arch.cpuid_entries[i].padding[1] = 0;
|
||||
vcpu->arch.cpuid_entries[i].padding[2] = 0;
|
||||
e2[i].function = e[i].function;
|
||||
e2[i].eax = e[i].eax;
|
||||
e2[i].ebx = e[i].ebx;
|
||||
e2[i].ecx = e[i].ecx;
|
||||
e2[i].edx = e[i].edx;
|
||||
e2[i].index = 0;
|
||||
e2[i].flags = 0;
|
||||
e2[i].padding[0] = 0;
|
||||
e2[i].padding[1] = 0;
|
||||
e2[i].padding[2] = 0;
|
||||
}
|
||||
vcpu->arch.cpuid_nent = cpuid->nent;
|
||||
r = kvm_check_cpuid(vcpu);
|
||||
|
||||
r = kvm_check_cpuid(e2, cpuid->nent);
|
||||
if (r) {
|
||||
vcpu->arch.cpuid_nent = 0;
|
||||
kvfree(cpuid_entries);
|
||||
goto out;
|
||||
kvfree(e2);
|
||||
goto out_free_cpuid;
|
||||
}
|
||||
|
||||
kvfree(vcpu->arch.cpuid_entries);
|
||||
vcpu->arch.cpuid_entries = e2;
|
||||
vcpu->arch.cpuid_nent = cpuid->nent;
|
||||
|
||||
cpuid_fix_nx_cap(vcpu);
|
||||
kvm_update_cpuid_runtime(vcpu);
|
||||
kvm_vcpu_after_set_cpuid(vcpu);
|
||||
|
||||
kvfree(cpuid_entries);
|
||||
out:
|
||||
out_free_cpuid:
|
||||
kvfree(e);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -241,26 +271,32 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
|
||||
struct kvm_cpuid2 *cpuid,
|
||||
struct kvm_cpuid_entry2 __user *entries)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *e2 = NULL;
|
||||
int r;
|
||||
|
||||
r = -E2BIG;
|
||||
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
|
||||
goto out;
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
|
||||
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
|
||||
goto out;
|
||||
vcpu->arch.cpuid_nent = cpuid->nent;
|
||||
r = kvm_check_cpuid(vcpu);
|
||||
if (r) {
|
||||
vcpu->arch.cpuid_nent = 0;
|
||||
goto out;
|
||||
return -E2BIG;
|
||||
|
||||
if (cpuid->nent) {
|
||||
e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent));
|
||||
if (IS_ERR(e2))
|
||||
return PTR_ERR(e2);
|
||||
}
|
||||
|
||||
r = kvm_check_cpuid(e2, cpuid->nent);
|
||||
if (r) {
|
||||
kvfree(e2);
|
||||
return r;
|
||||
}
|
||||
|
||||
kvfree(vcpu->arch.cpuid_entries);
|
||||
vcpu->arch.cpuid_entries = e2;
|
||||
vcpu->arch.cpuid_nent = cpuid->nent;
|
||||
|
||||
kvm_update_cpuid_runtime(vcpu);
|
||||
kvm_vcpu_after_set_cpuid(vcpu);
|
||||
out:
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
|
||||
@ -941,17 +977,8 @@ out_free:
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
|
||||
u32 function, u32 index)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *e;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
|
||||
e = &vcpu->arch.cpuid_entries[i];
|
||||
|
||||
if (e->function == function && (e->index == index ||
|
||||
!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
|
||||
return e;
|
||||
}
|
||||
return NULL;
|
||||
return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
|
||||
function, index);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "x86.h"
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/processor.h>
|
||||
#include <uapi/asm/kvm_para.h>
|
||||
|
||||
extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
|
||||
void kvm_set_cpu_caps(void);
|
||||
@ -34,6 +35,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
|
||||
return vcpu->arch.maxphyaddr;
|
||||
}
|
||||
|
||||
static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
|
||||
{
|
||||
return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
|
||||
}
|
||||
|
||||
struct cpuid_reg {
|
||||
u32 function;
|
||||
u32 index;
|
||||
@ -308,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
|
||||
return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
|
||||
}
|
||||
|
||||
static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
|
||||
unsigned int kvm_feature)
|
||||
{
|
||||
if (!vcpu->arch.pv_cpuid.enforce)
|
||||
return true;
|
||||
|
||||
return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -3606,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
|
||||
u64 tsc_aux = 0;
|
||||
|
||||
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
|
||||
return emulate_gp(ctxt, 0);
|
||||
return emulate_ud(ctxt);
|
||||
ctxt->dst.val = tsc_aux;
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
@ -3701,21 +3701,35 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
|
||||
|
||||
static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
|
||||
u64 msr_data;
|
||||
int r;
|
||||
|
||||
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
|
||||
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
|
||||
if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
|
||||
r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
|
||||
|
||||
if (r == X86EMUL_IO_NEEDED)
|
||||
return r;
|
||||
|
||||
if (r > 0)
|
||||
return emulate_gp(ctxt, 0);
|
||||
|
||||
return X86EMUL_CONTINUE;
|
||||
return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
|
||||
}
|
||||
|
||||
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
|
||||
u64 msr_data;
|
||||
int r;
|
||||
|
||||
if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
|
||||
r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
|
||||
|
||||
if (r == X86EMUL_IO_NEEDED)
|
||||
return r;
|
||||
|
||||
if (r)
|
||||
return emulate_gp(ctxt, 0);
|
||||
|
||||
*reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
|
||||
|
@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
|
||||
{
|
||||
union hv_stimer_config new_config = {.as_uint64 = config},
|
||||
old_config = {.as_uint64 = stimer->config.as_uint64};
|
||||
struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
|
||||
struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
|
||||
|
||||
if (!synic->active && !host)
|
||||
return 1;
|
||||
|
||||
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
|
||||
stimer->index, config, host);
|
||||
@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
|
||||
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
|
||||
bool host)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
|
||||
struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
|
||||
|
||||
if (!synic->active && !host)
|
||||
return 1;
|
||||
|
||||
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
|
||||
stimer->index, count, host);
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
|
||||
#define KVM_POSSIBLE_CR4_GUEST_BITS \
|
||||
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
|
||||
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
|
||||
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
|
||||
|
||||
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
|
||||
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
|
||||
|
@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
|
||||
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
|
||||
}
|
||||
|
||||
static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
|
||||
{
|
||||
kvm_lapic_set_reg(apic, APIC_DFR, val);
|
||||
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
|
||||
}
|
||||
|
||||
static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
|
||||
{
|
||||
return ((id >> 4) << 16) | (1 << (id & 0xf));
|
||||
@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
|
||||
}
|
||||
}
|
||||
|
||||
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
|
||||
{
|
||||
apic_clear_irr(vec, vcpu->arch.apic);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
|
||||
|
||||
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
u64 guest_tsc, tsc_deadline;
|
||||
|
||||
if (apic->lapic_timer.expired_tscdeadline == 0)
|
||||
return;
|
||||
|
||||
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
|
||||
apic->lapic_timer.expired_tscdeadline = 0;
|
||||
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
|
||||
@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
|
||||
|
||||
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (lapic_timer_int_injected(vcpu))
|
||||
if (lapic_in_kernel(vcpu) &&
|
||||
vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
|
||||
vcpu->arch.apic->lapic_timer.timer_advance_ns &&
|
||||
lapic_timer_int_injected(vcpu))
|
||||
__kvm_wait_lapic_expire(vcpu);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
|
||||
@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
|
||||
}
|
||||
|
||||
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
|
||||
if (apic->lapic_timer.timer_advance_ns)
|
||||
__kvm_wait_lapic_expire(vcpu);
|
||||
kvm_wait_lapic_expire(vcpu);
|
||||
kvm_apic_inject_pending_timer_irqs(apic);
|
||||
return;
|
||||
}
|
||||
|
||||
atomic_inc(&apic->lapic_timer.pending);
|
||||
kvm_set_pending_timer(vcpu);
|
||||
kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
|
||||
if (from_timer_fn)
|
||||
kvm_vcpu_kick(vcpu);
|
||||
}
|
||||
|
||||
static void start_sw_tscdeadline(struct kvm_lapic *apic)
|
||||
@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
||||
break;
|
||||
|
||||
case APIC_DFR:
|
||||
if (!apic_x2apic_mode(apic)) {
|
||||
kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
|
||||
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
|
||||
} else
|
||||
if (!apic_x2apic_mode(apic))
|
||||
kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
|
||||
else
|
||||
ret = 1;
|
||||
break;
|
||||
|
||||
@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
|
||||
if (!lapic_in_kernel(vcpu) ||
|
||||
!apic_lvtt_tscdeadline(apic))
|
||||
if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
|
||||
return 0;
|
||||
|
||||
return apic->lapic_timer.tscdeadline;
|
||||
@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
|
||||
{
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
|
||||
if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
|
||||
apic_lvtt_period(apic))
|
||||
if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
|
||||
return;
|
||||
|
||||
hrtimer_cancel(&apic->lapic_timer.timer);
|
||||
@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
|
||||
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
|
||||
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
|
||||
|
||||
kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
|
||||
kvm_apic_set_dfr(apic, 0xffffffffU);
|
||||
apic_set_spiv(apic, 0xff);
|
||||
kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
|
||||
if (!apic_x2apic_mode(apic))
|
||||
@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
|
||||
__apic_update_ppr(apic, &ppr);
|
||||
return apic_has_interrupt_for_ppr(apic, ppr);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
|
||||
|
||||
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
|
@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
|
||||
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
|
||||
int shorthand, unsigned int dest, int dest_mode);
|
||||
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
|
||||
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
|
||||
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
|
||||
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
|
||||
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
|
||||
|
@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
|
||||
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
|
||||
}
|
||||
|
||||
static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
|
||||
{
|
||||
return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a given access (described through the I/D, W/R and U/S bits of a
|
||||
* page fault error code pfec) causes a permission fault with the given PTE
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -3,9 +3,23 @@
|
||||
#define __KVM_X86_MMU_INTERNAL_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include <asm/kvm_host.h>
|
||||
|
||||
#undef MMU_DEBUG
|
||||
|
||||
#ifdef MMU_DEBUG
|
||||
extern bool dbg;
|
||||
|
||||
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
|
||||
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
|
||||
#define MMU_WARN_ON(x) WARN_ON(x)
|
||||
#else
|
||||
#define pgprintk(x...) do { } while (0)
|
||||
#define rmap_printk(x...) do { } while (0)
|
||||
#define MMU_WARN_ON(x) do { } while (0)
|
||||
#endif
|
||||
|
||||
struct kvm_mmu_page {
|
||||
struct list_head link;
|
||||
struct hlist_node hash_link;
|
||||
@ -41,8 +55,12 @@ struct kvm_mmu_page {
|
||||
|
||||
/* Number of writes since the last time traversal visited this page. */
|
||||
atomic_t write_flooding_count;
|
||||
|
||||
bool tdp_mmu_page;
|
||||
};
|
||||
|
||||
extern struct kmem_cache *mmu_page_header_cache;
|
||||
|
||||
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
|
||||
{
|
||||
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
|
||||
@ -55,9 +73,77 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
|
||||
return to_shadow_page(__pa(sptep));
|
||||
}
|
||||
|
||||
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* When using the EPT page-modification log, the GPAs in the log
|
||||
* would come from L2 rather than L1. Therefore, we need to rely
|
||||
* on write protection to record dirty pages. This also bypasses
|
||||
* PML, since writes now result in a vmexit.
|
||||
*/
|
||||
return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
|
||||
}
|
||||
|
||||
bool is_nx_huge_page_enabled(void);
|
||||
bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
bool can_unsync);
|
||||
|
||||
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot, u64 gfn);
|
||||
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
|
||||
u64 start_gfn, u64 pages);
|
||||
|
||||
static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
BUG_ON(!sp->root_count);
|
||||
lockdep_assert_held(&kvm->mmu_lock);
|
||||
|
||||
++sp->root_count;
|
||||
}
|
||||
|
||||
static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
lockdep_assert_held(&kvm->mmu_lock);
|
||||
--sp->root_count;
|
||||
|
||||
return !sp->root_count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
|
||||
*
|
||||
* RET_PF_RETRY: let CPU fault again on the address.
|
||||
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
|
||||
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
|
||||
* RET_PF_FIXED: The faulting entry has been fixed.
|
||||
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
|
||||
*/
|
||||
enum {
|
||||
RET_PF_RETRY = 0,
|
||||
RET_PF_EMULATE,
|
||||
RET_PF_INVALID,
|
||||
RET_PF_FIXED,
|
||||
RET_PF_SPURIOUS,
|
||||
};
|
||||
|
||||
/* Bits which may be returned by set_spte() */
|
||||
#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
|
||||
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
|
||||
#define SET_SPTE_SPURIOUS BIT(2)
|
||||
|
||||
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
int max_level, kvm_pfn_t *pfnp,
|
||||
bool huge_page_disallowed, int *req_level);
|
||||
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
|
||||
kvm_pfn_t *pfnp, int *goal_levelp);
|
||||
|
||||
bool is_nx_huge_page_enabled(void);
|
||||
|
||||
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
|
||||
|
||||
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
|
||||
#endif /* __KVM_X86_MMU_INTERNAL_H */
|
||||
|
@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
|
||||
|
||||
TRACE_EVENT(
|
||||
mark_mmio_spte,
|
||||
TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
|
||||
TP_ARGS(sptep, gfn, access, gen),
|
||||
TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte),
|
||||
TP_ARGS(sptep, gfn, spte),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(void *, sptep)
|
||||
@ -215,8 +215,8 @@ TRACE_EVENT(
|
||||
TP_fast_assign(
|
||||
__entry->sptep = sptep;
|
||||
__entry->gfn = gfn;
|
||||
__entry->access = access;
|
||||
__entry->gen = gen;
|
||||
__entry->access = spte & ACC_ALL;
|
||||
__entry->gen = get_mmio_spte_generation(spte);
|
||||
),
|
||||
|
||||
TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
|
||||
@ -244,14 +244,11 @@ TRACE_EVENT(
|
||||
__entry->access)
|
||||
);
|
||||
|
||||
#define __spte_satisfied(__spte) \
|
||||
(__entry->retry && is_writable_pte(__entry->__spte))
|
||||
|
||||
TRACE_EVENT(
|
||||
fast_page_fault,
|
||||
TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
|
||||
u64 *sptep, u64 old_spte, bool retry),
|
||||
TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
|
||||
u64 *sptep, u64 old_spte, int ret),
|
||||
TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, vcpu_id)
|
||||
@ -260,7 +257,7 @@ TRACE_EVENT(
|
||||
__field(u64 *, sptep)
|
||||
__field(u64, old_spte)
|
||||
__field(u64, new_spte)
|
||||
__field(bool, retry)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -270,7 +267,7 @@ TRACE_EVENT(
|
||||
__entry->sptep = sptep;
|
||||
__entry->old_spte = old_spte;
|
||||
__entry->new_spte = *sptep;
|
||||
__entry->retry = retry;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
|
||||
@ -278,7 +275,7 @@ TRACE_EVENT(
|
||||
__entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
|
||||
kvm_mmu_trace_pferr_flags), __entry->sptep,
|
||||
__entry->old_spte, __entry->new_spte,
|
||||
__spte_satisfied(old_spte), __spte_satisfied(new_spte)
|
||||
__entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
|
||||
)
|
||||
);
|
||||
|
||||
|
@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
* we call mmu_set_spte() with host_writable = true because
|
||||
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
|
||||
*/
|
||||
mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
|
||||
mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
|
||||
true, true);
|
||||
|
||||
kvm_release_pfn_clean(pfn);
|
||||
@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
|
||||
* emulate this operation, return 1 to indicate this case.
|
||||
*/
|
||||
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
|
||||
struct guest_walker *gw,
|
||||
int write_fault, int max_level,
|
||||
kvm_pfn_t pfn, bool map_writable, bool prefault,
|
||||
bool lpage_disallowed)
|
||||
struct guest_walker *gw, u32 error_code,
|
||||
int max_level, kvm_pfn_t pfn, bool map_writable,
|
||||
bool prefault)
|
||||
{
|
||||
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
|
||||
bool write_fault = error_code & PFERR_WRITE_MASK;
|
||||
bool exec = error_code & PFERR_FETCH_MASK;
|
||||
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
|
||||
struct kvm_mmu_page *sp = NULL;
|
||||
struct kvm_shadow_walk_iterator it;
|
||||
unsigned direct_access, access = gw->pt_access;
|
||||
int top_level, hlevel, ret;
|
||||
int top_level, level, req_level, ret;
|
||||
gfn_t base_gfn = gw->gfn;
|
||||
|
||||
direct_access = gw->pte_access;
|
||||
@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
}
|
||||
|
||||
hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
|
||||
level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
|
||||
huge_page_disallowed, &req_level);
|
||||
|
||||
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
|
||||
|
||||
@ -690,10 +694,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
|
||||
* We cannot overwrite existing page tables with an NX
|
||||
* large page, as the leaf could be executable.
|
||||
*/
|
||||
disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
|
||||
if (nx_huge_page_workaround_enabled)
|
||||
disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level,
|
||||
&pfn, &level);
|
||||
|
||||
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
|
||||
if (it.level == hlevel)
|
||||
if (it.level == level)
|
||||
break;
|
||||
|
||||
validate_direct_spte(vcpu, it.sptep, direct_access);
|
||||
@ -704,13 +710,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
|
||||
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
|
||||
it.level - 1, true, direct_access);
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
if (lpage_disallowed)
|
||||
if (huge_page_disallowed && req_level >= it.level)
|
||||
account_huge_nx_page(vcpu->kvm, sp);
|
||||
}
|
||||
}
|
||||
|
||||
ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
|
||||
it.level, base_gfn, pfn, prefault, map_writable);
|
||||
if (ret == RET_PF_SPURIOUS)
|
||||
return ret;
|
||||
|
||||
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
|
||||
++vcpu->stat.pf_fixed;
|
||||
return ret;
|
||||
@ -738,7 +747,7 @@ out_gpte_changed:
|
||||
*/
|
||||
static bool
|
||||
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
|
||||
struct guest_walker *walker, int user_fault,
|
||||
struct guest_walker *walker, bool user_fault,
|
||||
bool *write_fault_to_shadow_pgtable)
|
||||
{
|
||||
int level;
|
||||
@ -776,15 +785,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
|
||||
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
||||
bool prefault)
|
||||
{
|
||||
int write_fault = error_code & PFERR_WRITE_MASK;
|
||||
int user_fault = error_code & PFERR_USER_MASK;
|
||||
bool write_fault = error_code & PFERR_WRITE_MASK;
|
||||
bool user_fault = error_code & PFERR_USER_MASK;
|
||||
struct guest_walker walker;
|
||||
int r;
|
||||
kvm_pfn_t pfn;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable, is_self_change_mapping;
|
||||
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
|
||||
is_nx_huge_page_enabled();
|
||||
int max_level;
|
||||
|
||||
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
|
||||
@ -825,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
||||
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
|
||||
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
|
||||
|
||||
if (lpage_disallowed || is_self_change_mapping)
|
||||
if (is_self_change_mapping)
|
||||
max_level = PG_LEVEL_4K;
|
||||
else
|
||||
max_level = walker.level;
|
||||
@ -869,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
||||
r = make_mmu_pages_available(vcpu);
|
||||
if (r)
|
||||
goto out_unlock;
|
||||
r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
|
||||
map_writable, prefault, lpage_disallowed);
|
||||
r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
|
||||
map_writable, prefault);
|
||||
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
|
||||
|
||||
out_unlock:
|
||||
@ -895,6 +902,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
|
||||
{
|
||||
struct kvm_shadow_walk_iterator iterator;
|
||||
struct kvm_mmu_page *sp;
|
||||
u64 old_spte;
|
||||
int level;
|
||||
u64 *sptep;
|
||||
|
||||
@ -917,7 +925,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
|
||||
sptep = iterator.sptep;
|
||||
|
||||
sp = sptep_to_sp(sptep);
|
||||
if (is_last_spte(*sptep, level)) {
|
||||
old_spte = *sptep;
|
||||
if (is_last_spte(old_spte, level)) {
|
||||
pt_element_t gpte;
|
||||
gpa_t pte_gpa;
|
||||
|
||||
@ -927,7 +936,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
|
||||
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
||||
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
|
||||
|
||||
if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
|
||||
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
|
||||
if (is_shadow_present_pte(old_spte))
|
||||
kvm_flush_remote_tlbs_with_address(vcpu->kvm,
|
||||
sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
|
||||
|
||||
|
318
arch/x86/kvm/mmu/spte.c
Normal file
318
arch/x86/kvm/mmu/spte.c
Normal file
@ -0,0 +1,318 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Kernel-based Virtual Machine driver for Linux
|
||||
*
|
||||
* Macros and functions to access KVM PTEs (also known as SPTEs)
|
||||
*
|
||||
* Copyright (C) 2006 Qumranet, Inc.
|
||||
* Copyright 2020 Red Hat, Inc. and/or its affiliates.
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include "mmu.h"
|
||||
#include "mmu_internal.h"
|
||||
#include "x86.h"
|
||||
#include "spte.h"
|
||||
|
||||
#include <asm/e820/api.h>
|
||||
|
||||
u64 __read_mostly shadow_nx_mask;
|
||||
u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
|
||||
u64 __read_mostly shadow_user_mask;
|
||||
u64 __read_mostly shadow_accessed_mask;
|
||||
u64 __read_mostly shadow_dirty_mask;
|
||||
u64 __read_mostly shadow_mmio_value;
|
||||
u64 __read_mostly shadow_mmio_access_mask;
|
||||
u64 __read_mostly shadow_present_mask;
|
||||
u64 __read_mostly shadow_me_mask;
|
||||
u64 __read_mostly shadow_acc_track_mask;
|
||||
|
||||
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
|
||||
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
|
||||
|
||||
u8 __read_mostly shadow_phys_bits;
|
||||
|
||||
static u64 generation_mmio_spte_mask(u64 gen)
|
||||
{
|
||||
u64 mask;
|
||||
|
||||
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
|
||||
BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
|
||||
|
||||
mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
|
||||
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
|
||||
return mask;
|
||||
}
|
||||
|
||||
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
|
||||
{
|
||||
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
|
||||
u64 mask = generation_mmio_spte_mask(gen);
|
||||
u64 gpa = gfn << PAGE_SHIFT;
|
||||
|
||||
access &= shadow_mmio_access_mask;
|
||||
mask |= shadow_mmio_value | access;
|
||||
mask |= gpa | shadow_nonpresent_or_rsvd_mask;
|
||||
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
|
||||
<< shadow_nonpresent_or_rsvd_mask_len;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
|
||||
{
|
||||
if (pfn_valid(pfn))
|
||||
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
|
||||
/*
|
||||
* Some reserved pages, such as those from NVDIMM
|
||||
* DAX devices, are not for MMIO, and can be mapped
|
||||
* with cached memory type for better performance.
|
||||
* However, the above check misconceives those pages
|
||||
* as MMIO, and results in KVM mapping them with UC
|
||||
* memory type, which would hurt the performance.
|
||||
* Therefore, we check the host memory type in addition
|
||||
* and only treat UC/UC-/WC pages as MMIO.
|
||||
*/
|
||||
(!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
|
||||
|
||||
return !e820__mapped_raw_any(pfn_to_hpa(pfn),
|
||||
pfn_to_hpa(pfn + 1) - 1,
|
||||
E820_TYPE_RAM);
|
||||
}
|
||||
|
||||
int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
|
||||
gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
|
||||
bool can_unsync, bool host_writable, bool ad_disabled,
|
||||
u64 *new_spte)
|
||||
{
|
||||
u64 spte = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (ad_disabled)
|
||||
spte |= SPTE_AD_DISABLED_MASK;
|
||||
else if (kvm_vcpu_ad_need_write_protect(vcpu))
|
||||
spte |= SPTE_AD_WRPROT_ONLY_MASK;
|
||||
|
||||
/*
|
||||
* For the EPT case, shadow_present_mask is 0 if hardware
|
||||
* supports exec-only page table entries. In that case,
|
||||
* ACC_USER_MASK and shadow_user_mask are used to represent
|
||||
* read access. See FNAME(gpte_access) in paging_tmpl.h.
|
||||
*/
|
||||
spte |= shadow_present_mask;
|
||||
if (!speculative)
|
||||
spte |= spte_shadow_accessed_mask(spte);
|
||||
|
||||
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
|
||||
is_nx_huge_page_enabled()) {
|
||||
pte_access &= ~ACC_EXEC_MASK;
|
||||
}
|
||||
|
||||
if (pte_access & ACC_EXEC_MASK)
|
||||
spte |= shadow_x_mask;
|
||||
else
|
||||
spte |= shadow_nx_mask;
|
||||
|
||||
if (pte_access & ACC_USER_MASK)
|
||||
spte |= shadow_user_mask;
|
||||
|
||||
if (level > PG_LEVEL_4K)
|
||||
spte |= PT_PAGE_SIZE_MASK;
|
||||
if (tdp_enabled)
|
||||
spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
|
||||
kvm_is_mmio_pfn(pfn));
|
||||
|
||||
if (host_writable)
|
||||
spte |= SPTE_HOST_WRITEABLE;
|
||||
else
|
||||
pte_access &= ~ACC_WRITE_MASK;
|
||||
|
||||
if (!kvm_is_mmio_pfn(pfn))
|
||||
spte |= shadow_me_mask;
|
||||
|
||||
spte |= (u64)pfn << PAGE_SHIFT;
|
||||
|
||||
if (pte_access & ACC_WRITE_MASK) {
|
||||
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
|
||||
|
||||
/*
|
||||
* Optimization: for pte sync, if spte was writable the hash
|
||||
* lookup is unnecessary (and expensive). Write protection
|
||||
* is responsibility of mmu_get_page / kvm_sync_page.
|
||||
* Same reasoning can be applied to dirty page accounting.
|
||||
*/
|
||||
if (!can_unsync && is_writable_pte(old_spte))
|
||||
goto out;
|
||||
|
||||
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
|
||||
pgprintk("%s: found shadow page for %llx, marking ro\n",
|
||||
__func__, gfn);
|
||||
ret |= SET_SPTE_WRITE_PROTECTED_PT;
|
||||
pte_access &= ~ACC_WRITE_MASK;
|
||||
spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
|
||||
}
|
||||
}
|
||||
|
||||
if (pte_access & ACC_WRITE_MASK)
|
||||
spte |= spte_shadow_dirty_mask(spte);
|
||||
|
||||
if (speculative)
|
||||
spte = mark_spte_for_access_track(spte);
|
||||
|
||||
out:
|
||||
*new_spte = spte;
|
||||
return ret;
|
||||
}
|
||||
|
||||
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
|
||||
{
|
||||
u64 spte;
|
||||
|
||||
spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
|
||||
shadow_user_mask | shadow_x_mask | shadow_me_mask;
|
||||
|
||||
if (ad_disabled)
|
||||
spte |= SPTE_AD_DISABLED_MASK;
|
||||
else
|
||||
spte |= shadow_accessed_mask;
|
||||
|
||||
return spte;
|
||||
}
|
||||
|
||||
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
|
||||
{
|
||||
u64 new_spte;
|
||||
|
||||
new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
|
||||
new_spte |= (u64)new_pfn << PAGE_SHIFT;
|
||||
|
||||
new_spte &= ~PT_WRITABLE_MASK;
|
||||
new_spte &= ~SPTE_HOST_WRITEABLE;
|
||||
|
||||
new_spte = mark_spte_for_access_track(new_spte);
|
||||
|
||||
return new_spte;
|
||||
}
|
||||
|
||||
static u8 kvm_get_shadow_phys_bits(void)
|
||||
{
|
||||
/*
|
||||
* boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
|
||||
* in CPU detection code, but the processor treats those reduced bits as
|
||||
* 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
|
||||
* the physical address bits reported by CPUID.
|
||||
*/
|
||||
if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
|
||||
return cpuid_eax(0x80000008) & 0xff;
|
||||
|
||||
/*
|
||||
* Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
|
||||
* custom CPUID. Proceed with whatever the kernel found since these features
|
||||
* aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
|
||||
*/
|
||||
return boot_cpu_data.x86_phys_bits;
|
||||
}
|
||||
|
||||
u64 mark_spte_for_access_track(u64 spte)
|
||||
{
|
||||
if (spte_ad_enabled(spte))
|
||||
return spte & ~shadow_accessed_mask;
|
||||
|
||||
if (is_access_track_spte(spte))
|
||||
return spte;
|
||||
|
||||
/*
|
||||
* Making an Access Tracking PTE will result in removal of write access
|
||||
* from the PTE. So, verify that we will be able to restore the write
|
||||
* access in the fast page fault path later on.
|
||||
*/
|
||||
WARN_ONCE((spte & PT_WRITABLE_MASK) &&
|
||||
!spte_can_locklessly_be_made_writable(spte),
|
||||
"kvm: Writable SPTE is not locklessly dirty-trackable\n");
|
||||
|
||||
WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
|
||||
shadow_acc_track_saved_bits_shift),
|
||||
"kvm: Access Tracking saved bit locations are not zero\n");
|
||||
|
||||
spte |= (spte & shadow_acc_track_saved_bits_mask) <<
|
||||
shadow_acc_track_saved_bits_shift;
|
||||
spte &= ~shadow_acc_track_mask;
|
||||
|
||||
return spte;
|
||||
}
|
||||
|
||||
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
|
||||
{
|
||||
BUG_ON((u64)(unsigned)access_mask != access_mask);
|
||||
WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
|
||||
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
|
||||
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
|
||||
shadow_mmio_access_mask = access_mask;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
|
||||
|
||||
/*
|
||||
* Sets the shadow PTE masks used by the MMU.
|
||||
*
|
||||
* Assumptions:
|
||||
* - Setting either @accessed_mask or @dirty_mask requires setting both
|
||||
* - At least one of @accessed_mask or @acc_track_mask must be set
|
||||
*/
|
||||
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
|
||||
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
|
||||
u64 acc_track_mask, u64 me_mask)
|
||||
{
|
||||
BUG_ON(!dirty_mask != !accessed_mask);
|
||||
BUG_ON(!accessed_mask && !acc_track_mask);
|
||||
BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
|
||||
|
||||
shadow_user_mask = user_mask;
|
||||
shadow_accessed_mask = accessed_mask;
|
||||
shadow_dirty_mask = dirty_mask;
|
||||
shadow_nx_mask = nx_mask;
|
||||
shadow_x_mask = x_mask;
|
||||
shadow_present_mask = p_mask;
|
||||
shadow_acc_track_mask = acc_track_mask;
|
||||
shadow_me_mask = me_mask;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
|
||||
|
||||
void kvm_mmu_reset_all_pte_masks(void)
|
||||
{
|
||||
u8 low_phys_bits;
|
||||
|
||||
shadow_user_mask = 0;
|
||||
shadow_accessed_mask = 0;
|
||||
shadow_dirty_mask = 0;
|
||||
shadow_nx_mask = 0;
|
||||
shadow_x_mask = 0;
|
||||
shadow_present_mask = 0;
|
||||
shadow_acc_track_mask = 0;
|
||||
|
||||
shadow_phys_bits = kvm_get_shadow_phys_bits();
|
||||
|
||||
/*
|
||||
* If the CPU has 46 or less physical address bits, then set an
|
||||
* appropriate mask to guard against L1TF attacks. Otherwise, it is
|
||||
* assumed that the CPU is not vulnerable to L1TF.
|
||||
*
|
||||
* Some Intel CPUs address the L1 cache using more PA bits than are
|
||||
* reported by CPUID. Use the PA width of the L1 cache when possible
|
||||
* to achieve more effective mitigation, e.g. if system RAM overlaps
|
||||
* the most significant bits of legal physical address space.
|
||||
*/
|
||||
shadow_nonpresent_or_rsvd_mask = 0;
|
||||
low_phys_bits = boot_cpu_data.x86_phys_bits;
|
||||
if (boot_cpu_has_bug(X86_BUG_L1TF) &&
|
||||
!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
|
||||
52 - shadow_nonpresent_or_rsvd_mask_len)) {
|
||||
low_phys_bits = boot_cpu_data.x86_cache_bits
|
||||
- shadow_nonpresent_or_rsvd_mask_len;
|
||||
shadow_nonpresent_or_rsvd_mask =
|
||||
rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
|
||||
}
|
||||
|
||||
shadow_nonpresent_or_rsvd_lower_gfn_mask =
|
||||
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
|
||||
}
|
252
arch/x86/kvm/mmu/spte.h
Normal file
252
arch/x86/kvm/mmu/spte.h
Normal file
@ -0,0 +1,252 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
#ifndef KVM_X86_MMU_SPTE_H
|
||||
#define KVM_X86_MMU_SPTE_H
|
||||
|
||||
#include "mmu_internal.h"
|
||||
|
||||
#define PT_FIRST_AVAIL_BITS_SHIFT 10
|
||||
#define PT64_SECOND_AVAIL_BITS_SHIFT 54
|
||||
|
||||
/*
|
||||
* The mask used to denote special SPTEs, which can be either MMIO SPTEs or
|
||||
* Access Tracking SPTEs.
|
||||
*/
|
||||
#define SPTE_SPECIAL_MASK (3ULL << 52)
|
||||
#define SPTE_AD_ENABLED_MASK (0ULL << 52)
|
||||
#define SPTE_AD_DISABLED_MASK (1ULL << 52)
|
||||
#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
|
||||
#define SPTE_MMIO_MASK (3ULL << 52)
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
|
||||
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
|
||||
#else
|
||||
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
|
||||
#endif
|
||||
#define PT64_LVL_ADDR_MASK(level) \
|
||||
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
|
||||
* PT64_LEVEL_BITS))) - 1))
|
||||
#define PT64_LVL_OFFSET_MASK(level) \
|
||||
(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
|
||||
* PT64_LEVEL_BITS))) - 1))
|
||||
|
||||
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
|
||||
| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
|
||||
|
||||
#define ACC_EXEC_MASK 1
|
||||
#define ACC_WRITE_MASK PT_WRITABLE_MASK
|
||||
#define ACC_USER_MASK PT_USER_MASK
|
||||
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
|
||||
|
||||
/* The mask for the R/X bits in EPT PTEs */
|
||||
#define PT64_EPT_READABLE_MASK 0x1ull
|
||||
#define PT64_EPT_EXECUTABLE_MASK 0x4ull
|
||||
|
||||
#define PT64_LEVEL_BITS 9
|
||||
|
||||
#define PT64_LEVEL_SHIFT(level) \
|
||||
(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
|
||||
|
||||
#define PT64_INDEX(address, level)\
|
||||
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
|
||||
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||||
|
||||
|
||||
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
|
||||
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
|
||||
|
||||
/*
|
||||
* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
|
||||
* the memslots generation and is derived as follows:
|
||||
*
|
||||
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
|
||||
* Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
|
||||
*
|
||||
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
|
||||
* the MMIO generation number, as doing so would require stealing a bit from
|
||||
* the "real" generation number and thus effectively halve the maximum number
|
||||
* of MMIO generations that can be handled before encountering a wrap (which
|
||||
* requires a full MMU zap). The flag is instead explicitly queried when
|
||||
* checking for MMIO spte cache hits.
|
||||
*/
|
||||
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
|
||||
|
||||
#define MMIO_SPTE_GEN_LOW_START 3
|
||||
#define MMIO_SPTE_GEN_LOW_END 11
|
||||
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
|
||||
MMIO_SPTE_GEN_LOW_START)
|
||||
|
||||
#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
|
||||
#define MMIO_SPTE_GEN_HIGH_END 62
|
||||
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
|
||||
MMIO_SPTE_GEN_HIGH_START)
|
||||
|
||||
extern u64 __read_mostly shadow_nx_mask;
|
||||
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
|
||||
extern u64 __read_mostly shadow_user_mask;
|
||||
extern u64 __read_mostly shadow_accessed_mask;
|
||||
extern u64 __read_mostly shadow_dirty_mask;
|
||||
extern u64 __read_mostly shadow_mmio_value;
|
||||
extern u64 __read_mostly shadow_mmio_access_mask;
|
||||
extern u64 __read_mostly shadow_present_mask;
|
||||
extern u64 __read_mostly shadow_me_mask;
|
||||
|
||||
/*
|
||||
* SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
|
||||
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
|
||||
* pages.
|
||||
*/
|
||||
extern u64 __read_mostly shadow_acc_track_mask;
|
||||
|
||||
/*
|
||||
* This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
|
||||
* to guard against L1TF attacks.
|
||||
*/
|
||||
extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
|
||||
|
||||
/*
|
||||
* The mask/shift to use for saving the original R/X bits when marking the PTE
|
||||
* as not-present for access tracking purposes. We do not save the W bit as the
|
||||
* PTEs being access tracked also need to be dirty tracked, so the W bit will be
|
||||
* restored only when a write is attempted to the page.
|
||||
*/
|
||||
static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
|
||||
PT64_EPT_EXECUTABLE_MASK;
|
||||
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
|
||||
|
||||
/*
|
||||
* The number of high-order 1 bits to use in the mask above.
|
||||
*/
|
||||
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
|
||||
|
||||
/*
|
||||
* In some cases, we need to preserve the GFN of a non-present or reserved
|
||||
* SPTE when we usurp the upper five bits of the physical address space to
|
||||
* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
|
||||
* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
|
||||
* left into the reserved bits, i.e. the GFN in the SPTE will be split into
|
||||
* high and low parts. This mask covers the lower bits of the GFN.
|
||||
*/
|
||||
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
|
||||
|
||||
/*
|
||||
* The number of non-reserved physical address bits irrespective of features
|
||||
* that repurpose legal bits, e.g. MKTME.
|
||||
*/
|
||||
extern u8 __read_mostly shadow_phys_bits;
|
||||
|
||||
static inline bool is_mmio_spte(u64 spte)
|
||||
{
|
||||
return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
|
||||
}
|
||||
|
||||
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
|
||||
{
|
||||
return sp->role.ad_disabled;
|
||||
}
|
||||
|
||||
static inline bool spte_ad_enabled(u64 spte)
|
||||
{
|
||||
MMU_WARN_ON(is_mmio_spte(spte));
|
||||
return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
|
||||
}
|
||||
|
||||
static inline bool spte_ad_need_write_protect(u64 spte)
|
||||
{
|
||||
MMU_WARN_ON(is_mmio_spte(spte));
|
||||
return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
|
||||
}
|
||||
|
||||
static inline u64 spte_shadow_accessed_mask(u64 spte)
|
||||
{
|
||||
MMU_WARN_ON(is_mmio_spte(spte));
|
||||
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
|
||||
}
|
||||
|
||||
static inline u64 spte_shadow_dirty_mask(u64 spte)
|
||||
{
|
||||
MMU_WARN_ON(is_mmio_spte(spte));
|
||||
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
|
||||
}
|
||||
|
||||
static inline bool is_access_track_spte(u64 spte)
|
||||
{
|
||||
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
|
||||
}
|
||||
|
||||
static inline int is_shadow_present_pte(u64 pte)
|
||||
{
|
||||
return (pte != 0) && !is_mmio_spte(pte);
|
||||
}
|
||||
|
||||
static inline int is_large_pte(u64 pte)
|
||||
{
|
||||
return pte & PT_PAGE_SIZE_MASK;
|
||||
}
|
||||
|
||||
static inline int is_last_spte(u64 pte, int level)
|
||||
{
|
||||
if (level == PG_LEVEL_4K)
|
||||
return 1;
|
||||
if (is_large_pte(pte))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool is_executable_pte(u64 spte)
|
||||
{
|
||||
return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
|
||||
}
|
||||
|
||||
static inline kvm_pfn_t spte_to_pfn(u64 pte)
|
||||
{
|
||||
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline bool is_accessed_spte(u64 spte)
|
||||
{
|
||||
u64 accessed_mask = spte_shadow_accessed_mask(spte);
|
||||
|
||||
return accessed_mask ? spte & accessed_mask
|
||||
: !is_access_track_spte(spte);
|
||||
}
|
||||
|
||||
static inline bool is_dirty_spte(u64 spte)
|
||||
{
|
||||
u64 dirty_mask = spte_shadow_dirty_mask(spte);
|
||||
|
||||
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
|
||||
}
|
||||
|
||||
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
|
||||
{
|
||||
return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
|
||||
(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
|
||||
}
|
||||
|
||||
static inline u64 get_mmio_spte_generation(u64 spte)
|
||||
{
|
||||
u64 gen;
|
||||
|
||||
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
|
||||
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
|
||||
return gen;
|
||||
}
|
||||
|
||||
/* Bits which may be returned by set_spte() */
|
||||
#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
|
||||
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
|
||||
#define SET_SPTE_SPURIOUS BIT(2)
|
||||
|
||||
int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
|
||||
gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
|
||||
bool can_unsync, bool host_writable, bool ad_disabled,
|
||||
u64 *new_spte);
|
||||
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
|
||||
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
|
||||
u64 mark_spte_for_access_track(u64 spte);
|
||||
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
|
||||
|
||||
void kvm_mmu_reset_all_pte_masks(void);
|
||||
|
||||
#endif
|
182
arch/x86/kvm/mmu/tdp_iter.c
Normal file
182
arch/x86/kvm/mmu/tdp_iter.c
Normal file
@ -0,0 +1,182 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "mmu_internal.h"
|
||||
#include "tdp_iter.h"
|
||||
#include "spte.h"
|
||||
|
||||
/*
|
||||
* Recalculates the pointer to the SPTE for the current GFN and level and
|
||||
* reread the SPTE.
|
||||
*/
|
||||
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
|
||||
{
|
||||
iter->sptep = iter->pt_path[iter->level - 1] +
|
||||
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
|
||||
iter->old_spte = READ_ONCE(*iter->sptep);
|
||||
}
|
||||
|
||||
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
|
||||
{
|
||||
return gfn & -KVM_PAGES_PER_HPAGE(level);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
|
||||
* rooted at root_pt, starting with the walk to translate goal_gfn.
|
||||
*/
|
||||
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
|
||||
int min_level, gfn_t goal_gfn)
|
||||
{
|
||||
WARN_ON(root_level < 1);
|
||||
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
|
||||
|
||||
iter->goal_gfn = goal_gfn;
|
||||
iter->root_level = root_level;
|
||||
iter->min_level = min_level;
|
||||
iter->level = root_level;
|
||||
iter->pt_path[iter->level - 1] = root_pt;
|
||||
|
||||
iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
|
||||
tdp_iter_refresh_sptep(iter);
|
||||
|
||||
iter->valid = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an SPTE and its level, returns a pointer containing the host virtual
|
||||
* address of the child page table referenced by the SPTE. Returns null if
|
||||
* there is no such entry.
|
||||
*/
|
||||
u64 *spte_to_child_pt(u64 spte, int level)
|
||||
{
|
||||
/*
|
||||
* There's no child entry if this entry isn't present or is a
|
||||
* last-level entry.
|
||||
*/
|
||||
if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
|
||||
return NULL;
|
||||
|
||||
return __va(spte_to_pfn(spte) << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Steps down one level in the paging structure towards the goal GFN. Returns
|
||||
* true if the iterator was able to step down a level, false otherwise.
|
||||
*/
|
||||
static bool try_step_down(struct tdp_iter *iter)
|
||||
{
|
||||
u64 *child_pt;
|
||||
|
||||
if (iter->level == iter->min_level)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Reread the SPTE before stepping down to avoid traversing into page
|
||||
* tables that are no longer linked from this entry.
|
||||
*/
|
||||
iter->old_spte = READ_ONCE(*iter->sptep);
|
||||
|
||||
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
|
||||
if (!child_pt)
|
||||
return false;
|
||||
|
||||
iter->level--;
|
||||
iter->pt_path[iter->level - 1] = child_pt;
|
||||
iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
|
||||
tdp_iter_refresh_sptep(iter);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Steps to the next entry in the current page table, at the current page table
|
||||
* level. The next entry could point to a page backing guest memory or another
|
||||
* page table, or it could be non-present. Returns true if the iterator was
|
||||
* able to step to the next entry in the page table, false if the iterator was
|
||||
* already at the end of the current page table.
|
||||
*/
|
||||
static bool try_step_side(struct tdp_iter *iter)
|
||||
{
|
||||
/*
|
||||
* Check if the iterator is already at the end of the current page
|
||||
* table.
|
||||
*/
|
||||
if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
|
||||
(PT64_ENT_PER_PAGE - 1))
|
||||
return false;
|
||||
|
||||
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
|
||||
iter->goal_gfn = iter->gfn;
|
||||
iter->sptep++;
|
||||
iter->old_spte = READ_ONCE(*iter->sptep);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tries to traverse back up a level in the paging structure so that the walk
|
||||
* can continue from the next entry in the parent page table. Returns true on a
|
||||
* successful step up, false if already in the root page.
|
||||
*/
|
||||
static bool try_step_up(struct tdp_iter *iter)
|
||||
{
|
||||
if (iter->level == iter->root_level)
|
||||
return false;
|
||||
|
||||
iter->level++;
|
||||
iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
|
||||
tdp_iter_refresh_sptep(iter);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step to the next SPTE in a pre-order traversal of the paging structure.
|
||||
* To get to the next SPTE, the iterator either steps down towards the goal
|
||||
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
|
||||
* highter GFN.
|
||||
*
|
||||
* The basic algorithm is as follows:
|
||||
* 1. If the current SPTE is a non-last-level SPTE, step down into the page
|
||||
* table it points to.
|
||||
* 2. If the iterator cannot step down, it will try to step to the next SPTE
|
||||
* in the current page of the paging structure.
|
||||
* 3. If the iterator cannot step to the next entry in the current page, it will
|
||||
* try to step up to the parent paging structure page. In this case, that
|
||||
* SPTE will have already been visited, and so the iterator must also step
|
||||
* to the side again.
|
||||
*/
|
||||
void tdp_iter_next(struct tdp_iter *iter)
|
||||
{
|
||||
if (try_step_down(iter))
|
||||
return;
|
||||
|
||||
do {
|
||||
if (try_step_side(iter))
|
||||
return;
|
||||
} while (try_step_up(iter));
|
||||
iter->valid = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the walk over the paging structure from the root, starting from the
|
||||
* highest gfn the iterator had previously reached. Assumes that the entire
|
||||
* paging structure, except the root page, may have been completely torn down
|
||||
* and rebuilt.
|
||||
*/
|
||||
void tdp_iter_refresh_walk(struct tdp_iter *iter)
|
||||
{
|
||||
gfn_t goal_gfn = iter->goal_gfn;
|
||||
|
||||
if (iter->gfn > goal_gfn)
|
||||
goal_gfn = iter->gfn;
|
||||
|
||||
tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
|
||||
iter->root_level, iter->min_level, goal_gfn);
|
||||
}
|
||||
|
||||
u64 *tdp_iter_root_pt(struct tdp_iter *iter)
|
||||
{
|
||||
return iter->pt_path[iter->root_level - 1];
|
||||
}
|
||||
|
60
arch/x86/kvm/mmu/tdp_iter.h
Normal file
60
arch/x86/kvm/mmu/tdp_iter.h
Normal file
@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#ifndef __KVM_X86_MMU_TDP_ITER_H
|
||||
#define __KVM_X86_MMU_TDP_ITER_H
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
#include "mmu.h"
|
||||
|
||||
/*
|
||||
* A TDP iterator performs a pre-order walk over a TDP paging structure.
|
||||
*/
|
||||
struct tdp_iter {
|
||||
/*
|
||||
* The iterator will traverse the paging structure towards the mapping
|
||||
* for this GFN.
|
||||
*/
|
||||
gfn_t goal_gfn;
|
||||
/* Pointers to the page tables traversed to reach the current SPTE */
|
||||
u64 *pt_path[PT64_ROOT_MAX_LEVEL];
|
||||
/* A pointer to the current SPTE */
|
||||
u64 *sptep;
|
||||
/* The lowest GFN mapped by the current SPTE */
|
||||
gfn_t gfn;
|
||||
/* The level of the root page given to the iterator */
|
||||
int root_level;
|
||||
/* The lowest level the iterator should traverse to */
|
||||
int min_level;
|
||||
/* The iterator's current level within the paging structure */
|
||||
int level;
|
||||
/* A snapshot of the value at sptep */
|
||||
u64 old_spte;
|
||||
/*
|
||||
* Whether the iterator has a valid state. This will be false if the
|
||||
* iterator walks off the end of the paging structure.
|
||||
*/
|
||||
bool valid;
|
||||
};
|
||||
|
||||
/*
|
||||
* Iterates over every SPTE mapping the GFN range [start, end) in a
|
||||
* preorder traversal.
|
||||
*/
|
||||
#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \
|
||||
for (tdp_iter_start(&iter, root, root_level, min_level, start); \
|
||||
iter.valid && iter.gfn < end; \
|
||||
tdp_iter_next(&iter))
|
||||
|
||||
#define for_each_tdp_pte(iter, root, root_level, start, end) \
|
||||
for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
|
||||
|
||||
u64 *spte_to_child_pt(u64 pte, int level);
|
||||
|
||||
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
|
||||
int min_level, gfn_t goal_gfn);
|
||||
void tdp_iter_next(struct tdp_iter *iter);
|
||||
void tdp_iter_refresh_walk(struct tdp_iter *iter);
|
||||
u64 *tdp_iter_root_pt(struct tdp_iter *iter);
|
||||
|
||||
#endif /* __KVM_X86_MMU_TDP_ITER_H */
|
1157
arch/x86/kvm/mmu/tdp_mmu.c
Normal file
1157
arch/x86/kvm/mmu/tdp_mmu.c
Normal file
File diff suppressed because it is too large
Load Diff
48
arch/x86/kvm/mmu/tdp_mmu.h
Normal file
48
arch/x86/kvm/mmu/tdp_mmu.h
Normal file
@ -0,0 +1,48 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#ifndef __KVM_X86_MMU_TDP_MMU_H
|
||||
#define __KVM_X86_MMU_TDP_MMU_H
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
|
||||
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
|
||||
|
||||
bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
|
||||
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
|
||||
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
|
||||
|
||||
bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
|
||||
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
|
||||
|
||||
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
|
||||
int map_writable, int max_level, kvm_pfn_t pfn,
|
||||
bool prefault);
|
||||
|
||||
int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
|
||||
unsigned long end);
|
||||
|
||||
int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
|
||||
unsigned long end);
|
||||
int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
|
||||
|
||||
int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
|
||||
pte_t *host_ptep);
|
||||
|
||||
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
|
||||
int min_level);
|
||||
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot);
|
||||
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, unsigned long mask,
|
||||
bool wrprot);
|
||||
bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot);
|
||||
|
||||
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
|
||||
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes);
|
||||
#endif /* __KVM_X86_MMU_TDP_MMU_H */
|
@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm)
|
||||
return 0;
|
||||
|
||||
/* Allocating physical APIC ID table (4KB) */
|
||||
p_page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!p_page)
|
||||
goto free_avic;
|
||||
|
||||
kvm_svm->avic_physical_id_table_page = p_page;
|
||||
clear_page(page_address(p_page));
|
||||
|
||||
/* Allocating logical APIC ID table (4KB) */
|
||||
l_page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!l_page)
|
||||
goto free_avic;
|
||||
|
||||
kvm_svm->avic_logical_id_table_page = l_page;
|
||||
clear_page(page_address(l_page));
|
||||
|
||||
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
|
||||
again:
|
||||
@ -868,6 +866,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
|
||||
* - Tell IOMMU to use legacy mode for this interrupt.
|
||||
* - Retrieve ga_tag of prior interrupt remapping data.
|
||||
*/
|
||||
pi.prev_ga_tag = 0;
|
||||
pi.is_guest_mode = false;
|
||||
ret = irq_set_vcpu_affinity(host_irq, &pi);
|
||||
|
||||
|
@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
|
||||
void recalc_intercepts(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb_control_area *c, *h, *g;
|
||||
unsigned int i;
|
||||
|
||||
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
|
||||
|
||||
@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm)
|
||||
h = &svm->nested.hsave->control;
|
||||
g = &svm->nested.ctl;
|
||||
|
||||
svm->nested.host_intercept_exceptions = h->intercept_exceptions;
|
||||
|
||||
c->intercept_cr = h->intercept_cr;
|
||||
c->intercept_dr = h->intercept_dr;
|
||||
c->intercept_exceptions = h->intercept_exceptions;
|
||||
c->intercept = h->intercept;
|
||||
for (i = 0; i < MAX_INTERCEPT; i++)
|
||||
c->intercepts[i] = h->intercepts[i];
|
||||
|
||||
if (g->int_ctl & V_INTR_MASKING_MASK) {
|
||||
/* We only want the cr8 intercept bits of L1 */
|
||||
c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
|
||||
c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
|
||||
vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
|
||||
vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
|
||||
|
||||
/*
|
||||
* Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
|
||||
* affect any interrupt we may want to inject; therefore,
|
||||
* interrupt window vmexits are irrelevant to L0.
|
||||
*/
|
||||
c->intercept &= ~(1ULL << INTERCEPT_VINTR);
|
||||
vmcb_clr_intercept(c, INTERCEPT_VINTR);
|
||||
}
|
||||
|
||||
/* We don't want to see VMMCALLs from a nested guest */
|
||||
c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
|
||||
vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
|
||||
|
||||
c->intercept_cr |= g->intercept_cr;
|
||||
c->intercept_dr |= g->intercept_dr;
|
||||
c->intercept_exceptions |= g->intercept_exceptions;
|
||||
c->intercept |= g->intercept;
|
||||
for (i = 0; i < MAX_INTERCEPT; i++)
|
||||
c->intercepts[i] |= g->intercepts[i];
|
||||
}
|
||||
|
||||
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
|
||||
struct vmcb_control_area *from)
|
||||
{
|
||||
dst->intercept_cr = from->intercept_cr;
|
||||
dst->intercept_dr = from->intercept_dr;
|
||||
dst->intercept_exceptions = from->intercept_exceptions;
|
||||
dst->intercept = from->intercept;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < MAX_INTERCEPT; i++)
|
||||
dst->intercepts[i] = from->intercepts[i];
|
||||
|
||||
dst->iopm_base_pa = from->iopm_base_pa;
|
||||
dst->msrpm_base_pa = from->msrpm_base_pa;
|
||||
dst->tsc_offset = from->tsc_offset;
|
||||
@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
|
||||
*/
|
||||
int i;
|
||||
|
||||
if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
|
||||
if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
|
||||
return true;
|
||||
|
||||
for (i = 0; i < MSRPM_OFFSETS; i++) {
|
||||
@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
if (!nested_svm_vmrun_msrpm(svm)) {
|
||||
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
|
||||
vcpu->run->internal.suberror =
|
||||
KVM_INTERNAL_ERROR_EMULATION;
|
||||
vcpu->run->internal.ndata = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
|
||||
{
|
||||
if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
|
||||
if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
|
||||
return false;
|
||||
|
||||
if (control->asid == 0)
|
||||
@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb)
|
||||
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
|
||||
{
|
||||
bool nested_vmcb_lma;
|
||||
if ((vmcb->save.efer & EFER_SVME) == 0)
|
||||
bool vmcb12_lma;
|
||||
|
||||
if ((vmcb12->save.efer & EFER_SVME) == 0)
|
||||
return false;
|
||||
|
||||
if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
|
||||
(vmcb->save.cr0 & X86_CR0_NW))
|
||||
if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
|
||||
return false;
|
||||
|
||||
if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7))
|
||||
if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
|
||||
return false;
|
||||
|
||||
nested_vmcb_lma =
|
||||
(vmcb->save.efer & EFER_LME) &&
|
||||
(vmcb->save.cr0 & X86_CR0_PG);
|
||||
vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
|
||||
|
||||
if (!nested_vmcb_lma) {
|
||||
if (vmcb->save.cr4 & X86_CR4_PAE) {
|
||||
if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
|
||||
if (!vmcb12_lma) {
|
||||
if (vmcb12->save.cr4 & X86_CR4_PAE) {
|
||||
if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
|
||||
return false;
|
||||
} else {
|
||||
if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
|
||||
if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!(vmcb->save.cr4 & X86_CR4_PAE) ||
|
||||
!(vmcb->save.cr0 & X86_CR0_PE) ||
|
||||
(vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK))
|
||||
if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
|
||||
!(vmcb12->save.cr0 & X86_CR0_PE) ||
|
||||
(vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
|
||||
return false;
|
||||
}
|
||||
if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4))
|
||||
if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
|
||||
return false;
|
||||
|
||||
return nested_vmcb_check_controls(&vmcb->control);
|
||||
return nested_vmcb_check_controls(&vmcb12->control);
|
||||
}
|
||||
|
||||
static void load_nested_vmcb_control(struct vcpu_svm *svm,
|
||||
@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
|
||||
* EXIT_INT_INFO.
|
||||
*/
|
||||
static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
|
||||
struct vmcb *nested_vmcb)
|
||||
struct vmcb *vmcb12)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = &svm->vcpu;
|
||||
u32 exit_int_info = 0;
|
||||
@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
|
||||
|
||||
if (vcpu->arch.exception.has_error_code) {
|
||||
exit_int_info |= SVM_EVTINJ_VALID_ERR;
|
||||
nested_vmcb->control.exit_int_info_err =
|
||||
vmcb12->control.exit_int_info_err =
|
||||
vcpu->arch.exception.error_code;
|
||||
}
|
||||
|
||||
@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
|
||||
exit_int_info |= SVM_EVTINJ_TYPE_INTR;
|
||||
}
|
||||
|
||||
nested_vmcb->control.exit_int_info = exit_int_info;
|
||||
vmcb12->control.exit_int_info = exit_int_info;
|
||||
}
|
||||
|
||||
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
|
||||
@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
|
||||
static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
|
||||
{
|
||||
/* Load the nested guest state */
|
||||
svm->vmcb->save.es = nested_vmcb->save.es;
|
||||
svm->vmcb->save.cs = nested_vmcb->save.cs;
|
||||
svm->vmcb->save.ss = nested_vmcb->save.ss;
|
||||
svm->vmcb->save.ds = nested_vmcb->save.ds;
|
||||
svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
|
||||
svm->vmcb->save.idtr = nested_vmcb->save.idtr;
|
||||
kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
|
||||
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
|
||||
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
|
||||
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
|
||||
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
|
||||
kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
|
||||
kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
|
||||
kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
|
||||
svm->vmcb->save.es = vmcb12->save.es;
|
||||
svm->vmcb->save.cs = vmcb12->save.cs;
|
||||
svm->vmcb->save.ss = vmcb12->save.ss;
|
||||
svm->vmcb->save.ds = vmcb12->save.ds;
|
||||
svm->vmcb->save.gdtr = vmcb12->save.gdtr;
|
||||
svm->vmcb->save.idtr = vmcb12->save.idtr;
|
||||
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
|
||||
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
|
||||
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
|
||||
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
|
||||
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
|
||||
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
|
||||
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
|
||||
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
|
||||
|
||||
/* In case we don't even reach vcpu_run, the fields are not updated */
|
||||
svm->vmcb->save.rax = nested_vmcb->save.rax;
|
||||
svm->vmcb->save.rsp = nested_vmcb->save.rsp;
|
||||
svm->vmcb->save.rip = nested_vmcb->save.rip;
|
||||
svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
|
||||
svm->vcpu.arch.dr6 = nested_vmcb->save.dr6;
|
||||
svm->vmcb->save.cpl = nested_vmcb->save.cpl;
|
||||
svm->vmcb->save.rax = vmcb12->save.rax;
|
||||
svm->vmcb->save.rsp = vmcb12->save.rsp;
|
||||
svm->vmcb->save.rip = vmcb12->save.rip;
|
||||
svm->vmcb->save.dr7 = vmcb12->save.dr7;
|
||||
svm->vcpu.arch.dr6 = vmcb12->save.dr6;
|
||||
svm->vmcb->save.cpl = vmcb12->save.cpl;
|
||||
}
|
||||
|
||||
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
|
||||
@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
|
||||
vmcb_mark_all_dirty(svm->vmcb);
|
||||
}
|
||||
|
||||
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
|
||||
struct vmcb *nested_vmcb)
|
||||
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
|
||||
struct vmcb *vmcb12)
|
||||
{
|
||||
int ret;
|
||||
|
||||
svm->nested.vmcb = vmcb_gpa;
|
||||
load_nested_vmcb_control(svm, &nested_vmcb->control);
|
||||
nested_prepare_vmcb_save(svm, nested_vmcb);
|
||||
svm->nested.vmcb12_gpa = vmcb12_gpa;
|
||||
load_nested_vmcb_control(svm, &vmcb12->control);
|
||||
nested_prepare_vmcb_save(svm, vmcb12);
|
||||
nested_prepare_vmcb_control(svm);
|
||||
|
||||
ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3,
|
||||
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
|
||||
nested_npt_enabled(svm));
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
|
||||
int nested_svm_vmrun(struct vcpu_svm *svm)
|
||||
{
|
||||
int ret;
|
||||
struct vmcb *nested_vmcb;
|
||||
struct vmcb *vmcb12;
|
||||
struct vmcb *hsave = svm->nested.hsave;
|
||||
struct vmcb *vmcb = svm->vmcb;
|
||||
struct kvm_host_map map;
|
||||
u64 vmcb_gpa;
|
||||
u64 vmcb12_gpa;
|
||||
|
||||
if (is_smm(&svm->vcpu)) {
|
||||
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
vmcb_gpa = svm->vmcb->save.rax;
|
||||
ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
|
||||
vmcb12_gpa = svm->vmcb->save.rax;
|
||||
ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
|
||||
if (ret == -EINVAL) {
|
||||
kvm_inject_gp(&svm->vcpu, 0);
|
||||
return 1;
|
||||
@ -471,26 +479,31 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
|
||||
|
||||
ret = kvm_skip_emulated_instruction(&svm->vcpu);
|
||||
|
||||
nested_vmcb = map.hva;
|
||||
vmcb12 = map.hva;
|
||||
|
||||
if (!nested_vmcb_checks(svm, nested_vmcb)) {
|
||||
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
|
||||
nested_vmcb->control.exit_code_hi = 0;
|
||||
nested_vmcb->control.exit_info_1 = 0;
|
||||
nested_vmcb->control.exit_info_2 = 0;
|
||||
if (WARN_ON_ONCE(!svm->nested.initialized))
|
||||
return -EINVAL;
|
||||
|
||||
if (!nested_vmcb_checks(svm, vmcb12)) {
|
||||
vmcb12->control.exit_code = SVM_EXIT_ERR;
|
||||
vmcb12->control.exit_code_hi = 0;
|
||||
vmcb12->control.exit_info_1 = 0;
|
||||
vmcb12->control.exit_info_2 = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
|
||||
nested_vmcb->save.rip,
|
||||
nested_vmcb->control.int_ctl,
|
||||
nested_vmcb->control.event_inj,
|
||||
nested_vmcb->control.nested_ctl);
|
||||
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
|
||||
vmcb12->save.rip,
|
||||
vmcb12->control.int_ctl,
|
||||
vmcb12->control.event_inj,
|
||||
vmcb12->control.nested_ctl);
|
||||
|
||||
trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
|
||||
nested_vmcb->control.intercept_cr >> 16,
|
||||
nested_vmcb->control.intercept_exceptions,
|
||||
nested_vmcb->control.intercept);
|
||||
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
|
||||
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
|
||||
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD3],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD4],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD5]);
|
||||
|
||||
/* Clear internal status */
|
||||
kvm_clear_exception_queue(&svm->vcpu);
|
||||
@ -522,7 +535,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
|
||||
|
||||
svm->nested.nested_run_pending = 1;
|
||||
|
||||
if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb))
|
||||
if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
|
||||
goto out_exit_err;
|
||||
|
||||
if (nested_svm_vmrun_msrpm(svm))
|
||||
@ -563,23 +576,23 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
|
||||
int nested_svm_vmexit(struct vcpu_svm *svm)
|
||||
{
|
||||
int rc;
|
||||
struct vmcb *nested_vmcb;
|
||||
struct vmcb *vmcb12;
|
||||
struct vmcb *hsave = svm->nested.hsave;
|
||||
struct vmcb *vmcb = svm->vmcb;
|
||||
struct kvm_host_map map;
|
||||
|
||||
rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
|
||||
rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
|
||||
if (rc) {
|
||||
if (rc == -EINVAL)
|
||||
kvm_inject_gp(&svm->vcpu, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
nested_vmcb = map.hva;
|
||||
vmcb12 = map.hva;
|
||||
|
||||
/* Exit Guest-Mode */
|
||||
leave_guest_mode(&svm->vcpu);
|
||||
svm->nested.vmcb = 0;
|
||||
svm->nested.vmcb12_gpa = 0;
|
||||
WARN_ON_ONCE(svm->nested.nested_run_pending);
|
||||
|
||||
/* in case we halted in L2 */
|
||||
@ -587,45 +600,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
||||
|
||||
/* Give the current vmcb to the guest */
|
||||
|
||||
nested_vmcb->save.es = vmcb->save.es;
|
||||
nested_vmcb->save.cs = vmcb->save.cs;
|
||||
nested_vmcb->save.ss = vmcb->save.ss;
|
||||
nested_vmcb->save.ds = vmcb->save.ds;
|
||||
nested_vmcb->save.gdtr = vmcb->save.gdtr;
|
||||
nested_vmcb->save.idtr = vmcb->save.idtr;
|
||||
nested_vmcb->save.efer = svm->vcpu.arch.efer;
|
||||
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
|
||||
nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
|
||||
nested_vmcb->save.cr2 = vmcb->save.cr2;
|
||||
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
|
||||
nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
|
||||
nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu);
|
||||
nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu);
|
||||
nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu);
|
||||
nested_vmcb->save.dr7 = vmcb->save.dr7;
|
||||
nested_vmcb->save.dr6 = svm->vcpu.arch.dr6;
|
||||
nested_vmcb->save.cpl = vmcb->save.cpl;
|
||||
vmcb12->save.es = vmcb->save.es;
|
||||
vmcb12->save.cs = vmcb->save.cs;
|
||||
vmcb12->save.ss = vmcb->save.ss;
|
||||
vmcb12->save.ds = vmcb->save.ds;
|
||||
vmcb12->save.gdtr = vmcb->save.gdtr;
|
||||
vmcb12->save.idtr = vmcb->save.idtr;
|
||||
vmcb12->save.efer = svm->vcpu.arch.efer;
|
||||
vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
|
||||
vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
|
||||
vmcb12->save.cr2 = vmcb->save.cr2;
|
||||
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
|
||||
vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
|
||||
vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
|
||||
vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
|
||||
vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
|
||||
vmcb12->save.dr7 = vmcb->save.dr7;
|
||||
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
|
||||
vmcb12->save.cpl = vmcb->save.cpl;
|
||||
|
||||
nested_vmcb->control.int_state = vmcb->control.int_state;
|
||||
nested_vmcb->control.exit_code = vmcb->control.exit_code;
|
||||
nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
|
||||
nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
|
||||
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
|
||||
vmcb12->control.int_state = vmcb->control.int_state;
|
||||
vmcb12->control.exit_code = vmcb->control.exit_code;
|
||||
vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
|
||||
vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
|
||||
vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
|
||||
|
||||
if (nested_vmcb->control.exit_code != SVM_EXIT_ERR)
|
||||
nested_vmcb_save_pending_event(svm, nested_vmcb);
|
||||
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
|
||||
nested_vmcb_save_pending_event(svm, vmcb12);
|
||||
|
||||
if (svm->nrips_enabled)
|
||||
nested_vmcb->control.next_rip = vmcb->control.next_rip;
|
||||
vmcb12->control.next_rip = vmcb->control.next_rip;
|
||||
|
||||
nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl;
|
||||
nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
|
||||
nested_vmcb->control.event_inj = svm->nested.ctl.event_inj;
|
||||
nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
|
||||
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
|
||||
vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
|
||||
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
|
||||
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
|
||||
|
||||
nested_vmcb->control.pause_filter_count =
|
||||
vmcb12->control.pause_filter_count =
|
||||
svm->vmcb->control.pause_filter_count;
|
||||
nested_vmcb->control.pause_filter_thresh =
|
||||
vmcb12->control.pause_filter_thresh =
|
||||
svm->vmcb->control.pause_filter_thresh;
|
||||
|
||||
/* Restore the original control entries */
|
||||
@ -659,11 +672,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
||||
|
||||
vmcb_mark_all_dirty(svm->vmcb);
|
||||
|
||||
trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
|
||||
nested_vmcb->control.exit_info_1,
|
||||
nested_vmcb->control.exit_info_2,
|
||||
nested_vmcb->control.exit_int_info,
|
||||
nested_vmcb->control.exit_int_info_err,
|
||||
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
|
||||
vmcb12->control.exit_info_1,
|
||||
vmcb12->control.exit_info_2,
|
||||
vmcb12->control.exit_int_info,
|
||||
vmcb12->control.exit_int_info_err,
|
||||
KVM_ISA_SVM);
|
||||
|
||||
kvm_vcpu_unmap(&svm->vcpu, &map, true);
|
||||
@ -688,6 +701,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int svm_allocate_nested(struct vcpu_svm *svm)
|
||||
{
|
||||
struct page *hsave_page;
|
||||
|
||||
if (svm->nested.initialized)
|
||||
return 0;
|
||||
|
||||
hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!hsave_page)
|
||||
return -ENOMEM;
|
||||
svm->nested.hsave = page_address(hsave_page);
|
||||
|
||||
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
|
||||
if (!svm->nested.msrpm)
|
||||
goto err_free_hsave;
|
||||
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
|
||||
|
||||
svm->nested.initialized = true;
|
||||
return 0;
|
||||
|
||||
err_free_hsave:
|
||||
__free_page(hsave_page);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
void svm_free_nested(struct vcpu_svm *svm)
|
||||
{
|
||||
if (!svm->nested.initialized)
|
||||
return;
|
||||
|
||||
svm_vcpu_free_msrpm(svm->nested.msrpm);
|
||||
svm->nested.msrpm = NULL;
|
||||
|
||||
__free_page(virt_to_page(svm->nested.hsave));
|
||||
svm->nested.hsave = NULL;
|
||||
|
||||
svm->nested.initialized = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
|
||||
*/
|
||||
@ -702,6 +754,8 @@ void svm_leave_nested(struct vcpu_svm *svm)
|
||||
copy_vmcb_control_area(&vmcb->control, &hsave->control);
|
||||
nested_svm_uninit_mmu_context(&svm->vcpu);
|
||||
}
|
||||
|
||||
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
|
||||
}
|
||||
|
||||
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
|
||||
@ -709,7 +763,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
|
||||
u32 offset, msr, value;
|
||||
int write, mask;
|
||||
|
||||
if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
|
||||
if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
|
||||
return NESTED_EXIT_HOST;
|
||||
|
||||
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
|
||||
@ -736,7 +790,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
|
||||
u8 start_bit;
|
||||
u64 gpa;
|
||||
|
||||
if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
|
||||
if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
|
||||
return NESTED_EXIT_HOST;
|
||||
|
||||
port = svm->vmcb->control.exit_info_1 >> 16;
|
||||
@ -767,14 +821,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
|
||||
vmexit = nested_svm_intercept_ioio(svm);
|
||||
break;
|
||||
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
|
||||
u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
|
||||
if (svm->nested.ctl.intercept_cr & bit)
|
||||
if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
|
||||
vmexit = NESTED_EXIT_DONE;
|
||||
break;
|
||||
}
|
||||
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
|
||||
u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
|
||||
if (svm->nested.ctl.intercept_dr & bit)
|
||||
if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
|
||||
vmexit = NESTED_EXIT_DONE;
|
||||
break;
|
||||
}
|
||||
@ -792,8 +844,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
|
||||
if (svm->nested.ctl.intercept & exit_bits)
|
||||
if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
|
||||
vmexit = NESTED_EXIT_DONE;
|
||||
}
|
||||
}
|
||||
@ -833,7 +884,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
|
||||
{
|
||||
unsigned int nr = svm->vcpu.arch.exception.nr;
|
||||
|
||||
return (svm->nested.ctl.intercept_exceptions & (1 << nr));
|
||||
return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
|
||||
}
|
||||
|
||||
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
|
||||
@ -901,7 +952,7 @@ static void nested_svm_intr(struct vcpu_svm *svm)
|
||||
|
||||
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
|
||||
{
|
||||
return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT));
|
||||
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
|
||||
}
|
||||
|
||||
static void nested_svm_init(struct vcpu_svm *svm)
|
||||
@ -982,7 +1033,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
|
||||
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
|
||||
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
|
||||
|
||||
if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
|
||||
if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
|
||||
excp_bits)
|
||||
return NESTED_EXIT_HOST;
|
||||
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
|
||||
svm->vcpu.arch.apf.host_apf_flags)
|
||||
@ -1020,7 +1072,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
|
||||
|
||||
/* First fill in the header and copy it out. */
|
||||
if (is_guest_mode(vcpu)) {
|
||||
kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb;
|
||||
kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
|
||||
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
|
||||
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
|
||||
|
||||
@ -1094,7 +1146,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
|
||||
|
||||
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
|
||||
svm_leave_nested(svm);
|
||||
goto out_set_gif;
|
||||
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
|
||||
@ -1143,16 +1196,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
|
||||
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
|
||||
hsave->save = *save;
|
||||
|
||||
svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
|
||||
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
|
||||
load_nested_vmcb_control(svm, ctl);
|
||||
nested_prepare_vmcb_control(svm);
|
||||
|
||||
if (!nested_svm_vmrun_msrpm(svm))
|
||||
return -EINVAL;
|
||||
|
||||
out_set_gif:
|
||||
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
|
||||
|
||||
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
|
||||
ret = 0;
|
||||
out_free:
|
||||
kfree(save);
|
||||
@ -1163,6 +1211,7 @@ out_free:
|
||||
|
||||
struct kvm_x86_nested_ops svm_nested_ops = {
|
||||
.check_events = svm_check_nested_events,
|
||||
.get_nested_state_pages = svm_get_nested_state_pages,
|
||||
.get_state = svm_get_nested_state,
|
||||
.set_state = svm_set_nested_state,
|
||||
};
|
||||
|
@ -447,10 +447,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
|
||||
}
|
||||
|
||||
/*
|
||||
* The LAUNCH_UPDATE command will perform in-place encryption of the
|
||||
* memory content (i.e it will write the same memory region with C=1).
|
||||
* It's possible that the cache may contain the data with C=0, i.e.,
|
||||
* unencrypted so invalidate it first.
|
||||
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
|
||||
* place; the cache may contain the data that was written unencrypted.
|
||||
*/
|
||||
sev_clflush_pages(inpages, npages);
|
||||
|
||||
@ -806,10 +804,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
|
||||
}
|
||||
|
||||
/*
|
||||
* The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
|
||||
* memory content (i.e it will write the same memory region with C=1).
|
||||
* It's possible that the cache may contain the data with C=0, i.e.,
|
||||
* unencrypted so invalidate it first.
|
||||
* Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
|
||||
* the pages; flush the destination too so that future accesses do not
|
||||
* see stale data.
|
||||
*/
|
||||
sev_clflush_pages(src_p, 1);
|
||||
sev_clflush_pages(dst_p, 1);
|
||||
@ -857,7 +854,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
|
||||
struct kvm_sev_launch_secret params;
|
||||
struct page **pages;
|
||||
void *blob, *hdr;
|
||||
unsigned long n;
|
||||
unsigned long n, i;
|
||||
int ret, offset;
|
||||
|
||||
if (!sev_guest(kvm))
|
||||
@ -870,6 +867,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
|
||||
/*
|
||||
* Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
|
||||
* place; the cache may contain the data that was written unencrypted.
|
||||
*/
|
||||
sev_clflush_pages(pages, n);
|
||||
|
||||
/*
|
||||
* The secret must be copied into contiguous memory region, lets verify
|
||||
* that userspace memory pages are contiguous before we issue command.
|
||||
@ -915,6 +918,11 @@ e_free_blob:
|
||||
e_free:
|
||||
kfree(data);
|
||||
e_unpin_memory:
|
||||
/* content of memory is updated, mark pages dirty */
|
||||
for (i = 0; i < n; i++) {
|
||||
set_page_dirty_lock(pages[i]);
|
||||
mark_page_accessed(pages[i]);
|
||||
}
|
||||
sev_unpin_memory(kvm, pages, n);
|
||||
return ret;
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
|
||||
static const struct svm_direct_access_msrs {
|
||||
u32 index; /* Index of the MSR */
|
||||
bool always; /* True if intercept is always on */
|
||||
} direct_access_msrs[] = {
|
||||
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
|
||||
{ .index = MSR_STAR, .always = true },
|
||||
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
|
||||
#ifdef CONFIG_X86_64
|
||||
@ -263,9 +263,10 @@ static int get_max_npt_level(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
|
||||
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u64 old_efer = vcpu->arch.efer;
|
||||
vcpu->arch.efer = efer;
|
||||
|
||||
if (!npt_enabled) {
|
||||
@ -276,13 +277,32 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
|
||||
efer &= ~EFER_LME;
|
||||
}
|
||||
|
||||
if (!(efer & EFER_SVME)) {
|
||||
svm_leave_nested(svm);
|
||||
svm_set_gif(svm, true);
|
||||
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
|
||||
if (!(efer & EFER_SVME)) {
|
||||
svm_leave_nested(svm);
|
||||
svm_set_gif(svm, true);
|
||||
|
||||
/*
|
||||
* Free the nested guest state, unless we are in SMM.
|
||||
* In this case we will return to the nested guest
|
||||
* as soon as we leave SMM.
|
||||
*/
|
||||
if (!is_smm(&svm->vcpu))
|
||||
svm_free_nested(svm);
|
||||
|
||||
} else {
|
||||
int ret = svm_allocate_nested(svm);
|
||||
|
||||
if (ret) {
|
||||
vcpu->arch.efer = old_efer;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
svm->vmcb->save.efer = efer | EFER_SVME;
|
||||
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int is_external_interrupt(u32 info)
|
||||
@ -553,18 +573,44 @@ free_cpu_data:
|
||||
|
||||
}
|
||||
|
||||
static bool valid_msr_intercept(u32 index)
|
||||
static int direct_access_msr_slot(u32 msr)
|
||||
{
|
||||
int i;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
|
||||
if (direct_access_msrs[i].index == index)
|
||||
return true;
|
||||
if (direct_access_msrs[i].index == msr)
|
||||
return i;
|
||||
|
||||
return false;
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
|
||||
static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
|
||||
int write)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
int slot = direct_access_msr_slot(msr);
|
||||
|
||||
if (slot == -ENOENT)
|
||||
return;
|
||||
|
||||
/* Set the shadow bitmaps to the desired intercept states */
|
||||
if (read)
|
||||
set_bit(slot, svm->shadow_msr_intercept.read);
|
||||
else
|
||||
clear_bit(slot, svm->shadow_msr_intercept.read);
|
||||
|
||||
if (write)
|
||||
set_bit(slot, svm->shadow_msr_intercept.write);
|
||||
else
|
||||
clear_bit(slot, svm->shadow_msr_intercept.write);
|
||||
}
|
||||
|
||||
static bool valid_msr_intercept(u32 index)
|
||||
{
|
||||
return direct_access_msr_slot(index) != -ENOENT;
|
||||
}
|
||||
|
||||
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
|
||||
{
|
||||
u8 bit_write;
|
||||
unsigned long tmp;
|
||||
@ -583,8 +629,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
|
||||
return !!test_bit(bit_write, &tmp);
|
||||
}
|
||||
|
||||
static void set_msr_interception(u32 *msrpm, unsigned msr,
|
||||
int read, int write)
|
||||
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
|
||||
u32 msr, int read, int write)
|
||||
{
|
||||
u8 bit_read, bit_write;
|
||||
unsigned long tmp;
|
||||
@ -596,6 +642,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
|
||||
*/
|
||||
WARN_ON(!valid_msr_intercept(msr));
|
||||
|
||||
/* Enforce non allowed MSRs to trap */
|
||||
if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
|
||||
read = 0;
|
||||
|
||||
if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
|
||||
write = 0;
|
||||
|
||||
offset = svm_msrpm_offset(msr);
|
||||
bit_read = 2 * (msr & 0x0f);
|
||||
bit_write = 2 * (msr & 0x0f) + 1;
|
||||
@ -609,17 +662,60 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
|
||||
msrpm[offset] = tmp;
|
||||
}
|
||||
|
||||
static void svm_vcpu_init_msrpm(u32 *msrpm)
|
||||
static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
|
||||
int read, int write)
|
||||
{
|
||||
set_shadow_msr_intercept(vcpu, msr, read, write);
|
||||
set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
|
||||
}
|
||||
|
||||
u32 *svm_vcpu_alloc_msrpm(void)
|
||||
{
|
||||
struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
|
||||
u32 *msrpm;
|
||||
|
||||
if (!pages)
|
||||
return NULL;
|
||||
|
||||
msrpm = page_address(pages);
|
||||
memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
|
||||
|
||||
return msrpm;
|
||||
}
|
||||
|
||||
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
|
||||
|
||||
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
|
||||
if (!direct_access_msrs[i].always)
|
||||
continue;
|
||||
set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
|
||||
|
||||
void svm_vcpu_free_msrpm(u32 *msrpm)
|
||||
{
|
||||
__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
|
||||
}
|
||||
|
||||
static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u32 i;
|
||||
|
||||
/*
|
||||
* Set intercept permissions for all direct access MSRs again. They
|
||||
* will automatically get filtered through the MSR filter, so we are
|
||||
* back in sync after this.
|
||||
*/
|
||||
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
|
||||
u32 msr = direct_access_msrs[i].index;
|
||||
u32 read = test_bit(i, svm->shadow_msr_intercept.read);
|
||||
u32 write = test_bit(i, svm->shadow_msr_intercept.write);
|
||||
|
||||
set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
|
||||
}
|
||||
}
|
||||
|
||||
@ -666,26 +762,26 @@ static void init_msrpm_offsets(void)
|
||||
}
|
||||
}
|
||||
|
||||
static void svm_enable_lbrv(struct vcpu_svm *svm)
|
||||
static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 *msrpm = svm->msrpm;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
|
||||
}
|
||||
|
||||
static void svm_disable_lbrv(struct vcpu_svm *svm)
|
||||
static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 *msrpm = svm->msrpm;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
|
||||
}
|
||||
|
||||
void disable_nmi_singlestep(struct vcpu_svm *svm)
|
||||
@ -813,6 +909,9 @@ static __init void svm_set_cpu_caps(void)
|
||||
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
|
||||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
|
||||
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
|
||||
|
||||
/* Enable INVPCID feature */
|
||||
kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
|
||||
}
|
||||
|
||||
static __init int svm_hardware_setup(void)
|
||||
@ -985,6 +1084,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
|
||||
return svm->vmcb->control.tsc_offset;
|
||||
}
|
||||
|
||||
static void svm_check_invpcid(struct vcpu_svm *svm)
|
||||
{
|
||||
/*
|
||||
* Intercept INVPCID instruction only if shadow page table is
|
||||
* enabled. Interception is not required with nested page table
|
||||
* enabled.
|
||||
*/
|
||||
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
|
||||
if (!npt_enabled)
|
||||
svm_set_intercept(svm, INTERCEPT_INVPCID);
|
||||
else
|
||||
svm_clr_intercept(svm, INTERCEPT_INVPCID);
|
||||
}
|
||||
}
|
||||
|
||||
static void init_vmcb(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb_control_area *control = &svm->vmcb->control;
|
||||
@ -992,14 +1106,14 @@ static void init_vmcb(struct vcpu_svm *svm)
|
||||
|
||||
svm->vcpu.arch.hflags = 0;
|
||||
|
||||
set_cr_intercept(svm, INTERCEPT_CR0_READ);
|
||||
set_cr_intercept(svm, INTERCEPT_CR3_READ);
|
||||
set_cr_intercept(svm, INTERCEPT_CR4_READ);
|
||||
set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
|
||||
set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR0_READ);
|
||||
svm_set_intercept(svm, INTERCEPT_CR3_READ);
|
||||
svm_set_intercept(svm, INTERCEPT_CR4_READ);
|
||||
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
|
||||
if (!kvm_vcpu_apicv_active(&svm->vcpu))
|
||||
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
|
||||
set_dr_intercepts(svm);
|
||||
|
||||
@ -1094,15 +1208,15 @@ static void init_vmcb(struct vcpu_svm *svm)
|
||||
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
|
||||
svm_clr_intercept(svm, INTERCEPT_INVLPG);
|
||||
clr_exception_intercept(svm, PF_VECTOR);
|
||||
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
|
||||
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
|
||||
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
|
||||
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
|
||||
save->g_pat = svm->vcpu.arch.pat;
|
||||
save->cr3 = 0;
|
||||
save->cr4 = 0;
|
||||
}
|
||||
svm->asid_generation = 0;
|
||||
|
||||
svm->nested.vmcb = 0;
|
||||
svm->nested.vmcb12_gpa = 0;
|
||||
svm->vcpu.arch.hflags = 0;
|
||||
|
||||
if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
|
||||
@ -1114,6 +1228,8 @@ static void init_vmcb(struct vcpu_svm *svm)
|
||||
svm_clr_intercept(svm, INTERCEPT_PAUSE);
|
||||
}
|
||||
|
||||
svm_check_invpcid(svm);
|
||||
|
||||
if (kvm_vcpu_apicv_active(&svm->vcpu))
|
||||
avic_init_vmcb(svm);
|
||||
|
||||
@ -1171,35 +1287,20 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
|
||||
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm;
|
||||
struct page *page;
|
||||
struct page *msrpm_pages;
|
||||
struct page *hsave_page;
|
||||
struct page *nested_msrpm_pages;
|
||||
struct page *vmcb_page;
|
||||
int err;
|
||||
|
||||
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
|
||||
svm = to_svm(vcpu);
|
||||
|
||||
err = -ENOMEM;
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
if (!page)
|
||||
vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!vmcb_page)
|
||||
goto out;
|
||||
|
||||
msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
|
||||
if (!msrpm_pages)
|
||||
goto free_page1;
|
||||
|
||||
nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
|
||||
if (!nested_msrpm_pages)
|
||||
goto free_page2;
|
||||
|
||||
hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
if (!hsave_page)
|
||||
goto free_page3;
|
||||
|
||||
err = avic_init_vcpu(svm);
|
||||
if (err)
|
||||
goto free_page4;
|
||||
goto error_free_vmcb_page;
|
||||
|
||||
/* We initialize this flag to true to make sure that the is_running
|
||||
* bit would be set the first time the vcpu is loaded.
|
||||
@ -1207,18 +1308,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
|
||||
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
|
||||
svm->avic_is_running = true;
|
||||
|
||||
svm->nested.hsave = page_address(hsave_page);
|
||||
clear_page(svm->nested.hsave);
|
||||
svm->msrpm = svm_vcpu_alloc_msrpm();
|
||||
if (!svm->msrpm)
|
||||
goto error_free_vmcb_page;
|
||||
|
||||
svm->msrpm = page_address(msrpm_pages);
|
||||
svm_vcpu_init_msrpm(svm->msrpm);
|
||||
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
|
||||
|
||||
svm->nested.msrpm = page_address(nested_msrpm_pages);
|
||||
svm_vcpu_init_msrpm(svm->nested.msrpm);
|
||||
|
||||
svm->vmcb = page_address(page);
|
||||
clear_page(svm->vmcb);
|
||||
svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
|
||||
svm->vmcb = page_address(vmcb_page);
|
||||
svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
|
||||
svm->asid_generation = 0;
|
||||
init_vmcb(svm);
|
||||
|
||||
@ -1227,14 +1324,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
|
||||
|
||||
return 0;
|
||||
|
||||
free_page4:
|
||||
__free_page(hsave_page);
|
||||
free_page3:
|
||||
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
|
||||
free_page2:
|
||||
__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
|
||||
free_page1:
|
||||
__free_page(page);
|
||||
error_free_vmcb_page:
|
||||
__free_page(vmcb_page);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
@ -1258,10 +1349,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
|
||||
*/
|
||||
svm_clear_current_vmcb(svm->vmcb);
|
||||
|
||||
svm_free_nested(svm);
|
||||
|
||||
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
|
||||
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
|
||||
__free_page(virt_to_page(svm->nested.hsave));
|
||||
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
|
||||
}
|
||||
|
||||
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
@ -1549,11 +1640,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
|
||||
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
|
||||
|
||||
if (gcr0 == *hcr0) {
|
||||
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
|
||||
clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
|
||||
svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
} else {
|
||||
set_cr_intercept(svm, INTERCEPT_CR0_READ);
|
||||
set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR0_READ);
|
||||
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2224,12 +2315,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
|
||||
{
|
||||
unsigned long cr0 = svm->vcpu.arch.cr0;
|
||||
bool ret = false;
|
||||
u64 intercept;
|
||||
|
||||
intercept = svm->nested.ctl.intercept;
|
||||
|
||||
if (!is_guest_mode(&svm->vcpu) ||
|
||||
(!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
|
||||
(!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
|
||||
return false;
|
||||
|
||||
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
|
||||
@ -2267,6 +2355,7 @@ static int cr_interception(struct vcpu_svm *svm)
|
||||
if (cr >= 16) { /* mov to cr */
|
||||
cr -= 16;
|
||||
val = kvm_register_read(&svm->vcpu, reg);
|
||||
trace_kvm_cr_write(cr, val);
|
||||
switch (cr) {
|
||||
case 0:
|
||||
if (!check_selective_cr0_intercepted(svm, val))
|
||||
@ -2312,6 +2401,7 @@ static int cr_interception(struct vcpu_svm *svm)
|
||||
return 1;
|
||||
}
|
||||
kvm_register_write(&svm->vcpu, reg, val);
|
||||
trace_kvm_cr_read(cr, val);
|
||||
}
|
||||
return kvm_complete_insn_gp(&svm->vcpu, err);
|
||||
}
|
||||
@ -2562,7 +2652,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
* We update the L1 MSR bit as well since it will end up
|
||||
* touching the MSR anyway now.
|
||||
*/
|
||||
set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
|
||||
break;
|
||||
case MSR_IA32_PRED_CMD:
|
||||
if (!msr->host_initiated &&
|
||||
@ -2577,7 +2667,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
break;
|
||||
|
||||
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
|
||||
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
|
||||
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
|
||||
break;
|
||||
case MSR_AMD64_VIRT_SPEC_CTRL:
|
||||
if (!msr->host_initiated &&
|
||||
@ -2641,9 +2731,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
svm->vmcb->save.dbgctl = data;
|
||||
vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
|
||||
if (data & (1ULL<<0))
|
||||
svm_enable_lbrv(svm);
|
||||
svm_enable_lbrv(vcpu);
|
||||
else
|
||||
svm_disable_lbrv(svm);
|
||||
svm_disable_lbrv(vcpu);
|
||||
break;
|
||||
case MSR_VM_HSAVE_PA:
|
||||
svm->nested.hsave_msr = data;
|
||||
@ -2739,6 +2829,33 @@ static int mwait_interception(struct vcpu_svm *svm)
|
||||
return nop_interception(svm);
|
||||
}
|
||||
|
||||
static int invpcid_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = &svm->vcpu;
|
||||
unsigned long type;
|
||||
gva_t gva;
|
||||
|
||||
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* For an INVPCID intercept:
|
||||
* EXITINFO1 provides the linear address of the memory operand.
|
||||
* EXITINFO2 provides the contents of the register operand.
|
||||
*/
|
||||
type = svm->vmcb->control.exit_info_2;
|
||||
gva = svm->vmcb->control.exit_info_1;
|
||||
|
||||
if (type > 3) {
|
||||
kvm_inject_gp(vcpu, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return kvm_handle_invpcid(vcpu, type, gva);
|
||||
}
|
||||
|
||||
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
|
||||
[SVM_EXIT_READ_CR0] = cr_interception,
|
||||
[SVM_EXIT_READ_CR3] = cr_interception,
|
||||
@ -2801,6 +2918,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
|
||||
[SVM_EXIT_MWAIT] = mwait_interception,
|
||||
[SVM_EXIT_XSETBV] = xsetbv_interception,
|
||||
[SVM_EXIT_RDPRU] = rdpru_interception,
|
||||
[SVM_EXIT_INVPCID] = invpcid_interception,
|
||||
[SVM_EXIT_NPF] = npf_interception,
|
||||
[SVM_EXIT_RSM] = rsm_interception,
|
||||
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
|
||||
@ -2819,12 +2937,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
|
||||
pr_err("VMCB Control Area:\n");
|
||||
pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
|
||||
pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
|
||||
pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
|
||||
pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
|
||||
pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
|
||||
pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
|
||||
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
|
||||
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
|
||||
pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
|
||||
pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
|
||||
pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
|
||||
pr_err("%-20s%08x %08x\n", "intercepts:",
|
||||
control->intercepts[INTERCEPT_WORD3],
|
||||
control->intercepts[INTERCEPT_WORD4]);
|
||||
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
|
||||
pr_err("%-20s%d\n", "pause filter threshold:",
|
||||
control->pause_filter_thresh);
|
||||
@ -2923,12 +3043,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
|
||||
"excp_to:", save->last_excp_to);
|
||||
}
|
||||
|
||||
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
|
||||
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
|
||||
u32 *intr_info, u32 *error_code)
|
||||
{
|
||||
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
|
||||
|
||||
*info1 = control->exit_info_1;
|
||||
*info2 = control->exit_info_2;
|
||||
*intr_info = control->exit_int_info;
|
||||
if ((*intr_info & SVM_EXITINTINFO_VALID) &&
|
||||
(*intr_info & SVM_EXITINTINFO_VALID_ERR))
|
||||
*error_code = control->exit_int_info_err;
|
||||
else
|
||||
*error_code = 0;
|
||||
}
|
||||
|
||||
static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
|
||||
@ -2939,7 +3066,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
|
||||
|
||||
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
|
||||
|
||||
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
|
||||
if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
|
||||
vcpu->arch.cr0 = svm->vmcb->save.cr0;
|
||||
if (npt_enabled)
|
||||
vcpu->arch.cr3 = svm->vmcb->save.cr3;
|
||||
@ -2947,12 +3074,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
|
||||
if (is_guest_mode(vcpu)) {
|
||||
int vmexit;
|
||||
|
||||
trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
|
||||
svm->vmcb->control.exit_info_1,
|
||||
svm->vmcb->control.exit_info_2,
|
||||
svm->vmcb->control.exit_int_info,
|
||||
svm->vmcb->control.exit_int_info_err,
|
||||
KVM_ISA_SVM);
|
||||
trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
|
||||
|
||||
vmexit = nested_svm_exit_special(svm);
|
||||
|
||||
@ -3062,13 +3184,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
|
||||
if (nested_svm_virtualize_tpr(vcpu))
|
||||
return;
|
||||
|
||||
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
|
||||
if (irr == -1)
|
||||
return;
|
||||
|
||||
if (tpr >= irr)
|
||||
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
|
||||
}
|
||||
|
||||
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
|
||||
@ -3256,7 +3378,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
|
||||
if (nested_svm_virtualize_tpr(vcpu))
|
||||
return;
|
||||
|
||||
if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
|
||||
if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
|
||||
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
|
||||
kvm_set_cr8(vcpu, cr8);
|
||||
}
|
||||
@ -3353,8 +3475,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
|
||||
|
||||
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!is_guest_mode(vcpu) &&
|
||||
to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
|
||||
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
|
||||
to_svm(vcpu)->vmcb->control.exit_info_1)
|
||||
return handle_fastpath_set_msr_irqoff(vcpu);
|
||||
|
||||
@ -3419,7 +3540,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
|
||||
|
||||
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
fastpath_t exit_fastpath;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
|
||||
@ -3460,9 +3580,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
clgi();
|
||||
kvm_load_guest_xsave_state(vcpu);
|
||||
|
||||
if (lapic_in_kernel(vcpu) &&
|
||||
vcpu->arch.apic->lapic_timer.timer_advance_ns)
|
||||
kvm_wait_lapic_expire(vcpu);
|
||||
kvm_wait_lapic_expire(vcpu);
|
||||
|
||||
/*
|
||||
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
|
||||
@ -3542,8 +3660,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
svm_handle_mce(svm);
|
||||
|
||||
svm_complete_interrupts(svm);
|
||||
exit_fastpath = svm_exit_handlers_fastpath(vcpu);
|
||||
return exit_fastpath;
|
||||
|
||||
if (is_guest_mode(vcpu))
|
||||
return EXIT_FASTPATH_NONE;
|
||||
|
||||
return svm_exit_handlers_fastpath(vcpu);
|
||||
}
|
||||
|
||||
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
|
||||
@ -3629,6 +3750,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
|
||||
guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
|
||||
|
||||
/* Check again if INVPCID interception if required */
|
||||
svm_check_invpcid(svm);
|
||||
|
||||
if (!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
|
||||
@ -3743,7 +3867,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
|
||||
break;
|
||||
case SVM_EXIT_WRITE_CR0: {
|
||||
unsigned long cr0, val;
|
||||
u64 intercept;
|
||||
|
||||
if (info->intercept == x86_intercept_cr_write)
|
||||
icpt_info.exit_code += info->modrm_reg;
|
||||
@ -3752,9 +3875,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
|
||||
info->intercept == x86_intercept_clts)
|
||||
break;
|
||||
|
||||
intercept = svm->nested.ctl.intercept;
|
||||
|
||||
if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
|
||||
if (!(vmcb_is_intercept(&svm->nested.ctl,
|
||||
INTERCEPT_SELECTIVE_CR0)))
|
||||
break;
|
||||
|
||||
cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
|
||||
@ -3889,7 +4011,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
|
||||
/* FED8h - SVM Guest */
|
||||
put_smstate(u64, smstate, 0x7ed8, 1);
|
||||
/* FEE0h - SVM Guest VMCB Physical Address */
|
||||
put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
|
||||
put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
|
||||
|
||||
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
|
||||
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
|
||||
@ -3911,7 +4033,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
|
||||
if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
|
||||
u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
|
||||
u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
|
||||
u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
|
||||
u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
|
||||
|
||||
if (guest) {
|
||||
if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
|
||||
@ -3921,10 +4043,13 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
|
||||
return 1;
|
||||
|
||||
if (kvm_vcpu_map(&svm->vcpu,
|
||||
gpa_to_gfn(vmcb), &map) == -EINVAL)
|
||||
gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
|
||||
return 1;
|
||||
|
||||
ret = enter_svm_guest_mode(svm, vmcb, map.hva);
|
||||
if (svm_allocate_nested(svm))
|
||||
return 1;
|
||||
|
||||
ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
|
||||
kvm_vcpu_unmap(&svm->vcpu, &map, true);
|
||||
}
|
||||
}
|
||||
@ -3945,19 +4070,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
}
|
||||
|
||||
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
|
||||
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
|
||||
{
|
||||
unsigned long cr4 = kvm_read_cr4(vcpu);
|
||||
bool smep = cr4 & X86_CR4_SMEP;
|
||||
bool smap = cr4 & X86_CR4_SMAP;
|
||||
bool is_user = svm_get_cpl(vcpu) == 3;
|
||||
|
||||
/*
|
||||
* If RIP is invalid, go ahead with emulation which will cause an
|
||||
* internal error exit.
|
||||
*/
|
||||
if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
|
||||
return true;
|
||||
bool smep, smap, is_user;
|
||||
unsigned long cr4;
|
||||
|
||||
/*
|
||||
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
|
||||
@ -3999,6 +4115,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
|
||||
* instruction pointer so we will not able to workaround it. Lets
|
||||
* print the error and request to kill the guest.
|
||||
*/
|
||||
if (likely(!insn || insn_len))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If RIP is invalid, go ahead with emulation which will cause an
|
||||
* internal error exit.
|
||||
*/
|
||||
if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
|
||||
return true;
|
||||
|
||||
cr4 = kvm_read_cr4(vcpu);
|
||||
smep = cr4 & X86_CR4_SMEP;
|
||||
smap = cr4 & X86_CR4_SMAP;
|
||||
is_user = svm_get_cpl(vcpu) == 3;
|
||||
if (smap && (!smep || is_user)) {
|
||||
if (!sev_guest(vcpu->kvm))
|
||||
return true;
|
||||
@ -4022,7 +4152,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
|
||||
* if an INIT signal is pending.
|
||||
*/
|
||||
return !gif_set(svm) ||
|
||||
(svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
|
||||
(vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
|
||||
}
|
||||
|
||||
static void svm_vm_destroy(struct kvm *kvm)
|
||||
@ -4160,9 +4290,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
||||
.mem_enc_reg_region = svm_register_enc_region,
|
||||
.mem_enc_unreg_region = svm_unregister_enc_region,
|
||||
|
||||
.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
|
||||
.can_emulate_instruction = svm_can_emulate_instruction,
|
||||
|
||||
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
|
||||
|
||||
.msr_filter_changed = svm_msr_filter_changed,
|
||||
};
|
||||
|
||||
static struct kvm_x86_init_ops svm_init_ops __initdata = {
|
||||
|
@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = {
|
||||
|
||||
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
|
||||
|
||||
#define MAX_DIRECT_ACCESS_MSRS 15
|
||||
#define MSRPM_OFFSETS 16
|
||||
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
|
||||
extern bool npt_enabled;
|
||||
@ -85,8 +86,7 @@ struct svm_nested_state {
|
||||
struct vmcb *hsave;
|
||||
u64 hsave_msr;
|
||||
u64 vm_cr_msr;
|
||||
u64 vmcb;
|
||||
u32 host_intercept_exceptions;
|
||||
u64 vmcb12_gpa;
|
||||
|
||||
/* These are the merged vectors */
|
||||
u32 *msrpm;
|
||||
@ -97,6 +97,8 @@ struct svm_nested_state {
|
||||
|
||||
/* cache for control fields of the guest */
|
||||
struct vmcb_control_area ctl;
|
||||
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
struct vcpu_svm {
|
||||
@ -158,6 +160,12 @@ struct vcpu_svm {
|
||||
*/
|
||||
struct list_head ir_list;
|
||||
spinlock_t ir_list_lock;
|
||||
|
||||
/* Save desired MSR intercept (read: pass-through) state */
|
||||
struct {
|
||||
DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
|
||||
DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
|
||||
} shadow_msr_intercept;
|
||||
};
|
||||
|
||||
struct svm_cpu_data {
|
||||
@ -214,51 +222,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
|
||||
return svm->vmcb;
|
||||
}
|
||||
|
||||
static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
|
||||
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_cr |= (1U << bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
|
||||
__set_bit(bit, (unsigned long *)&control->intercepts);
|
||||
}
|
||||
|
||||
static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
|
||||
static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_cr &= ~(1U << bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
|
||||
__clear_bit(bit, (unsigned long *)&control->intercepts);
|
||||
}
|
||||
|
||||
static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
|
||||
static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
return vmcb->control.intercept_cr & (1U << bit);
|
||||
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
|
||||
return test_bit(bit, (unsigned long *)&control->intercepts);
|
||||
}
|
||||
|
||||
static inline void set_dr_intercepts(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
|
||||
| (1 << INTERCEPT_DR1_READ)
|
||||
| (1 << INTERCEPT_DR2_READ)
|
||||
| (1 << INTERCEPT_DR3_READ)
|
||||
| (1 << INTERCEPT_DR4_READ)
|
||||
| (1 << INTERCEPT_DR5_READ)
|
||||
| (1 << INTERCEPT_DR6_READ)
|
||||
| (1 << INTERCEPT_DR7_READ)
|
||||
| (1 << INTERCEPT_DR0_WRITE)
|
||||
| (1 << INTERCEPT_DR1_WRITE)
|
||||
| (1 << INTERCEPT_DR2_WRITE)
|
||||
| (1 << INTERCEPT_DR3_WRITE)
|
||||
| (1 << INTERCEPT_DR4_WRITE)
|
||||
| (1 << INTERCEPT_DR5_WRITE)
|
||||
| (1 << INTERCEPT_DR6_WRITE)
|
||||
| (1 << INTERCEPT_DR7_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
@ -267,25 +268,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_dr = 0;
|
||||
vmcb->control.intercepts[INTERCEPT_DR] = 0;
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
|
||||
static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
|
||||
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_exceptions |= (1U << bit);
|
||||
WARN_ON_ONCE(bit >= 32);
|
||||
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
|
||||
static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
|
||||
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept_exceptions &= ~(1U << bit);
|
||||
WARN_ON_ONCE(bit >= 32);
|
||||
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
@ -294,7 +297,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept |= (1ULL << bit);
|
||||
vmcb_set_intercept(&vmcb->control, bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
@ -303,14 +306,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
|
||||
{
|
||||
struct vmcb *vmcb = get_host_vmcb(svm);
|
||||
|
||||
vmcb->control.intercept &= ~(1ULL << bit);
|
||||
vmcb_clr_intercept(&vmcb->control, bit);
|
||||
|
||||
recalc_intercepts(svm);
|
||||
}
|
||||
|
||||
static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
|
||||
{
|
||||
return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
|
||||
return vmcb_is_intercept(&svm->vmcb->control, bit);
|
||||
}
|
||||
|
||||
static inline bool vgif_enabled(struct vcpu_svm *svm)
|
||||
@ -345,11 +348,15 @@ static inline bool gif_set(struct vcpu_svm *svm)
|
||||
/* svm.c */
|
||||
#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U
|
||||
#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U
|
||||
#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U
|
||||
#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U
|
||||
#define MSR_INVALID 0xffffffffU
|
||||
|
||||
u32 svm_msrpm_offset(u32 msr);
|
||||
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
|
||||
u32 *svm_vcpu_alloc_msrpm(void);
|
||||
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
|
||||
void svm_vcpu_free_msrpm(u32 *msrpm);
|
||||
|
||||
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
|
||||
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
|
||||
int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
|
||||
void svm_flush_tlb(struct kvm_vcpu *vcpu);
|
||||
@ -374,22 +381,24 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
|
||||
|
||||
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
|
||||
{
|
||||
return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI));
|
||||
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
|
||||
}
|
||||
|
||||
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
|
||||
{
|
||||
return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR));
|
||||
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
|
||||
}
|
||||
|
||||
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
|
||||
{
|
||||
return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
|
||||
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
|
||||
}
|
||||
|
||||
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
|
||||
struct vmcb *nested_vmcb);
|
||||
void svm_leave_nested(struct vcpu_svm *svm);
|
||||
void svm_free_nested(struct vcpu_svm *svm);
|
||||
int svm_allocate_nested(struct vcpu_svm *svm);
|
||||
int nested_svm_vmrun(struct vcpu_svm *svm);
|
||||
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
|
||||
int nested_svm_vmexit(struct vcpu_svm *svm);
|
||||
|
@ -15,18 +15,20 @@
|
||||
* Tracepoint for guest mode entry.
|
||||
*/
|
||||
TRACE_EVENT(kvm_entry,
|
||||
TP_PROTO(unsigned int vcpu_id),
|
||||
TP_ARGS(vcpu_id),
|
||||
TP_PROTO(struct kvm_vcpu *vcpu),
|
||||
TP_ARGS(vcpu),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( unsigned int, vcpu_id )
|
||||
__field( unsigned long, rip )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->vcpu_id = vcpu_id;
|
||||
__entry->vcpu_id = vcpu->vcpu_id;
|
||||
__entry->rip = kvm_rip_read(vcpu);
|
||||
),
|
||||
|
||||
TP_printk("vcpu %u", __entry->vcpu_id)
|
||||
TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
|
||||
);
|
||||
|
||||
/*
|
||||
@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic,
|
||||
(isa == KVM_ISA_VMX) ? \
|
||||
__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
|
||||
|
||||
#define TRACE_EVENT_KVM_EXIT(name) \
|
||||
TRACE_EVENT(name, \
|
||||
TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
|
||||
TP_ARGS(exit_reason, vcpu, isa), \
|
||||
\
|
||||
TP_STRUCT__entry( \
|
||||
__field( unsigned int, exit_reason ) \
|
||||
__field( unsigned long, guest_rip ) \
|
||||
__field( u32, isa ) \
|
||||
__field( u64, info1 ) \
|
||||
__field( u64, info2 ) \
|
||||
__field( u32, intr_info ) \
|
||||
__field( u32, error_code ) \
|
||||
__field( unsigned int, vcpu_id ) \
|
||||
), \
|
||||
\
|
||||
TP_fast_assign( \
|
||||
__entry->exit_reason = exit_reason; \
|
||||
__entry->guest_rip = kvm_rip_read(vcpu); \
|
||||
__entry->isa = isa; \
|
||||
__entry->vcpu_id = vcpu->vcpu_id; \
|
||||
kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
|
||||
&__entry->info2, \
|
||||
&__entry->intr_info, \
|
||||
&__entry->error_code); \
|
||||
), \
|
||||
\
|
||||
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
|
||||
"info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \
|
||||
__entry->vcpu_id, \
|
||||
kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
|
||||
__entry->guest_rip, __entry->info1, __entry->info2, \
|
||||
__entry->intr_info, __entry->error_code) \
|
||||
)
|
||||
|
||||
/*
|
||||
* Tracepoint for kvm guest exit:
|
||||
*/
|
||||
TRACE_EVENT(kvm_exit,
|
||||
TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
|
||||
TP_ARGS(exit_reason, vcpu, isa),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( unsigned int, exit_reason )
|
||||
__field( unsigned long, guest_rip )
|
||||
__field( u32, isa )
|
||||
__field( u64, info1 )
|
||||
__field( u64, info2 )
|
||||
__field( unsigned int, vcpu_id )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->exit_reason = exit_reason;
|
||||
__entry->guest_rip = kvm_rip_read(vcpu);
|
||||
__entry->isa = isa;
|
||||
__entry->vcpu_id = vcpu->vcpu_id;
|
||||
kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,
|
||||
&__entry->info2);
|
||||
),
|
||||
|
||||
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
|
||||
__entry->vcpu_id,
|
||||
kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
|
||||
__entry->guest_rip, __entry->info1, __entry->info2)
|
||||
);
|
||||
TRACE_EVENT_KVM_EXIT(kvm_exit);
|
||||
|
||||
/*
|
||||
* Tracepoint for kvm interrupt injection:
|
||||
@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun,
|
||||
);
|
||||
|
||||
TRACE_EVENT(kvm_nested_intercepts,
|
||||
TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
|
||||
TP_ARGS(cr_read, cr_write, exceptions, intercept),
|
||||
TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
|
||||
__u32 intercept1, __u32 intercept2, __u32 intercept3),
|
||||
TP_ARGS(cr_read, cr_write, exceptions, intercept1,
|
||||
intercept2, intercept3),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( __u16, cr_read )
|
||||
__field( __u16, cr_write )
|
||||
__field( __u32, exceptions )
|
||||
__field( __u64, intercept )
|
||||
__field( __u32, intercept1 )
|
||||
__field( __u32, intercept2 )
|
||||
__field( __u32, intercept3 )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cr_read = cr_read;
|
||||
__entry->cr_write = cr_write;
|
||||
__entry->exceptions = exceptions;
|
||||
__entry->intercept = intercept;
|
||||
__entry->intercept1 = intercept1;
|
||||
__entry->intercept2 = intercept2;
|
||||
__entry->intercept3 = intercept3;
|
||||
),
|
||||
|
||||
TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
|
||||
__entry->cr_read, __entry->cr_write, __entry->exceptions,
|
||||
__entry->intercept)
|
||||
TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
|
||||
"intercepts: %08x %08x %08x",
|
||||
__entry->cr_read, __entry->cr_write, __entry->exceptions,
|
||||
__entry->intercept1, __entry->intercept2, __entry->intercept3)
|
||||
);
|
||||
/*
|
||||
* Tracepoint for #VMEXIT while nested
|
||||
*/
|
||||
TRACE_EVENT(kvm_nested_vmexit,
|
||||
TP_PROTO(__u64 rip, __u32 exit_code,
|
||||
__u64 exit_info1, __u64 exit_info2,
|
||||
__u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
|
||||
TP_ARGS(rip, exit_code, exit_info1, exit_info2,
|
||||
exit_int_info, exit_int_info_err, isa),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( __u64, rip )
|
||||
__field( __u32, exit_code )
|
||||
__field( __u64, exit_info1 )
|
||||
__field( __u64, exit_info2 )
|
||||
__field( __u32, exit_int_info )
|
||||
__field( __u32, exit_int_info_err )
|
||||
__field( __u32, isa )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->rip = rip;
|
||||
__entry->exit_code = exit_code;
|
||||
__entry->exit_info1 = exit_info1;
|
||||
__entry->exit_info2 = exit_info2;
|
||||
__entry->exit_int_info = exit_int_info;
|
||||
__entry->exit_int_info_err = exit_int_info_err;
|
||||
__entry->isa = isa;
|
||||
),
|
||||
TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
|
||||
"ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
|
||||
__entry->rip,
|
||||
kvm_print_exit_reason(__entry->exit_code, __entry->isa),
|
||||
__entry->exit_info1, __entry->exit_info2,
|
||||
__entry->exit_int_info, __entry->exit_int_info_err)
|
||||
);
|
||||
TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
|
||||
|
||||
/*
|
||||
* Tracepoint for #VMEXIT reinjected to the guest
|
||||
|
@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void)
|
||||
static inline bool cpu_has_vmx_rdtscp(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_RDTSCP;
|
||||
SECONDARY_EXEC_ENABLE_RDTSCP;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
|
||||
@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void)
|
||||
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
|
||||
}
|
||||
|
||||
static inline bool vmx_rdrand_supported(void)
|
||||
static inline bool cpu_has_vmx_rdrand(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_RDRAND_EXITING;
|
||||
@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void)
|
||||
SECONDARY_EXEC_ENCLS_EXITING;
|
||||
}
|
||||
|
||||
static inline bool vmx_rdseed_supported(void)
|
||||
static inline bool cpu_has_vmx_rdseed(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_RDSEED_EXITING;
|
||||
@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void)
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
|
||||
}
|
||||
|
||||
static inline bool vmx_xsaves_supported(void)
|
||||
static inline bool cpu_has_vmx_xsaves(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_XSAVES;
|
||||
}
|
||||
|
||||
static inline bool vmx_waitpkg_supported(void)
|
||||
static inline bool cpu_has_vmx_waitpkg(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
|
||||
|
@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
|
||||
vmx->nested.hv_evmcs = NULL;
|
||||
}
|
||||
|
||||
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
|
||||
struct loaded_vmcs *prev)
|
||||
{
|
||||
struct vmcs_host_state *dest, *src;
|
||||
|
||||
if (unlikely(!vmx->guest_state_loaded))
|
||||
return;
|
||||
|
||||
src = &prev->host_state;
|
||||
dest = &vmx->loaded_vmcs->host_state;
|
||||
|
||||
vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
|
||||
dest->ldt_sel = src->ldt_sel;
|
||||
#ifdef CONFIG_X86_64
|
||||
dest->ds_sel = src->ds_sel;
|
||||
dest->es_sel = src->es_sel;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct loaded_vmcs *prev;
|
||||
int cpu;
|
||||
|
||||
if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
|
||||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
prev = vmx->loaded_vmcs;
|
||||
vmx->loaded_vmcs = vmcs;
|
||||
vmx_vcpu_load_vmcs(vcpu, cpu, prev);
|
||||
vmx_sync_vmcs_host_state(vmx, prev);
|
||||
put_cpu();
|
||||
|
||||
vmx_register_cache_reset(vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
|
||||
* just stops using VMX.
|
||||
@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
|
||||
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
|
||||
|
||||
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
|
||||
return;
|
||||
|
||||
kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
|
||||
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
|
||||
|
||||
vmx->nested.vmxon = false;
|
||||
vmx->nested.smm.vmxon = false;
|
||||
@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
|
||||
free_loaded_vmcs(&vmx->nested.vmcs02);
|
||||
}
|
||||
|
||||
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
|
||||
struct loaded_vmcs *prev)
|
||||
{
|
||||
struct vmcs_host_state *dest, *src;
|
||||
|
||||
if (unlikely(!vmx->guest_state_loaded))
|
||||
return;
|
||||
|
||||
src = &prev->host_state;
|
||||
dest = &vmx->loaded_vmcs->host_state;
|
||||
|
||||
vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
|
||||
dest->ldt_sel = src->ldt_sel;
|
||||
#ifdef CONFIG_X86_64
|
||||
dest->ds_sel = src->ds_sel;
|
||||
dest->es_sel = src->es_sel;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct loaded_vmcs *prev;
|
||||
int cpu;
|
||||
|
||||
if (vmx->loaded_vmcs == vmcs)
|
||||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
prev = vmx->loaded_vmcs;
|
||||
vmx->loaded_vmcs = vmcs;
|
||||
vmx_vcpu_load_vmcs(vcpu, cpu, prev);
|
||||
vmx_sync_vmcs_host_state(vmx, prev);
|
||||
put_cpu();
|
||||
|
||||
vmx_register_cache_reset(vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the current vmcs of the logical processor is the
|
||||
* vmcs01 of the vcpu before calling free_nested().
|
||||
@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
vcpu_load(vcpu);
|
||||
vmx_leave_nested(vcpu);
|
||||
vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
|
||||
free_nested(vcpu);
|
||||
vcpu_put(vcpu);
|
||||
}
|
||||
|
||||
@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
|
||||
* VM-exit in L0, use the more accurate value.
|
||||
*/
|
||||
if (msr_index == MSR_IA32_TSC) {
|
||||
int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
|
||||
MSR_IA32_TSC);
|
||||
int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
|
||||
MSR_IA32_TSC);
|
||||
|
||||
if (index >= 0) {
|
||||
u64 val = vmx->msr_autostore.guest.val[index].value;
|
||||
if (i >= 0) {
|
||||
u64 val = vmx->msr_autostore.guest.val[i].value;
|
||||
|
||||
*data = kvm_read_l1_tsc(vcpu, val);
|
||||
return true;
|
||||
@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
|
||||
bool in_vmcs12_store_list;
|
||||
int msr_autostore_index;
|
||||
int msr_autostore_slot;
|
||||
bool in_autostore_list;
|
||||
int last;
|
||||
|
||||
msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
|
||||
in_autostore_list = msr_autostore_index >= 0;
|
||||
msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
|
||||
in_autostore_list = msr_autostore_slot >= 0;
|
||||
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
|
||||
|
||||
if (in_vmcs12_store_list && !in_autostore_list) {
|
||||
if (autostore->nr == NR_LOADSTORE_MSRS) {
|
||||
if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
|
||||
/*
|
||||
* Emulated VMEntry does not fail here. Instead a less
|
||||
* accurate value will be returned by
|
||||
@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
|
||||
autostore->val[last].index = msr_index;
|
||||
} else if (!in_vmcs12_store_list && in_autostore_list) {
|
||||
last = --autostore->nr;
|
||||
autostore->val[msr_autostore_index] = autostore->val[last];
|
||||
autostore->val[msr_autostore_slot] = autostore->val[last];
|
||||
}
|
||||
}
|
||||
|
||||
@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
||||
/* Take the following fields only from vmcs12 */
|
||||
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
|
||||
SECONDARY_EXEC_ENABLE_INVPCID |
|
||||
SECONDARY_EXEC_RDTSCP |
|
||||
SECONDARY_EXEC_ENABLE_RDTSCP |
|
||||
SECONDARY_EXEC_XSAVES |
|
||||
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
|
||||
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
|
||||
@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
||||
vmcs_write16(GUEST_INTR_STATUS,
|
||||
vmcs12->guest_intr_status);
|
||||
|
||||
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
|
||||
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
|
||||
|
||||
secondary_exec_controls_set(vmx, exec_control);
|
||||
}
|
||||
|
||||
@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
||||
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
|
||||
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
|
||||
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
|
||||
|
||||
vmx->segment_cache.bitmask = 0;
|
||||
}
|
||||
|
||||
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
|
||||
@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
||||
* which means L1 attempted VMEntry to L2 with invalid state.
|
||||
* Fail the VMEntry.
|
||||
*/
|
||||
if (vmx->emulation_required) {
|
||||
if (CC(!vmx_guest_state_valid(vcpu))) {
|
||||
*entry_failure_code = ENTRY_FAIL_DEFAULT;
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
|
||||
prepare_vmcs02_early(vmx, vmcs12);
|
||||
|
||||
if (from_vmentry) {
|
||||
if (unlikely(!nested_get_vmcs12_pages(vcpu)))
|
||||
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
|
||||
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
|
||||
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
|
||||
}
|
||||
|
||||
if (nested_vmx_check_vmentry_hw(vcpu)) {
|
||||
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
|
||||
@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
|
||||
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
|
||||
* have already been set at vmentry time and should not be reset.
|
||||
*/
|
||||
kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
|
||||
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
|
||||
if (evmptrld_status == EVMPTRLD_ERROR) {
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
} else if (evmptrld_status == EVMPTRLD_VMFAIL) {
|
||||
} else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
|
||||
return nested_vmx_failInvalid(vcpu);
|
||||
}
|
||||
|
||||
if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
|
||||
if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
|
||||
return nested_vmx_failInvalid(vcpu);
|
||||
|
||||
vmcs12 = get_vmcs12(vcpu);
|
||||
@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
|
||||
* rather than RFLAGS.ZF, and no error number is stored to the
|
||||
* VM-instruction error field.
|
||||
*/
|
||||
if (vmcs12->hdr.shadow_vmcs)
|
||||
if (CC(vmcs12->hdr.shadow_vmcs))
|
||||
return nested_vmx_failInvalid(vcpu);
|
||||
|
||||
if (vmx->nested.hv_evmcs) {
|
||||
@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
|
||||
* for misconfigurations which will anyway be caught by the processor
|
||||
* when using the merged vmcs02.
|
||||
*/
|
||||
if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
|
||||
if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
|
||||
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
|
||||
|
||||
if (vmcs12->launch_state == launch)
|
||||
if (CC(vmcs12->launch_state == launch))
|
||||
return nested_vmx_fail(vcpu,
|
||||
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
|
||||
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
|
||||
@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
|
||||
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
|
||||
goto vmentry_failed;
|
||||
|
||||
/* Emulate processing of posted interrupts on VM-Enter. */
|
||||
if (nested_cpu_has_posted_intr(vmcs12) &&
|
||||
kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
|
||||
vmx->nested.pi_pending = true;
|
||||
kvm_make_request(KVM_REQ_EVENT, vcpu);
|
||||
kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
|
||||
}
|
||||
|
||||
/* Hide L1D cache contents from the nested guest. */
|
||||
vmx->vcpu.arch.l1tf_flush_l1d = true;
|
||||
|
||||
@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
|
||||
|
||||
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
|
||||
{
|
||||
struct shared_msr_entry *efer_msr;
|
||||
struct vmx_uret_msr *efer_msr;
|
||||
unsigned int i;
|
||||
|
||||
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
|
||||
@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
|
||||
return vmx->msr_autoload.guest.val[i].value;
|
||||
}
|
||||
|
||||
efer_msr = find_msr_entry(vmx, MSR_EFER);
|
||||
efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
|
||||
if (efer_msr)
|
||||
return efer_msr->data;
|
||||
|
||||
@ -4696,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
|
||||
|
||||
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
|
||||
if (r != X86EMUL_CONTINUE) {
|
||||
*ret = vmx_handle_memory_failure(vcpu, r, &e);
|
||||
*ret = kvm_handle_memory_failure(vcpu, r, &e);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -4760,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
|
||||
|
||||
if (vmx_pt_mode_is_host_guest()) {
|
||||
vmx->pt_desc.guest.ctl = 0;
|
||||
pt_update_intercept_for_msr(vmx);
|
||||
pt_update_intercept_for_msr(vcpu);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -5003,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
|
||||
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
|
||||
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return vmx_handle_memory_failure(vcpu, r, &e);
|
||||
return kvm_handle_memory_failure(vcpu, r, &e);
|
||||
}
|
||||
|
||||
return nested_vmx_succeed(vcpu);
|
||||
@ -5076,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return vmx_handle_memory_failure(vcpu, r, &e);
|
||||
return kvm_handle_memory_failure(vcpu, r, &e);
|
||||
}
|
||||
|
||||
field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
|
||||
@ -5238,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
|
||||
r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
|
||||
sizeof(gpa_t), &e);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return vmx_handle_memory_failure(vcpu, r, &e);
|
||||
return kvm_handle_memory_failure(vcpu, r, &e);
|
||||
|
||||
return nested_vmx_succeed(vcpu);
|
||||
}
|
||||
@ -5291,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return vmx_handle_memory_failure(vcpu, r, &e);
|
||||
return kvm_handle_memory_failure(vcpu, r, &e);
|
||||
|
||||
/*
|
||||
* Nested EPT roots are always held through guest_mmu,
|
||||
@ -5373,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return vmx_handle_memory_failure(vcpu, r, &e);
|
||||
return kvm_handle_memory_failure(vcpu, r, &e);
|
||||
|
||||
if (operand.vpid >> 16)
|
||||
return nested_vmx_fail(vcpu,
|
||||
@ -5918,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
|
||||
goto reflect_vmexit;
|
||||
}
|
||||
|
||||
exit_intr_info = vmx_get_intr_info(vcpu);
|
||||
exit_qual = vmx_get_exit_qual(vcpu);
|
||||
|
||||
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
|
||||
vmx->idt_vectoring_info, exit_intr_info,
|
||||
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
|
||||
KVM_ISA_VMX);
|
||||
trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
|
||||
|
||||
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
|
||||
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
|
||||
@ -5940,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
|
||||
* need to be synthesized by querying the in-kernel LAPIC, but external
|
||||
* interrupts are never reflected to L1 so it's a non-issue.
|
||||
*/
|
||||
if ((exit_intr_info &
|
||||
(INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
|
||||
(INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
|
||||
exit_intr_info = vmx_get_intr_info(vcpu);
|
||||
if (is_exception_with_error_code(exit_intr_info)) {
|
||||
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
|
||||
|
||||
vmcs12->vm_exit_intr_error_code =
|
||||
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
|
||||
}
|
||||
exit_qual = vmx_get_exit_qual(vcpu);
|
||||
|
||||
reflect_vmexit:
|
||||
nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
|
||||
@ -6182,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
|
||||
* restored yet. EVMCS will be mapped from
|
||||
* nested_get_vmcs12_pages().
|
||||
*/
|
||||
kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
|
||||
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -6318,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
|
||||
#ifdef CONFIG_X86_64
|
||||
VM_EXIT_HOST_ADDR_SPACE_SIZE |
|
||||
#endif
|
||||
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
|
||||
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
|
||||
VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
msrs->exit_ctls_high |=
|
||||
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
|
||||
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
|
||||
@ -6337,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
|
||||
#ifdef CONFIG_X86_64
|
||||
VM_ENTRY_IA32E_MODE |
|
||||
#endif
|
||||
VM_ENTRY_LOAD_IA32_PAT;
|
||||
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
|
||||
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
msrs->entry_ctls_high |=
|
||||
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
|
||||
|
||||
@ -6391,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
|
||||
msrs->secondary_ctls_low = 0;
|
||||
msrs->secondary_ctls_high &=
|
||||
SECONDARY_EXEC_DESC |
|
||||
SECONDARY_EXEC_RDTSCP |
|
||||
SECONDARY_EXEC_ENABLE_RDTSCP |
|
||||
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
|
||||
SECONDARY_EXEC_WBINVD_EXITING |
|
||||
SECONDARY_EXEC_APIC_REGISTER_VIRT |
|
||||
@ -6561,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
|
||||
.hv_timer_pending = nested_vmx_preemption_timer_pending,
|
||||
.get_state = vmx_get_nested_state,
|
||||
.set_state = vmx_set_nested_state,
|
||||
.get_vmcs12_pages = nested_get_vmcs12_pages,
|
||||
.get_nested_state_pages = nested_get_vmcs12_pages,
|
||||
.write_log_dirty = nested_vmx_write_pml_buffer,
|
||||
.enable_evmcs = nested_enable_evmcs,
|
||||
.get_evmcs_version = nested_get_evmcs_version,
|
||||
|
332
arch/x86/kvm/vmx/posted_intr.c
Normal file
332
arch/x86/kvm/vmx/posted_intr.c
Normal file
@ -0,0 +1,332 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
#include <asm/irq_remapping.h>
|
||||
#include <asm/cpu.h>
|
||||
|
||||
#include "lapic.h"
|
||||
#include "posted_intr.h"
|
||||
#include "trace.h"
|
||||
#include "vmx.h"
|
||||
|
||||
/*
|
||||
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
|
||||
* can find which vCPU should be waken up.
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
|
||||
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
|
||||
|
||||
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return &(to_vmx(vcpu)->pi_desc);
|
||||
}
|
||||
|
||||
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
struct pi_desc old, new;
|
||||
unsigned int dest;
|
||||
|
||||
/*
|
||||
* In case of hot-plug or hot-unplug, we may have to undo
|
||||
* vmx_vcpu_pi_put even if there is no assigned device. And we
|
||||
* always keep PI.NDST up to date for simplicity: it makes the
|
||||
* code easier, and CPU migration is not a fast path.
|
||||
*/
|
||||
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
|
||||
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
|
||||
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
|
||||
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
|
||||
* correctly.
|
||||
*/
|
||||
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
|
||||
pi_clear_sn(pi_desc);
|
||||
goto after_clear_sn;
|
||||
}
|
||||
|
||||
/* The full case. */
|
||||
do {
|
||||
old.control = new.control = pi_desc->control;
|
||||
|
||||
dest = cpu_physical_id(cpu);
|
||||
|
||||
if (x2apic_enabled())
|
||||
new.ndst = dest;
|
||||
else
|
||||
new.ndst = (dest << 8) & 0xFF00;
|
||||
|
||||
new.sn = 0;
|
||||
} while (cmpxchg64(&pi_desc->control, old.control,
|
||||
new.control) != old.control);
|
||||
|
||||
after_clear_sn:
|
||||
|
||||
/*
|
||||
* Clear SN before reading the bitmap. The VT-d firmware
|
||||
* writes the bitmap and reads SN atomically (5.2.3 in the
|
||||
* spec), so it doesn't really have a memory barrier that
|
||||
* pairs with this, but we cannot do that and we need one.
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (!pi_is_pir_empty(pi_desc))
|
||||
pi_set_on(pi_desc);
|
||||
}
|
||||
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
|
||||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
|
||||
!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
|
||||
/* Set SN when the vCPU is preempted */
|
||||
if (vcpu->preempted)
|
||||
pi_set_sn(pi_desc);
|
||||
}
|
||||
|
||||
static void __pi_post_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
struct pi_desc old, new;
|
||||
unsigned int dest;
|
||||
|
||||
do {
|
||||
old.control = new.control = pi_desc->control;
|
||||
WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
|
||||
"Wakeup handler not enabled while the VCPU is blocked\n");
|
||||
|
||||
dest = cpu_physical_id(vcpu->cpu);
|
||||
|
||||
if (x2apic_enabled())
|
||||
new.ndst = dest;
|
||||
else
|
||||
new.ndst = (dest << 8) & 0xFF00;
|
||||
|
||||
/* set 'NV' to 'notification vector' */
|
||||
new.nv = POSTED_INTR_VECTOR;
|
||||
} while (cmpxchg64(&pi_desc->control, old.control,
|
||||
new.control) != old.control);
|
||||
|
||||
if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
list_del(&vcpu->blocked_vcpu_list);
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
vcpu->pre_pcpu = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine does the following things for vCPU which is going
|
||||
* to be blocked if VT-d PI is enabled.
|
||||
* - Store the vCPU to the wakeup list, so when interrupts happen
|
||||
* we can find the right vCPU to wake up.
|
||||
* - Change the Posted-interrupt descriptor as below:
|
||||
* 'NDST' <-- vcpu->pre_pcpu
|
||||
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
|
||||
* - If 'ON' is set during this process, which means at least one
|
||||
* interrupt is posted for this vCPU, we cannot block it, in
|
||||
* this case, return 1, otherwise, return 0.
|
||||
*
|
||||
*/
|
||||
int pi_pre_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
unsigned int dest;
|
||||
struct pi_desc old, new;
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
|
||||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
|
||||
!kvm_vcpu_apicv_active(vcpu))
|
||||
return 0;
|
||||
|
||||
WARN_ON(irqs_disabled());
|
||||
local_irq_disable();
|
||||
if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
|
||||
vcpu->pre_pcpu = vcpu->cpu;
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
list_add_tail(&vcpu->blocked_vcpu_list,
|
||||
&per_cpu(blocked_vcpu_on_cpu,
|
||||
vcpu->pre_pcpu));
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
}
|
||||
|
||||
do {
|
||||
old.control = new.control = pi_desc->control;
|
||||
|
||||
WARN((pi_desc->sn == 1),
|
||||
"Warning: SN field of posted-interrupts "
|
||||
"is set before blocking\n");
|
||||
|
||||
/*
|
||||
* Since vCPU can be preempted during this process,
|
||||
* vcpu->cpu could be different with pre_pcpu, we
|
||||
* need to set pre_pcpu as the destination of wakeup
|
||||
* notification event, then we can find the right vCPU
|
||||
* to wakeup in wakeup handler if interrupts happen
|
||||
* when the vCPU is in blocked state.
|
||||
*/
|
||||
dest = cpu_physical_id(vcpu->pre_pcpu);
|
||||
|
||||
if (x2apic_enabled())
|
||||
new.ndst = dest;
|
||||
else
|
||||
new.ndst = (dest << 8) & 0xFF00;
|
||||
|
||||
/* set 'NV' to 'wakeup vector' */
|
||||
new.nv = POSTED_INTR_WAKEUP_VECTOR;
|
||||
} while (cmpxchg64(&pi_desc->control, old.control,
|
||||
new.control) != old.control);
|
||||
|
||||
/* We should not block the vCPU if an interrupt is posted for it. */
|
||||
if (pi_test_on(pi_desc) == 1)
|
||||
__pi_post_block(vcpu);
|
||||
|
||||
local_irq_enable();
|
||||
return (vcpu->pre_pcpu == -1);
|
||||
}
|
||||
|
||||
void pi_post_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (vcpu->pre_pcpu == -1)
|
||||
return;
|
||||
|
||||
WARN_ON(irqs_disabled());
|
||||
local_irq_disable();
|
||||
__pi_post_block(vcpu);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
|
||||
*/
|
||||
void pi_wakeup_handler(void)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
|
||||
blocked_vcpu_list) {
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (pi_test_on(pi_desc) == 1)
|
||||
kvm_vcpu_kick(vcpu);
|
||||
}
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
}
|
||||
|
||||
void __init pi_init(int cpu)
|
||||
{
|
||||
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
|
||||
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
}
|
||||
|
||||
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
return pi_test_on(pi_desc) ||
|
||||
(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* pi_update_irte - set IRTE for Posted-Interrupts
|
||||
*
|
||||
* @kvm: kvm
|
||||
* @host_irq: host irq of the interrupt
|
||||
* @guest_irq: gsi of the interrupt
|
||||
* @set: set or unset PI
|
||||
* returns 0 on success, < 0 on failure
|
||||
*/
|
||||
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
|
||||
bool set)
|
||||
{
|
||||
struct kvm_kernel_irq_routing_entry *e;
|
||||
struct kvm_irq_routing_table *irq_rt;
|
||||
struct kvm_lapic_irq irq;
|
||||
struct kvm_vcpu *vcpu;
|
||||
struct vcpu_data vcpu_info;
|
||||
int idx, ret = 0;
|
||||
|
||||
if (!kvm_arch_has_assigned_device(kvm) ||
|
||||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
|
||||
!kvm_vcpu_apicv_active(kvm->vcpus[0]))
|
||||
return 0;
|
||||
|
||||
idx = srcu_read_lock(&kvm->irq_srcu);
|
||||
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
|
||||
if (guest_irq >= irq_rt->nr_rt_entries ||
|
||||
hlist_empty(&irq_rt->map[guest_irq])) {
|
||||
pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
|
||||
guest_irq, irq_rt->nr_rt_entries);
|
||||
goto out;
|
||||
}
|
||||
|
||||
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
|
||||
if (e->type != KVM_IRQ_ROUTING_MSI)
|
||||
continue;
|
||||
/*
|
||||
* VT-d PI cannot support posting multicast/broadcast
|
||||
* interrupts to a vCPU, we still use interrupt remapping
|
||||
* for these kind of interrupts.
|
||||
*
|
||||
* For lowest-priority interrupts, we only support
|
||||
* those with single CPU as the destination, e.g. user
|
||||
* configures the interrupts via /proc/irq or uses
|
||||
* irqbalance to make the interrupts single-CPU.
|
||||
*
|
||||
* We will support full lowest-priority interrupt later.
|
||||
*
|
||||
* In addition, we can only inject generic interrupts using
|
||||
* the PI mechanism, refuse to route others through it.
|
||||
*/
|
||||
|
||||
kvm_set_msi_irq(kvm, e, &irq);
|
||||
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
|
||||
!kvm_irq_is_postable(&irq)) {
|
||||
/*
|
||||
* Make sure the IRTE is in remapped mode if
|
||||
* we don't handle it in posted mode.
|
||||
*/
|
||||
ret = irq_set_vcpu_affinity(host_irq, NULL);
|
||||
if (ret < 0) {
|
||||
printk(KERN_INFO
|
||||
"failed to back to remapped mode, irq: %u\n",
|
||||
host_irq);
|
||||
goto out;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
|
||||
vcpu_info.vector = irq.vector;
|
||||
|
||||
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
|
||||
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
|
||||
|
||||
if (set)
|
||||
ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
|
||||
else
|
||||
ret = irq_set_vcpu_affinity(host_irq, NULL);
|
||||
|
||||
if (ret < 0) {
|
||||
printk(KERN_INFO "%s: failed to update PI IRTE\n",
|
||||
__func__);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
srcu_read_unlock(&kvm->irq_srcu, idx);
|
||||
return ret;
|
||||
}
|
99
arch/x86/kvm/vmx/posted_intr.h
Normal file
99
arch/x86/kvm/vmx/posted_intr.h
Normal file
@ -0,0 +1,99 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __KVM_X86_VMX_POSTED_INTR_H
|
||||
#define __KVM_X86_VMX_POSTED_INTR_H
|
||||
|
||||
#define POSTED_INTR_ON 0
|
||||
#define POSTED_INTR_SN 1
|
||||
|
||||
/* Posted-Interrupt Descriptor */
|
||||
struct pi_desc {
|
||||
u32 pir[8]; /* Posted interrupt requested */
|
||||
union {
|
||||
struct {
|
||||
/* bit 256 - Outstanding Notification */
|
||||
u16 on : 1,
|
||||
/* bit 257 - Suppress Notification */
|
||||
sn : 1,
|
||||
/* bit 271:258 - Reserved */
|
||||
rsvd_1 : 14;
|
||||
/* bit 279:272 - Notification Vector */
|
||||
u8 nv;
|
||||
/* bit 287:280 - Reserved */
|
||||
u8 rsvd_2;
|
||||
/* bit 319:288 - Notification Destination */
|
||||
u32 ndst;
|
||||
};
|
||||
u64 control;
|
||||
};
|
||||
u32 rsvd[6];
|
||||
} __aligned(64);
|
||||
|
||||
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_and_set_bit(POSTED_INTR_ON,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_and_clear_bit(POSTED_INTR_ON,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
|
||||
}
|
||||
|
||||
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
|
||||
{
|
||||
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
|
||||
}
|
||||
|
||||
static inline void pi_set_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
set_bit(POSTED_INTR_SN,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline void pi_set_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
set_bit(POSTED_INTR_ON,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline void pi_clear_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
clear_bit(POSTED_INTR_ON,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline void pi_clear_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
clear_bit(POSTED_INTR_SN,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline int pi_test_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_bit(POSTED_INTR_ON,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline int pi_test_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_bit(POSTED_INTR_SN,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
|
||||
int pi_pre_block(struct kvm_vcpu *vcpu);
|
||||
void pi_post_block(struct kvm_vcpu *vcpu);
|
||||
void pi_wakeup_handler(void);
|
||||
void __init pi_init(int cpu);
|
||||
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
|
||||
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
|
||||
bool set);
|
||||
|
||||
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
|
@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info)
|
||||
return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
|
||||
}
|
||||
|
||||
static inline bool is_exception_with_error_code(u32 intr_info)
|
||||
{
|
||||
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
|
||||
|
||||
return (intr_info & mask) == mask;
|
||||
}
|
||||
|
||||
enum vmcs_field_width {
|
||||
VMCS_FIELD_WIDTH_U16 = 0,
|
||||
VMCS_FIELD_WIDTH_U64 = 1,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user