A collection of x86 and ARM bugfixes, and some improvements to documentation.
On top of this, a cleanup of kvm_para.h headers, which were exported by some architectures even though they not support KVM at all. This is responsible for all the Kbuild changes in the diffstat. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJcoM5VAAoJEL/70l94x66DU3EH/A8sYdsfeqALWElm2Sy9TYas mntz+oTWsl3vDy8s8zp1ET2NpF7oBlBEMmCWhVEJaD+1qW3VpTRAseR3Zr9ML9xD k+BQM8SKv47o86ZN+y4XALl30Ckb3DXh/X1xsrV5hF6J3ofC+Ce2tF560l8C9ygC WyHDxwNHMWVA/6TyW3mhunzuVKgZ/JND9+0zlyY1LKmUQ0BQLle23gseIhhI0YDm B4VGIYU2Mf8jCH5Ir3N/rQ8pLdo8U7f5P/MMfgXQafksvUHJBg6B6vOhLJh94dLh J2wixYp1zlT0drBBkvJ0jPZ75skooWWj0o3otEA7GNk/hRj6MTllgfL5SajTHZg= =/A7u -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM fixes from Paolo Bonzini: "A collection of x86 and ARM bugfixes, and some improvements to documentation. On top of this, a cleanup of kvm_para.h headers, which were exported by some architectures even though they not support KVM at all. This is responsible for all the Kbuild changes in the diffstat" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits) Documentation: kvm: clarify KVM_SET_USER_MEMORY_REGION KVM: doc: Document the life cycle of a VM and its resources KVM: selftests: complete IO before migrating guest state KVM: selftests: disable stack protector for all KVM tests KVM: selftests: explicitly disable PIE for tests KVM: selftests: assert on exit reason in CR4/cpuid sync test KVM: x86: update %rip after emulating IO x86/kvm/hyper-v: avoid spurious pending stimer on vCPU init kvm/x86: Move MSR_IA32_ARCH_CAPABILITIES to array emulated_msrs KVM: x86: Emulate MSR_IA32_ARCH_CAPABILITIES on AMD hosts kvm: don't redefine flags as something else kvm: mmu: Used range based flushing in slot_handle_level_range KVM: export <linux/kvm_para.h> and <asm/kvm_para.h> iif KVM is supported KVM: x86: remove check on nr_mmu_pages in kvm_arch_commit_memory_region() kvm: nVMX: Add a vmentry check for HOST_SYSENTER_ESP and HOST_SYSENTER_EIP fields KVM: SVM: Workaround errata#1096 (insn_len maybe zero on SMAP violation) KVM: Reject device ioctls from processes other than the VM's creator KVM: doc: Fix incorrect word ordering regarding supported use of APIs KVM: x86: fix handling of role.cr4_pae and rename it to 'gpte_size' KVM: nVMX: Do not inherit quadrant and invalid for the root shadow EPT ...
This commit is contained in:
		
						commit
						63fc9c2348
					
				| @ -5,25 +5,32 @@ The Definitive KVM (Kernel-based Virtual Machine) API Documentation | ||||
| ---------------------- | ||||
| 
 | ||||
| The kvm API is a set of ioctls that are issued to control various aspects | ||||
| of a virtual machine.  The ioctls belong to three classes | ||||
| of a virtual machine.  The ioctls belong to three classes: | ||||
| 
 | ||||
|  - System ioctls: These query and set global attributes which affect the | ||||
|    whole kvm subsystem.  In addition a system ioctl is used to create | ||||
|    virtual machines | ||||
|    virtual machines. | ||||
| 
 | ||||
|  - VM ioctls: These query and set attributes that affect an entire virtual | ||||
|    machine, for example memory layout.  In addition a VM ioctl is used to | ||||
|    create virtual cpus (vcpus). | ||||
|    create virtual cpus (vcpus) and devices. | ||||
| 
 | ||||
|    Only run VM ioctls from the same process (address space) that was used | ||||
|    to create the VM. | ||||
|    VM ioctls must be issued from the same process (address space) that was | ||||
|    used to create the VM. | ||||
| 
 | ||||
|  - vcpu ioctls: These query and set attributes that control the operation | ||||
|    of a single virtual cpu. | ||||
| 
 | ||||
|    Only run vcpu ioctls from the same thread that was used to create the | ||||
|    vcpu. | ||||
|    vcpu ioctls should be issued from the same thread that was used to create | ||||
|    the vcpu, except for asynchronous vcpu ioctl that are marked as such in | ||||
|    the documentation.  Otherwise, the first ioctl after switching threads | ||||
|    could see a performance impact. | ||||
| 
 | ||||
|  - device ioctls: These query and set attributes that control the operation | ||||
|    of a single device. | ||||
| 
 | ||||
|    device ioctls must be issued from the same process (address space) that | ||||
|    was used to create the VM. | ||||
| 
 | ||||
| 2. File descriptors | ||||
| ------------------- | ||||
| @ -32,17 +39,34 @@ The kvm API is centered around file descriptors.  An initial | ||||
| open("/dev/kvm") obtains a handle to the kvm subsystem; this handle | ||||
| can be used to issue system ioctls.  A KVM_CREATE_VM ioctl on this | ||||
| handle will create a VM file descriptor which can be used to issue VM | ||||
| ioctls.  A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu | ||||
| and return a file descriptor pointing to it.  Finally, ioctls on a vcpu | ||||
| fd can be used to control the vcpu, including the important task of | ||||
| actually running guest code. | ||||
| ioctls.  A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will | ||||
| create a virtual cpu or device and return a file descriptor pointing to | ||||
| the new resource.  Finally, ioctls on a vcpu or device fd can be used | ||||
| to control the vcpu or device.  For vcpus, this includes the important | ||||
| task of actually running guest code. | ||||
| 
 | ||||
| In general file descriptors can be migrated among processes by means | ||||
| of fork() and the SCM_RIGHTS facility of unix domain socket.  These | ||||
| kinds of tricks are explicitly not supported by kvm.  While they will | ||||
| not cause harm to the host, their actual behavior is not guaranteed by | ||||
| the API.  The only supported use is one virtual machine per process, | ||||
| and one vcpu per thread. | ||||
| the API.  See "General description" for details on the ioctl usage | ||||
| model that is supported by KVM. | ||||
| 
 | ||||
| It is important to note that althought VM ioctls may only be issued from | ||||
| the process that created the VM, a VM's lifecycle is associated with its | ||||
| file descriptor, not its creator (process).  In other words, the VM and | ||||
| its resources, *including the associated address space*, are not freed | ||||
| until the last reference to the VM's file descriptor has been released. | ||||
| For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will | ||||
| not be freed until both the parent (original) process and its child have | ||||
| put their references to the VM's file descriptor. | ||||
| 
 | ||||
| Because a VM's resources are not freed until the last reference to its | ||||
| file descriptor is released, creating additional references to a VM via | ||||
| via fork(), dup(), etc... without careful consideration is strongly | ||||
| discouraged and may have unwanted side effects, e.g. memory allocated | ||||
| by and on behalf of the VM's process may not be freed/unaccounted when | ||||
| the VM is shut down. | ||||
| 
 | ||||
| 
 | ||||
| It is important to note that althought VM ioctls may only be issued from | ||||
| @ -515,11 +539,15 @@ c) KVM_INTERRUPT_SET_LEVEL | ||||
| Note that any value for 'irq' other than the ones stated above is invalid | ||||
| and incurs unexpected behavior. | ||||
| 
 | ||||
| This is an asynchronous vcpu ioctl and can be invoked from any thread. | ||||
| 
 | ||||
| MIPS: | ||||
| 
 | ||||
| Queues an external interrupt to be injected into the virtual CPU. A negative | ||||
| interrupt number dequeues the interrupt. | ||||
| 
 | ||||
| This is an asynchronous vcpu ioctl and can be invoked from any thread. | ||||
| 
 | ||||
| 
 | ||||
| 4.17 KVM_DEBUG_GUEST | ||||
| 
 | ||||
| @ -1086,14 +1114,12 @@ struct kvm_userspace_memory_region { | ||||
| #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0) | ||||
| #define KVM_MEM_READONLY	(1UL << 1) | ||||
| 
 | ||||
| This ioctl allows the user to create or modify a guest physical memory | ||||
| slot.  When changing an existing slot, it may be moved in the guest | ||||
| physical memory space, or its flags may be modified.  It may not be | ||||
| resized.  Slots may not overlap in guest physical address space. | ||||
| Bits 0-15 of "slot" specifies the slot id and this value should be | ||||
| less than the maximum number of user memory slots supported per VM. | ||||
| The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS, | ||||
| if this capability is supported by the architecture. | ||||
| This ioctl allows the user to create, modify or delete a guest physical | ||||
| memory slot.  Bits 0-15 of "slot" specify the slot id and this value | ||||
| should be less than the maximum number of user memory slots supported per | ||||
| VM.  The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS, | ||||
| if this capability is supported by the architecture.  Slots may not | ||||
| overlap in guest physical address space. | ||||
| 
 | ||||
| If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" | ||||
| specifies the address space which is being modified.  They must be | ||||
| @ -1102,6 +1128,10 @@ KVM_CAP_MULTI_ADDRESS_SPACE capability.  Slots in separate address spaces | ||||
| are unrelated; the restriction on overlapping slots only applies within | ||||
| each address space. | ||||
| 
 | ||||
| Deleting a slot is done by passing zero for memory_size.  When changing | ||||
| an existing slot, it may be moved in the guest physical memory space, | ||||
| or its flags may be modified, but it may not be resized. | ||||
| 
 | ||||
| Memory for the region is taken starting at the address denoted by the | ||||
| field userspace_addr, which must point at user addressable memory for | ||||
| the entire memory slot size.  Any object may back this memory, including | ||||
| @ -2493,7 +2523,7 @@ KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, | ||||
|                            machine checks needing further payload are not | ||||
|                            supported by this ioctl) | ||||
| 
 | ||||
| Note that the vcpu ioctl is asynchronous to vcpu execution. | ||||
| This is an asynchronous vcpu ioctl and can be invoked from any thread. | ||||
| 
 | ||||
| 4.78 KVM_PPC_GET_HTAB_FD | ||||
| 
 | ||||
| @ -3042,8 +3072,7 @@ KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg | ||||
| KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall | ||||
| KVM_S390_MCHK - machine check interrupt; parameters in .mchk | ||||
| 
 | ||||
| 
 | ||||
| Note that the vcpu ioctl is asynchronous to vcpu execution. | ||||
| This is an asynchronous vcpu ioctl and can be invoked from any thread. | ||||
| 
 | ||||
| 4.94 KVM_S390_GET_IRQ_STATE | ||||
| 
 | ||||
|  | ||||
| @ -142,7 +142,7 @@ Shadow pages contain the following information: | ||||
|     If clear, this page corresponds to a guest page table denoted by the gfn | ||||
|     field. | ||||
|   role.quadrant: | ||||
|     When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit | ||||
|     When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit | ||||
|     sptes.  That means a guest page table contains more ptes than the host, | ||||
|     so multiple shadow pages are needed to shadow one guest page. | ||||
|     For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the | ||||
| @ -158,9 +158,9 @@ Shadow pages contain the following information: | ||||
|     The page is invalid and should not be used.  It is a root page that is | ||||
|     currently pinned (by a cpu hardware register pointing to it); once it is | ||||
|     unpinned it will be destroyed. | ||||
|   role.cr4_pae: | ||||
|     Contains the value of cr4.pae for which the page is valid (e.g. whether | ||||
|     32-bit or 64-bit gptes are in use). | ||||
|   role.gpte_is_8_bytes: | ||||
|     Reflects the size of the guest PTE for which the page is valid, i.e. '1' | ||||
|     if 64-bit gptes are in use, '0' if 32-bit gptes are in use. | ||||
|   role.nxe: | ||||
|     Contains the value of efer.nxe for which the page is valid. | ||||
|   role.cr0_wp: | ||||
| @ -173,6 +173,9 @@ Shadow pages contain the following information: | ||||
|     Contains the value of cr4.smap && !cr0.wp for which the page is valid | ||||
|     (pages for which this is true are different from other pages; see the | ||||
|     treatment of cr0.wp=0 below). | ||||
|   role.ept_sp: | ||||
|     This is a virtual flag to denote a shadowed nested EPT page.  ept_sp | ||||
|     is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. | ||||
|   role.smm: | ||||
|     Is 1 if the page is valid in system management mode.  This field | ||||
|     determines which of the kvm_memslots array was used to build this | ||||
|  | ||||
| @ -6,6 +6,7 @@ generic-y += exec.h | ||||
| generic-y += export.h | ||||
| generic-y += fb.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
| generic-y += preempt.h | ||||
|  | ||||
| @ -1,2 +0,0 @@ | ||||
| /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ | ||||
| #include <asm-generic/kvm_para.h> | ||||
| @ -11,6 +11,7 @@ generic-y += hardirq.h | ||||
| generic-y += hw_irq.h | ||||
| generic-y += irq_regs.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, | ||||
| 				       const void *data, unsigned long len) | ||||
| { | ||||
| 	int srcu_idx = srcu_read_lock(&kvm->srcu); | ||||
| 	int ret = kvm_write_guest(kvm, gpa, data, len); | ||||
| 
 | ||||
| 	srcu_read_unlock(&kvm->srcu, srcu_idx); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static inline void *kvm_get_hyp_vector(void) | ||||
| { | ||||
| 	switch(read_cpuid_part()) { | ||||
|  | ||||
| @ -75,6 +75,8 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm) | ||||
| 
 | ||||
| #define S2_PMD_MASK				PMD_MASK | ||||
| #define S2_PMD_SIZE				PMD_SIZE | ||||
| #define S2_PUD_MASK				PUD_MASK | ||||
| #define S2_PUD_SIZE				PUD_SIZE | ||||
| 
 | ||||
| static inline bool kvm_stage2_has_pmd(struct kvm *kvm) | ||||
| { | ||||
|  | ||||
| @ -3,3 +3,4 @@ | ||||
| generated-y += unistd-common.h | ||||
| generated-y += unistd-oabi.h | ||||
| generated-y += unistd-eabi.h | ||||
| generic-y += kvm_para.h | ||||
|  | ||||
| @ -1,2 +0,0 @@ | ||||
| /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ | ||||
| #include <asm-generic/kvm_para.h> | ||||
| @ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm, | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, | ||||
| 				       const void *data, unsigned long len) | ||||
| { | ||||
| 	int srcu_idx = srcu_read_lock(&kvm->srcu); | ||||
| 	int ret = kvm_write_guest(kvm, gpa, data, len); | ||||
| 
 | ||||
| 	srcu_read_unlock(&kvm->srcu, srcu_idx); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_KVM_INDIRECT_VECTORS | ||||
| /*
 | ||||
|  * EL2 vectors can be mapped and rerouted in a number of ways, | ||||
|  | ||||
| @ -123,6 +123,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) | ||||
| 	int ret = -EINVAL; | ||||
| 	bool loaded; | ||||
| 
 | ||||
| 	/* Reset PMU outside of the non-preemptible section */ | ||||
| 	kvm_pmu_vcpu_reset(vcpu); | ||||
| 
 | ||||
| 	preempt_disable(); | ||||
| 	loaded = (vcpu->cpu != -1); | ||||
| 	if (loaded) | ||||
| @ -170,9 +173,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) | ||||
| 		vcpu->arch.reset_state.reset = false; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Reset PMU */ | ||||
| 	kvm_pmu_vcpu_reset(vcpu); | ||||
| 
 | ||||
| 	/* Default workaround setup is enabled (if supported) */ | ||||
| 	if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL) | ||||
| 		vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; | ||||
|  | ||||
| @ -19,6 +19,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -23,6 +23,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += linkage.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -19,6 +19,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,2 +0,0 @@ | ||||
| /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ | ||||
| #include <asm-generic/kvm_para.h> | ||||
| @ -2,6 +2,7 @@ generated-y += syscall_table.h | ||||
| generic-y += compat.h | ||||
| generic-y += exec.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
| generic-y += preempt.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generated-y += unistd_64.h | ||||
| generic-y += kvm_para.h | ||||
|  | ||||
| @ -13,6 +13,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generated-y += unistd_32.h | ||||
| generic-y += kvm_para.h | ||||
|  | ||||
| @ -17,6 +17,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += linkage.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
|  | ||||
| @ -1,3 +1,2 @@ | ||||
| generated-y += unistd_32.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -23,6 +23,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -20,6 +20,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -11,6 +11,7 @@ generic-y += irq_regs.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,3 +1,2 @@ | ||||
| generated-y += unistd_32.h | ||||
| generated-y += unistd_64.h | ||||
| generic-y += kvm_para.h | ||||
|  | ||||
| @ -9,6 +9,7 @@ generic-y += emergency-restart.h | ||||
| generic-y += exec.h | ||||
| generic-y += irq_regs.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,5 +1,4 @@ | ||||
| # SPDX-License-Identifier: GPL-2.0 | ||||
| 
 | ||||
| generated-y += unistd_32.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -9,6 +9,7 @@ generic-y += exec.h | ||||
| generic-y += export.h | ||||
| generic-y += irq_regs.h | ||||
| generic-y += irq_work.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += linkage.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
|  | ||||
| @ -1,2 +0,0 @@ | ||||
| /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ | ||||
| #include <asm-generic/kvm_para.h> | ||||
| @ -18,6 +18,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += mcs_spinlock.h | ||||
| generic-y += mm-arch-hooks.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generic-y += kvm_para.h | ||||
| generic-y += ucontext.h | ||||
|  | ||||
| @ -253,14 +253,14 @@ struct kvm_mmu_memory_cache { | ||||
|  * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used | ||||
|  * by indirect shadow page can not be more than 15 bits. | ||||
|  * | ||||
|  * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access, | ||||
|  * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access, | ||||
|  * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. | ||||
|  */ | ||||
| union kvm_mmu_page_role { | ||||
| 	u32 word; | ||||
| 	struct { | ||||
| 		unsigned level:4; | ||||
| 		unsigned cr4_pae:1; | ||||
| 		unsigned gpte_is_8_bytes:1; | ||||
| 		unsigned quadrant:2; | ||||
| 		unsigned direct:1; | ||||
| 		unsigned access:3; | ||||
| @ -350,6 +350,7 @@ struct kvm_mmu_page { | ||||
| }; | ||||
| 
 | ||||
| struct kvm_pio_request { | ||||
| 	unsigned long linear_rip; | ||||
| 	unsigned long count; | ||||
| 	int in; | ||||
| 	int port; | ||||
| @ -568,6 +569,7 @@ struct kvm_vcpu_arch { | ||||
| 	bool tpr_access_reporting; | ||||
| 	u64 ia32_xss; | ||||
| 	u64 microcode_version; | ||||
| 	u64 arch_capabilities; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Paging state of the vcpu | ||||
| @ -1192,6 +1194,8 @@ struct kvm_x86_ops { | ||||
| 	int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, | ||||
| 				   uint16_t *vmcs_version); | ||||
| 	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); | ||||
| 
 | ||||
| 	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); | ||||
| }; | ||||
| 
 | ||||
| struct kvm_arch_async_pf { | ||||
| @ -1252,7 +1256,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, | ||||
| 				   gfn_t gfn_offset, unsigned long mask); | ||||
| void kvm_mmu_zap_all(struct kvm *kvm); | ||||
| void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); | ||||
| unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | ||||
| unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); | ||||
| void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); | ||||
| 
 | ||||
| int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); | ||||
|  | ||||
| @ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, | ||||
| 		new_config.enable = 0; | ||||
| 	stimer->config.as_uint64 = new_config.as_uint64; | ||||
| 
 | ||||
| 	stimer_mark_pending(stimer, false); | ||||
| 	if (stimer->config.enable) | ||||
| 		stimer_mark_pending(stimer, false); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| @ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, | ||||
| 		stimer->config.enable = 0; | ||||
| 	else if (stimer->config.auto_enable) | ||||
| 		stimer->config.enable = 1; | ||||
| 	stimer_mark_pending(stimer, false); | ||||
| 
 | ||||
| 	if (stimer->config.enable) | ||||
| 		stimer_mark_pending(stimer, false); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator { | ||||
| 
 | ||||
| static const union kvm_mmu_page_role mmu_base_role_mask = { | ||||
| 	.cr0_wp = 1, | ||||
| 	.cr4_pae = 1, | ||||
| 	.gpte_is_8_bytes = 1, | ||||
| 	.nxe = 1, | ||||
| 	.smep_andnot_wp = 1, | ||||
| 	.smap_andnot_wp = 1, | ||||
| @ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | ||||
| static void kvm_mmu_commit_zap_page(struct kvm *kvm, | ||||
| 				    struct list_head *invalid_list); | ||||
| 
 | ||||
| 
 | ||||
| #define for_each_valid_sp(_kvm, _sp, _gfn)				\ | ||||
| 	hlist_for_each_entry(_sp,					\ | ||||
| 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ | ||||
| @ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | ||||
| 	for_each_valid_sp(_kvm, _sp, _gfn)				\ | ||||
| 		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else | ||||
| 
 | ||||
| static inline bool is_ept_sp(struct kvm_mmu_page *sp) | ||||
| { | ||||
| 	return sp->role.cr0_wp && sp->role.smap_andnot_wp; | ||||
| } | ||||
| 
 | ||||
| /* @sp->gfn should be write-protected at the call site */ | ||||
| static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||||
| 			    struct list_head *invalid_list) | ||||
| { | ||||
| 	if (sp->role.cr4_pae != !!is_pae(vcpu) | ||||
| 	    || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { | ||||
| 	if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) || | ||||
| 	    vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { | ||||
| 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | ||||
| 		return false; | ||||
| 	} | ||||
| @ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||||
| 	role.level = level; | ||||
| 	role.direct = direct; | ||||
| 	if (role.direct) | ||||
| 		role.cr4_pae = 0; | ||||
| 		role.gpte_is_8_bytes = true; | ||||
| 	role.access = access; | ||||
| 	if (!vcpu->arch.mmu->direct_map | ||||
| 	    && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { | ||||
| @ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, | ||||
| 
 | ||||
| 	role.base.access = ACC_ALL; | ||||
| 	role.base.nxe = !!is_nx(vcpu); | ||||
| 	role.base.cr4_pae = !!is_pae(vcpu); | ||||
| 	role.base.cr0_wp = is_write_protection(vcpu); | ||||
| 	role.base.smm = is_smm(vcpu); | ||||
| 	role.base.guest_mode = is_guest_mode(vcpu); | ||||
| @ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) | ||||
| 	role.base.ad_disabled = (shadow_accessed_mask == 0); | ||||
| 	role.base.level = kvm_x86_ops->get_tdp_level(vcpu); | ||||
| 	role.base.direct = true; | ||||
| 	role.base.gpte_is_8_bytes = true; | ||||
| 
 | ||||
| 	return role; | ||||
| } | ||||
| @ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) | ||||
| 	role.base.smap_andnot_wp = role.ext.cr4_smap && | ||||
| 		!is_write_protection(vcpu); | ||||
| 	role.base.direct = !is_paging(vcpu); | ||||
| 	role.base.gpte_is_8_bytes = !!is_pae(vcpu); | ||||
| 
 | ||||
| 	if (!is_long_mode(vcpu)) | ||||
| 		role.base.level = PT32E_ROOT_LEVEL; | ||||
| @ -4918,18 +4925,26 @@ static union kvm_mmu_role | ||||
| kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, | ||||
| 				   bool execonly) | ||||
| { | ||||
| 	union kvm_mmu_role role; | ||||
| 	union kvm_mmu_role role = {0}; | ||||
| 
 | ||||
| 	/* Base role is inherited from root_mmu */ | ||||
| 	role.base.word = vcpu->arch.root_mmu.mmu_role.base.word; | ||||
| 	role.ext = kvm_calc_mmu_role_ext(vcpu); | ||||
| 	/* SMM flag is inherited from root_mmu */ | ||||
| 	role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; | ||||
| 
 | ||||
| 	role.base.level = PT64_ROOT_4LEVEL; | ||||
| 	role.base.gpte_is_8_bytes = true; | ||||
| 	role.base.direct = false; | ||||
| 	role.base.ad_disabled = !accessed_dirty; | ||||
| 	role.base.guest_mode = true; | ||||
| 	role.base.access = ACC_ALL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the | ||||
| 	 * SMAP variation to denote shadow EPT entries. | ||||
| 	 */ | ||||
| 	role.base.cr0_wp = true; | ||||
| 	role.base.smap_andnot_wp = true; | ||||
| 
 | ||||
| 	role.ext = kvm_calc_mmu_role_ext(vcpu); | ||||
| 	role.ext.execonly = execonly; | ||||
| 
 | ||||
| 	return role; | ||||
| @ -5179,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, | ||||
| 		 gpa, bytes, sp->role.word); | ||||
| 
 | ||||
| 	offset = offset_in_page(gpa); | ||||
| 	pte_size = sp->role.cr4_pae ? 8 : 4; | ||||
| 	pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Sometimes, the OS only writes the last one bytes to update status | ||||
| @ -5203,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) | ||||
| 	page_offset = offset_in_page(gpa); | ||||
| 	level = sp->role.level; | ||||
| 	*nspte = 1; | ||||
| 	if (!sp->role.cr4_pae) { | ||||
| 	if (!sp->role.gpte_is_8_bytes) { | ||||
| 		page_offset <<= 1;	/* 32->64 */ | ||||
| 		/*
 | ||||
| 		 * A 32-bit pde maps 4MB while the shadow pdes map | ||||
| @ -5393,10 +5408,12 @@ emulate: | ||||
| 	 * This can happen if a guest gets a page-fault on data access but the HW | ||||
| 	 * table walker is not able to read the instruction page (e.g instruction | ||||
| 	 * page is not present in memory). In those cases we simply restart the | ||||
| 	 * guest. | ||||
| 	 * guest, with the exception of AMD Erratum 1096 which is unrecoverable. | ||||
| 	 */ | ||||
| 	if (unlikely(insn && !insn_len)) | ||||
| 		return 1; | ||||
| 	if (unlikely(insn && !insn_len)) { | ||||
| 		if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu)) | ||||
| 			return 1; | ||||
| 	} | ||||
| 
 | ||||
| 	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); | ||||
| 
 | ||||
| @ -5509,7 +5526,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, | ||||
| 
 | ||||
| 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | ||||
| 			if (flush && lock_flush_tlb) { | ||||
| 				kvm_flush_remote_tlbs(kvm); | ||||
| 				kvm_flush_remote_tlbs_with_address(kvm, | ||||
| 						start_gfn, | ||||
| 						iterator.gfn - start_gfn + 1); | ||||
| 				flush = false; | ||||
| 			} | ||||
| 			cond_resched_lock(&kvm->mmu_lock); | ||||
| @ -5517,7 +5536,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, | ||||
| 	} | ||||
| 
 | ||||
| 	if (flush && lock_flush_tlb) { | ||||
| 		kvm_flush_remote_tlbs(kvm); | ||||
| 		kvm_flush_remote_tlbs_with_address(kvm, start_gfn, | ||||
| 						   end_gfn - start_gfn + 1); | ||||
| 		flush = false; | ||||
| 	} | ||||
| 
 | ||||
| @ -6011,7 +6031,7 @@ out: | ||||
| /*
 | ||||
|  * Calculate mmu pages needed for kvm. | ||||
|  */ | ||||
| unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | ||||
| unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) | ||||
| { | ||||
| 	unsigned int nr_mmu_pages; | ||||
| 	unsigned int  nr_pages = 0; | ||||
|  | ||||
| @ -29,10 +29,10 @@ | ||||
| 								        \ | ||||
| 	role.word = __entry->role;					\ | ||||
| 									\ | ||||
| 	trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s"		\ | ||||
| 	trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s"	\ | ||||
| 			 " %snxe %sad root %u %s%c",			\ | ||||
| 			 __entry->gfn, role.level,			\ | ||||
| 			 role.cr4_pae ? " pae" : "",			\ | ||||
| 			 role.gpte_is_8_bytes ? 8 : 4,			\ | ||||
| 			 role.quadrant,					\ | ||||
| 			 role.direct ? " direct" : "",			\ | ||||
| 			 access_str[role.access],			\ | ||||
|  | ||||
| @ -7098,6 +7098,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, | ||||
| 	return -ENODEV; | ||||
| } | ||||
| 
 | ||||
| static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) | ||||
| { | ||||
| 	bool is_user, smap; | ||||
| 
 | ||||
| 	is_user = svm_get_cpl(vcpu) == 3; | ||||
| 	smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh | ||||
| 	 * | ||||
| 	 * In non SEV guest, hypervisor will be able to read the guest | ||||
| 	 * memory to decode the instruction pointer when insn_len is zero | ||||
| 	 * so we return true to indicate that decoding is possible. | ||||
| 	 * | ||||
| 	 * But in the SEV guest, the guest memory is encrypted with the | ||||
| 	 * guest specific key and hypervisor will not be able to decode the | ||||
| 	 * instruction pointer so we will not able to workaround it. Lets | ||||
| 	 * print the error and request to kill the guest. | ||||
| 	 */ | ||||
| 	if (is_user && smap) { | ||||
| 		if (!sev_guest(vcpu->kvm)) | ||||
| 			return true; | ||||
| 
 | ||||
| 		pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n"); | ||||
| 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||||
| 	} | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static struct kvm_x86_ops svm_x86_ops __ro_after_init = { | ||||
| 	.cpu_has_kvm_support = has_svm, | ||||
| 	.disabled_by_bios = is_disabled, | ||||
| @ -7231,6 +7261,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { | ||||
| 
 | ||||
| 	.nested_enable_evmcs = nested_enable_evmcs, | ||||
| 	.nested_get_evmcs_version = nested_get_evmcs_version, | ||||
| 
 | ||||
| 	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault, | ||||
| }; | ||||
| 
 | ||||
| static int __init svm_init(void) | ||||
|  | ||||
| @ -2585,6 +2585,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, | ||||
| 	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || | ||||
| 	    !nested_cr3_valid(vcpu, vmcs12->host_cr3)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || | ||||
| 	    is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the | ||||
| 	 * IA32_EFER MSR must be 0 in the field for that register. In addition, | ||||
|  | ||||
| @ -1683,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||
| 
 | ||||
| 		msr_info->data = to_vmx(vcpu)->spec_ctrl; | ||||
| 		break; | ||||
| 	case MSR_IA32_ARCH_CAPABILITIES: | ||||
| 		if (!msr_info->host_initiated && | ||||
| 		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) | ||||
| 			return 1; | ||||
| 		msr_info->data = to_vmx(vcpu)->arch_capabilities; | ||||
| 		break; | ||||
| 	case MSR_IA32_SYSENTER_CS: | ||||
| 		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); | ||||
| 		break; | ||||
| @ -1895,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||
| 		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, | ||||
| 					      MSR_TYPE_W); | ||||
| 		break; | ||||
| 	case MSR_IA32_ARCH_CAPABILITIES: | ||||
| 		if (!msr_info->host_initiated) | ||||
| 			return 1; | ||||
| 		vmx->arch_capabilities = data; | ||||
| 		break; | ||||
| 	case MSR_IA32_CR_PAT: | ||||
| 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||||
| 			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) | ||||
| @ -4088,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||||
| 		++vmx->nmsrs; | ||||
| 	} | ||||
| 
 | ||||
| 	vmx->arch_capabilities = kvm_get_arch_capabilities(); | ||||
| 
 | ||||
| 	vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); | ||||
| 
 | ||||
| 	/* 22.2.1, 20.8.1 */ | ||||
| @ -7409,6 +7396,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) | ||||
| { | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static __init int hardware_setup(void) | ||||
| { | ||||
| 	unsigned long host_bndcfgs; | ||||
| @ -7711,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { | ||||
| 	.set_nested_state = NULL, | ||||
| 	.get_vmcs12_pages = NULL, | ||||
| 	.nested_enable_evmcs = NULL, | ||||
| 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, | ||||
| }; | ||||
| 
 | ||||
| static void vmx_cleanup_l1d_flush(void) | ||||
|  | ||||
| @ -190,7 +190,6 @@ struct vcpu_vmx { | ||||
| 	u64		      msr_guest_kernel_gs_base; | ||||
| #endif | ||||
| 
 | ||||
| 	u64		      arch_capabilities; | ||||
| 	u64		      spec_ctrl; | ||||
| 
 | ||||
| 	u32 vm_entry_controls_shadow; | ||||
|  | ||||
| @ -1125,7 +1125,7 @@ static u32 msrs_to_save[] = { | ||||
| #endif | ||||
| 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, | ||||
| 	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, | ||||
| 	MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, | ||||
| 	MSR_IA32_SPEC_CTRL, | ||||
| 	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, | ||||
| 	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, | ||||
| 	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, | ||||
| @ -1158,6 +1158,7 @@ static u32 emulated_msrs[] = { | ||||
| 
 | ||||
| 	MSR_IA32_TSC_ADJUST, | ||||
| 	MSR_IA32_TSCDEADLINE, | ||||
| 	MSR_IA32_ARCH_CAPABILITIES, | ||||
| 	MSR_IA32_MISC_ENABLE, | ||||
| 	MSR_IA32_MCG_STATUS, | ||||
| 	MSR_IA32_MCG_CTL, | ||||
| @ -2443,6 +2444,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||
| 		if (msr_info->host_initiated) | ||||
| 			vcpu->arch.microcode_version = data; | ||||
| 		break; | ||||
| 	case MSR_IA32_ARCH_CAPABILITIES: | ||||
| 		if (!msr_info->host_initiated) | ||||
| 			return 1; | ||||
| 		vcpu->arch.arch_capabilities = data; | ||||
| 		break; | ||||
| 	case MSR_EFER: | ||||
| 		return set_efer(vcpu, data); | ||||
| 	case MSR_K7_HWCR: | ||||
| @ -2747,6 +2753,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||
| 	case MSR_IA32_UCODE_REV: | ||||
| 		msr_info->data = vcpu->arch.microcode_version; | ||||
| 		break; | ||||
| 	case MSR_IA32_ARCH_CAPABILITIES: | ||||
| 		if (!msr_info->host_initiated && | ||||
| 		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) | ||||
| 			return 1; | ||||
| 		msr_info->data = vcpu->arch.arch_capabilities; | ||||
| 		break; | ||||
| 	case MSR_IA32_TSC: | ||||
| 		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; | ||||
| 		break; | ||||
| @ -6523,14 +6535,27 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); | ||||
| 
 | ||||
| static int complete_fast_pio_out(struct kvm_vcpu *vcpu) | ||||
| { | ||||
| 	vcpu->arch.pio.count = 0; | ||||
| 
 | ||||
| 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) | ||||
| 		return 1; | ||||
| 
 | ||||
| 	return kvm_skip_emulated_instruction(vcpu); | ||||
| } | ||||
| 
 | ||||
| static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, | ||||
| 			    unsigned short port) | ||||
| { | ||||
| 	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||||
| 	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, | ||||
| 					    size, port, &val, 1); | ||||
| 	/* do not return to emulator after return from userspace */ | ||||
| 	vcpu->arch.pio.count = 0; | ||||
| 
 | ||||
| 	if (!ret) { | ||||
| 		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); | ||||
| 		vcpu->arch.complete_userspace_io = complete_fast_pio_out; | ||||
| 	} | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| @ -6541,6 +6566,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) | ||||
| 	/* We should only ever be called with arch.pio.count equal to 1 */ | ||||
| 	BUG_ON(vcpu->arch.pio.count != 1); | ||||
| 
 | ||||
| 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { | ||||
| 		vcpu->arch.pio.count = 0; | ||||
| 		return 1; | ||||
| 	} | ||||
| 
 | ||||
| 	/* For size less than 4 we merge, else we zero extend */ | ||||
| 	val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) | ||||
| 					: 0; | ||||
| @ -6553,7 +6583,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) | ||||
| 				 vcpu->arch.pio.port, &val, 1); | ||||
| 	kvm_register_write(vcpu, VCPU_REGS_RAX, val); | ||||
| 
 | ||||
| 	return 1; | ||||
| 	return kvm_skip_emulated_instruction(vcpu); | ||||
| } | ||||
| 
 | ||||
| static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, | ||||
| @ -6572,6 +6602,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); | ||||
| 	vcpu->arch.complete_userspace_io = complete_fast_pio_in; | ||||
| 
 | ||||
| 	return 0; | ||||
| @ -6579,16 +6610,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, | ||||
| 
 | ||||
| int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) | ||||
| { | ||||
| 	int ret = kvm_skip_emulated_instruction(vcpu); | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered | ||||
| 	 * KVM_EXIT_DEBUG here. | ||||
| 	 */ | ||||
| 	if (in) | ||||
| 		return kvm_fast_pio_in(vcpu, size, port) && ret; | ||||
| 		ret = kvm_fast_pio_in(vcpu, size, port); | ||||
| 	else | ||||
| 		return kvm_fast_pio_out(vcpu, size, port) && ret; | ||||
| 		ret = kvm_fast_pio_out(vcpu, size, port); | ||||
| 	return ret && kvm_skip_emulated_instruction(vcpu); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(kvm_fast_pio); | ||||
| 
 | ||||
| @ -8733,6 +8761,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | ||||
| 
 | ||||
| int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | ||||
| { | ||||
| 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); | ||||
| 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; | ||||
| 	kvm_vcpu_mtrr_init(vcpu); | ||||
| 	vcpu_load(vcpu); | ||||
| @ -9429,13 +9458,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | ||||
| 				const struct kvm_memory_slot *new, | ||||
| 				enum kvm_mr_change change) | ||||
| { | ||||
| 	int nr_mmu_pages = 0; | ||||
| 
 | ||||
| 	if (!kvm->arch.n_requested_mmu_pages) | ||||
| 		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | ||||
| 
 | ||||
| 	if (nr_mmu_pages) | ||||
| 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | ||||
| 		kvm_mmu_change_mmu_pages(kvm, | ||||
| 				kvm_mmu_calculate_default_mmu_pages(kvm)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Dirty logging tracks sptes in 4k granularity, meaning that large | ||||
|  | ||||
| @ -15,6 +15,7 @@ generic-y += irq_work.h | ||||
| generic-y += kdebug.h | ||||
| generic-y += kmap_types.h | ||||
| generic-y += kprobes.h | ||||
| generic-y += kvm_para.h | ||||
| generic-y += local.h | ||||
| generic-y += local64.h | ||||
| generic-y += mcs_spinlock.h | ||||
|  | ||||
| @ -1,2 +1 @@ | ||||
| generated-y += unistd_32.h | ||||
| generic-y += kvm_para.h | ||||
|  | ||||
| @ -7,5 +7,7 @@ no-export-headers += kvm.h | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) | ||||
| ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),) | ||||
| no-export-headers += kvm_para.h | ||||
| endif | ||||
| endif | ||||
|  | ||||
| @ -29,8 +29,8 @@ LIBKVM += $(LIBKVM_$(UNAME_M)) | ||||
| INSTALL_HDR_PATH = $(top_srcdir)/usr | ||||
| LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ | ||||
| LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include | ||||
| CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I.. | ||||
| LDFLAGS += -pthread | ||||
| CFLAGS += -O2 -g -std=gnu99 -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I.. | ||||
| LDFLAGS += -pthread -no-pie | ||||
| 
 | ||||
| # After inclusion, $(OUTPUT) is defined and
 | ||||
| # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
 | ||||
|  | ||||
| @ -102,6 +102,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); | ||||
| struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); | ||||
| void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); | ||||
| int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); | ||||
| void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); | ||||
| void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, | ||||
| 		       struct kvm_mp_state *mp_state); | ||||
| void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); | ||||
|  | ||||
| @ -1121,6 +1121,22 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) | ||||
| 	return rc; | ||||
| } | ||||
| 
 | ||||
| void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) | ||||
| { | ||||
| 	struct vcpu *vcpu = vcpu_find(vm, vcpuid); | ||||
| 	int ret; | ||||
| 
 | ||||
| 	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); | ||||
| 
 | ||||
| 	vcpu->state->immediate_exit = 1; | ||||
| 	ret = ioctl(vcpu->fd, KVM_RUN, NULL); | ||||
| 	vcpu->state->immediate_exit = 0; | ||||
| 
 | ||||
| 	TEST_ASSERT(ret == -1 && errno == EINTR, | ||||
| 		    "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", | ||||
| 		    ret, errno); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * VM VCPU Set MP State | ||||
|  * | ||||
|  | ||||
| @ -87,22 +87,25 @@ int main(int argc, char *argv[]) | ||||
| 	while (1) { | ||||
| 		rc = _vcpu_run(vm, VCPU_ID); | ||||
| 
 | ||||
| 		if (run->exit_reason == KVM_EXIT_IO) { | ||||
| 			switch (get_ucall(vm, VCPU_ID, &uc)) { | ||||
| 			case UCALL_SYNC: | ||||
| 				/* emulate hypervisor clearing CR4.OSXSAVE */ | ||||
| 				vcpu_sregs_get(vm, VCPU_ID, &sregs); | ||||
| 				sregs.cr4 &= ~X86_CR4_OSXSAVE; | ||||
| 				vcpu_sregs_set(vm, VCPU_ID, &sregs); | ||||
| 				break; | ||||
| 			case UCALL_ABORT: | ||||
| 				TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); | ||||
| 				break; | ||||
| 			case UCALL_DONE: | ||||
| 				goto done; | ||||
| 			default: | ||||
| 				TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); | ||||
| 			} | ||||
| 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, | ||||
| 			    "Unexpected exit reason: %u (%s),\n", | ||||
| 			    run->exit_reason, | ||||
| 			    exit_reason_str(run->exit_reason)); | ||||
| 
 | ||||
| 		switch (get_ucall(vm, VCPU_ID, &uc)) { | ||||
| 		case UCALL_SYNC: | ||||
| 			/* emulate hypervisor clearing CR4.OSXSAVE */ | ||||
| 			vcpu_sregs_get(vm, VCPU_ID, &sregs); | ||||
| 			sregs.cr4 &= ~X86_CR4_OSXSAVE; | ||||
| 			vcpu_sregs_set(vm, VCPU_ID, &sregs); | ||||
| 			break; | ||||
| 		case UCALL_ABORT: | ||||
| 			TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); | ||||
| 			break; | ||||
| 		case UCALL_DONE: | ||||
| 			goto done; | ||||
| 		default: | ||||
| 			TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
| @ -134,6 +134,11 @@ int main(int argc, char *argv[]) | ||||
| 
 | ||||
| 	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); | ||||
| 
 | ||||
| 	if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { | ||||
| 		fprintf(stderr, "immediate_exit not available, skipping test\n"); | ||||
| 		exit(KSFT_SKIP); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Create VM */ | ||||
| 	vm = vm_create_default(VCPU_ID, 0, guest_code); | ||||
| 	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); | ||||
| @ -156,8 +161,6 @@ int main(int argc, char *argv[]) | ||||
| 			    stage, run->exit_reason, | ||||
| 			    exit_reason_str(run->exit_reason)); | ||||
| 
 | ||||
| 		memset(®s1, 0, sizeof(regs1)); | ||||
| 		vcpu_regs_get(vm, VCPU_ID, ®s1); | ||||
| 		switch (get_ucall(vm, VCPU_ID, &uc)) { | ||||
| 		case UCALL_ABORT: | ||||
| 			TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], | ||||
| @ -176,6 +179,17 @@ int main(int argc, char *argv[]) | ||||
| 			    uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", | ||||
| 			    stage, (ulong)uc.args[1]); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees | ||||
| 		 * guest state is consistent only after userspace re-enters the | ||||
| 		 * kernel with KVM_RUN.  Complete IO prior to migrating state | ||||
| 		 * to a new VM. | ||||
| 		 */ | ||||
| 		vcpu_run_complete_io(vm, VCPU_ID); | ||||
| 
 | ||||
| 		memset(®s1, 0, sizeof(regs1)); | ||||
| 		vcpu_regs_get(vm, VCPU_ID, ®s1); | ||||
| 
 | ||||
| 		state = vcpu_save_state(vm, VCPU_ID); | ||||
| 		kvm_vm_release(vm); | ||||
| 
 | ||||
|  | ||||
| @ -222,7 +222,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (used_lrs) { | ||||
| 	if (used_lrs || cpu_if->its_vpe.its_vm) { | ||||
| 		int i; | ||||
| 		u32 elrsr; | ||||
| 
 | ||||
| @ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) | ||||
| 	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; | ||||
| 	int i; | ||||
| 
 | ||||
| 	if (used_lrs) { | ||||
| 	if (used_lrs || cpu_if->its_vpe.its_vm) { | ||||
| 		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); | ||||
| 
 | ||||
| 		for (i = 0; i < used_lrs; i++) | ||||
|  | ||||
| @ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) | ||||
|  * @addr:	IPA | ||||
|  * @pmd:	pmd pointer for IPA | ||||
|  * | ||||
|  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all | ||||
|  * pages in the range dirty. | ||||
|  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. | ||||
|  */ | ||||
| static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) | ||||
| { | ||||
| @ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) | ||||
|  * @addr:	IPA | ||||
|  * @pud:	pud pointer for IPA | ||||
|  * | ||||
|  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all | ||||
|  * pages in the range dirty. | ||||
|  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. | ||||
|  */ | ||||
| static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) | ||||
| { | ||||
| @ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, | ||||
|  * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. | ||||
|  * @kvm:	The KVM struct pointer for the VM. | ||||
|  * | ||||
|  * Allocates only the stage-2 HW PGD level table(s) (can support either full | ||||
|  * 40-bit input addresses or limited to 32-bit input addresses). Clears the | ||||
|  * allocated pages. | ||||
|  * Allocates only the stage-2 HW PGD level table(s) of size defined by | ||||
|  * stage2_pgd_size(kvm). | ||||
|  * | ||||
|  * Note we don't need locking here as this is only called when the VM is | ||||
|  * created, which can only be done once. | ||||
| @ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache | ||||
| { | ||||
| 	pmd_t *pmd, old_pmd; | ||||
| 
 | ||||
| retry: | ||||
| 	pmd = stage2_get_pmd(kvm, cache, addr); | ||||
| 	VM_BUG_ON(!pmd); | ||||
| 
 | ||||
| 	old_pmd = *pmd; | ||||
| 	/*
 | ||||
| 	 * Multiple vcpus faulting on the same PMD entry, can | ||||
| 	 * lead to them sequentially updating the PMD with the | ||||
| 	 * same value. Following the break-before-make | ||||
| 	 * (pmd_clear() followed by tlb_flush()) process can | ||||
| 	 * hinder forward progress due to refaults generated | ||||
| 	 * on missing translations. | ||||
| 	 * | ||||
| 	 * Skip updating the page table if the entry is | ||||
| 	 * unchanged. | ||||
| 	 */ | ||||
| 	if (pmd_val(old_pmd) == pmd_val(*new_pmd)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (pmd_present(old_pmd)) { | ||||
| 		/*
 | ||||
| 		 * Multiple vcpus faulting on the same PMD entry, can | ||||
| 		 * lead to them sequentially updating the PMD with the | ||||
| 		 * same value. Following the break-before-make | ||||
| 		 * (pmd_clear() followed by tlb_flush()) process can | ||||
| 		 * hinder forward progress due to refaults generated | ||||
| 		 * on missing translations. | ||||
| 		 * If we already have PTE level mapping for this block, | ||||
| 		 * we must unmap it to avoid inconsistent TLB state and | ||||
| 		 * leaking the table page. We could end up in this situation | ||||
| 		 * if the memory slot was marked for dirty logging and was | ||||
| 		 * reverted, leaving PTE level mappings for the pages accessed | ||||
| 		 * during the period. So, unmap the PTE level mapping for this | ||||
| 		 * block and retry, as we could have released the upper level | ||||
| 		 * table in the process. | ||||
| 		 * | ||||
| 		 * Skip updating the page table if the entry is | ||||
| 		 * unchanged. | ||||
| 		 * Normal THP split/merge follows mmu_notifier callbacks and do | ||||
| 		 * get handled accordingly. | ||||
| 		 */ | ||||
| 		if (pmd_val(old_pmd) == pmd_val(*new_pmd)) | ||||
| 			return 0; | ||||
| 
 | ||||
| 		if (!pmd_thp_or_huge(old_pmd)) { | ||||
| 			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); | ||||
| 			goto retry; | ||||
| 		} | ||||
| 		/*
 | ||||
| 		 * Mapping in huge pages should only happen through a | ||||
| 		 * fault.  If a page is merged into a transparent huge | ||||
| @ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache | ||||
| 		 * should become splitting first, unmapped, merged, | ||||
| 		 * and mapped back in on-demand. | ||||
| 		 */ | ||||
| 		VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); | ||||
| 
 | ||||
| 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); | ||||
| 		pmd_clear(pmd); | ||||
| 		kvm_tlb_flush_vmid_ipa(kvm, addr); | ||||
| 	} else { | ||||
| @ -1114,6 +1128,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac | ||||
| { | ||||
| 	pud_t *pudp, old_pud; | ||||
| 
 | ||||
| retry: | ||||
| 	pudp = stage2_get_pud(kvm, cache, addr); | ||||
| 	VM_BUG_ON(!pudp); | ||||
| 
 | ||||
| @ -1121,14 +1136,23 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * A large number of vcpus faulting on the same stage 2 entry, | ||||
| 	 * can lead to a refault due to the | ||||
| 	 * stage2_pud_clear()/tlb_flush(). Skip updating the page | ||||
| 	 * tables if there is no change. | ||||
| 	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). | ||||
| 	 * Skip updating the page tables if there is no change. | ||||
| 	 */ | ||||
| 	if (pud_val(old_pud) == pud_val(*new_pudp)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (stage2_pud_present(kvm, old_pud)) { | ||||
| 		/*
 | ||||
| 		 * If we already have table level mapping for this block, unmap | ||||
| 		 * the range for this block and retry. | ||||
| 		 */ | ||||
| 		if (!stage2_pud_huge(kvm, old_pud)) { | ||||
| 			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); | ||||
| 			goto retry; | ||||
| 		} | ||||
| 
 | ||||
| 		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); | ||||
| 		stage2_pud_clear(kvm, pudp); | ||||
| 		kvm_tlb_flush_vmid_ipa(kvm, addr); | ||||
| 	} else { | ||||
| @ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|   * stage2_wp_puds - write protect PGD range | ||||
|   * @pgd:	pointer to pgd entry | ||||
|   * @addr:	range start address | ||||
|   * @end:	range end address | ||||
|   * | ||||
|   * Process PUD entries, for a huge PUD we cause a panic. | ||||
|   */ | ||||
|  * stage2_wp_puds - write protect PGD range | ||||
|  * @pgd:	pointer to pgd entry | ||||
|  * @addr:	range start address | ||||
|  * @end:	range end address | ||||
|  */ | ||||
| static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, | ||||
| 			    phys_addr_t addr, phys_addr_t end) | ||||
| { | ||||
| @ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address, | ||||
| 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); | ||||
| } | ||||
| 
 | ||||
| static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, | ||||
| 					       unsigned long hva) | ||||
| static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, | ||||
| 					       unsigned long hva, | ||||
| 					       unsigned long map_size) | ||||
| { | ||||
| 	gpa_t gpa_start; | ||||
| 	hva_t uaddr_start, uaddr_end; | ||||
| @ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Pages belonging to memslots that don't have the same alignment | ||||
| 	 * within a PMD for userspace and IPA cannot be mapped with stage-2 | ||||
| 	 * PMD entries, because we'll end up mapping the wrong pages. | ||||
| 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 | ||||
| 	 * PMD/PUD entries, because we'll end up mapping the wrong pages. | ||||
| 	 * | ||||
| 	 * Consider a layout like the following: | ||||
| 	 * | ||||
| 	 *    memslot->userspace_addr: | ||||
| 	 *    +-----+--------------------+--------------------+---+ | ||||
| 	 *    |abcde|fgh  Stage-1 PMD    |    Stage-1 PMD   tv|xyz| | ||||
| 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz| | ||||
| 	 *    +-----+--------------------+--------------------+---+ | ||||
| 	 * | ||||
| 	 *    memslot->base_gfn << PAGE_SIZE: | ||||
| 	 *      +---+--------------------+--------------------+-----+ | ||||
| 	 *      |abc|def  Stage-2 PMD    |    Stage-2 PMD     |tvxyz| | ||||
| 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz| | ||||
| 	 *      +---+--------------------+--------------------+-----+ | ||||
| 	 * | ||||
| 	 * If we create those stage-2 PMDs, we'll end up with this incorrect | ||||
| 	 * If we create those stage-2 blocks, we'll end up with this incorrect | ||||
| 	 * mapping: | ||||
| 	 *   d -> f | ||||
| 	 *   e -> g | ||||
| 	 *   f -> h | ||||
| 	 */ | ||||
| 	if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) | ||||
| 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) | ||||
| 		return false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Next, let's make sure we're not trying to map anything not covered | ||||
| 	 * by the memslot. This means we have to prohibit PMD size mappings | ||||
| 	 * for the beginning and end of a non-PMD aligned and non-PMD sized | ||||
| 	 * by the memslot. This means we have to prohibit block size mappings | ||||
| 	 * for the beginning and end of a non-block aligned and non-block sized | ||||
| 	 * memory slot (illustrated by the head and tail parts of the | ||||
| 	 * userspace view above containing pages 'abcde' and 'xyz', | ||||
| 	 * respectively). | ||||
| @ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, | ||||
| 	 * userspace_addr or the base_gfn, as both are equally aligned (per | ||||
| 	 * the check above) and equally sized. | ||||
| 	 */ | ||||
| 	return (hva & S2_PMD_MASK) >= uaddr_start && | ||||
| 	       (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; | ||||
| 	return (hva & ~(map_size - 1)) >= uaddr_start && | ||||
| 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end; | ||||
| } | ||||
| 
 | ||||
| static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | ||||
| @ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | ||||
| 		return -EFAULT; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!fault_supports_stage2_pmd_mappings(memslot, hva)) | ||||
| 		force_pte = true; | ||||
| 
 | ||||
| 	if (logging_active) | ||||
| 		force_pte = true; | ||||
| 
 | ||||
| 	/* Let's check if we will get back a huge page backed by hugetlbfs */ | ||||
| 	down_read(¤t->mm->mmap_sem); | ||||
| 	vma = find_vma_intersection(current->mm, hva, hva + 1); | ||||
| @ -1692,6 +1709,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | ||||
| 	} | ||||
| 
 | ||||
| 	vma_pagesize = vma_kernel_pagesize(vma); | ||||
| 	if (logging_active || | ||||
| 	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { | ||||
| 		force_pte = true; | ||||
| 		vma_pagesize = PAGE_SIZE; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The stage2 has a minimum of 2 level table (For arm64 see | ||||
| 	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can | ||||
| @ -1699,11 +1722,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | ||||
| 	 * As for PUD huge maps, we must make sure that we have at least | ||||
| 	 * 3 levels, i.e, PMD is not folded. | ||||
| 	 */ | ||||
| 	if ((vma_pagesize == PMD_SIZE || | ||||
| 	     (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) && | ||||
| 	    !force_pte) { | ||||
| 	if (vma_pagesize == PMD_SIZE || | ||||
| 	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) | ||||
| 		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; | ||||
| 	} | ||||
| 	up_read(¤t->mm->mmap_sem); | ||||
| 
 | ||||
| 	/* We need minimum second+third level pages */ | ||||
|  | ||||
| @ -754,8 +754,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, | ||||
| 	u64 indirect_ptr, type = GITS_BASER_TYPE(baser); | ||||
| 	phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); | ||||
| 	int esz = GITS_BASER_ENTRY_SIZE(baser); | ||||
| 	int index; | ||||
| 	int index, idx; | ||||
| 	gfn_t gfn; | ||||
| 	bool ret; | ||||
| 
 | ||||
| 	switch (type) { | ||||
| 	case GITS_BASER_TYPE_DEVICE: | ||||
| @ -782,7 +783,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, | ||||
| 
 | ||||
| 		if (eaddr) | ||||
| 			*eaddr = addr; | ||||
| 		return kvm_is_visible_gfn(its->dev->kvm, gfn); | ||||
| 
 | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/* calculate and check the index into the 1st level */ | ||||
| @ -812,7 +814,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, | ||||
| 
 | ||||
| 	if (eaddr) | ||||
| 		*eaddr = indirect_ptr; | ||||
| 	return kvm_is_visible_gfn(its->dev->kvm, gfn); | ||||
| 
 | ||||
| out: | ||||
| 	idx = srcu_read_lock(&its->dev->kvm->srcu); | ||||
| 	ret = kvm_is_visible_gfn(its->dev->kvm, gfn); | ||||
| 	srcu_read_unlock(&its->dev->kvm->srcu, idx); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int vgic_its_alloc_collection(struct vgic_its *its, | ||||
| @ -1729,8 +1736,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev) | ||||
| 	kfree(its); | ||||
| } | ||||
| 
 | ||||
| int vgic_its_has_attr_regs(struct kvm_device *dev, | ||||
| 			   struct kvm_device_attr *attr) | ||||
| static int vgic_its_has_attr_regs(struct kvm_device *dev, | ||||
| 				  struct kvm_device_attr *attr) | ||||
| { | ||||
| 	const struct vgic_register_region *region; | ||||
| 	gpa_t offset = attr->attr; | ||||
| @ -1750,9 +1757,9 @@ int vgic_its_has_attr_regs(struct kvm_device *dev, | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int vgic_its_attr_regs_access(struct kvm_device *dev, | ||||
| 			      struct kvm_device_attr *attr, | ||||
| 			      u64 *reg, bool is_write) | ||||
| static int vgic_its_attr_regs_access(struct kvm_device *dev, | ||||
| 				     struct kvm_device_attr *attr, | ||||
| 				     u64 *reg, bool is_write) | ||||
| { | ||||
| 	const struct vgic_register_region *region; | ||||
| 	struct vgic_its *its; | ||||
| @ -1919,7 +1926,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, | ||||
| 	       ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | | ||||
| 		ite->collection->collection_id; | ||||
| 	val = cpu_to_le64(val); | ||||
| 	return kvm_write_guest(kvm, gpa, &val, ite_esz); | ||||
| 	return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
| @ -2066,7 +2073,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, | ||||
| 	       (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | | ||||
| 		(dev->num_eventid_bits - 1)); | ||||
| 	val = cpu_to_le64(val); | ||||
| 	return kvm_write_guest(kvm, ptr, &val, dte_esz); | ||||
| 	return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
| @ -2246,7 +2253,7 @@ static int vgic_its_save_cte(struct vgic_its *its, | ||||
| 	       ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | | ||||
| 	       collection->collection_id); | ||||
| 	val = cpu_to_le64(val); | ||||
| 	return kvm_write_guest(its->dev->kvm, gpa, &val, esz); | ||||
| 	return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); | ||||
| } | ||||
| 
 | ||||
| static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) | ||||
| @ -2317,7 +2324,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) | ||||
| 	 */ | ||||
| 	val = 0; | ||||
| 	BUG_ON(cte_esz > sizeof(val)); | ||||
| 	ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz); | ||||
| 	ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -358,7 +358,7 @@ retry: | ||||
| 	if (status) { | ||||
| 		/* clear consumed data */ | ||||
| 		val &= ~(1 << bit_nr); | ||||
| 		ret = kvm_write_guest(kvm, ptr, &val, 1); | ||||
| 		ret = kvm_write_guest_lock(kvm, ptr, &val, 1); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
| @ -409,7 +409,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) | ||||
| 		else | ||||
| 			val &= ~(1 << bit_nr); | ||||
| 
 | ||||
| 		ret = kvm_write_guest(kvm, ptr, &val, 1); | ||||
| 		ret = kvm_write_guest_lock(kvm, ptr, &val, 1); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
|  | ||||
| @ -867,15 +867,21 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) | ||||
| 	 * either observe the new interrupt before or after doing this check, | ||||
| 	 * and introducing additional synchronization mechanism doesn't change | ||||
| 	 * this. | ||||
| 	 * | ||||
| 	 * Note that we still need to go through the whole thing if anything | ||||
| 	 * can be directly injected (GICv4). | ||||
| 	 */ | ||||
| 	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) | ||||
| 	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && | ||||
| 	    !vgic_supports_direct_msis(vcpu->kvm)) | ||||
| 		return; | ||||
| 
 | ||||
| 	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); | ||||
| 
 | ||||
| 	raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); | ||||
| 	vgic_flush_lr_state(vcpu); | ||||
| 	raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); | ||||
| 	if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { | ||||
| 		raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); | ||||
| 		vgic_flush_lr_state(vcpu); | ||||
| 		raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	if (can_access_vgic_from_kernel()) | ||||
| 		vgic_restore_state(vcpu); | ||||
|  | ||||
| @ -214,9 +214,9 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) | ||||
| 
 | ||||
| 	if (flags & EPOLLHUP) { | ||||
| 		/* The eventfd is closing, detach from KVM */ | ||||
| 		unsigned long flags; | ||||
| 		unsigned long iflags; | ||||
| 
 | ||||
| 		spin_lock_irqsave(&kvm->irqfds.lock, flags); | ||||
| 		spin_lock_irqsave(&kvm->irqfds.lock, iflags); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We must check if someone deactivated the irqfd before | ||||
| @ -230,7 +230,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) | ||||
| 		if (irqfd_is_active(irqfd)) | ||||
| 			irqfd_deactivate(irqfd); | ||||
| 
 | ||||
| 		spin_unlock_irqrestore(&kvm->irqfds.lock, flags); | ||||
| 		spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
|  | ||||
| @ -2905,6 +2905,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, | ||||
| { | ||||
| 	struct kvm_device *dev = filp->private_data; | ||||
| 
 | ||||
| 	if (dev->kvm->mm != current->mm) | ||||
| 		return -EIO; | ||||
| 
 | ||||
| 	switch (ioctl) { | ||||
| 	case KVM_SET_DEVICE_ATTR: | ||||
| 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user