From a88a9ec1729a4687370856567919e1ab79a02483 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Apr 2024 16:25:03 +0100 Subject: [PATCH 1/8] KVM: arm64: Improve out-of-order sysreg table diagnostics Adding new entries to our system register tables is a painful exercise, as we require them to be ordered by Op0,Op1,CRn,CRm,Op2. If an entry is misordered, we output an error that indicates the pointer to the entry and the number *of the last valid one*. That's not very helpful, and would be much better if we printed the number of the *offending* entry as well as its name (which is present in the vast majority of the cases). This makes debugging new additions to the tables much easier. Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20240410152503.3593890-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c9f4f387155f..33efa441e21d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3069,12 +3069,14 @@ static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { - kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i); + kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", + &table[i], i, table[i].name); return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1); + kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", + &table[i], i, table[i - 1].name, table[i].name); return false; } } From ae69e7740770d8e02915b299d060c9280a3db01c Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Thu, 11 Apr 2024 13:56:59 +0000 Subject: [PATCH 2/8] KVM: arm64: Remove FFA_MSG_SEND_DIRECT_REQ from the denylist The denylist is blocking the 32 bit version of the call but is allowing the 64 bit version of it. There is no reason for blocking only one of them and the hypervisor should support these calls. Signed-off-by: Sebastian Ene Link: https://lore.kernel.org/r/20240411135700.2140550-1-sebastianene@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 320f2eaa14a9..02746f9d0980 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -600,7 +600,6 @@ static bool ffa_call_supported(u64 func_id) case FFA_MSG_POLL: case FFA_MSG_WAIT: /* 32-bit variants of 64-bit calls */ - case FFA_MSG_SEND_DIRECT_REQ: case FFA_MSG_SEND_DIRECT_RESP: case FFA_RXTX_MAP: case FFA_MEM_DONATE: From e8533e58cae02923e8cbffca516d3d821cee1649 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 1 May 2024 15:29:05 +0100 Subject: [PATCH 3/8] KVM: arm64: Remove duplicated AA64MMFR1_EL1 XNX Commit d5a32b60dc18 ("KVM: arm64: Allow userspace to change ID_AA64MMFR{0-2}_EL1") made certain fields in these registers writable, but in doing so, ID_AA64MMFR1_EL1_XNX was listed twice. Remove the duplication. Signed-off-by: Russell King (Oracle) Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/E1s2AxF-00AWLv-03@rmk-PC.armlinux.org.uk Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 33efa441e21d..3c7007553adf 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2338,7 +2338,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR0_EL1_TGRAN16_2)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | - ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_TWED | ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | From 03b3d00a70b55857439511c1b558ca00a99f4126 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 2 May 2024 16:45:45 +0100 Subject: [PATCH 4/8] KVM: arm64: vgic: Allocate private interrupts on demand Private interrupts are currently part of the CPU interface structure that is part of each and every vcpu we create. Currently, we have 32 of them per vcpu, resulting in a per-vcpu array that is just shy of 4kB. On its own, that's no big deal, but it gets in the way of other things: - each vcpu gets mapped at EL2 on nVHE/hVHE configurations. This requires memory that is physically contiguous. However, the EL2 code has no purpose looking at the interrupt structures and could do without them being mapped. - supporting features such as EPPIs, which extend the number of private interrupts past the 32 limit would make the array even larger, even for VMs that do not use the EPPI feature. Address these issues by moving the private interrupt array outside of the vcpu, and replace it with a simple pointer. We take this opportunity to make it obvious what gets initialised when, as that path was remarkably opaque, and tighten the locking. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240502154545.3012089-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 82 +++++++++++++++++++++++++-------- include/kvm/arm_vgic.h | 2 +- 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index f20941f83a07..d3787fb09251 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -182,27 +182,22 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; int i; - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + lockdep_assert_held(&vcpu->kvm->arch.config_lock); - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - raw_spin_lock_init(&vgic_cpu->ap_list_lock); - atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (vgic_cpu->private_irqs) + return 0; + + vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS, + sizeof(struct vgic_irq), + GFP_KERNEL_ACCOUNT); + + if (!vgic_cpu->private_irqs) + return -ENOMEM; /* * Enable and configure all SGIs to be edge-triggered and @@ -227,9 +222,48 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) } } + return 0; +} + +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) +{ + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = vgic_allocate_private_irqs_locked(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (!irqchip_in_kernel(vcpu->kvm)) return 0; + ret = vgic_allocate_private_irqs(vcpu); + if (ret) + return ret; + /* * If we are creating a VCPU with a GICv3 we must also register the * KVM io device for the redistributor that belongs to this VCPU. @@ -285,10 +319,13 @@ int vgic_init(struct kvm *kvm) /* Initialize groups on CPUs created before the VGIC type was known */ kvm_for_each_vcpu(idx, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + ret = vgic_allocate_private_irqs_locked(vcpu); + if (ret) + goto out; for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + struct vgic_irq *irq = vgic_get_irq(kvm, vcpu, i); + switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V3: irq->group = 1; @@ -300,8 +337,12 @@ int vgic_init(struct kvm *kvm) break; default: ret = -EINVAL; - goto out; } + + vgic_put_irq(kvm, irq); + + if (ret) + goto out; } } @@ -381,6 +422,9 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_flush_pending_lpis(vcpu); INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_unregister_redist_iodev(vcpu); vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 47035946648e..a7397f37f4dd 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -330,7 +330,7 @@ struct vgic_cpu { struct vgic_v3_cpu_if vgic_v3; }; - struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; + struct vgic_irq *private_irqs; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ From 838d992b84486311e6039170d28b79a7a0633f06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 2 May 2024 16:42:47 +0100 Subject: [PATCH 5/8] KVM: arm64: Convert kvm_mpidr_index() to bitmap_gather() Linux 6.9 has introduced new bitmap manipulation helpers, with bitmap_gather() being of special interest, as it does exactly what kvm_mpidr_index() is already doing. Make the latter a wrapper around the former. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240502154247.3012042-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9e8a496fb284..403d7479c0bc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -220,20 +220,10 @@ struct kvm_mpidr_data { static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr) { - unsigned long mask = data->mpidr_mask; - u64 aff = mpidr & MPIDR_HWID_BITMASK; - int nbits, bit, bit_idx = 0; - u16 index = 0; + unsigned long index = 0, mask = data->mpidr_mask; + unsigned long aff = mpidr & MPIDR_HWID_BITMASK; - /* - * If this looks like RISC-V's BEXT or x86's PEXT - * instructions, it isn't by accident. - */ - nbits = fls(mask); - for_each_set_bit(bit, &mask, nbits) { - index |= (aff & BIT(bit)) >> (bit - bit_idx); - bit_idx++; - } + bitmap_gather(&index, &aff, &mask, fls(mask)); return index; } From 3c142f9d02b992aec5d96b82917e4cc07850c4df Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 1 May 2024 17:33:59 +0100 Subject: [PATCH 6/8] KVM: arm64: Fix hvhe/nvhe early alias parsing Booting a kernel with "arm64_sw.hvhe=1 kvm-arm.mode=nvhe" on the command-line results in KVM initialising using hVHE, whereas one might expect the latter option to override the former. Fix this by adding "arm64_sw.hvhe=0" to the alias expansion for "kvm-arm.mode=nvhe". Signed-off-by: Will Deacon Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240501163400.15838-2-will@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kernel/pi/idreg-override.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index aad399796e81..39c9253fcf23 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -209,7 +209,7 @@ static const struct { char alias[FTR_ALIAS_NAME_LEN]; char feature[FTR_ALIAS_OPTION_LEN]; } aliases[] __initconst = { - { "kvm_arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=nvhe", "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" }, { "kvm_arm.mode=protected", "id_aa64mmfr1.vh=0" }, { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, From 5053c3f0519cd4c746577e3a6a7756f7c04b03dd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 1 May 2024 17:34:00 +0100 Subject: [PATCH 7/8] KVM: arm64: Use hVHE in pKVM by default on CPUs with VHE support The early command line parsing treats "kvm-arm.mode=protected" as an alias for "id_aa64mmfr1.vh=0", forcing the use of nVHE so that the host kernel runs at EL1 with the pKVM hypervisor at EL2. With the introduction of hVHE support in ad744e8cb346 ("arm64: Allow arm64_sw.hvhe on command line"), the hypervisor can run using the EL2+0 translation regime. This is interesting for unusual CPUs that have VH stuck to 1, but also because it opens the possibility of a hypervisor "userspace" in the distant future which could be used to isolate vCPU contexts in the hypervisor (see Marc's talk from KVM Forum 2022 [1]). Repaint the "kvm-arm.mode=protected" alias to map to "arm64_sw.hvhe=1", which will use hVHE on CPUs that support it and remain with nVHE otherwise. [1] https://www.youtube.com/watch?v=1F_Mf2j9eIo Signed-off-by: Will Deacon Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240501163400.15838-3-will@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kernel/pi/idreg-override.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 39c9253fcf23..c20549e43a77 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -210,7 +210,7 @@ static const struct { char feature[FTR_ALIAS_OPTION_LEN]; } aliases[] __initconst = { { "kvm_arm.mode=nvhe", "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" }, - { "kvm_arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "arm64_sw.hvhe=1" }, { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, From ce5d2448eb8fe83aed331db53a08612286a137dd Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 8 May 2024 07:19:52 +0000 Subject: [PATCH 8/8] KVM: arm64: Destroy mpidr_data for 'late' vCPU creation A particularly annoying userspace could create a vCPU after KVM has computed mpidr_data for the VM, either by racing against VGIC initialization or having a userspace irqchip. In any case, this means mpidr_data no longer fully describes the VM, and attempts to find the new vCPU with kvm_mpidr_to_vcpu() will fail. The fix is to discard mpidr_data altogether, as it is only a performance optimization and not required for correctness. In all likelihood KVM will recompute the mappings when KVM_RUN is called on the new vCPU. Note that reads of mpidr_data are not guarded by a lock; promote to RCU to cope with the possibility of mpidr_data being invalidated at runtime. Fixes: 54a8006d0b49 ("KVM: arm64: Fast-track kvm_mpidr_to_vcpu() when mpidr_data is available") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240508071952.2035422-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 52 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3dee5490eea9..3dbf43e97f42 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -195,6 +195,23 @@ void kvm_arch_create_vm_debugfs(struct kvm *kvm) kvm_sys_regs_create_debugfs(kvm); } +static void kvm_destroy_mpidr_data(struct kvm *kvm) +{ + struct kvm_mpidr_data *data; + + mutex_lock(&kvm->arch.config_lock); + + data = rcu_dereference_protected(kvm->arch.mpidr_data, + lockdep_is_held(&kvm->arch.config_lock)); + if (data) { + rcu_assign_pointer(kvm->arch.mpidr_data, NULL); + synchronize_rcu(); + kfree(data); + } + + mutex_unlock(&kvm->arch.config_lock); +} + /** * kvm_arch_destroy_vm - destroy the VM data structure * @kvm: pointer to the KVM struct @@ -209,7 +226,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (is_protected_kvm_enabled()) pkvm_destroy_hyp_vm(kvm); - kfree(kvm->arch.mpidr_data); + kvm_destroy_mpidr_data(kvm); + kfree(kvm->arch.sysreg_masks); kvm_destroy_vcpus(kvm); @@ -395,6 +413,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + /* + * This vCPU may have been created after mpidr_data was initialized. + * Throw out the pre-computed mappings if that is the case which forces + * KVM to fall back to iteratively searching the vCPUs. + */ + kvm_destroy_mpidr_data(vcpu->kvm); + err = kvm_vgic_vcpu_init(vcpu); if (err) return err; @@ -594,7 +619,8 @@ static void kvm_init_mpidr_data(struct kvm *kvm) mutex_lock(&kvm->arch.config_lock); - if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1) + if (rcu_access_pointer(kvm->arch.mpidr_data) || + atomic_read(&kvm->online_vcpus) == 1) goto out; kvm_for_each_vcpu(c, vcpu, kvm) { @@ -631,7 +657,7 @@ static void kvm_init_mpidr_data(struct kvm *kvm) data->cmpidr_to_idx[index] = c; } - kvm->arch.mpidr_data = data; + rcu_assign_pointer(kvm->arch.mpidr_data, data); out: mutex_unlock(&kvm->arch.config_lock); } @@ -2470,22 +2496,28 @@ out_err: struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = NULL; + struct kvm_mpidr_data *data; unsigned long i; mpidr &= MPIDR_HWID_BITMASK; - if (kvm->arch.mpidr_data) { - u16 idx = kvm_mpidr_index(kvm->arch.mpidr_data, mpidr); + rcu_read_lock(); + data = rcu_dereference(kvm->arch.mpidr_data); - vcpu = kvm_get_vcpu(kvm, - kvm->arch.mpidr_data->cmpidr_to_idx[idx]); + if (data) { + u16 idx = kvm_mpidr_index(data, mpidr); + + vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]); if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu)) vcpu = NULL; - - return vcpu; } + rcu_read_unlock(); + + if (vcpu) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) { if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) return vcpu;