From 9bcbb6104a344d3526e185ee1e7b985509914e90 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Tue, 21 Jan 2025 04:40:16 +0000 Subject: [PATCH 1/7] KVM: arm64: Flush hyp bss section after initialization of variables in bss To determine CPU features during initialization, the nVHE hypervisor utilizes sanitized values of the host's CPU features registers. These values, stored in u64 idaa64*_el1_sys_val variables are updated by the kvm_hyp_init_symbols() function at EL1. To ensure EL2 visibility with the MMU off, the data cache needs to be flushed after these updates. However, individually flushing each variable using kvm_flush_dcache_to_poc() is inefficient. These cpu feature variables would be part of the bss section of the hypervisor. Hence, flush the entire bss section of hypervisor once the initialization is complete. Fixes: 6c30bfb18d0b ("KVM: arm64: Add handlers for protected VM System Registers") Suggested-by: Fuad Tabba Signed-off-by: Lokesh Vutla Link: https://lore.kernel.org/r/20250121044016.2219256-1-lokeshvutla@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bcc4f7e92634..0725a0b50a3e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2400,6 +2400,13 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) From 0f1a6c5c9784eff7e31e4915e17285fb89ad3644 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 31 Jan 2025 22:29:24 +0000 Subject: [PATCH 2/7] KVM: arm64: Flush/sync debug state in protected mode The recent changes to debug state management broke self-hosted debug for guests when running in protected mode, since both the debug owner and the debug state itself aren't shared with the hyp's view of the vcpu. Fix it by flushing/syncing the relevant bits with the hyp vcpu. Fixes: beb470d96cec ("KVM: arm64: Use debug_owner to track if debug regs need save/restore") Reported-by: Mark Brown Closes: https://lore.kernel.org/kvmarm/5f62740f-a065-42d9-9f56-8fb648b9c63f@sirena.org.uk/ Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250131222922.1548780-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 5c134520e180..6e12c070832f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; @@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) unsigned int i; fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; From 32392e04cb50d87bb7a6a7d9213f44a1a0961820 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 3 Feb 2025 15:15:43 -0800 Subject: [PATCH 3/7] KVM: arm64: Fail protected mode init if no vgic hardware is present Protected mode assumes that at minimum vgic-v3 is present, however KVM fails to actually enforce this at the time of initialization. As such, when running protected mode in a half-baked state on GICv2 hardware we see the hyp go belly up at vcpu_load() when it tries to restore the vgic-v3 cpuif: $ ./arch_timer_edge_cases [ 130.599140] kvm [4518]: nVHE hyp panic at: [] __kvm_nvhe___vgic_v3_restore_vmcr_aprs+0x8/0x84! [ 130.603685] kvm [4518]: Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE [ 130.611962] kvm [4518]: Hyp Offset: 0xfffeca95ed000000 [ 130.617053] Kernel panic - not syncing: HYP panic: [ 130.617053] PS:800003c9 PC:0000b56a94102b58 ESR:0000000002000000 [ 130.617053] FAR:ffff00007b98d4d0 HPFAR:00000000007b98d0 PAR:0000000000000000 [ 130.617053] VCPU:0000000000000000 [ 130.638013] CPU: 0 UID: 0 PID: 4518 Comm: arch_timer_edge Tainted: G C 6.13.0-rc3-00009-gf7d03fcbf1f4 #1 [ 130.648790] Tainted: [C]=CRAP [ 130.651721] Hardware name: Libre Computer AML-S905X-CC (DT) [ 130.657242] Call trace: [ 130.659656] show_stack+0x18/0x24 (C) [ 130.663279] dump_stack_lvl+0x38/0x90 [ 130.666900] dump_stack+0x18/0x24 [ 130.670178] panic+0x388/0x3e8 [ 130.673196] nvhe_hyp_panic_handler+0x104/0x208 [ 130.677681] kvm_arch_vcpu_load+0x290/0x548 [ 130.681821] vcpu_load+0x50/0x80 [ 130.685013] kvm_arch_vcpu_ioctl_run+0x30/0x868 [ 130.689498] kvm_vcpu_ioctl+0x2e0/0x974 [ 130.693293] __arm64_sys_ioctl+0xb4/0xec [ 130.697174] invoke_syscall+0x48/0x110 [ 130.700883] el0_svc_common.constprop.0+0x40/0xe0 [ 130.705540] do_el0_svc+0x1c/0x28 [ 130.708818] el0_svc+0x30/0xd0 [ 130.711837] el0t_64_sync_handler+0x10c/0x138 [ 130.716149] el0t_64_sync+0x198/0x19c [ 130.719774] SMP: stopping secondary CPUs [ 130.723660] Kernel Offset: disabled [ 130.727103] CPU features: 0x000,00000800,02800000,0200421b [ 130.732537] Memory Limit: none [ 130.735561] ---[ end Kernel panic - not syncing: HYP panic: [ 130.735561] PS:800003c9 PC:0000b56a94102b58 ESR:0000000002000000 [ 130.735561] FAR:ffff00007b98d4d0 HPFAR:00000000007b98d0 PAR:0000000000000000 [ 130.735561] VCPU:0000000000000000 ]--- Fix it by failing KVM initialization if the system doesn't implement vgic-v3, as protected mode will never do anything useful on such hardware. Reported-by: Mark Brown Closes: https://lore.kernel.org/kvmarm/5ca7588c-7bf2-4352-8661-e4a56a9cd9aa@sirena.org.uk/ Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250203231543.233511-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0725a0b50a3e..62c650c2f7b6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2290,6 +2290,19 @@ static int __init init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; From 5417a2e9b130a78bf48cb4cf92630efcee5ccf38 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 14:55:54 +0000 Subject: [PATCH 4/7] KVM: arm64: Fix nested S2 MMU structures reallocation For each vcpu that userspace creates, we allocate a number of s2_mmu structures that will eventually contain our shadow S2 page tables. Since this is a dynamically allocated array, we reallocate the array and initialise the newly allocated elements. Once everything is correctly initialised, we adjust pointer and size in the kvm structure, and move on. But should that initialisation fail *and* the reallocation triggered a copy to another location, we end-up returning early, with the kvm structure still containing the (now stale) old pointer. Weeee! Cure it by assigning the pointer early, and use this to perform the initialisation. If everything succeeds, we adjust the size. Otherwise, we just leave the size as it was, no harm done, and the new memory is as good as the ol' one (we hope...). Fixes: 4f128f8e1aaac ("KVM: arm64: nv: Support multiple nested Stage-2 mmu structures") Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Link: https://lore.kernel.org/r/20250204145554.774427-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 33d2ace68665..0c9387d2f507 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) if (!tmp) return -ENOMEM; + swap(kvm->arch.nested_mmus, tmp); + /* * If we went through a realocation, adjust the MMU back-pointers in * the previously initialised kvm_pgtable structures. */ if (kvm->arch.nested_mmus != tmp) for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - tmp[i].pgt->mmu = &tmp[i]; + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) - ret = init_nested_s2_mmu(kvm, &tmp[i]); + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); if (ret) { for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) - kvm_free_stage2_pgd(&tmp[i]); + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); return ret; } kvm->arch.nested_mmus_size = num_mmus; - kvm->arch.nested_mmus = tmp; return 0; } From b450dcce93bc2cf6d2bfaf5a0de88a94ebad8f89 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:48 +0000 Subject: [PATCH 5/7] KVM: arm64: timer: Always evaluate the need for a soft timer When updating the interrupt state for an emulated timer, we return early and skip the setup of a soft timer that runs in parallel with the guest. While this is OK if we have set the interrupt pending, it is pretty wrong if the guest moved CVAL into the future. In that case, no timer is armed and the guest can wait for a very long time (it will take a full put/load cycle for the situation to resolve). This is specially visible with EDK2 running at EL2, but still using the EL1 virtual timer, which in that case is fully emulated. Any key-press takes ages to be captured, as there is no UART interrupt and EDK2 relies on polling from a timer... The fix is simply to drop the early return. If the timer interrupt is pending, we will still return early, and otherwise arm the soft timer. Fixes: 4d74ecfa6458b ("KVM: arm64: Don't arm a hrtimer for an already pending timer") Cc: stable@vger.kernel.org Tested-by: Dmytro Terletskyi Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index d3d243366536..035e43f5d4f9 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { + if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } kvm_timer_update_status(ctx, should_fire); From 1b8705ad5365b5333240b46d5cd24e88ef2ddb14 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:49 +0000 Subject: [PATCH 6/7] KVM: arm64: timer: Correctly handle EL1 timer emulation when !FEAT_ECV Both Wei-Lin Chang and Volodymyr Babchuk report that the way we handle the emulation of EL1 timers with NV is completely wrong, specially in the case of HCR_EL2.E2H==0. There are three problems in about as many lines of code: - With E2H==0, the EL1 timers are overwritten with the EL1 state, while they should actually contain the EL2 state (as per the timer map) - With E2H==1, we run the full EL1 timer emulation even when ECV is present, hiding a bug in timer_emulate() (see previous patch) - The comments are actively misleading, and say all the wrong things. This is only attributable to the code having been initially written for FEAT_NV, hacked up to handle FEAT_NV2 *in parallel*, and vaguely hacked again to be FEAT_NV2 only. Oh, and yours truly being a gold plated idiot. The fix is obvious: just delete most of the E2H==0 code, have a unified handling of the timers (because they really are E2H agnostic), and make sure we don't execute any of that when FEAT_ECV is present. Fixes: 4bad3068cfa9f ("KVM: arm64: nv: Sync nested timer state with FEAT_NV2") Reported-by: Wei-Lin Chang Reported-by: Volodymyr Babchuk Link: https://lore.kernel.org/r/fqiqfjzwpgbzdtouu2pwqlu7llhnf5lmy4hzv5vo6ph4v3vyls@jdcfy3fjjc5k Link: https://lore.kernel.org/r/87frl51tse.fsf@epam.com Tested-by: Dmytro Terletskyi Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 035e43f5d4f9..e59836e0260c 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -974,31 +974,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) * which allows trapping of the timer registers even with NV2. * Still, this is still worse than FEAT_NV on its own. Meh. */ - if (!vcpu_el2_e2h_is_set(vcpu)) { - if (cpus_have_final_cap(ARM64_HAS_ECV)) - return; - - /* - * A non-VHE guest hypervisor doesn't have any direct access - * to its timers: the EL2 registers trap (and the HW is - * fully emulated), while the EL0 registers access memory - * despite the access being notionally direct. Boo. - * - * We update the hardware timer registers with the - * latest value written by the guest to the VNCR page - * and let the hardware take care of the rest. - */ - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL); - } else { + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { /* * For a VHE guest hypervisor, the EL2 state is directly - * stored in the host EL1 timers, while the emulated EL0 + * stored in the host EL1 timers, while the emulated EL1 * state is stored in the VNCR page. The latter could have * been updated behind our back, and we must reset the * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. */ struct timer_map map; get_timer_map(vcpu, &map); From 0e459810285503fb354537e84049e212c5917c33 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:50 +0000 Subject: [PATCH 7/7] KVM: arm64: timer: Don't adjust the EL2 virtual timer offset The way we deal with the EL2 virtual timer is a bit odd. We try to cope with E2H being flipped, and adjust which offset applies to that timer depending on the current E2H value. But that's a complexity we shouldn't have to worry about. What we have to deal with is either E2H being RES1, in which case there is no offset, or E2H being RES0, and the virtual timer simply does not exist. Drop the adjusting of the timer offset, which makes things a bit simpler. At the same time, make sure that accessing the HV timer when E2H is RES0 results in an UNDEF in the guest. Suggested-by: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 15 --------------- arch/arm64/kvm/sys_regs.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index e59836e0260c..231c0cd9c7b4 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -759,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); - - /* - * The virtual offset behaviour is "interesting", as it - * always applies when HCR_EL2.E2H==0, but only when - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we - * track E2H when putting the HV timer in "direct" mode. - */ - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { - struct arch_timer_offset *offs = &map->direct_vtimer->offset; - - if (vcpu_el2_e2h_is_set(vcpu)) - offs->vcpu_offset = NULL; - else - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); - } } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 526d66f24e34..7968bee0d27e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1452,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static bool access_hv_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!vcpu_el2_e2h_is_set(vcpu)) + return undef_access(vcpu, p, r); + + return access_arch_timer(vcpu, p, r); +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -3099,9 +3109,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), - { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, - EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), - EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },