mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
Merge tag 'kvm-x86-fixes-6.14-rcN.2' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 6.14-rcN #2 - Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow. - Ensure DEBUGCTL is context switched on AMD to avoid running the guest with the host's value, which can lead to unexpected bus lock #DBs. - Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly emulate BTF. KVM's lack of context switching has meant BTF has always been broken to some extent. - Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest can enable DebugSwap without KVM's knowledge. - Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO memory" phase without actually generating a write-protection fault. - Fix a printf() goof in the SEV smoke test that causes build failures with -Werror. - Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2 isn't supported by KVM.
This commit is contained in:
@@ -780,6 +780,7 @@ struct kvm_vcpu_arch {
|
||||
u32 pkru;
|
||||
u32 hflags;
|
||||
u64 efer;
|
||||
u64 host_debugctl;
|
||||
u64 apic_base;
|
||||
struct kvm_lapic *apic; /* kernel irqchip context */
|
||||
bool load_eoi_exitmap_pending;
|
||||
|
||||
@@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
||||
|
||||
entry->ecx = entry->edx = 0;
|
||||
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
|
||||
entry->eax = entry->ebx;
|
||||
entry->eax = entry->ebx = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
|
||||
|
||||
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
|
||||
{
|
||||
struct kvm *kvm = svm->vcpu.kvm;
|
||||
|
||||
/*
|
||||
* All host state for SEV-ES guests is categorized into three swap types
|
||||
* based on how it is handled by hardware during a world switch:
|
||||
@@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
|
||||
|
||||
/*
|
||||
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
|
||||
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
|
||||
* saves and loads debug registers (Type-A).
|
||||
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
|
||||
* not save or load debug registers. Sadly, KVM can't prevent SNP
|
||||
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
|
||||
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
|
||||
* the guest has actually enabled (or not!) in the VMSA.
|
||||
*
|
||||
* If DebugSwap is *possible*, save the masks so that they're restored
|
||||
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
|
||||
* rely on the CPU to restore the host values; KVM will restore them as
|
||||
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
|
||||
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
|
||||
* don't need to be restored per se, KVM just needs to ensure they are
|
||||
* loaded with the correct values *if* the CPU writes the MSRs.
|
||||
*/
|
||||
if (sev_vcpu_has_debug_swap(svm)) {
|
||||
hostsa->dr0 = native_get_debugreg(0);
|
||||
hostsa->dr1 = native_get_debugreg(1);
|
||||
hostsa->dr2 = native_get_debugreg(2);
|
||||
hostsa->dr3 = native_get_debugreg(3);
|
||||
if (sev_vcpu_has_debug_swap(svm) ||
|
||||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
|
||||
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
|
||||
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
|
||||
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
|
||||
|
||||
@@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* AMD changed the architectural behavior of bits 5:2. On CPUs
|
||||
* without BusLockTrap, bits 5:2 control "external pins", but
|
||||
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
|
||||
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
|
||||
* the guest to set bits 5:2 despite not actually virtualizing
|
||||
* Performance-Monitoring/Breakpoint external pins. Drop bits
|
||||
* 5:2 for backwards compatibility.
|
||||
*/
|
||||
data &= ~GENMASK(5, 2);
|
||||
|
||||
/*
|
||||
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
|
||||
* way to communicate lack of support to the guest.
|
||||
*/
|
||||
if (data & DEBUGCTLMSR_BTF) {
|
||||
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
|
||||
data &= ~DEBUGCTLMSR_BTF;
|
||||
}
|
||||
|
||||
if (data & DEBUGCTL_RESERVED_BITS)
|
||||
return 1;
|
||||
|
||||
@@ -4189,6 +4210,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
|
||||
|
||||
guest_state_enter_irqoff();
|
||||
|
||||
/*
|
||||
* Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
|
||||
* VMRUN controls whether or not physical IRQs are masked (KVM always
|
||||
* runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
|
||||
* temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
|
||||
* into guest state if delivery of an event during VMRUN triggers a
|
||||
* #VMEXIT, and the guest_state transitions already tell lockdep that
|
||||
* IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
|
||||
* this path, so IRQs aren't actually unmasked while running host code.
|
||||
*/
|
||||
raw_local_irq_enable();
|
||||
|
||||
amd_clear_divider();
|
||||
|
||||
if (sev_es_guest(vcpu->kvm))
|
||||
@@ -4197,6 +4230,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
|
||||
else
|
||||
__svm_vcpu_run(svm, spec_ctrl_intercepted);
|
||||
|
||||
raw_local_irq_disable();
|
||||
|
||||
guest_state_exit_irqoff();
|
||||
}
|
||||
|
||||
@@ -4253,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
|
||||
clgi();
|
||||
kvm_load_guest_xsave_state(vcpu);
|
||||
|
||||
/*
|
||||
* Hardware only context switches DEBUGCTL if LBR virtualization is
|
||||
* enabled. Manually load DEBUGCTL if necessary (and restore it after
|
||||
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
|
||||
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
|
||||
*/
|
||||
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
|
||||
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
|
||||
update_debugctlmsr(svm->vmcb->save.dbgctl);
|
||||
|
||||
kvm_wait_lapic_expire(vcpu);
|
||||
|
||||
/*
|
||||
@@ -4280,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
|
||||
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
|
||||
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
|
||||
|
||||
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
|
||||
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
|
||||
update_debugctlmsr(vcpu->arch.host_debugctl);
|
||||
|
||||
kvm_load_host_xsave_state(vcpu);
|
||||
stgi();
|
||||
|
||||
|
||||
@@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
|
||||
/* svm.c */
|
||||
#define MSR_INVALID 0xffffffffU
|
||||
|
||||
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
|
||||
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
|
||||
|
||||
extern bool dump_invalid_vmcb;
|
||||
|
||||
|
||||
@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
|
||||
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
|
||||
|
||||
/* Enter guest mode */
|
||||
sti
|
||||
|
||||
3: vmrun %_ASM_AX
|
||||
4:
|
||||
cli
|
||||
|
||||
/* Pop @svm to RAX while it's the only available register. */
|
||||
pop %_ASM_AX
|
||||
|
||||
@@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
|
||||
mov KVM_VMCB_pa(%rax), %rax
|
||||
|
||||
/* Enter guest mode */
|
||||
sti
|
||||
|
||||
1: vmrun %rax
|
||||
|
||||
2: cli
|
||||
|
||||
2:
|
||||
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
|
||||
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
|
||||
|
||||
|
||||
@@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
|
||||
*/
|
||||
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
|
||||
shrink_ple_window(vcpu);
|
||||
|
||||
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
|
||||
|
||||
vmx_vcpu_pi_load(vcpu, cpu);
|
||||
|
||||
vmx->host_debugctlmsr = get_debugctlmsr();
|
||||
}
|
||||
|
||||
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
@@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
|
||||
}
|
||||
|
||||
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
|
||||
if (vmx->host_debugctlmsr)
|
||||
update_debugctlmsr(vmx->host_debugctlmsr);
|
||||
if (vcpu->arch.host_debugctl)
|
||||
update_debugctlmsr(vcpu->arch.host_debugctl);
|
||||
|
||||
#ifndef CONFIG_X86_64
|
||||
/*
|
||||
|
||||
@@ -340,8 +340,6 @@ struct vcpu_vmx {
|
||||
/* apic deadline value in host tsc */
|
||||
u64 hv_deadline_tsc;
|
||||
|
||||
unsigned long host_debugctlmsr;
|
||||
|
||||
/*
|
||||
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
|
||||
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
|
||||
|
||||
@@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
||||
set_debugreg(0, 7);
|
||||
}
|
||||
|
||||
vcpu->arch.host_debugctl = get_debugctlmsr();
|
||||
|
||||
guest_timing_enter_irqoff();
|
||||
|
||||
for (;;) {
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "ucall_common.h"
|
||||
|
||||
static bool mprotect_ro_done;
|
||||
static bool all_vcpus_hit_ro_fault;
|
||||
|
||||
static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
||||
{
|
||||
@@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
||||
|
||||
/*
|
||||
* Write to the region while mprotect(PROT_READ) is underway. Keep
|
||||
* looping until the memory is guaranteed to be read-only, otherwise
|
||||
* vCPUs may complete their writes and advance to the next stage
|
||||
* prematurely.
|
||||
* looping until the memory is guaranteed to be read-only and a fault
|
||||
* has occurred, otherwise vCPUs may complete their writes and advance
|
||||
* to the next stage prematurely.
|
||||
*
|
||||
* For architectures that support skipping the faulting instruction,
|
||||
* generate the store via inline assembly to ensure the exact length
|
||||
@@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
||||
#else
|
||||
vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
|
||||
#endif
|
||||
} while (!READ_ONCE(mprotect_ro_done));
|
||||
} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));
|
||||
|
||||
/*
|
||||
* Only architectures that write the entire range can explicitly sync,
|
||||
@@ -81,6 +82,7 @@ struct vcpu_info {
|
||||
|
||||
static int nr_vcpus;
|
||||
static atomic_t rendezvous;
|
||||
static atomic_t nr_ro_faults;
|
||||
|
||||
static void rendezvous_with_boss(void)
|
||||
{
|
||||
@@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
|
||||
* be stuck on the faulting instruction for other architectures. Go to
|
||||
* stage 3 without a rendezvous
|
||||
*/
|
||||
do {
|
||||
r = _vcpu_run(vcpu);
|
||||
} while (!r);
|
||||
r = _vcpu_run(vcpu);
|
||||
TEST_ASSERT(r == -1 && errno == EFAULT,
|
||||
"Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
|
||||
|
||||
atomic_inc(&nr_ro_faults);
|
||||
if (atomic_read(&nr_ro_faults) == nr_vcpus) {
|
||||
WRITE_ONCE(all_vcpus_hit_ro_fault, true);
|
||||
sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
/*
|
||||
* Verify *all* writes from the guest hit EFAULT due to the VMA now
|
||||
@@ -378,7 +384,6 @@ int main(int argc, char *argv[])
|
||||
rendezvous_with_vcpus(&time_run2, "run 2");
|
||||
|
||||
mprotect(mem, slot_size, PROT_READ);
|
||||
usleep(10);
|
||||
mprotect_ro_done = true;
|
||||
sync_global_to_guest(vm, mprotect_ro_done);
|
||||
|
||||
|
||||
@@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
|
||||
|
||||
GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
|
||||
GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
|
||||
GUEST_ASSERT(!ctrl->int_state);
|
||||
}
|
||||
|
||||
static void l1_svm_code(struct svm_test_data *svm)
|
||||
@@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
|
||||
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
|
||||
GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
|
||||
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
|
||||
GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO));
|
||||
}
|
||||
|
||||
static void l1_vmx_code(struct vmx_pages *vmx)
|
||||
|
||||
@@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
|
||||
bool bad = false;
|
||||
for (i = 0; i < 4095; i++) {
|
||||
if (from_host[i] != from_guest[i]) {
|
||||
printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
|
||||
printf("mismatch at %u | %02hhx %02hhx\n",
|
||||
i, from_host[i], from_guest[i]);
|
||||
bad = true;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user