Files
gvisor/pkg/ring0/kernel_amd64.go
Jamie Liu 2247aceb99 kvm: enable CPUID faulting on all VCPUs
This feature is controlled by an MSR; MSRs are per-CPU.

The Intel SDM doesn't document CPUID faulting, at least as of the Dec 2024
revision; despite the deleted comment in ring0/kernel_amd64.go, there is no
Vol. 3 Table 2-43, and every table in Vol. 4 ("Model-Specific Registers") lists
bit 31 in MSR_PLATFORM_INFO as "reserved". The only documentation seems to be
that cited by Linux's e9ea1e7f53b85 ("x86/arch_prctl: Add
ARCH_[GET|SET]_CPUID"): "Intel Virtualization Technology FlexMigration
Application Note" 323850-004, 2012. This document positions CPUID faulting as
an alternative way to support cross-CPU migration for VMs that don't use VMX;
consequently it does not clarify if CPUID faulting is effective in guest ("VMX
non-root") mode, or if the CPUID VM exit takes precedence. If the former is the
case then CPUID faulting is probably faster than setting app CPUID with
KVM_SET_CPUID2, and vice versa. But regardless, this is much simpler.

PiperOrigin-RevId: 733113944
2025-03-03 17:15:39 -08:00

324 lines
9.6 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build amd64
// +build amd64
package ring0
import (
"encoding/binary"
"reflect"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/sentry/arch"
)
// HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the
// value in regs.
func HaltAndWriteFSBase(regs *arch.Registers)
// init initializes architecture-specific state.
func (k *Kernel) init(maxCPUs int) {
entrySize := reflect.TypeOf(kernelEntry{}).Size()
var (
entries []kernelEntry
padding = 1
)
for {
entries = make([]kernelEntry, maxCPUs+padding-1)
totalSize := entrySize * uintptr(maxCPUs+padding-1)
addr := reflect.ValueOf(&entries[0]).Pointer()
if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize {
// The runtime forces power-of-2 alignment for allocations, and we are therefore
// safe once the first address is aligned and the chunk is at least a full page.
break
}
padding = padding << 1
}
k.cpuEntries = entries
k.globalIDT = &idt64{}
if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize {
panic("Size of globalIDT should be PageSize")
}
if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 {
panic("Allocated globalIDT should be page aligned")
}
// Setup the IDT, which is uniform.
for v, handler := range handlers {
// Allow Breakpoint and Overflow to be called from all
// privilege levels.
dpl := 0
if v == Breakpoint || v == Overflow {
dpl = 3
}
// Note that we set all traps to use the interrupt stack, this
// is defined below when setting up the TSS.
k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
}
}
// EntryRegions returns the set of kernel entry regions (must be mapped).
func (k *Kernel) EntryRegions() map[uintptr]uintptr {
regions := make(map[uintptr]uintptr)
addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
end, _ := hostarch.Addr(addr + size).RoundUp()
regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
addr = reflect.ValueOf(k.globalIDT).Pointer()
size = reflect.TypeOf(idt64{}).Size()
end, _ = hostarch.Addr(addr + size).RoundUp()
regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
return regions
}
// init initializes architecture-specific state.
func (c *CPU) init(cpuID int) {
c.kernelEntry = &c.kernel.cpuEntries[cpuID]
c.cpuSelf = c
// Null segment.
c.gdt[0].setNull()
// Kernel & user segments.
c.gdt[segKcode] = KernelCodeSegment
c.gdt[segKdata] = KernelDataSegment
c.gdt[segUcode32] = UserCodeSegment32
c.gdt[segUdata] = UserDataSegment
c.gdt[segUcode64] = UserCodeSegment64
// The task segment, this spans two entries.
tssBase, tssLimit, _ := c.TSS()
c.gdt[segTss].set(
uint32(tssBase),
uint32(tssLimit),
0, // Privilege level zero.
SegmentDescriptorPresent|
SegmentDescriptorAccess|
SegmentDescriptorWrite|
SegmentDescriptorExecute)
c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
// Set the kernel stack pointer in the TSS (virtual address).
stackAddr := c.StackTop()
c.stackTop = stackAddr
c.tss.rsp0Lo = uint32(stackAddr)
c.tss.rsp0Hi = uint32(stackAddr >> 32)
c.tss.ist1Lo = uint32(stackAddr)
c.tss.ist1Hi = uint32(stackAddr >> 32)
// Set the I/O bitmap base address beyond the last byte in the TSS
// to block access to the entire I/O address range.
//
// From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1:
// I/O addresses not spanned by the map are treated as if they had set
// bits in the map.
c.tss.ioPerm = tssLimit + 1
// Permanently set the kernel segments.
c.registers.Cs = uint64(Kcode)
c.registers.Ds = uint64(Kdata)
c.registers.Es = uint64(Kdata)
c.registers.Ss = uint64(Kdata)
c.registers.Fs = uint64(Kdata)
c.registers.Gs = uint64(Kdata)
// Set mandatory flags.
c.registers.Eflags = KernelFlagsSet
c.hasXSAVE = hasXSAVE
c.hasXSAVEOPT = hasXSAVEOPT
c.hasFSGSBASE = hasFSGSBASE
}
// StackTop returns the kernel's stack address.
//
//go:nosplit
func (c *CPU) StackTop() uint64 {
return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
}
// IDT returns the CPU's IDT base and limit.
//
//go:nosplit
func (c *CPU) IDT() (uint64, uint16) {
return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
}
// GDT returns the CPU's GDT base and limit.
//
//go:nosplit
func (c *CPU) GDT() (uint64, uint16) {
return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
}
// TSS returns the CPU's TSS base, limit and value.
//
//go:nosplit
func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
}
// CR0 returns the CPU's CR0 value.
//
//go:nosplit
func (c *CPU) CR0() uint64 {
return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE
}
// CR4 returns the CPU's CR4 value.
//
//go:nosplit
func (c *CPU) CR4() uint64 {
cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_PGE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
if hasPCID {
cr4 |= _CR4_PCIDE
}
if hasXSAVE {
cr4 |= _CR4_OSXSAVE
}
if hasSMEP {
cr4 |= _CR4_SMEP
}
if hasSMAP {
cr4 |= _CR4_SMAP
}
if hasFSGSBASE {
cr4 |= _CR4_FSGSBASE
}
if hasUMIP {
cr4 |= _CR4_UMIP
}
return cr4
}
// EFER returns the CPU's EFER value.
//
//go:nosplit
func (c *CPU) EFER() uint64 {
return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
}
// IsCanonical indicates whether addr is canonical per the amd64 spec.
//
//go:nosplit
func IsCanonical(addr uint64) bool {
return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000
}
// SwitchToUser performs either a sysret or an iret.
//
// The return value is the vector that interrupted execution.
//
// This function will not split the stack. Callers will probably want to call
// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
// calling this function.
//
// When this is done, this region is quite sensitive to things like system
// calls. After calling entersyscall, any memory used must have been allocated
// and no function calls without go:nosplit are permitted. Any calls made here
// are protected appropriately (e.g. IsCanonical and CR3).
//
// Also note that this function transitively depends on the compiler generating
// code that uses IP-relative addressing inside of absolute addresses. That's
// the case for amd64, but may not be the case for other architectures.
//
// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
//
// +checkescape:all
//
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
// Sanitize registers.
regs := switchOpts.Registers
regs.Eflags &= ^uint64(UserFlagsClear)
regs.Eflags |= UserFlagsSet
regs.Cs = uint64(Ucode64) // Required for iret.
regs.Ss = uint64(Udata) // Ditto.
// Perform the switch.
needIRET := uint64(0)
if switchOpts.FullRestore {
needIRET = 1
}
vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no.
return
}
func doSwitchToUser(
cpu *CPU, // +0(FP)
regs *arch.Registers, // +8(FP)
fpState *byte, // +16(FP)
userCR3 uint64, // +24(FP)
needIRET uint64) Vector // +32(FP), +40(FP)
// startGo is the CPU entrypoint.
//
// This is called from the start asm stub (see entry_amd64.go); on return the
// registers in c.registers will be restored (not segments).
//
// Note that any code written in Go should adhere to Go expected environment:
// - Initialized floating point state (required for optimizations using
// floating point instructions).
// - Go TLS in FS_BASE (this is required by splittable functions, calls into
// the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access
// TLS)).
//
//go:nosplit
func startGo(c *CPU) {
// Save per-cpu.
writeGS(kernelAddr(c.kernelEntry))
//
// TODO(mpratt): Note that per the note above, this should be done
// before entering Go code. However for simplicity we leave it here for
// now, since the small critical sections with undefined FPU state
// should only contain very limited use of floating point instructions
// (notably, use of XMM15 as a zero register).
fninit()
// Need to sync XCR0 with the host, because xsave and xrstor can be
// called from different contexts.
if hasXSAVE {
// Exclude MPX bits. MPX has been deprecated and we have seen
// cases when it isn't supported in VM.
xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS)
xsetbv(0, xcr0)
}
// Set the syscall target.
wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter()))
wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
// NOTE: This depends on having the 64-bit segments immediately
// following the 32-bit user segments. This is simply the way the
// sysret instruction is designed to work (it assumes they follow).
wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter()))
}
// ReadCR2 reads the current CR2 value.
//
//go:nosplit
func ReadCR2() uintptr {
return readCR2()
}