Files
Jamie Liu aecf514158 Don't XSAVE PKRU state.
We don't implement any of the `pkey_*` syscalls, so applications can't use
protection keys.

Updates #10087

PiperOrigin-RevId: 611287272
2024-02-28 17:47:27 -08:00

714 lines
26 KiB
ArmAsm

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "funcdata.h"
#include "textflag.h"
// CPU offsets.
#define CPU_REGISTERS 72 // +checkoffset . CPU.registers
#define CPU_FPU_STATE 288 // +checkoffset . CPU.floatingPointState
#define CPU_ARCH_STATE 16 // +checkoffset . CPU.CPUArchState
#define CPU_ERROR_CODE CPU_ARCH_STATE+0 // +checkoffset . CPUArchState.errorCode
#define CPU_ERROR_TYPE CPU_ARCH_STATE+8 // +checkoffset . CPUArchState.errorType
#define CPU_VECTOR CPU_ARCH_STATE+16 // +checkoffset . CPUArchState.vector
#define CPU_FAULT_ADDR CPU_ARCH_STATE+24 // +checkoffset . CPUArchState.faultAddr
#define CPU_ENTRY CPU_ARCH_STATE+32 // +checkoffset . CPUArchState.kernelEntry
#define CPU_APP_GS_BASE CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.appGsBase
#define CPU_HAS_XSAVE CPU_ARCH_STATE+48 // +checkoffset . CPUArchState.hasXSAVE
#define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+49 // +checkoffset . CPUArchState.hasXSAVEOPT
#define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE
#define ENTRY_SCRATCH0 256 // +checkoffset . kernelEntry.scratch0
#define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop
#define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf
#define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3
// Bits.
#define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF
#define _RFLAGS_IOPL0 4096 // +checkconst . _RFLAGS_IOPL0
#define _KERNEL_FLAGS 2 // +checkconst . KernelFlagsSet
// Vectors.
#define DivideByZero 0 // +checkconst . DivideByZero
#define Debug 1 // +checkconst . Debug
#define NMI 2 // +checkconst . NMI
#define Breakpoint 3 // +checkconst . Breakpoint
#define Overflow 4 // +checkconst . Overflow
#define BoundRangeExceeded 5 // +checkconst . BoundRangeExceeded
#define InvalidOpcode 6 // +checkconst . InvalidOpcode
#define DeviceNotAvailable 7 // +checkconst . DeviceNotAvailable
#define DoubleFault 8 // +checkconst . DoubleFault
#define CoprocessorSegmentOverrun 9 // +checkconst . CoprocessorSegmentOverrun
#define InvalidTSS 10 // +checkconst . InvalidTSS
#define SegmentNotPresent 11 // +checkconst . SegmentNotPresent
#define StackSegmentFault 12 // +checkconst . StackSegmentFault
#define GeneralProtectionFault 13 // +checkconst . GeneralProtectionFault
#define PageFault 14 // +checkconst . PageFault
#define X87FloatingPointException 16 // +checkconst . X87FloatingPointException
#define AlignmentCheck 17 // +checkconst . AlignmentCheck
#define MachineCheck 18 // +checkconst . MachineCheck
#define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException
#define VirtualizationException 20 // +checkconst . VirtualizationException
#define SecurityException 30 // +checkconst . SecurityException
#define SyscallInt80 128 // +checkconst . SyscallInt80
#define Syscall 256 // +checkconst . Syscall
#define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15
#define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14
#define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13
#define PTRACE_R12 24 // +checkoffset linux PtraceRegs.R12
#define PTRACE_RBP 32 // +checkoffset linux PtraceRegs.Rbp
#define PTRACE_RBX 40 // +checkoffset linux PtraceRegs.Rbx
#define PTRACE_R11 48 // +checkoffset linux PtraceRegs.R11
#define PTRACE_R10 56 // +checkoffset linux PtraceRegs.R10
#define PTRACE_R9 64 // +checkoffset linux PtraceRegs.R9
#define PTRACE_R8 72 // +checkoffset linux PtraceRegs.R8
#define PTRACE_RAX 80 // +checkoffset linux PtraceRegs.Rax
#define PTRACE_RCX 88 // +checkoffset linux PtraceRegs.Rcx
#define PTRACE_RDX 96 // +checkoffset linux PtraceRegs.Rdx
#define PTRACE_RSI 104 // +checkoffset linux PtraceRegs.Rsi
#define PTRACE_RDI 112 // +checkoffset linux PtraceRegs.Rdi
#define PTRACE_ORIGRAX 120 // +checkoffset linux PtraceRegs.Orig_rax
#define PTRACE_RIP 128 // +checkoffset linux PtraceRegs.Rip
#define PTRACE_CS 136 // +checkoffset linux PtraceRegs.Cs
#define PTRACE_FLAGS 144 // +checkoffset linux PtraceRegs.Eflags
#define PTRACE_RSP 152 // +checkoffset linux PtraceRegs.Rsp
#define PTRACE_SS 160 // +checkoffset linux PtraceRegs.Ss
#define PTRACE_FS_BASE 168 // +checkoffset linux PtraceRegs.Fs_base
#define PTRACE_GS_BASE 176 // +checkoffset linux PtraceRegs.Gs_base
// The value for XCR0 is defined to xsave/xrstor everything except for PKRU and
// AMX regions.
// TODO(gvisor.dev/issues/9896): Implement AMX support.
// TODO(gvisor.dev/issues/10087): Implement PKRU support.
#define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18))
#define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK)
#define XCR0_EDX 0xffffffff
// Saves a register set.
//
// This is a macro because it may need to executed in contents where a stack is
// not available for calls.
//
// The following registers are not saved: AX, SP, IP, FLAGS, all segments.
#define REGISTERS_SAVE(reg, offset) \
MOVQ R15, offset+PTRACE_R15(reg); \
MOVQ R14, offset+PTRACE_R14(reg); \
MOVQ R13, offset+PTRACE_R13(reg); \
MOVQ R12, offset+PTRACE_R12(reg); \
MOVQ BP, offset+PTRACE_RBP(reg); \
MOVQ BX, offset+PTRACE_RBX(reg); \
MOVQ CX, offset+PTRACE_RCX(reg); \
MOVQ DX, offset+PTRACE_RDX(reg); \
MOVQ R11, offset+PTRACE_R11(reg); \
MOVQ R10, offset+PTRACE_R10(reg); \
MOVQ R9, offset+PTRACE_R9(reg); \
MOVQ R8, offset+PTRACE_R8(reg); \
MOVQ SI, offset+PTRACE_RSI(reg); \
MOVQ DI, offset+PTRACE_RDI(reg);
// Loads a register set.
//
// This is a macro because it may need to executed in contents where a stack is
// not available for calls.
//
// The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
#define REGISTERS_LOAD(reg, offset) \
MOVQ offset+PTRACE_R15(reg), R15; \
MOVQ offset+PTRACE_R14(reg), R14; \
MOVQ offset+PTRACE_R13(reg), R13; \
MOVQ offset+PTRACE_R12(reg), R12; \
MOVQ offset+PTRACE_RBP(reg), BP; \
MOVQ offset+PTRACE_RBX(reg), BX; \
MOVQ offset+PTRACE_RCX(reg), CX; \
MOVQ offset+PTRACE_RDX(reg), DX; \
MOVQ offset+PTRACE_R11(reg), R11; \
MOVQ offset+PTRACE_R10(reg), R10; \
MOVQ offset+PTRACE_R9(reg), R9; \
MOVQ offset+PTRACE_R8(reg), R8; \
MOVQ offset+PTRACE_RSI(reg), SI; \
MOVQ offset+PTRACE_RDI(reg), DI;
// WRITE_CR3() writes the given CR3 value.
//
// The code corresponds to:
//
// mov %rax, %cr3
//
#define WRITE_CR3() \
BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
// SWAP_GS swaps the kernel GS (CPU).
#define SWAP_GS() \
BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
// IRET returns from an interrupt frame.
#define IRET() \
BYTE $0x48; BYTE $0xcf;
// SYSRET64 executes the sysret instruction.
#define SYSRET64() \
BYTE $0x48; BYTE $0x0f; BYTE $0x07;
// LOAD_KERNEL_STACK loads the kernel stack.
#define LOAD_KERNEL_STACK(entry) \
MOVQ ENTRY_STACK_TOP(entry), SP;
// ADDR_OF_FUNC defines a function named 'name' that returns the address of
// 'symbol'.
#define ADDR_OF_FUNC(name, symbol) \
TEXT name,$0-8; \
MOVQ $symbol, AX; \
MOVQ AX, ret+0(FP); \
RET
// See kernel.go.
TEXT ·Halt(SB),NOSPLIT|NOFRAME,$0
HLT
RET
// See kernel_amd64.go.
TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8
HLT
// Restore FS_BASE.
MOVQ regs+0(FP), AX
MOVQ PTRACE_FS_BASE(AX), AX
PUSHQ AX // First argument (FS_BASE)
CALL ·writeFS(SB)
POPQ AX
RET
// jumpToKernel changes execution to the kernel address space.
//
// This works by changing the return value to the kernel version.
TEXT ·jumpToKernel(SB),NOSPLIT|NOFRAME,$0
MOVQ 0(SP), AX
ORQ ·KernelStartAddress(SB), AX // Future return value.
MOVQ AX, 0(SP)
RET
// jumpToUser changes execution to the user address space.
//
// This works by changing the return value to the user version.
TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0
// N.B. we can't access KernelStartAddress from the upper half (data
// pages not available), so just naively clear all the upper bits.
// We are assuming a 47-bit virtual address space.
MOVQ $0x00007fffffffffff, AX
MOVQ 0(SP), BX
ANDQ BX, AX // Future return value.
MOVQ AX, 0(SP)
RET
// See kernel_amd64.go.
//
// The 16-byte frame size is for the saved values of MXCSR and the x87 control
// word.
TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48
// We are passed pointers to heap objects, but do not store them in our
// local frame.
NO_LOCAL_POINTERS
// MXCSR and the x87 control word are the only floating point state
// that is callee-save and thus we must save.
STMXCSR mxcsr-0(SP)
FSTCW cw-8(SP)
// Restore application floating point state.
MOVQ cpu+0(FP), SI
MOVQ fpState+16(FP), DI
MOVB ·hasXSAVE(SB), BX
TESTB BX, BX
JZ no_xrstor
// Use xrstor to restore all available fp state.
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI)
JMP fprestore_done
no_xrstor:
// Fall back to fxrstor if xsave is not available.
FXRSTOR64 0(DI)
fprestore_done:
// Set application GS.
MOVQ regs+8(FP), R8
SWAP_GS()
MOVQ PTRACE_GS_BASE(R8), AX
CMPQ AX, CPU_APP_GS_BASE(SI)
JE skip_gs
MOVQ AX, CPU_APP_GS_BASE(SI)
PUSHQ AX
CALL ·writeGS(SB)
POPQ AX
skip_gs:
// Call sysret() or iret().
MOVQ userCR3+24(FP), CX
MOVQ needIRET+32(FP), R9
ADDQ $-32, SP
MOVQ SI, 0(SP) // cpu
MOVQ R8, 8(SP) // regs
MOVQ CX, 16(SP) // userCR3
TESTQ R9, R9
JNZ do_iret
CALL ·sysret(SB)
JMP done_sysret_or_iret
do_iret:
CALL ·iret(SB)
done_sysret_or_iret:
MOVQ 24(SP), AX // vector
ADDQ $32, SP
MOVQ AX, ret+40(FP)
// Save application floating point state.
MOVQ fpState+16(FP), DI
MOVB ·hasXSAVE(SB), BX
MOVB ·hasXSAVEOPT(SB), CX
TESTB BX, BX
JZ no_xsave
// Use xsave/xsaveopt to save all extended state.
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
TESTB CX, CX
JZ no_xsaveopt
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
JMP fpsave_done
no_xsaveopt:
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
JMP fpsave_done
no_xsave:
FXSAVE64 0(DI)
fpsave_done:
// Restore MXCSR and the x87 control word after one of the two floating
// point save cases above, to ensure the application versions are saved
// before being clobbered here.
LDMXCSR mxcsr-0(SP)
// FLDCW is a "waiting" x87 instruction, meaning it checks for pending
// unmasked exceptions before executing. Thus if userspace has unmasked
// an exception and has one pending, it can be raised by FLDCW even
// though the new control word will mask exceptions. To prevent this,
// we must first clear pending exceptions (which will be restored by
// XRSTOR, et al).
BYTE $0xDB; BYTE $0xE2; // FNCLEX
FLDCW cw-8(SP)
RET
// See entry_amd64.go.
TEXT ·sysret(SB),NOSPLIT|NOFRAME,$0-32
// Set application FS. We can't do this in Go because Go code needs FS.
MOVQ regs+8(FP), AX
MOVQ PTRACE_FS_BASE(AX), AX
PUSHQ AX
CALL ·writeFS(SB)
POPQ AX
CALL ·jumpToKernel(SB)
// Save original state and stack. sysenter() or exception()
// from APP(gr3) will switch to this stack, set the return
// value (vector: 32(SP)) and then do RET, which will also
// automatically return to the lower half.
MOVQ cpu+0(FP), BX
MOVQ regs+8(FP), AX
MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
// save SP AX userCR3 on the kernel stack.
MOVQ CPU_ENTRY(BX), BX
LOAD_KERNEL_STACK(BX)
PUSHQ PTRACE_RSP(AX)
PUSHQ PTRACE_RAX(AX)
PUSHQ CX
// Restore user register state.
REGISTERS_LOAD(AX, 0)
MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET.
MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
// restore userCR3, AX, SP.
POPQ AX // Get userCR3.
WRITE_CR3() // Switch to userCR3.
POPQ AX // Restore AX.
POPQ SP // Restore SP.
SYSRET64()
// sysenter or exception will write our return value and return to our
// caller.
// See entry_amd64.go.
TEXT ·iret(SB),NOSPLIT|NOFRAME,$0-32
// Set application FS. We can't do this in Go because Go code needs FS.
MOVQ regs+8(FP), AX
MOVQ PTRACE_FS_BASE(AX), AX
PUSHQ AX // First argument (FS_BASE)
CALL ·writeFS(SB)
POPQ AX
CALL ·jumpToKernel(SB)
// Save original state and stack. sysenter() or exception()
// from APP(gr3) will switch to this stack, set the return
// value (vector: 32(SP)) and then do RET, which will also
// automatically return to the lower half.
MOVQ cpu+0(FP), BX
MOVQ regs+8(FP), AX
MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
// Build an IRET frame & restore state.
MOVQ CPU_ENTRY(BX), BX
LOAD_KERNEL_STACK(BX)
PUSHQ PTRACE_SS(AX)
PUSHQ PTRACE_RSP(AX)
PUSHQ PTRACE_FLAGS(AX)
PUSHQ PTRACE_CS(AX)
PUSHQ PTRACE_RIP(AX)
PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack.
PUSHQ CX // Save userCR3 on kernel stack.
REGISTERS_LOAD(AX, 0) // Restore most registers.
POPQ AX // Get userCR3.
WRITE_CR3() // Switch to userCR3.
POPQ AX // Restore AX.
IRET()
// sysenter or exception will write our return value and return to our
// caller.
// See entry_amd64.go.
TEXT ·resume(SB),NOSPLIT|NOFRAME,$0
// See iret, above.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
REGISTERS_LOAD(AX, CPU_REGISTERS)
MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
IRET()
// See entry_amd64.go.
TEXT ·start(SB),NOSPLIT|NOFRAME,$0
// N.B. This is the vCPU entrypoint. It is not called from Go code and
// thus pushes and pops values on the stack until calling into Go
// (startGo) because we aren't usually a typical Go assembly frame.
PUSHQ $0x0 // Previous frame pointer.
MOVQ SP, BP // Set frame pointer.
PUSHQ AX // Save CPU.
// Set up environment required by Go before calling startGo: Go needs
// FS_BASE and floating point initialized.
MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
PUSHQ BX // First argument (FS_BASE)
CALL ·writeFS(SB)
POPQ BX
MOVQ CPU_APP_GS_BASE(AX),BX
PUSHQ BX
CALL ·writeGS(SB)
POPQ BX
SWAP_GS()
// First argument (CPU) already at bottom of stack.
CALL ·startGo(SB) // Call Go hook.
JMP ·resume(SB) // Restore to registers.
ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB));
// See entry_amd64.go.
TEXT ·sysenter(SB),NOSPLIT|NOFRAME,$0
// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
// the kernel mode. See the comment of UserFlagsSet for more details.
TESTL $_RFLAGS_IOPL0, R11
JZ kernel
user:
SWAP_GS()
MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch.
MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
WRITE_CR3() // Switch to kernel cr3.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
MOVQ CX, PTRACE_RIP(AX)
MOVQ R11, PTRACE_FLAGS(AX)
MOVQ SP, PTRACE_RSP(AX)
MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value.
MOVQ CX, PTRACE_RAX(AX) // Save everything else.
MOVQ CX, PTRACE_ORIGRAX(AX)
CMPB CPU_HAS_FSGSBASE(GS), $1
JNE sysenter_skip_gs
SWAP_GS()
BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx
MOVQ BX, PTRACE_GS_BASE(AX)
SWAP_GS()
sysenter_skip_gs:
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks.
MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
CALL ·jumpToUser(SB)
// Restore kernel FS_BASE.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
PUSHQ BX // First argument (FS_BASE)
CALL ·writeFS(SB)
POPQ BX
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
// Return to the kernel, where the frame is:
//
// vector (sp+32)
// userCR3 (sp+24)
// regs (sp+16)
// cpu (sp+8)
// vcpu.Switch (sp+0)
//
MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
MOVQ $Syscall, 32(SP) // Output vector.
RET
kernel:
// We can't restore the original stack, but we can access the registers
// in the CPU state directly. No need for temporary juggling.
MOVQ AX, ENTRY_SCRATCH0(GS)
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
REGISTERS_SAVE(AX, CPU_REGISTERS)
MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX)
MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX)
MOVQ ENTRY_SCRATCH0(GS), BX
MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
MOVQ $0xffffffffffffffff, CPU_VECTOR(AX) // Set error type to kernel.
// Save floating point state. CPU.floatingPointState is a slice, so the
// first word of CPU.floatingPointState is a pointer to the destination
// array.
MOVQ CPU_FPU_STATE(AX), DI
MOVB CPU_HAS_XSAVE(AX), BX
MOVB CPU_HAS_XSAVEOPT(AX), CX
TESTB BX, BX
JZ no_xsave
// Use xsave/xsaveopt to save all extended state.
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
TESTB CX, CX
JZ no_xsaveopt
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
JMP fpsave_done
no_xsaveopt:
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
JMP fpsave_done
no_xsave:
FXSAVE64 0(DI)
fpsave_done:
// Call the syscall trampoline.
LOAD_KERNEL_STACK(GS)
MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
PUSHQ AX // First argument (vCPU).
CALL ·kernelSyscall(SB) // Call the trampoline.
POPQ AX // Pop vCPU.
// We only trigger a bluepill entry in the bluepill function, and can
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt.
JMP ·resume(SB)
ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB));
// exception is a generic exception handler.
//
// There are two cases handled:
//
// 1) An exception in kernel mode: this results in saving the state at the time
// of the exception and calling the defined hook.
//
// 2) An exception in guest mode: the original kernel frame is restored, and
// the vector & error codes are pushed as return values.
//
// See below for the stubs that call exception.
TEXT ·exception(SB),NOSPLIT|NOFRAME,$0
// Determine whether the exception occurred in kernel mode or user
// mode, based on the flags. We expect the following stack:
//
// SS (sp+48)
// SP (sp+40)
// FLAGS (sp+32)
// CS (sp+24)
// IP (sp+16)
// ERROR_CODE (sp+8)
// VECTOR (sp+0)
//
TESTL $_RFLAGS_IOPL0, 32(SP)
JZ kernel
user:
SWAP_GS()
ADDQ $-8, SP // Adjust for flags.
MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
PUSHQ AX // Save user AX on stack.
MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
WRITE_CR3() // Switch to kernel cr3.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
POPQ BX // Restore original AX.
MOVQ BX, PTRACE_RAX(AX) // Save it.
MOVQ BX, PTRACE_ORIGRAX(AX)
CMPB CPU_HAS_FSGSBASE(GS), $1
JNE exception_skip_gs
SWAP_GS()
BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx
MOVQ BX, PTRACE_GS_BASE(AX)
SWAP_GS()
exception_skip_gs:
MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
CALL ·jumpToUser(SB)
// Restore kernel FS_BASE.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
PUSHQ BX // First argument (FS_BASE)
CALL ·writeFS(SB)
POPQ BX
// Copy out and return.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ 0(SP), BX // Load vector.
MOVQ 8(SP), CX // Load error code.
MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
MOVQ CX, CPU_ERROR_CODE(AX) // Set error code.
MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
MOVQ BX, 32(SP) // Output vector.
RET
kernel:
// As per above, we can save directly.
PUSHQ AX
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
REGISTERS_SAVE(AX, CPU_REGISTERS)
POPQ BX
MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
// Set the error code and adjust the stack.
MOVQ 8(SP), BX // Load the error code.
MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
MOVQ 0(SP), BX // Load the error code.
MOVQ BX, CPU_VECTOR(AX) // Copy out to the CPU.
BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX
MOVQ BX, CPU_FAULT_ADDR(AX)
MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
// Save floating point state. CPU.floatingPointState is a slice, so the
// first word of CPU.floatingPointState is a pointer to the destination
// array.
MOVQ CPU_FPU_STATE(AX), DI
MOVB CPU_HAS_XSAVE(AX), BX
MOVB CPU_HAS_XSAVEOPT(AX), CX
TESTB BX, BX
JZ no_xsave
// Use xsave/xsaveopt to save all extended state.
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
TESTB CX, CX
JZ no_xsaveopt
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
JMP fpsave_done
no_xsaveopt:
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
JMP fpsave_done
no_xsave:
FXSAVE64 0(DI)
fpsave_done:
// Call the exception trampoline.
MOVQ 0(SP), BX // BX contains the vector.
LOAD_KERNEL_STACK(GS)
MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
PUSHQ BX // Second argument (vector).
PUSHQ AX // First argument (vCPU).
CALL ·kernelException(SB) // Call the trampoline.
POPQ BX // Pop vector.
POPQ AX // Pop vCPU.
// We only trigger a bluepill entry in the bluepill function, and can
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt.
JMP ·resume(SB)
#define EXCEPTION_WITH_ERROR(value, symbol, addr) \
ADDR_OF_FUNC(addr, symbol); \
TEXT symbol,NOSPLIT|NOFRAME,$0; \
PUSHQ $value; \
JMP ·exception(SB);
#define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \
ADDR_OF_FUNC(addr, symbol); \
TEXT symbol,NOSPLIT|NOFRAME,$0; \
PUSHQ $0x0; \
PUSHQ $value; \
JMP ·exception(SB);
EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB))
EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB))
EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB))
EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB))
EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB))
EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB))
EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB))
EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB))
EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB))
EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB))
EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB))
EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB))
EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB))
EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB))
EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB))
EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB))
EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB))
EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB))
EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB))
EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB))
EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB))
EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))