mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
03a28d158e
Now we don't need to trigger a second fault to figure out whether it was write or read access. Fixes #11008 Co-developed-by: Jamie Liu <jamieliu@google.com> PiperOrigin-RevId: 697677262
379 lines
14 KiB
Go
379 lines
14 KiB
Go
// Copyright 2020 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package sysmsg provides a stub signal handler and a communication protocol
|
|
// between stub threads and the Sentry.
|
|
//
|
|
// Note that this package is allowlisted for use of sync/atomic.
|
|
//
|
|
// +checkalignedignore
|
|
package sysmsg
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"sync/atomic"
|
|
|
|
"golang.org/x/sys/unix"
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/hostarch"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
)
|
|
|
|
// LINT.IfChange
|
|
// Per-thread stack layout:
|
|
//
|
|
// *------------*
|
|
// | guard page |
|
|
// |------------|
|
|
// | |
|
|
// | sysstack |
|
|
// | |
|
|
// *------------*
|
|
// | guard page |
|
|
// |------------|
|
|
// | |
|
|
// | ^ |
|
|
// | / \ |
|
|
// | | |
|
|
// | altstack |
|
|
// |------------|
|
|
// | sysmsg |
|
|
// *------------*
|
|
const (
|
|
// PerThreadMemSize is the size of a per-thread memory region.
|
|
PerThreadMemSize = 8 * hostarch.PageSize
|
|
// GuardSize is the size of an unmapped region which is placed right
|
|
// before the signal stack.
|
|
GuardSize = hostarch.PageSize
|
|
PerThreadPrivateStackOffset = GuardSize
|
|
PerThreadPrivateStackSize = 2 * hostarch.PageSize
|
|
// PerThreadStackSharedSize is the size of a per-thread stack region.
|
|
PerThreadSharedStackSize = 4 * hostarch.PageSize
|
|
PerThreadSharedStackOffset = 4 * hostarch.PageSize
|
|
// MsgOffsetFromStack is the offset of the Msg structure on
|
|
// the thread stack.
|
|
MsgOffsetFromSharedStack = PerThreadMemSize - hostarch.PageSize - PerThreadSharedStackOffset
|
|
|
|
// SpinningQueueMemSize is the size of a spinning queue memory region.
|
|
SpinningQueueMemSize = hostarch.PageSize
|
|
)
|
|
|
|
// StackAddrToMsg returns an address of a sysmsg structure.
|
|
func StackAddrToMsg(sp uintptr) uintptr {
|
|
return sp + MsgOffsetFromSharedStack
|
|
}
|
|
|
|
// StackAddrToSyshandlerStack returns an address of a syshandler stack.
|
|
func StackAddrToSyshandlerStack(sp uintptr) uintptr {
|
|
return sp + PerThreadPrivateStackOffset + PerThreadPrivateStackSize
|
|
}
|
|
|
|
// MsgToStackAddr returns a start address of a stack.
|
|
func MsgToStackAddr(msg uintptr) uintptr {
|
|
return msg - MsgOffsetFromSharedStack
|
|
}
|
|
|
|
// ThreadState is used to store a state of the sysmsg thread.
|
|
type ThreadState uint32
|
|
|
|
// Set atomicaly sets the state value.
|
|
func (s *ThreadState) Set(state ThreadState) {
|
|
atomic.StoreUint32((*uint32)(s), uint32(state))
|
|
}
|
|
|
|
// CompareAndSwap atomicaly compares and swaps the state value.
|
|
func (s *ThreadState) CompareAndSwap(old, state ThreadState) bool {
|
|
return atomic.CompareAndSwapUint32((*uint32)(s), uint32(old), uint32(state))
|
|
}
|
|
|
|
// Get returns the current state value.
|
|
//
|
|
//go:nosplit
|
|
func (s *ThreadState) Get() ThreadState {
|
|
return ThreadState(atomic.LoadUint32((*uint32)(s)))
|
|
}
|
|
|
|
const (
|
|
// ThreadStateNone means that the thread is executing the user workload.
|
|
ThreadStateNone ThreadState = iota
|
|
// ThreadStateDone means that last event has been handled and the stub thread
|
|
// can be resumed.
|
|
ThreadStateDone
|
|
// ThreadStatePrep means that syshandler started filling the sysmsg struct.
|
|
ThreadStatePrep
|
|
// ThreadStateAsleep means that this thread fell asleep because there was not
|
|
// enough contexts to process in the context queue.
|
|
ThreadStateAsleep
|
|
// ThreadStateInitializing is only set once at sysmsg thread creation time. It
|
|
// is used to tell the signal handler that the thread does not yet have a
|
|
// context.
|
|
ThreadStateInitializing
|
|
)
|
|
|
|
// Msg contains the current state of the sysmsg thread.
|
|
type Msg struct {
|
|
// The next batch of fields is used to call the syshandler stub
|
|
// function. A system call can be replaced with a function call. When
|
|
// a function call is executed, it can't change the current process
|
|
// stack, so it needs to save stack and instruction registers, switch
|
|
// on its syshandler stack and call the jmp instruction to the syshandler
|
|
// address.
|
|
//
|
|
// Self is a pointer to itself in a process address space.
|
|
Self uint64
|
|
// RetAddr is a return address from the syshandler function.
|
|
RetAddr uint64
|
|
// Syshandler is an address of the syshandler function.
|
|
Syshandler uint64
|
|
// SyshandlerStack is an address of the thread syshandler stack.
|
|
SyshandlerStack uint64
|
|
// AppStack is a value of the stack register before calling the syshandler
|
|
// function.
|
|
AppStack uint64
|
|
// interrupt is non-zero if there is a postponed interrupt.
|
|
interrupt uint32
|
|
// State indicates to the sentry what the sysmsg thread is doing at a given
|
|
// moment.
|
|
State ThreadState
|
|
// Context is a pointer to the ThreadContext struct that the current sysmsg
|
|
// thread is processing.
|
|
Context uint64
|
|
|
|
// FaultJump is the size of a faulted instruction.
|
|
FaultJump int32
|
|
// Err is the error value with which the {sig|sys}handler crashes the stub
|
|
// thread (see sysmsg.h:__panic).
|
|
Err int32
|
|
// ErrAdditional is an error value that gives additional information
|
|
// about the panic.
|
|
ErrAdditional int32
|
|
// Line is the code line on which the {sig|sys}handler crashed the stub thread
|
|
// (see sysmsg.h:panic).
|
|
Line int32
|
|
// Debug is a variable to use to get visibility into the stub from the sentry.
|
|
Debug uint64
|
|
// ThreadID is the ID of the sysmsg thread.
|
|
ThreadID uint32
|
|
}
|
|
|
|
// ContextState defines the reason the context has exited back to the sentry,
|
|
// or ContextStateNone if running/ready-to-run.
|
|
type ContextState uint32
|
|
|
|
// Set atomicaly sets the state value.
|
|
func (s *ContextState) Set(state ContextState) {
|
|
atomic.StoreUint32((*uint32)(s), uint32(state))
|
|
}
|
|
|
|
// Get returns the current state value.
|
|
//
|
|
//go:nosplit
|
|
func (s *ContextState) Get() ContextState {
|
|
return ContextState(atomic.LoadUint32((*uint32)(s)))
|
|
}
|
|
|
|
// Context State types.
|
|
const (
|
|
// ContextStateNone means that is either running in the user task or is ready
|
|
// to run in the user task.
|
|
ContextStateNone ContextState = iota
|
|
// ContextStateSyscall means that a syscall event is triggered from the
|
|
// sighandler.
|
|
ContextStateSyscall
|
|
// ContextStateFault means that there is a fault event that needs to be
|
|
// handled.
|
|
ContextStateFault
|
|
// ContextStateSyscallTrap means that a syscall event is triggered from
|
|
// a function call (syshandler).
|
|
ContextStateSyscallTrap
|
|
// ContextStateSyscallCanBePatched means that the syscall can be replaced
|
|
// with a function call.
|
|
ContextStateSyscallCanBePatched
|
|
// ContextStateInvalid is an invalid state that the sentry should never see.
|
|
ContextStateInvalid
|
|
)
|
|
|
|
const (
|
|
// MaxFPStateLen is the largest possible FPState that we will save.
|
|
// Note: This value was chosen to be able to fit ThreadContext into one page.
|
|
MaxFPStateLen uint32 = 3584
|
|
|
|
// AllocatedSizeofThreadContextStruct defines how much memory to allocate for
|
|
// one instance of ThreadContext.
|
|
// We over allocate the memory for it because:
|
|
// - The next instances needs to align to 64 bytes for purposes of xsave.
|
|
// - It's nice to align it to the page boundary.
|
|
AllocatedSizeofThreadContextStruct uintptr = 4096
|
|
)
|
|
|
|
// ThreadContext contains the current context of the sysmsg thread. The struct
|
|
// facilitates switching contexts by allowing the sentry to switch pointers to
|
|
// this struct as it needs to.
|
|
type ThreadContext struct {
|
|
// FPState is a region of memory where:
|
|
// - syshandler saves FPU state to using xsave/fxsave
|
|
// - sighandler copies FPU state to from ucontext->uc_mcontext.fpregs
|
|
// Note that xsave requires this region of memory to be 64 byte aligned;
|
|
// therefore allocations of ThreadContext must be too.
|
|
FPState [MaxFPStateLen]byte
|
|
// FPStateChanged is set to true when the stub thread needs to restore FPState
|
|
// because the sentry changed it.
|
|
FPStateChanged uint64
|
|
// Regs is the context's GP register set. The {sig|sys}handler will save and
|
|
// restore the user app's registers here.
|
|
Regs linux.PtraceRegs
|
|
|
|
// SignalInfo is the siginfo struct.
|
|
SignalInfo linux.SignalInfo
|
|
// Signo is the signal that the stub is requesting the sentry to handle.
|
|
Signo int64
|
|
// State indicates the reason why the context has exited back to the sentry.
|
|
State ContextState
|
|
// Interrupt is set to indicate that this context has been interrupted.
|
|
Interrupt uint32
|
|
// ThreadID is the ID of the sysmsg thread that's currently working on the
|
|
// context.
|
|
ThreadID uint32
|
|
// LastThreadID is the ID of the previous sysmsg thread that ran the context
|
|
// (not the one currently working on it). This field is used by sysmsg threads
|
|
// to detect whether fpstate may have changed since the last time they ran a
|
|
// context.
|
|
LastThreadID uint32
|
|
// SentryFastPath is used to indicate to the stub thread that the sentry
|
|
// goroutine used for this thread context is busy-polling for a response
|
|
// instead of using FUTEX_WAIT.
|
|
SentryFastPath uint32
|
|
// AckedTime is used by sysmsg threads to signal to the sentry that this context
|
|
// has been picked up from the context queue and is actively being worked on.
|
|
// The stub thread puts down the timestamp at which it has started processing
|
|
// this context.
|
|
AckedTime uint64
|
|
// StateChangedTime is the time when the ThreadContext.State changed, as
|
|
// recorded by the stub thread when it gave it back to the sentry
|
|
// (the sentry does not populate this field except to reset it).
|
|
StateChangedTime uint64
|
|
// TLS is a pointer to a thread local storage.
|
|
// It is is only populated on ARM64.
|
|
TLS uint64
|
|
// Debug is a variable to use to get visibility into the stub from the sentry.
|
|
Debug uint64
|
|
// SigError is an error code that clarifies the nature of the signal.
|
|
SigError uint64
|
|
}
|
|
|
|
// StubError are values that represent known stub-thread failure modes.
|
|
// Since these errors originate from the stub threads, look at
|
|
// sysmsg.h:stub_error.
|
|
type StubError int32
|
|
|
|
const (
|
|
// StubErrorBadSysmsg indicates sysmsg->self did not match sysmsg.
|
|
StubErrorBadSysmsg StubError = 0x0bad0000 + iota
|
|
// StubErrorBadThreadState indicates sysmsg->state was invalid.
|
|
StubErrorBadThreadState
|
|
// StubErrorBadSpinningQueueDecref indicates stubs removed more threads
|
|
// from spinning queue than were put in.
|
|
StubErrorBadSpinningQueueDecref
|
|
// StubErrorArchPrctl indicates an error when calling arch_prctl.
|
|
StubErrorArchPrctl
|
|
// StubErrorFutex indicates an error when calling futex.
|
|
StubErrorFutex
|
|
// StubErrorBadContextID indicates a context received from the context
|
|
// queue was of unexpected value.
|
|
StubErrorBadContextID
|
|
// StubErrorFpStateBadHeader indicates that the floating point state
|
|
// header did not match the expected value.
|
|
StubErrorFpStateBadHeader
|
|
)
|
|
|
|
// LINT.ThenChange(sysmsg.h)
|
|
|
|
// Init initializes the message.
|
|
func (m *Msg) Init(threadID uint32) {
|
|
m.Err = 0
|
|
m.ErrAdditional = 0
|
|
m.Line = -1
|
|
m.ThreadID = threadID
|
|
m.Context = 0
|
|
}
|
|
|
|
// Init initializes the ThreadContext instance.
|
|
func (c *ThreadContext) Init(initialThreadID uint32) {
|
|
c.FPStateChanged = 1
|
|
c.Regs = linux.PtraceRegs{}
|
|
c.Signo = 0
|
|
c.SignalInfo = linux.SignalInfo{}
|
|
c.State = ContextStateNone
|
|
c.ThreadID = initialThreadID
|
|
}
|
|
|
|
// ConvertSysmsgErr converts m.Err to platform.ContextError.
|
|
func (m *Msg) ConvertSysmsgErr() *platform.ContextError {
|
|
err := &platform.ContextError{
|
|
Errno: unix.EPERM,
|
|
}
|
|
|
|
const prefix = "systrap stub thread failure:"
|
|
suffix := fmt.Sprintf("(failed on line %d; %s)", atomic.LoadInt32(&m.Line), m.String())
|
|
switch StubError(atomic.LoadInt32(&m.Err)) {
|
|
case StubErrorBadSysmsg:
|
|
err.Err = fmt.Errorf("%s sysmsg->self did not match sysmsg during sig/sys-handler %s", prefix, suffix)
|
|
case StubErrorBadThreadState:
|
|
err.Err = fmt.Errorf("%s sysmsg->state was invalid during sys-handler %s", prefix, suffix)
|
|
case StubErrorBadSpinningQueueDecref:
|
|
err.Err = fmt.Errorf("%s imbalanced use of spinning queue %s", prefix, suffix)
|
|
case StubErrorArchPrctl:
|
|
err.Err = fmt.Errorf("%s arch_prctl error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
|
|
case StubErrorFutex:
|
|
err.Err = fmt.Errorf("%s futex error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
|
|
case StubErrorBadContextID:
|
|
err.Err = fmt.Errorf("%s unexpected context ID (%d) from context queue %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
|
|
case StubErrorFpStateBadHeader:
|
|
err.Err = fmt.Errorf("%s FP state context magic header (%d) does not match expected FPSIMD_MAGIC %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
|
|
default:
|
|
err.Err = fmt.Errorf("%s unknown reason (0x%x) (possible shared memory corruption) %s", prefix, atomic.LoadInt32(&m.Err), suffix)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (m *Msg) String() string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "sysmsg.Msg{msg: %x state %d", m.Self, m.State)
|
|
fmt.Fprintf(&b, " err %x line %d debug %x", m.Err, m.Line, m.Debug)
|
|
fmt.Fprintf(&b, " app stack %x", m.AppStack)
|
|
fmt.Fprintf(&b, " context %x", m.Context)
|
|
fmt.Fprintf(&b, " ThreadID %d", m.ThreadID)
|
|
b.WriteString("}")
|
|
|
|
return b.String()
|
|
}
|
|
|
|
func (c *ThreadContext) String() string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "sysmsg.ThreadContext{state %d", c.State.Get())
|
|
fmt.Fprintf(&b, " fault addr %x syscall %d", c.SignalInfo.Addr(), c.SignalInfo.Syscall())
|
|
fmt.Fprintf(&b, " ip %x sp %x", c.Regs.InstructionPointer(), c.Regs.StackPointer())
|
|
fmt.Fprintf(&b, " FPStateChanged %d Regs %+v", c.FPStateChanged, c.Regs)
|
|
fmt.Fprintf(&b, " Interrupt %d", c.Interrupt)
|
|
fmt.Fprintf(&b, " ThreadID %d LastThreadID %d", c.ThreadID, c.LastThreadID)
|
|
fmt.Fprintf(&b, " SentryFastPath %d Acked %d", c.SentryFastPath, c.AckedTime)
|
|
fmt.Fprintf(&b, " signo: %d, siginfo: %+v", c.Signo, c.SignalInfo)
|
|
fmt.Fprintf(&b, " debug %d", atomic.LoadUint64(&c.Debug))
|
|
b.WriteString("}")
|
|
|
|
return b.String()
|
|
}
|