mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
8d5f3c982a
Call dumpAndPanicSyscallError for the rare case where we fail to kill the the Sentry upon detecting an unexpected stub exit. This will provide enough information determine if a panic occur due to failed SIGKILL attempt or an unexpected event. PiperOrigin-RevId: 737751257
1205 lines
38 KiB
Go
1205 lines
38 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package systrap
|
|
|
|
import (
|
|
"fmt"
|
|
"runtime"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"golang.org/x/sys/unix"
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
"gvisor.dev/gvisor/pkg/hostarch"
|
|
"gvisor.dev/gvisor/pkg/hostsyscall"
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
"gvisor.dev/gvisor/pkg/pool"
|
|
"gvisor.dev/gvisor/pkg/seccomp"
|
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
"gvisor.dev/gvisor/pkg/sighandling"
|
|
)
|
|
|
|
var (
|
|
// globalPool tracks all subprocesses in various state: active or available for
|
|
// reuse.
|
|
globalPool = subprocessPool{}
|
|
|
|
// maximumUserAddress is the largest possible user address.
|
|
maximumUserAddress = linux.TaskSize
|
|
|
|
// stubInitAddress is the initial attempt link address for the stub.
|
|
stubInitAddress = linux.TaskSize
|
|
|
|
// maxRandomOffsetOfStubAddress is the maximum offset for randomizing a
|
|
// stub address. It is set to the default value of mm.mmap_rnd_bits.
|
|
//
|
|
// Note: Tools like ThreadSanitizer don't like when the memory layout
|
|
// is changed significantly.
|
|
maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1)
|
|
|
|
// maxStubUserAddress is the largest possible user address for
|
|
// processes running inside gVisor. It is fixed because
|
|
// * we don't want to reveal a stub address.
|
|
// * it has to be the same across checkpoint/restore.
|
|
maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress
|
|
)
|
|
|
|
// Linux kernel errnos which "should never be seen by user programs", but will
|
|
// be revealed to ptrace syscall exit tracing.
|
|
//
|
|
// These constants are only used in subprocess.go.
|
|
const (
|
|
ERESTARTSYS = unix.Errno(512)
|
|
ERESTARTNOINTR = unix.Errno(513)
|
|
ERESTARTNOHAND = unix.Errno(514)
|
|
)
|
|
|
|
// thread is a traced thread; it is a thread identifier.
|
|
//
|
|
// This is a convenience type for defining ptrace operations.
|
|
type thread struct {
|
|
tgid int32
|
|
tid int32
|
|
|
|
// sysmsgStackID is a stack ID in subprocess.sysmsgStackPool.
|
|
sysmsgStackID uint64
|
|
|
|
// initRegs are the initial registers for the first thread.
|
|
//
|
|
// These are used for the register set for system calls.
|
|
initRegs arch.Registers
|
|
|
|
logPrefix atomic.Pointer[string]
|
|
}
|
|
|
|
// requestThread is used to request a new sysmsg thread. A thread identifier will
|
|
// be sent into the thread channel.
|
|
type requestThread struct {
|
|
thread chan *thread
|
|
}
|
|
|
|
// requestStub is used to request a new stub process.
|
|
type requestStub struct {
|
|
done chan *thread
|
|
}
|
|
|
|
// maxSysmsgThreads is the maximum number of sysmsg threads that a subprocess
|
|
// can create. It is based on GOMAXPROCS and set once, so it must be set after
|
|
// GOMAXPROCS has been adjusted (see loader.go:Args.NumCPU).
|
|
var maxSysmsgThreads = 0
|
|
|
|
// maxChildThreads is the max number of all child system threads that a
|
|
// subprocess can create, including sysmsg threads.
|
|
var maxChildThreads = 0
|
|
|
|
const (
|
|
// maxGuestContexts specifies the maximum number of task contexts that a
|
|
// subprocess can handle.
|
|
maxGuestContexts = 4095
|
|
// invalidContextID specifies an invalid ID.
|
|
invalidContextID uint32 = 0xfefefefe
|
|
// invalidThreadID is used to indicate that a context is not being worked on by
|
|
// any sysmsg thread.
|
|
invalidThreadID uint32 = 0xfefefefe
|
|
)
|
|
|
|
// subprocess is a collection of threads being traced.
|
|
type subprocess struct {
|
|
platform.NoAddressSpaceIO
|
|
subprocessRefs
|
|
|
|
// requests is used to signal creation of new threads.
|
|
requests chan any
|
|
|
|
// sysmsgInitRegs is used to reset sysemu regs.
|
|
sysmsgInitRegs arch.Registers
|
|
|
|
// mu protects the following fields.
|
|
mu sync.Mutex
|
|
|
|
// faultedContexts is the set of contexts for which it's possible that
|
|
// platformContext.lastFaultSP == this subprocess.
|
|
faultedContexts map[*platformContext]struct{}
|
|
|
|
// sysmsgStackPool is a pool of available sysmsg stacks.
|
|
sysmsgStackPool pool.Pool
|
|
|
|
// threadContextPool is a pool of available sysmsg.ThreadContext IDs.
|
|
threadContextPool pool.Pool
|
|
|
|
// threadContextRegion defines the ThreadContext memory region start
|
|
// within the sentry address space.
|
|
threadContextRegion uintptr
|
|
|
|
// memoryFile is used to allocate a sysmsg stack which is shared between a
|
|
// stub process and the Sentry.
|
|
memoryFile *pgalloc.MemoryFile
|
|
|
|
// usertrap is the state of the usertrap table which contains syscall
|
|
// trampolines.
|
|
usertrap *usertrap.State
|
|
|
|
syscallThreadMu sync.Mutex
|
|
syscallThread *syscallThread
|
|
|
|
// sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads
|
|
sysmsgThreadsMu sync.Mutex
|
|
// sysmsgThreads is a collection of all active sysmsg threads in the
|
|
// subprocess.
|
|
sysmsgThreads map[uint32]*sysmsgThread
|
|
// numSysmsgThreads counts the number of active sysmsg threads; we use a
|
|
// counter instead of using len(sysmsgThreads) because we need to synchronize
|
|
// how many threads get created _before_ the creation happens.
|
|
numSysmsgThreads int
|
|
|
|
// contextQueue is a queue of all contexts that are ready to switch back to
|
|
// user mode.
|
|
contextQueue *contextQueue
|
|
|
|
// dead indicates whether the subprocess is alive or not.
|
|
dead atomicbitops.Bool
|
|
}
|
|
|
|
var seccompNotifyIsSupported = false
|
|
|
|
func initSeccompNotify() {
|
|
errno := hostsyscall.RawSyscallErrno(seccomp.SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_NEW_LISTENER, 0)
|
|
switch errno {
|
|
case unix.EFAULT:
|
|
// seccomp unotify is supported.
|
|
case unix.EINVAL:
|
|
log.Warningf("Seccomp user-space notification mechanism isn't " +
|
|
"supported by the kernel (available since Linux 5.0).")
|
|
default:
|
|
panic(fmt.Sprintf("seccomp returns unexpected code: %d", errno))
|
|
}
|
|
}
|
|
|
|
func (s *subprocess) initSyscallThread(ptraceThread *thread, seccompNotify bool) error {
|
|
s.syscallThreadMu.Lock()
|
|
defer s.syscallThreadMu.Unlock()
|
|
|
|
id, ok := s.sysmsgStackPool.Get()
|
|
if !ok {
|
|
panic("unable to allocate a sysmsg stub thread")
|
|
}
|
|
|
|
ptraceThread.sysmsgStackID = id
|
|
t := syscallThread{
|
|
subproc: s,
|
|
thread: ptraceThread,
|
|
}
|
|
|
|
if err := t.init(seccompNotify); err != nil {
|
|
panic(fmt.Sprintf("failed to create a syscall thread"))
|
|
}
|
|
s.syscallThread = &t
|
|
|
|
s.syscallThread.detach()
|
|
|
|
return nil
|
|
}
|
|
|
|
func handlePtraceSyscallRequestError(req any, format string, values ...any) {
|
|
switch req.(type) {
|
|
case requestThread:
|
|
req.(requestThread).thread <- nil
|
|
case requestStub:
|
|
req.(requestStub).done <- nil
|
|
}
|
|
log.Warningf("handlePtraceSyscallRequest failed: "+format, values...)
|
|
}
|
|
|
|
// handlePtraceSyscallRequest executes system calls that can't be run via
|
|
// syscallThread without using ptrace. Look at the description of syscallThread
|
|
// to get more details about its limitations.
|
|
func (s *subprocess) handlePtraceSyscallRequest(req any) {
|
|
s.syscallThreadMu.Lock()
|
|
defer s.syscallThreadMu.Unlock()
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
if err := s.syscallThread.attach(); err != nil {
|
|
handlePtraceSyscallRequestError(req, err.Error())
|
|
return
|
|
}
|
|
defer s.syscallThread.detach()
|
|
|
|
ptraceThread := s.syscallThread.thread
|
|
|
|
switch r := req.(type) {
|
|
case requestThread:
|
|
t, err := ptraceThread.clone()
|
|
if err != nil {
|
|
handlePtraceSyscallRequestError(req, "error initializing thread: %v", err)
|
|
return
|
|
}
|
|
|
|
// Since the new thread was created with
|
|
// clone(CLONE_PTRACE), it will begin execution with
|
|
// SIGSTOP pending and with this thread as its tracer.
|
|
// (Hopefully nobody tgkilled it with a signal <
|
|
// SIGSTOP before the SIGSTOP was delivered, in which
|
|
// case that signal would be delivered before SIGSTOP.)
|
|
if sig := t.wait(stopped); sig != unix.SIGSTOP {
|
|
handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig)
|
|
return
|
|
}
|
|
|
|
t.initRegs = ptraceThread.initRegs
|
|
// Set the parent death signal to SIGKILL.
|
|
_, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL,
|
|
arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG},
|
|
arch.SyscallArgument{Value: uintptr(unix.SIGKILL)},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
)
|
|
if err != nil {
|
|
handlePtraceSyscallRequestError(req, "prctl: %v", err)
|
|
return
|
|
}
|
|
|
|
id, ok := s.sysmsgStackPool.Get()
|
|
if !ok {
|
|
handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread")
|
|
return
|
|
}
|
|
t.sysmsgStackID = id
|
|
|
|
if e := hostsyscall.RawSyscallErrno(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 {
|
|
handlePtraceSyscallRequestError(req, "tkill failed: %v", e)
|
|
return
|
|
}
|
|
|
|
// Detach the thread.
|
|
t.detach()
|
|
|
|
// Return the thread.
|
|
r.thread <- t
|
|
case requestStub:
|
|
t, err := ptraceThread.createStub()
|
|
if err != nil {
|
|
handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err)
|
|
return
|
|
}
|
|
r.done <- t
|
|
|
|
}
|
|
}
|
|
|
|
// newSubprocess returns a usable subprocess.
|
|
//
|
|
// This will either be a newly created subprocess, or one from the global pool.
|
|
// The create function will be called in the latter case, which is guaranteed
|
|
// to happen with the runtime thread locked.
|
|
//
|
|
// seccompNotify indicates a ways of comunications with syscall threads.
|
|
// If it is false, futex-s are used. Otherwise, seccomp-unotify is used.
|
|
// seccomp-unotify can't be used for the source pool process, because it is a
|
|
// parent of all other stub processes, but only one filter can be installed
|
|
// with SECCOMP_FILTER_FLAG_NEW_LISTENER.
|
|
func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile, seccompNotify bool) (*subprocess, error) {
|
|
if sp := globalPool.fetchAvailable(); sp != nil {
|
|
sp.subprocessRefs.InitRefs()
|
|
sp.usertrap = usertrap.New()
|
|
return sp, nil
|
|
}
|
|
|
|
// The following goroutine is responsible for creating the first traced
|
|
// thread, and responding to requests to make additional threads in the
|
|
// traced process. The process will be killed and reaped when the
|
|
// request channel is closed, which happens in Release below.
|
|
requests := make(chan any)
|
|
|
|
// Ready.
|
|
sp := &subprocess{
|
|
requests: requests,
|
|
faultedContexts: make(map[*platformContext]struct{}),
|
|
sysmsgStackPool: pool.Pool{Start: 0, Limit: uint64(maxChildThreads)},
|
|
threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts},
|
|
memoryFile: memoryFile,
|
|
sysmsgThreads: make(map[uint32]*sysmsgThread),
|
|
}
|
|
sp.subprocessRefs.InitRefs()
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
|
|
// Initialize the syscall thread.
|
|
ptraceThread, err := create()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
sp.sysmsgInitRegs = ptraceThread.initRegs
|
|
|
|
if err := sp.initSyscallThread(ptraceThread, seccompNotify); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
go func() { // S/R-SAFE: Platform-related.
|
|
|
|
// Wait for requests to create threads.
|
|
for req := range requests {
|
|
sp.handlePtraceSyscallRequest(req)
|
|
}
|
|
|
|
// Requests should never be closed.
|
|
panic("unreachable")
|
|
}()
|
|
|
|
sp.unmap()
|
|
sp.usertrap = usertrap.New()
|
|
sp.mapSharedRegions()
|
|
sp.mapPrivateRegions()
|
|
|
|
// The main stub doesn't need sysmsg threads.
|
|
if seccompNotify {
|
|
// Create the initial sysmsg thread.
|
|
atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1)
|
|
if err := sp.createSysmsgThread(); err != nil {
|
|
return nil, err
|
|
}
|
|
sp.numSysmsgThreads++
|
|
}
|
|
|
|
return sp, nil
|
|
}
|
|
|
|
// mapSharedRegions maps the shared regions that are used between the subprocess
|
|
// and ALL of the subsequently created sysmsg threads into both the sentry and
|
|
// the syscall thread.
|
|
//
|
|
// Should be called before any sysmsg threads are created.
|
|
// Initializes s.contextQueue and s.threadContextRegion.
|
|
func (s *subprocess) mapSharedRegions() {
|
|
if s.contextQueue != nil || s.threadContextRegion != 0 {
|
|
panic("contextQueue or threadContextRegion was already initialized")
|
|
}
|
|
|
|
opts := pgalloc.AllocOpts{
|
|
Kind: usage.System,
|
|
Dir: pgalloc.TopDown,
|
|
}
|
|
|
|
// Map shared regions into the sentry.
|
|
contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts)
|
|
contextQueue.init()
|
|
|
|
// Map thread context region into the syscall thread.
|
|
_, err := s.syscallThread.syscall(
|
|
unix.SYS_MMAP,
|
|
arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)},
|
|
arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())},
|
|
arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
|
|
arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
|
|
arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
|
|
arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)})
|
|
if err != nil {
|
|
panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
|
|
}
|
|
|
|
s.contextQueue = contextQueue
|
|
|
|
// Map thread context region into the sentry.
|
|
threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts)
|
|
if err != nil {
|
|
panic(fmt.Sprintf("failed to allocate a new subprocess context memory region"))
|
|
}
|
|
sentryThreadContextRegionAddr, errno := hostsyscall.RawSyscall6(
|
|
unix.SYS_MMAP,
|
|
0,
|
|
uintptr(threadContextFR.Length()),
|
|
unix.PROT_WRITE|unix.PROT_READ,
|
|
unix.MAP_SHARED|unix.MAP_FILE,
|
|
uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start))
|
|
if errno != 0 {
|
|
panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno))
|
|
}
|
|
|
|
// Map thread context region into the syscall thread.
|
|
if _, err := s.syscallThread.syscall(
|
|
unix.SYS_MMAP,
|
|
arch.SyscallArgument{Value: uintptr(stubContextRegion)},
|
|
arch.SyscallArgument{Value: uintptr(threadContextFR.Length())},
|
|
arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
|
|
arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
|
|
arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
|
|
arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil {
|
|
panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
|
|
}
|
|
|
|
s.threadContextRegion = sentryThreadContextRegionAddr
|
|
}
|
|
|
|
func (s *subprocess) mapPrivateRegions() {
|
|
_, err := s.syscallThread.syscall(
|
|
unix.SYS_MMAP,
|
|
arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)},
|
|
arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)},
|
|
arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
|
|
arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0})
|
|
if err != nil {
|
|
panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err))
|
|
}
|
|
}
|
|
|
|
// unmap unmaps non-stub regions of the process.
|
|
//
|
|
// This will panic on failure (which should never happen).
|
|
func (s *subprocess) unmap() {
|
|
s.Unmap(0, uint64(stubStart))
|
|
if maximumUserAddress != stubEnd {
|
|
s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
|
|
}
|
|
}
|
|
|
|
// Release kills the subprocess.
|
|
//
|
|
// Just kidding! We can't safely coordinate the detaching of all the
|
|
// tracees (since the tracers are random runtime threads, and the process
|
|
// won't exit until tracers have been notifier).
|
|
//
|
|
// Therefore we simply unmap everything in the subprocess and return it to the
|
|
// globalPool. This has the added benefit of reducing creation time for new
|
|
// subprocesses.
|
|
func (s *subprocess) Release() {
|
|
if !s.alive() {
|
|
return
|
|
}
|
|
s.unmap()
|
|
s.DecRef(s.release)
|
|
}
|
|
|
|
// release returns the subprocess to the global pool.
|
|
func (s *subprocess) release() {
|
|
if s.alive() {
|
|
globalPool.markAvailable(s)
|
|
return
|
|
}
|
|
if s.syscallThread != nil && s.syscallThread.seccompNotify != nil {
|
|
s.syscallThread.seccompNotify.Close()
|
|
}
|
|
}
|
|
|
|
// attach attaches to the thread.
|
|
func (t *thread) attach() error {
|
|
if errno := hostsyscall.RawSyscallErrno(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 {
|
|
return fmt.Errorf("unable to attach: %v", errno)
|
|
}
|
|
|
|
// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
|
|
// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
|
|
// newSubprocess), so we always expect to see signal-delivery-stop with
|
|
// SIGSTOP.
|
|
if sig := t.wait(stopped); sig != unix.SIGSTOP {
|
|
return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
|
|
}
|
|
|
|
// Initialize options.
|
|
t.init()
|
|
return nil
|
|
}
|
|
|
|
func (t *thread) grabInitRegs() {
|
|
// Grab registers.
|
|
//
|
|
// Note that we adjust the current register RIP value to be just before
|
|
// the current system call executed. This depends on the definition of
|
|
// the stub itself.
|
|
if err := t.getRegs(&t.initRegs); err != nil {
|
|
panic(fmt.Sprintf("ptrace get regs failed: %v", err))
|
|
}
|
|
t.adjustInitRegsRip()
|
|
t.initRegs.SetStackPointer(0)
|
|
}
|
|
|
|
// detach detaches from the thread.
|
|
//
|
|
// Because the SIGSTOP is not suppressed, the thread will enter group-stop.
|
|
func (t *thread) detach() {
|
|
if errno := hostsyscall.RawSyscallErrno6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 {
|
|
panic(fmt.Sprintf("can't detach new clone: %v", errno))
|
|
}
|
|
}
|
|
|
|
// waitOutcome is used for wait below.
|
|
type waitOutcome int
|
|
|
|
const (
|
|
// stopped indicates that the process was stopped.
|
|
stopped waitOutcome = iota
|
|
|
|
// killed indicates that the process was killed.
|
|
killed
|
|
)
|
|
|
|
func (t *thread) loadLogPrefix() *string {
|
|
p := t.logPrefix.Load()
|
|
if p == nil {
|
|
prefix := fmt.Sprintf("[% 4d:% 4d] ", t.tgid, t.tid)
|
|
t.logPrefix.Store(&prefix)
|
|
p = &prefix
|
|
}
|
|
return p
|
|
}
|
|
|
|
// Debugf logs with the debugging severity.
|
|
func (t *thread) Debugf(format string, v ...any) {
|
|
if log.IsLogging(log.Debug) {
|
|
log.DebugfAtDepth(1, *t.loadLogPrefix()+format, v...)
|
|
}
|
|
}
|
|
|
|
// Warningf logs with the warning severity.
|
|
func (t *thread) Warningf(format string, v ...any) {
|
|
if log.IsLogging(log.Warning) {
|
|
log.WarningfAtDepth(1, *t.loadLogPrefix()+format, v...)
|
|
}
|
|
}
|
|
|
|
func (t *thread) dumpAndPanic(message string) {
|
|
var regs arch.Registers
|
|
message += "\n"
|
|
if err := t.getRegs(®s); err == nil {
|
|
message += dumpRegs(®s)
|
|
} else {
|
|
log.Warningf("unable to get registers: %v", err)
|
|
}
|
|
message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
|
|
panic(message)
|
|
}
|
|
|
|
func (t *thread) dumpRegs(message string) {
|
|
var regs arch.Registers
|
|
message += "\n"
|
|
if err := t.getRegs(®s); err == nil {
|
|
message += dumpRegs(®s)
|
|
} else {
|
|
log.Warningf("unable to get registers: %v", err)
|
|
}
|
|
log.Infof("%s", message)
|
|
}
|
|
|
|
func (t *thread) unexpectedStubExit() {
|
|
msg, err := t.getEventMessage()
|
|
status := unix.WaitStatus(msg)
|
|
if status.Signaled() && status.Signal() == unix.SIGKILL {
|
|
// SIGKILL can be only sent by a user or OOM-killer. In both
|
|
// these cases, we don't need to panic. There is no reasons to
|
|
// think that something wrong in gVisor.
|
|
log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
|
|
err := sighandling.KillItself()
|
|
if err != nil {
|
|
t.dumpAndPanic(fmt.Sprintf(
|
|
"failed to kill the process %d:%d: %v", t.tgid, t.tid, err))
|
|
}
|
|
}
|
|
t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
|
|
}
|
|
|
|
// wait waits for a stop event.
|
|
//
|
|
// Precondition: outcome is a valid waitOutcome.
|
|
func (t *thread) wait(outcome waitOutcome) unix.Signal {
|
|
var status unix.WaitStatus
|
|
|
|
for {
|
|
r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil)
|
|
if err == unix.EINTR || err == unix.EAGAIN {
|
|
// Wait was interrupted; wait again.
|
|
continue
|
|
} else if err != nil {
|
|
panic(fmt.Sprintf("ptrace wait failed: %v", err))
|
|
}
|
|
if int(r) != int(t.tid) {
|
|
panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
|
|
}
|
|
switch outcome {
|
|
case stopped:
|
|
if !status.Stopped() {
|
|
t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
|
|
}
|
|
stopSig := status.StopSignal()
|
|
if stopSig == 0 {
|
|
continue // Spurious stop.
|
|
}
|
|
if stopSig == unix.SIGTRAP {
|
|
if status.TrapCause() == unix.PTRACE_EVENT_EXIT {
|
|
t.unexpectedStubExit()
|
|
}
|
|
// Re-encode the trap cause the way it's expected.
|
|
return stopSig | unix.Signal(status.TrapCause()<<8)
|
|
}
|
|
// Not a trap signal.
|
|
return stopSig
|
|
case killed:
|
|
if !status.Exited() && !status.Signaled() {
|
|
t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
|
|
}
|
|
return unix.Signal(status.ExitStatus())
|
|
default:
|
|
// Should not happen.
|
|
t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
|
|
}
|
|
}
|
|
}
|
|
|
|
// kill kills the thread;
|
|
func (t *thread) kill() {
|
|
unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
|
|
}
|
|
|
|
// destroy kills and waits on the thread.
|
|
//
|
|
// Note that this should not be used in the general case; the death of threads
|
|
// will typically cause the death of the parent. This is a utility method for
|
|
// manually created threads.
|
|
func (t *thread) destroy() {
|
|
t.detach()
|
|
unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
|
|
t.wait(killed)
|
|
}
|
|
|
|
// init initializes trace options.
|
|
func (t *thread) init() {
|
|
// Set the TRACESYSGOOD option to differentiate real SIGTRAP.
|
|
// set PTRACE_O_EXITKILL to ensure that the unexpected exit of the
|
|
// sentry will immediately kill the associated stubs.
|
|
errno := hostsyscall.RawSyscallErrno6(
|
|
unix.SYS_PTRACE,
|
|
unix.PTRACE_SETOPTIONS,
|
|
uintptr(t.tid),
|
|
0,
|
|
unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL,
|
|
0, 0)
|
|
if errno != 0 {
|
|
panic(fmt.Sprintf("ptrace set options failed: %v", errno))
|
|
}
|
|
}
|
|
|
|
// syscall executes a system call cycle in the traced context.
|
|
//
|
|
// This is _not_ for use by application system calls, rather it is for use when
|
|
// a system call must be injected into the remote context (e.g. mmap, munmap).
|
|
// Note that clones are handled separately.
|
|
func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
|
|
// Set registers.
|
|
if err := t.setRegs(regs); err != nil {
|
|
panic(fmt.Sprintf("ptrace set regs failed: %v", err))
|
|
}
|
|
|
|
for {
|
|
// Execute the syscall instruction. The task has to stop on the
|
|
// trap instruction which is right after the syscall
|
|
// instruction.
|
|
if errno := hostsyscall.RawSyscallErrno(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0); errno != 0 {
|
|
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
|
|
}
|
|
|
|
sig := t.wait(stopped)
|
|
if sig == unix.SIGTRAP {
|
|
// Reached syscall-enter-stop.
|
|
break
|
|
} else {
|
|
// Some other signal caused a thread stop; ignore.
|
|
if sig != unix.SIGSTOP && sig != unix.SIGCHLD {
|
|
log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig)
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Grab registers.
|
|
if err := t.getRegs(regs); err != nil {
|
|
panic(fmt.Sprintf("ptrace get regs failed: %v", err))
|
|
}
|
|
return syscallReturnValue(regs)
|
|
}
|
|
|
|
// syscallIgnoreInterrupt ignores interrupts on the system call thread and
|
|
// restarts the syscall if the kernel indicates that should happen.
|
|
func (t *thread) syscallIgnoreInterrupt(
|
|
initRegs *arch.Registers,
|
|
sysno uintptr,
|
|
args ...arch.SyscallArgument) (uintptr, error) {
|
|
for {
|
|
regs := createSyscallRegs(initRegs, sysno, args...)
|
|
rval, err := t.syscall(®s)
|
|
switch err {
|
|
case ERESTARTSYS:
|
|
continue
|
|
case ERESTARTNOINTR:
|
|
continue
|
|
case ERESTARTNOHAND:
|
|
continue
|
|
default:
|
|
return rval, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
|
|
func (t *thread) NotifyInterrupt() {
|
|
unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt))
|
|
}
|
|
|
|
func (s *subprocess) incAwakeContexts() {
|
|
nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1)
|
|
if nr > uint32(maxSysmsgThreads) {
|
|
return
|
|
}
|
|
fastpath.nrMaxAwakeStubThreads.Add(1)
|
|
}
|
|
|
|
func (s *subprocess) decAwakeContexts() {
|
|
nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0))
|
|
if nr >= uint32(maxSysmsgThreads) {
|
|
return
|
|
}
|
|
fastpath.nrMaxAwakeStubThreads.Add(^uint32(0))
|
|
}
|
|
|
|
// switchToApp is called from the main SwitchToApp entrypoint.
|
|
//
|
|
// This function returns true on a system call, false on a signal.
|
|
// The second return value is true if a syscall instruction can be replaced on
|
|
// a function call.
|
|
func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, at hostarch.AccessType, err *platform.ContextError) {
|
|
// Reset necessary registers.
|
|
regs := &ac.StateData().Regs
|
|
s.resetSysemuRegs(regs)
|
|
ctx := c.sharedContext
|
|
ctx.shared.Regs = regs.PtraceRegs
|
|
restoreArchSpecificState(ctx.shared, ac)
|
|
|
|
// Check for interrupts, and ensure that future interrupts signal the context.
|
|
if !c.interrupt.Enable(c.sharedContext) {
|
|
// Pending interrupt; simulate.
|
|
ctx.clearInterrupt()
|
|
c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)}
|
|
return false, false, hostarch.NoAccess, nil
|
|
}
|
|
defer func() {
|
|
ctx.clearInterrupt()
|
|
c.interrupt.Disable()
|
|
}()
|
|
|
|
restoreFPState(ctx, c, ac)
|
|
|
|
// Place the context onto the context queue.
|
|
if ctx.sleeping {
|
|
ctx.sleeping = false
|
|
s.incAwakeContexts()
|
|
}
|
|
ctx.setState(sysmsg.ContextStateNone)
|
|
if err := s.contextQueue.add(ctx); err != nil {
|
|
return false, false, hostarch.NoAccess, err
|
|
}
|
|
|
|
if err := s.waitOnState(ctx); err != nil {
|
|
return false, false, hostarch.NoAccess, corruptedSharedMemoryErr(err.Error())
|
|
}
|
|
|
|
// Check if there's been an error.
|
|
threadID := ctx.threadID()
|
|
if threadID != invalidThreadID {
|
|
if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 {
|
|
return false, false, hostarch.NoAccess, sysThread.msg.ConvertSysmsgErr()
|
|
}
|
|
return false, false, hostarch.NoAccess, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID))
|
|
}
|
|
|
|
// Copy register state locally.
|
|
regs.PtraceRegs = ctx.shared.Regs
|
|
retrieveArchSpecificState(ctx.shared, ac)
|
|
c.needToPullFullState = true
|
|
// We have a signal. We verify however, that the signal was
|
|
// either delivered from the kernel or from this process. We
|
|
// don't respect other signals.
|
|
c.signalInfo = ctx.shared.SignalInfo
|
|
ctxState := ctx.state()
|
|
if ctxState == sysmsg.ContextStateSyscallCanBePatched {
|
|
ctxState = sysmsg.ContextStateSyscall
|
|
shouldPatchSyscall = true
|
|
}
|
|
|
|
if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap {
|
|
if maybePatchSignalInfo(regs, &c.signalInfo) {
|
|
return false, false, hostarch.Execute, nil
|
|
}
|
|
updateSyscallRegs(regs)
|
|
return true, shouldPatchSyscall, hostarch.NoAccess, nil
|
|
} else if ctxState != sysmsg.ContextStateFault {
|
|
return false, false, hostarch.NoAccess, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState))
|
|
}
|
|
|
|
at = hostarch.NoAccess
|
|
if c.signalInfo.Signo == int32(linux.SIGSEGV) {
|
|
at = sigErrorToAccessType(ctx.shared.SigError)
|
|
}
|
|
return false, false, at, nil
|
|
}
|
|
|
|
func (s *subprocess) waitOnState(ctx *sharedContext) error {
|
|
ctx.kicked = false
|
|
slowPath := false
|
|
if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 {
|
|
ctx.kicked = s.kickSysmsgThread()
|
|
}
|
|
for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() {
|
|
if !slowPath {
|
|
events := dispatcher.waitFor(ctx)
|
|
if events&sharedContextKicked != 0 {
|
|
if ctx.kicked {
|
|
continue
|
|
}
|
|
if ctx.isAcked() {
|
|
ctx.kicked = true
|
|
continue
|
|
}
|
|
s.kickSysmsgThread()
|
|
ctx.kicked = true
|
|
continue
|
|
}
|
|
if events&sharedContextSlowPath != 0 {
|
|
ctx.disableSentryFastPath()
|
|
slowPath = true
|
|
continue
|
|
}
|
|
} else {
|
|
// If the context already received a handshake then it knows it's being
|
|
// worked on.
|
|
if !ctx.kicked && !ctx.isAcked() {
|
|
ctx.kicked = s.kickSysmsgThread()
|
|
}
|
|
|
|
if err := ctx.sleepOnState(curState); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
ctx.recordLatency()
|
|
ctx.resetLatencyMeasures()
|
|
ctx.enableSentryFastPath()
|
|
|
|
return nil
|
|
}
|
|
|
|
// canKickSysmsgThread returns true if a new thread can be kicked.
|
|
// The second return value is the expected number of threads after kicking a
|
|
// new one.
|
|
func (s *subprocess) canKickSysmsgThread() (bool, uint32) {
|
|
// numActiveContexts and numActiveThreads can be changed from stub
|
|
// threads that handles the contextQueue without any locks. The idea
|
|
// here is that any stub thread that gets CPU time can make some
|
|
// progress. In stub threads, we can use only spinlock-like
|
|
// synchronizations, but they don't work well because a thread that
|
|
// holds a lock can be preempted by another thread that is waiting for
|
|
// the same lock.
|
|
nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads)
|
|
nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup)
|
|
nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts)
|
|
|
|
nrActiveThreads += nrThreadsToWakeup + 1
|
|
if nrActiveThreads > nrActiveContexts {
|
|
// This can happen when one or more stub threads are
|
|
// waiting for cpu time. The host probably has more
|
|
// running tasks than a number of cpu-s.
|
|
return false, nrActiveThreads
|
|
}
|
|
return true, nrActiveThreads
|
|
}
|
|
|
|
// kickSysmsgThread returns true if it was able to wake up or create a new sysmsg
|
|
// stub thread.
|
|
func (s *subprocess) kickSysmsgThread() bool {
|
|
kick, _ := s.canKickSysmsgThread()
|
|
if !kick {
|
|
return false
|
|
}
|
|
|
|
s.sysmsgThreadsMu.Lock()
|
|
kick, nrThreads := s.canKickSysmsgThread()
|
|
if !kick {
|
|
s.sysmsgThreadsMu.Unlock()
|
|
return false
|
|
}
|
|
numTimesStubKicked.Increment()
|
|
atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1)
|
|
if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) {
|
|
s.numSysmsgThreads++
|
|
s.sysmsgThreadsMu.Unlock()
|
|
if err := s.createSysmsgThread(); err != nil {
|
|
log.Warningf("Unable to create a new stub thread: %s", err)
|
|
s.sysmsgThreadsMu.Lock()
|
|
s.numSysmsgThreads--
|
|
s.sysmsgThreadsMu.Unlock()
|
|
}
|
|
} else {
|
|
s.sysmsgThreadsMu.Unlock()
|
|
}
|
|
s.contextQueue.wakeupSysmsgThread()
|
|
|
|
return true
|
|
}
|
|
|
|
// syscall executes the given system call without handling interruptions.
|
|
func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
|
|
s.syscallThreadMu.Lock()
|
|
defer s.syscallThreadMu.Unlock()
|
|
|
|
return s.syscallThread.syscall(sysno, args...)
|
|
}
|
|
|
|
// MapFile implements platform.AddressSpace.MapFile.
|
|
func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
|
|
fd, err := f.DataFD(fr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var flags int
|
|
if precommit {
|
|
flags |= unix.MAP_POPULATE
|
|
}
|
|
_, err = s.syscall(
|
|
unix.SYS_MMAP,
|
|
arch.SyscallArgument{Value: uintptr(addr)},
|
|
arch.SyscallArgument{Value: uintptr(fr.Length())},
|
|
arch.SyscallArgument{Value: uintptr(at.Prot())},
|
|
arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)},
|
|
arch.SyscallArgument{Value: uintptr(fd)},
|
|
arch.SyscallArgument{Value: uintptr(fr.Start)})
|
|
return err
|
|
}
|
|
|
|
// Unmap implements platform.AddressSpace.Unmap.
|
|
func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) {
|
|
_, err := s.syscall(
|
|
unix.SYS_MUNMAP,
|
|
arch.SyscallArgument{Value: uintptr(addr)},
|
|
arch.SyscallArgument{Value: uintptr(length)})
|
|
if err != nil && err != errDeadSubprocess {
|
|
// We never expect this to happen.
|
|
panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
|
|
}
|
|
}
|
|
|
|
func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error {
|
|
if !c.sharedContext.isActiveInSubprocess(s) {
|
|
panic("Attempted to PullFullState for context that is not used in subprocess")
|
|
}
|
|
saveFPState(c.sharedContext, ac)
|
|
return nil
|
|
}
|
|
|
|
var (
|
|
sysmsgThreadPriorityOnce sync.Once
|
|
sysmsgThreadPriority int
|
|
)
|
|
|
|
// initSysmsgThreadPriority looks at the current priority of the process
|
|
// and updates `sysmsgThreadPriority` accordingly.
|
|
func initSysmsgThreadPriority() {
|
|
sysmsgThreadPriorityOnce.Do(func() {
|
|
prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0)
|
|
if err != nil {
|
|
panic("unable to get current scheduling priority")
|
|
}
|
|
// Sysmsg threads are executed with a priority one lower than the Sentry.
|
|
sysmsgThreadPriority = 20 - prio + 1
|
|
})
|
|
}
|
|
|
|
// createSysmsgThread creates a new sysmsg thread.
|
|
// The thread starts processing any available context in the context queue.
|
|
func (s *subprocess) createSysmsgThread() error {
|
|
// Create a new seccomp process.
|
|
var r requestThread
|
|
r.thread = make(chan *thread)
|
|
s.requests <- r
|
|
p := <-r.thread
|
|
if p == nil {
|
|
return fmt.Errorf("createSysmsgThread: failed to get clone")
|
|
}
|
|
|
|
runtime.LockOSThread()
|
|
defer runtime.UnlockOSThread()
|
|
if err := p.attach(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Skip SIGSTOP.
|
|
if errno := hostsyscall.RawSyscallErrno(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0); errno != 0 {
|
|
panic(fmt.Sprintf("ptrace cont failed: %v", errno))
|
|
}
|
|
sig := p.wait(stopped)
|
|
if sig != unix.SIGSTOP {
|
|
panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
|
|
}
|
|
|
|
// Allocate a new stack for the BPF process.
|
|
opts := pgalloc.AllocOpts{
|
|
Kind: usage.System,
|
|
Dir: pgalloc.TopDown,
|
|
}
|
|
fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts)
|
|
if err != nil {
|
|
// TODO(b/144063246): Need to fail the clone system call.
|
|
panic(fmt.Sprintf("failed to allocate a new stack: %v", err))
|
|
}
|
|
sysThread := &sysmsgThread{
|
|
thread: p,
|
|
subproc: s,
|
|
stackRange: fr,
|
|
}
|
|
// Use the sysmsgStackID as a handle on this thread instead of host tid in
|
|
// order to be able to reliably specify invalidThreadID.
|
|
threadID := uint32(p.sysmsgStackID)
|
|
|
|
// Map the stack into the sentry.
|
|
sentryStackAddr, errno := hostsyscall.RawSyscall6(
|
|
unix.SYS_MMAP,
|
|
0,
|
|
sysmsg.PerThreadSharedStackSize,
|
|
unix.PROT_WRITE|unix.PROT_READ,
|
|
unix.MAP_SHARED|unix.MAP_FILE,
|
|
uintptr(s.memoryFile.FD()), uintptr(fr.Start))
|
|
if errno != 0 {
|
|
panic(fmt.Sprintf("mmap failed: %v", errno))
|
|
}
|
|
|
|
// Before installing the stub syscall filters, we need to call a few
|
|
// system calls (e.g. sigaltstack, sigaction) which have in-memory
|
|
// arguments. We need to prevent changing these parameters by other
|
|
// stub threads, so lets map the future BPF stack as read-only and
|
|
// fill syscall arguments from the Sentry.
|
|
sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset
|
|
err = sysThread.mapStack(sysmsgStackAddr, true)
|
|
if err != nil {
|
|
panic(fmt.Sprintf("mmap failed: %v", err))
|
|
}
|
|
|
|
sysThread.init(sentryStackAddr, sysmsgStackAddr)
|
|
|
|
// Map the stack into the BPF process.
|
|
err = sysThread.mapStack(sysmsgStackAddr, false)
|
|
if err != nil {
|
|
s.memoryFile.DecRef(fr)
|
|
panic(fmt.Sprintf("mmap failed: %v", err))
|
|
}
|
|
|
|
// Map the stack into the BPF process.
|
|
privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset
|
|
err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize)
|
|
if err != nil {
|
|
s.memoryFile.DecRef(fr)
|
|
panic(fmt.Sprintf("mmap failed: %v", err))
|
|
}
|
|
|
|
sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr))
|
|
sysThread.msg.Init(threadID)
|
|
sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack)
|
|
sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr()))
|
|
sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler))
|
|
|
|
sysThread.msg.State.Set(sysmsg.ThreadStateInitializing)
|
|
|
|
if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil {
|
|
log.Warningf("Unable to change priority of a stub thread: %s", err)
|
|
}
|
|
|
|
// Install a pre-compiled seccomp rules for the BPF process.
|
|
_, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL,
|
|
arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)},
|
|
arch.SyscallArgument{Value: uintptr(1)},
|
|
arch.SyscallArgument{Value: uintptr(0)},
|
|
arch.SyscallArgument{Value: uintptr(0)},
|
|
arch.SyscallArgument{Value: uintptr(0)},
|
|
arch.SyscallArgument{Value: uintptr(0)})
|
|
if err != nil {
|
|
panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err))
|
|
}
|
|
|
|
_, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP,
|
|
arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)},
|
|
arch.SyscallArgument{Value: uintptr(0)},
|
|
arch.SyscallArgument{Value: stubSysmsgRules})
|
|
if err != nil {
|
|
panic(fmt.Sprintf("seccomp failed: %v", err))
|
|
}
|
|
|
|
// Prepare to start the BPF process.
|
|
tregs := &arch.Registers{}
|
|
s.resetSysemuRegs(tregs)
|
|
setArchSpecificRegs(sysThread, tregs)
|
|
if err := p.setRegs(tregs); err != nil {
|
|
panic(fmt.Sprintf("ptrace set regs failed: %v", err))
|
|
}
|
|
archSpecificSysmsgThreadInit(sysThread)
|
|
// Skip SIGSTOP.
|
|
if e := hostsyscall.RawSyscallErrno(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 {
|
|
panic(fmt.Sprintf("tkill failed: %v", e))
|
|
}
|
|
// Resume the BPF process.
|
|
if errno := hostsyscall.RawSyscallErrno(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0); errno != 0 {
|
|
panic(fmt.Sprintf("can't detach new clone: %v", errno))
|
|
}
|
|
|
|
s.sysmsgThreadsMu.Lock()
|
|
s.sysmsgThreads[threadID] = sysThread
|
|
s.sysmsgThreadsMu.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// PreFork implements platform.AddressSpace.PreFork.
|
|
// We need to take the usertrap lock to be sure that fork() will not be in the
|
|
// middle of applying a binary patch.
|
|
func (s *subprocess) PreFork() {
|
|
s.usertrap.PreFork()
|
|
}
|
|
|
|
// PostFork implements platform.AddressSpace.PostFork.
|
|
func (s *subprocess) PostFork() {
|
|
s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above.
|
|
}
|
|
|
|
// activateContext activates the context in this subprocess.
|
|
// No-op if the context is already active within the subprocess; if not,
|
|
// deactivates it from its last subprocess.
|
|
func (s *subprocess) activateContext(c *platformContext) error {
|
|
if !c.sharedContext.isActiveInSubprocess(s) {
|
|
c.sharedContext.release()
|
|
c.sharedContext = nil
|
|
|
|
shared, err := s.getSharedContext()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.sharedContext = shared
|
|
}
|
|
return nil
|
|
}
|