Files

490 lines
16 KiB
Go
Raw Permalink Normal View History

// Copyright 2018 The gVisor Authors.
2018-04-27 10:37:02 -07:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"fmt"
"os"
2019-12-06 16:58:28 -08:00
"runtime/trace"
2018-04-27 10:37:02 -07:00
"golang.org/x/sys/unix"
2019-06-13 16:49:09 -07:00
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/errors"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
2021-03-29 13:28:32 -07:00
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal"
2019-06-13 16:49:09 -07:00
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
2022-04-29 14:47:17 -07:00
"gvisor.dev/gvisor/pkg/sentry/seccheck"
pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
2018-04-27 10:37:02 -07:00
)
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
type SyscallRestartBlock interface {
Restart(t *Task) (uintptr, error)
}
// SyscallControl is returned by syscalls to control the behavior of
// Task.doSyscallInvoke.
type SyscallControl struct {
// next is the state that the task goroutine should switch to. If next is
// nil, the task goroutine should continue to syscall exit as usual.
next taskRunState
// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
// in the task's syscall return value register.
ignoreReturn bool
}
var (
// CtrlDoExit is returned by the implementations of the exit and exit_group
// syscalls to enter the task exit path directly, skipping syscall exit
// tracing.
CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
// feature before syscall execution. This causes Task.doSyscallInvoke
// to return runSyscallReinvoke, allowing Task.run to check for stops
// before immediately re-invoking the syscall (skipping the re-checking
// of seccomp filters and ptrace which would confuse userspace
// tracing).
ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
// than tail-calling it, allowing stops to be checked before syscall exit.
ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
)
func (t *Task) invokeExternal() {
t.BeginExternalStop()
go func() { // S/R-SAFE: External control flow.
defer t.EndExternalStop()
t.SyscallTable().External(t.Kernel())
}()
}
func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
s := t.SyscallTable()
fe := s.FeatureEnable.Word(sysno)
2022-11-08 13:12:12 -08:00
var straceContext any
2018-04-27 10:37:02 -07:00
if bits.IsAnyOn32(fe, StraceEnableBits) {
straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
}
if bits.IsAnyOn32(fe, SecCheckRawEnter) {
2022-04-29 14:47:17 -07:00
info := pb.Syscall{
Sysno: uint64(sysno),
Arg1: args[0].Uint64(),
Arg2: args[1].Uint64(),
Arg3: args[2].Uint64(),
Arg4: args[3].Uint64(),
Arg5: args[4].Uint64(),
Arg6: args[5].Uint64(),
}
fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
if !fields.Context.Empty() {
info.ContextData = &pb.ContextData{}
LoadSeccheckData(t, fields.Context, info.ContextData)
}
2022-07-21 12:43:55 -07:00
seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
2022-04-29 14:47:17 -07:00
return c.RawSyscall(t, fields, &info)
})
}
if bits.IsAnyOn32(fe, SecCheckEnter) {
2022-04-29 14:47:17 -07:00
fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno))
var ctxData *pb.ContextData
if !fields.Context.Empty() {
ctxData = &pb.ContextData{}
LoadSeccheckData(t, fields.Context, ctxData)
}
info := SyscallInfo{
Sysno: sysno,
Args: args,
}
cb := s.LookupSyscallToProto(sysno)
2022-05-05 19:27:16 -07:00
msg, msgType := cb(t, fields, ctxData, info)
2022-07-21 12:43:55 -07:00
seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
2022-05-05 19:27:16 -07:00
return c.Syscall(t, fields, ctxData, msgType, msg)
2022-04-29 14:47:17 -07:00
})
}
2018-04-27 10:37:02 -07:00
if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
t.invokeExternal()
// Ensure we check for stops, then invoke the syscall again.
ctrl = ctrlStopAndReinvokeSyscall
} else {
fn := s.Lookup(sysno)
2019-12-06 16:58:28 -08:00
var region *trace.Region // Only non-nil if tracing == true.
if trace.IsEnabled() {
region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
}
2018-04-27 10:37:02 -07:00
if fn != nil {
// Call our syscall implementation.
rval, ctrl, err = fn(t, sysno, args)
2018-04-27 10:37:02 -07:00
} else {
// Use the missing function if not found.
rval, err = t.SyscallTable().Missing(t, sysno, args)
}
2019-12-06 16:58:28 -08:00
if region != nil {
region.End()
}
2018-04-27 10:37:02 -07:00
}
if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
t.invokeExternal()
// Don't reinvoke the unix.
2018-04-27 10:37:02 -07:00
}
if bits.IsAnyOn32(fe, StraceEnableBits) {
s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
}
if bits.IsAnyOn32(fe, SecCheckRawExit) {
2022-04-29 14:47:17 -07:00
info := pb.Syscall{
Sysno: uint64(sysno),
Arg1: args[0].Uint64(),
Arg2: args[1].Uint64(),
Arg3: args[2].Uint64(),
Arg4: args[3].Uint64(),
Arg5: args[4].Uint64(),
Arg6: args[5].Uint64(),
Exit: &pb.Exit{
Result: int64(rval),
Errorno: int64(ExtractErrno(err, int(sysno))),
},
}
fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
if !fields.Context.Empty() {
info.ContextData = &pb.ContextData{}
LoadSeccheckData(t, fields.Context, info.ContextData)
}
2022-07-21 12:43:55 -07:00
seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
2022-04-29 14:47:17 -07:00
return c.RawSyscall(t, fields, &info)
})
}
if bits.IsAnyOn32(fe, SecCheckExit) {
2022-04-29 14:47:17 -07:00
fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno))
var ctxData *pb.ContextData
if !fields.Context.Empty() {
ctxData = &pb.ContextData{}
LoadSeccheckData(t, fields.Context, ctxData)
}
info := SyscallInfo{
Exit: true,
Sysno: sysno,
Args: args,
Rval: rval,
Errno: ExtractErrno(err, int(sysno)),
}
cb := s.LookupSyscallToProto(sysno)
2022-05-05 19:27:16 -07:00
msg, msgType := cb(t, fields, ctxData, info)
2022-07-21 12:43:55 -07:00
seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
2022-05-05 19:27:16 -07:00
return c.Syscall(t, fields, ctxData, msgType, msg)
2022-04-29 14:47:17 -07:00
})
}
2018-04-27 10:37:02 -07:00
return
}
// doSyscall is the entry point for an invocation of a system call specified by
// the current state of t's registers.
//
// The syscall path is very hot; avoid defer.
func (t *Task) doSyscall() taskRunState {
2020-03-11 03:21:34 +00:00
// Save value of the register which is clobbered in the following
// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
//
// On x86, register rax was shared by syscall number and return
// value, and at the entry of the syscall handler, the rax was
2020-05-15 20:03:54 -07:00
// saved to regs.orig_rax which was exposed to userspace.
2020-03-11 03:21:34 +00:00
// But on arm64, syscall number was passed through X8, and the X0
// was shared by the first syscall argument and return value. The
2020-05-15 20:03:54 -07:00
// X0 was saved to regs.orig_x0 which was not exposed to userspace.
2020-03-11 03:21:34 +00:00
// So we have to do the same operation here to save the X0 value
// into the task context.
t.Arch().SyscallSaveOrig()
2018-04-27 10:37:02 -07:00
sysno := t.Arch().SyscallNo()
args := t.Arch().SyscallArgs()
// Tracers expect to see this between when the task traps into the kernel
// to perform a syscall and when the syscall is actually invoked.
// This useless-looking temporary is needed because Go.
tmp := uintptr(unix.ENOSYS)
2018-04-27 10:37:02 -07:00
t.Arch().SetReturn(-tmp)
// Check seccomp filters. The nil check is for performance (as seccomp use
// is rare), not needed for correctness.
if t.seccomp.Load() != nil {
2021-03-29 13:28:32 -07:00
switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
2018-04-27 10:37:02 -07:00
t.Debugf("Syscall %d: denied by seccomp", sysno)
return (*runSyscallExit)(nil)
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_ALLOW:
2018-04-27 10:37:02 -07:00
// ok
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_KILL_THREAD:
2018-04-27 10:37:02 -07:00
t.Debugf("Syscall %d: killed by seccomp", sysno)
t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
2018-04-27 10:37:02 -07:00
return (*runExit)(nil)
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_TRACE:
2018-04-27 10:37:02 -07:00
t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
return (*runSyscallAfterPtraceEventSeccomp)(nil)
default:
panic(fmt.Sprintf("Unknown seccomp result %d", r))
}
}
2022-05-06 12:18:45 -07:00
syscallCounter.Increment()
2018-04-27 10:37:02 -07:00
return t.doSyscallEnter(sysno, args)
}
type runSyscallAfterPtraceEventSeccomp struct{}
func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
if t.killed() {
// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
// ptrace(2)
return (*runInterrupt)(nil)
}
sysno := t.Arch().SyscallNo()
// "The tracer can skip the system call by changing the syscall number to
// -1." - Documentation/prctl/seccomp_filter.txt
if sysno == ^uintptr(0) {
return (*runSyscallExit)(nil).execute(t)
}
args := t.Arch().SyscallArgs()
return t.doSyscallEnter(sysno, args)
}
func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
if next, ok := t.ptraceSyscallEnter(); ok {
return next
}
return t.doSyscallInvoke(sysno, args)
}
2018-08-02 10:41:44 -07:00
// +stateify savable
2018-04-27 10:37:02 -07:00
type runSyscallAfterSyscallEnterStop struct{}
func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
t.tg.signalHandlers.mu.Lock()
2019-04-08 16:31:06 -07:00
t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
2018-04-27 10:37:02 -07:00
t.tg.signalHandlers.mu.Unlock()
}
if t.killed() {
return (*runInterrupt)(nil)
}
sysno := t.Arch().SyscallNo()
if sysno == ^uintptr(0) {
return (*runSyscallExit)(nil)
}
args := t.Arch().SyscallArgs()
2020-03-11 03:21:34 +00:00
2018-04-27 10:37:02 -07:00
return t.doSyscallInvoke(sysno, args)
}
2018-08-02 10:41:44 -07:00
// +stateify savable
2018-04-27 10:37:02 -07:00
type runSyscallAfterSysemuStop struct{}
func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
t.tg.signalHandlers.mu.Lock()
2019-04-08 16:31:06 -07:00
t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
2018-04-27 10:37:02 -07:00
t.tg.signalHandlers.mu.Unlock()
}
if t.killed() {
return (*runInterrupt)(nil)
}
return (*runSyscallExit)(nil).execute(t)
}
func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
if !ctrl.ignoreReturn {
t.Arch().SetReturn(rval)
}
if ctrl.next != nil {
return ctrl.next
}
} else if err != nil {
2020-04-16 11:48:14 -07:00
t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
2018-04-27 10:37:02 -07:00
t.haveSyscallReturn = true
} else {
t.Arch().SetReturn(rval)
}
return (*runSyscallExit)(nil).execute(t)
}
2018-08-02 10:41:44 -07:00
// +stateify savable
2018-04-27 10:37:02 -07:00
type runSyscallReinvoke struct{}
func (*runSyscallReinvoke) execute(t *Task) taskRunState {
if t.killed() {
// It's possible that since the last execution, the task has
// been forcible killed. Invoking the system call here could
// result in an infinite loop if it is again preempted by an
// external stop and reinvoked.
return (*runInterrupt)(nil)
}
sysno := t.Arch().SyscallNo()
args := t.Arch().SyscallArgs()
return t.doSyscallInvoke(sysno, args)
}
2018-08-02 10:41:44 -07:00
// +stateify savable
2018-04-27 10:37:02 -07:00
type runSyscallExit struct{}
func (*runSyscallExit) execute(t *Task) taskRunState {
t.ptraceSyscallExit()
return (*runApp)(nil)
}
// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
// indicated by an execution fault at address addr. doVsyscall returns the
// task's next run state.
2021-03-29 13:28:32 -07:00
func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount)
2018-04-27 10:37:02 -07:00
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
2021-03-29 13:28:32 -07:00
if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
2018-04-27 10:37:02 -07:00
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
2019-04-08 16:31:06 -07:00
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
2018-04-27 10:37:02 -07:00
return (*runApp)(nil)
}
// For _vsyscalls_, there is no need to translate System V calling convention
// to syscall ABI because they both use RDI, RSI, and RDX for the first three
// arguments and none of the vsyscalls uses more than two arguments.
args := t.Arch().SyscallArgs()
if t.seccomp.Load() != nil {
2018-04-27 10:37:02 -07:00
switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
2018-04-27 10:37:02 -07:00
t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
return (*runApp)(nil)
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_ALLOW:
2018-04-27 10:37:02 -07:00
// ok
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_TRACE:
2018-04-27 10:37:02 -07:00
t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
2018-12-18 10:27:16 -08:00
case linux.SECCOMP_RET_KILL_THREAD:
t.Debugf("vsyscall %d: killed by seccomp", sysno)
t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
2018-12-18 10:27:16 -08:00
return (*runExit)(nil)
2018-04-27 10:37:02 -07:00
default:
panic(fmt.Sprintf("Unknown seccomp result %d", r))
}
}
return t.doVsyscallInvoke(sysno, args, caller)
}
type runVsyscallAfterPtraceEventSeccomp struct {
2021-03-29 13:28:32 -07:00
addr hostarch.Addr
2018-04-27 10:37:02 -07:00
sysno uintptr
caller marshal.Marshallable
2018-04-27 10:37:02 -07:00
}
func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
if t.killed() {
return (*runInterrupt)(nil)
}
sysno := t.Arch().SyscallNo()
// "... the syscall may not be changed to another system call using the
// orig_rax register. It may only be changed to -1 order [sic] to skip the
// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
// causes do_exit(SIGSYS), and changing sp is ignored.
2021-03-29 13:28:32 -07:00
if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
2018-04-27 10:37:02 -07:00
return (*runExit)(nil)
}
if sysno == ^uintptr(0) {
return (*runApp)(nil)
}
return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}
func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
2018-04-27 10:37:02 -07:00
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
// Set the return value. The stack has already been adjusted.
t.Arch().SetReturn(0)
} else if err == nil {
t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
// Set the return value. The stack has already been adjusted.
t.Arch().SetReturn(uintptr(rval))
} else {
t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
if linuxerr.Equals(linuxerr.EFAULT, err) {
2018-04-27 10:37:02 -07:00
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
2019-04-08 16:31:06 -07:00
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
2018-04-27 10:37:02 -07:00
// A return is not emulated in this case.
return (*runApp)(nil)
}
2020-04-16 11:48:14 -07:00
t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
2018-04-27 10:37:02 -07:00
}
t.Arch().SetIP(t.Arch().Value(caller))
t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
return (*runApp)(nil)
}
// ExtractErrno extracts an integer error number from the error.
// The syscall number is purely for context in the error case. Use -1 if
// syscall number is unknown.
2020-04-16 11:48:14 -07:00
func ExtractErrno(err error, sysno int) int {
2018-04-27 10:37:02 -07:00
switch err := err.(type) {
case nil:
return 0
case unix.Errno:
2018-04-27 10:37:02 -07:00
return int(err)
case *errors.Error:
return int(linuxerr.ToUnix(err))
2018-04-27 10:37:02 -07:00
case *memmap.BusError:
// Bus errors may generate SIGBUS, but for syscalls they still
// return EFAULT. See case in task_run.go where the fault is
// handled (and the SIGBUS is delivered).
return int(unix.EFAULT)
2018-04-27 10:37:02 -07:00
case *os.PathError:
2020-04-16 11:48:14 -07:00
return ExtractErrno(err.Err, sysno)
2018-04-27 10:37:02 -07:00
case *os.LinkError:
2020-04-16 11:48:14 -07:00
return ExtractErrno(err.Err, sysno)
2018-04-27 10:37:02 -07:00
case *os.SyscallError:
2020-04-16 11:48:14 -07:00
return ExtractErrno(err.Err, sysno)
case *platform.ContextError:
return int(err.Errno)
2018-04-27 10:37:02 -07:00
default:
2021-08-13 17:14:36 -07:00
if errno, ok := linuxerr.TranslateError(err); ok {
return int(linuxerr.ToUnix(errno))
2018-04-27 10:37:02 -07:00
}
}
panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
}