mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
9fcf0b5b53
PiperOrigin-RevId: 705785809
414 lines
12 KiB
Go
414 lines
12 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package kernel
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
"gvisor.dev/gvisor/pkg/context"
|
|
"gvisor.dev/gvisor/pkg/errors/linuxerr"
|
|
"gvisor.dev/gvisor/pkg/hostarch"
|
|
"gvisor.dev/gvisor/pkg/sentry/inet"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
"gvisor.dev/gvisor/pkg/sentry/vfs"
|
|
)
|
|
|
|
// TaskConfig defines the configuration of a new Task (see below).
|
|
type TaskConfig struct {
|
|
// Kernel is the owning Kernel.
|
|
Kernel *Kernel
|
|
|
|
// Parent is the new task's parent. Parent may be nil.
|
|
Parent *Task
|
|
|
|
// If InheritParent is not nil, use InheritParent's parent as the new
|
|
// task's parent.
|
|
InheritParent *Task
|
|
|
|
// ThreadGroup is the ThreadGroup the new task belongs to.
|
|
ThreadGroup *ThreadGroup
|
|
|
|
// SignalMask is the new task's initial signal mask.
|
|
SignalMask linux.SignalSet
|
|
|
|
// TaskImage is the TaskImage of the new task. Ownership of the
|
|
// TaskImage is transferred to TaskSet.NewTask, whether or not it
|
|
// succeeds.
|
|
TaskImage *TaskImage
|
|
|
|
// FSContext is the FSContext of the new task. A reference must be held on
|
|
// FSContext, which is transferred to TaskSet.NewTask whether or not it
|
|
// succeeds.
|
|
FSContext *FSContext
|
|
|
|
// FDTable is the FDTableof the new task. A reference must be held on
|
|
// FDMap, which is transferred to TaskSet.NewTask whether or not it
|
|
// succeeds.
|
|
FDTable *FDTable
|
|
|
|
// Credentials is the Credentials of the new task.
|
|
Credentials *auth.Credentials
|
|
|
|
// Niceness is the niceness of the new task.
|
|
Niceness int
|
|
|
|
// NetworkNamespace is the network namespace to be used for the new task.
|
|
NetworkNamespace *inet.Namespace
|
|
|
|
// AllowedCPUMask contains the cpus that this task can run on.
|
|
AllowedCPUMask sched.CPUSet
|
|
|
|
// UTSNamespace is the UTSNamespace of the new task.
|
|
UTSNamespace *UTSNamespace
|
|
|
|
// IPCNamespace is the IPCNamespace of the new task.
|
|
IPCNamespace *IPCNamespace
|
|
|
|
// MountNamespace is the MountNamespace of the new task.
|
|
MountNamespace *vfs.MountNamespace
|
|
|
|
// RSeqAddr is a pointer to the userspace linux.RSeq structure.
|
|
RSeqAddr hostarch.Addr
|
|
|
|
// RSeqSignature is the signature that the rseq abort IP must be signed
|
|
// with.
|
|
RSeqSignature uint32
|
|
|
|
// ContainerID is the container the new task belongs to.
|
|
ContainerID string
|
|
|
|
// InitialCgroups are the cgroups the container is initialised to.
|
|
InitialCgroups map[Cgroup]struct{}
|
|
|
|
// UserCounters is user resource counters.
|
|
UserCounters *UserCounters
|
|
|
|
// SessionKeyring is the session keyring associated with the parent task.
|
|
// It may be nil.
|
|
SessionKeyring *auth.Key
|
|
|
|
Origin TaskOrigin
|
|
}
|
|
|
|
// NewTask creates a new task defined by cfg.
|
|
//
|
|
// NewTask does not start the returned task; the caller must call Task.Start.
|
|
//
|
|
// If successful, NewTask transfers references held by cfg to the new task.
|
|
// Otherwise, NewTask releases them.
|
|
func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
|
|
var err error
|
|
cleanup := func() {
|
|
cfg.TaskImage.release(ctx)
|
|
cfg.FSContext.DecRef(ctx)
|
|
cfg.FDTable.DecRef(ctx)
|
|
cfg.UTSNamespace.DecRef(ctx)
|
|
cfg.IPCNamespace.DecRef(ctx)
|
|
cfg.NetworkNamespace.DecRef(ctx)
|
|
if cfg.MountNamespace != nil {
|
|
cfg.MountNamespace.DecRef(ctx)
|
|
}
|
|
}
|
|
if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
|
|
cleanup()
|
|
return nil, err
|
|
}
|
|
t, err := ts.newTask(ctx, cfg)
|
|
if err != nil {
|
|
cfg.UserCounters.decRLimitNProc()
|
|
cleanup()
|
|
return nil, err
|
|
}
|
|
return t, nil
|
|
}
|
|
|
|
// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
|
|
// of cfg if it succeeds.
|
|
func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
|
|
srcT := TaskFromContext(ctx)
|
|
tg := cfg.ThreadGroup
|
|
image := cfg.TaskImage
|
|
t := &Task{
|
|
taskNode: taskNode{
|
|
tg: tg,
|
|
parent: cfg.Parent,
|
|
children: make(map[*Task]struct{}),
|
|
},
|
|
runState: (*runApp)(nil),
|
|
interruptChan: make(chan struct{}, 1),
|
|
signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)),
|
|
signalStack: linux.SignalStack{Flags: linux.SS_DISABLE},
|
|
image: *image,
|
|
fsContext: cfg.FSContext,
|
|
fdTable: cfg.FDTable,
|
|
k: cfg.Kernel,
|
|
ptraceTracees: make(map[*Task]struct{}),
|
|
allowedCPUMask: cfg.AllowedCPUMask.Copy(),
|
|
ioUsage: &usage.IO{},
|
|
niceness: cfg.Niceness,
|
|
utsns: cfg.UTSNamespace,
|
|
ipcns: cfg.IPCNamespace,
|
|
mountNamespace: cfg.MountNamespace,
|
|
rseqCPU: -1,
|
|
rseqAddr: cfg.RSeqAddr,
|
|
rseqSignature: cfg.RSeqSignature,
|
|
futexWaiter: futex.NewWaiter(),
|
|
containerID: cfg.ContainerID,
|
|
cgroups: make(map[Cgroup]struct{}),
|
|
userCounters: cfg.UserCounters,
|
|
sessionKeyring: cfg.SessionKeyring,
|
|
Origin: cfg.Origin,
|
|
onDestroyAction: make(map[TaskDestroyAction]struct{}),
|
|
}
|
|
t.netns = cfg.NetworkNamespace
|
|
t.creds.Store(cfg.Credentials)
|
|
t.endStopCond.L = &t.tg.signalHandlers.mu
|
|
// We don't construct t.blockingTimer until Task.run(); see that function
|
|
// for justification.
|
|
|
|
var (
|
|
cg Cgroup
|
|
charged, committed bool
|
|
)
|
|
|
|
// Reserve cgroup PIDs controller charge. This is either committed when the
|
|
// new task enters the cgroup below, or rolled back on failure.
|
|
//
|
|
// We may also get here from a non-task context (for example, when
|
|
// creating the init task, or from the exec control command). In these cases
|
|
// we skip charging the pids controller, as non-userspace task creation
|
|
// bypasses pid limits.
|
|
if srcT != nil {
|
|
var err error
|
|
if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
|
|
return nil, err
|
|
}
|
|
if charged {
|
|
defer func() {
|
|
if !committed {
|
|
if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil {
|
|
panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err))
|
|
}
|
|
}
|
|
// Ref from ChargeFor. Note that we need to drop this outside of
|
|
// TaskSet.mu critical sections.
|
|
cg.DecRef(ctx)
|
|
}()
|
|
}
|
|
}
|
|
|
|
// If the task was the first to be added to the thread group, check if
|
|
// it needs to be notified of CPU limits being exceeded.
|
|
// We use a defer here because we need to do this without holding the
|
|
// TaskSet or signalHandlers lock.
|
|
var isFirstTask bool
|
|
defer func() {
|
|
if isFirstTask {
|
|
tg.notifyRlimitCPUUpdated(t)
|
|
}
|
|
}()
|
|
|
|
// Make the new task (and possibly thread group) visible to the rest of
|
|
// the system atomically.
|
|
ts.mu.Lock()
|
|
defer ts.mu.Unlock()
|
|
tg.signalHandlers.mu.Lock()
|
|
defer tg.signalHandlers.mu.Unlock()
|
|
if tg.exiting || tg.execing != nil {
|
|
// If the caller is in the same thread group, then what we return
|
|
// doesn't matter too much since the caller will exit before it returns
|
|
// to userspace. If the caller isn't in the same thread group, then
|
|
// we're in uncharted territory and can return whatever we want.
|
|
return nil, linuxerr.EINTR
|
|
}
|
|
if ts.liveTasks == 0 && ts.noNewTasksIfZeroLive {
|
|
// Since liveTasks == 0, our caller cannot be a task goroutine invoking
|
|
// a syscall, so it's safe to return a non-errno error that is more
|
|
// explanatory.
|
|
return nil, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned")
|
|
}
|
|
if err := ts.assignTIDsLocked(t); err != nil {
|
|
return nil, err
|
|
}
|
|
// Below this point, newTask is expected not to fail (there is no rollback
|
|
// of assignTIDsLocked or any of the following).
|
|
|
|
ts.liveTasks++
|
|
|
|
// Logging on t's behalf will panic if t.logPrefix hasn't been
|
|
// initialized. This is the earliest point at which we can do so
|
|
// (since t now has thread IDs).
|
|
t.updateInfoLocked()
|
|
|
|
if cfg.InheritParent != nil {
|
|
t.parent = cfg.InheritParent.parent
|
|
}
|
|
if t.parent != nil {
|
|
t.parent.children[t] = struct{}{}
|
|
}
|
|
|
|
// If InitialCgroups is not nil, the new task will be placed in the
|
|
// specified cgroups. Otherwise, if srcT is not nil, the new task will
|
|
// be placed in the srcT's cgroups. If neither is specified, the new task
|
|
// will be in the root cgroups.
|
|
t.EnterInitialCgroups(srcT, cfg.InitialCgroups)
|
|
committed = true
|
|
|
|
if isFirstTask = tg.leader == nil; isFirstTask {
|
|
// New thread group.
|
|
tg.leader = t
|
|
if parentPG := tg.parentPG(); parentPG == nil {
|
|
tg.createSession()
|
|
} else {
|
|
// Inherit the process group and terminal.
|
|
parentPG.incRefWithParent(parentPG)
|
|
tg.processGroup = parentPG
|
|
tg.tty = t.parent.tg.tty
|
|
}
|
|
|
|
// If our parent is a child subreaper, or if it has a child
|
|
// subreaper, then this new thread group does as well.
|
|
if t.parent != nil {
|
|
tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper
|
|
}
|
|
}
|
|
tg.tasks.PushBack(t)
|
|
tg.tasksCount++
|
|
tg.liveTasks++
|
|
tg.activeTasks++
|
|
|
|
// Propagate external TaskSet stops to the new task.
|
|
t.stopCount = atomicbitops.FromInt32(ts.stopCount)
|
|
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t]))
|
|
|
|
t.startTime = t.k.RealtimeClock().Now()
|
|
|
|
// As a final step, initialize the platform context. This may require
|
|
// other pieces to be initialized as the task is used the context.
|
|
t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())
|
|
|
|
return t, nil
|
|
}
|
|
|
|
// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
|
|
// which it should be visible.
|
|
//
|
|
// Preconditions: ts.mu must be locked for writing.
|
|
func (ts *TaskSet) assignTIDsLocked(t *Task) error {
|
|
type allocatedTID struct {
|
|
ns *PIDNamespace
|
|
tid ThreadID
|
|
}
|
|
var allocatedTIDs []allocatedTID
|
|
var tid ThreadID
|
|
var err error
|
|
for ns := t.tg.pidns; ns != nil; ns = ns.parent {
|
|
if tid, err = ns.allocateTID(); err != nil {
|
|
break
|
|
}
|
|
if err = ns.addTask(t, tid); err != nil {
|
|
break
|
|
}
|
|
allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
|
|
}
|
|
if err != nil {
|
|
// Failure. Remove the tids we already allocated in descendant
|
|
// namespaces.
|
|
for _, a := range allocatedTIDs {
|
|
a.ns.deleteTask(t)
|
|
}
|
|
return err
|
|
}
|
|
t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg]))
|
|
return nil
|
|
}
|
|
|
|
// allocateTID returns an unused ThreadID from ns.
|
|
//
|
|
// Preconditions: ns.owner.mu must be locked for writing.
|
|
func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
|
|
if ns.exiting {
|
|
// "In this case, a subsequent fork(2) into this PID namespace will
|
|
// fail with the error ENOMEM; it is not possible to create a new
|
|
// processes [sic] in a PID namespace whose init process has
|
|
// terminated." - pid_namespaces(7)
|
|
return 0, linuxerr.ENOMEM
|
|
}
|
|
tid := ns.last
|
|
for {
|
|
// Next.
|
|
tid++
|
|
if tid > TasksLimit {
|
|
tid = initTID + 1
|
|
}
|
|
|
|
// Is it available?
|
|
tidInUse := func() bool {
|
|
if _, ok := ns.tasks[tid]; ok {
|
|
return true
|
|
}
|
|
if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
|
|
return true
|
|
}
|
|
if _, ok := ns.sessions[SessionID(tid)]; ok {
|
|
return true
|
|
}
|
|
return false
|
|
}()
|
|
|
|
if !tidInUse {
|
|
ns.last = tid
|
|
return tid, nil
|
|
}
|
|
|
|
// Did we do a full cycle?
|
|
if tid == ns.last {
|
|
// No tid available.
|
|
return 0, linuxerr.EAGAIN
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start starts the task goroutine. Start must be called exactly once for each
|
|
// task returned by NewTask.
|
|
//
|
|
// 'tid' must be the task's TID in the root PID namespace and it's used for
|
|
// debugging purposes only (set as parameter to Task.run to make it visible
|
|
// in stack dumps).
|
|
func (t *Task) Start(tid ThreadID) {
|
|
// If the task was restored, it may be "starting" after having already exited.
|
|
if t.runState == nil {
|
|
return
|
|
}
|
|
t.goroutineStopped.Add(1)
|
|
t.tg.liveGoroutines.Add(1)
|
|
t.tg.pidns.owner.runningGoroutines.Add(1)
|
|
|
|
// Task is now running in system mode.
|
|
t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
|
|
|
|
// Use the task's TID in the root PID namespace to make it visible in stack dumps.
|
|
go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
|
|
}
|