// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // TaskConfig defines the configuration of a new Task (see below). type TaskConfig struct { // Kernel is the owning Kernel. Kernel *Kernel // Parent is the new task's parent. Parent may be nil. Parent *Task // If InheritParent is not nil, use InheritParent's parent as the new // task's parent. InheritParent *Task // ThreadGroup is the ThreadGroup the new task belongs to. ThreadGroup *ThreadGroup // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet // TaskImage is the TaskImage of the new task. Ownership of the // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it // succeeds. FSContext *FSContext // FDTable is the FDTableof the new task. A reference must be held on // FDMap, which is transferred to TaskSet.NewTask whether or not it // succeeds. FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials // Niceness is the niceness of the new task. Niceness int // NetworkNamespace is the network namespace to be used for the new task. NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet // UTSNamespace is the UTSNamespace of the new task. UTSNamespace *UTSNamespace // IPCNamespace is the IPCNamespace of the new task. IPCNamespace *IPCNamespace // MountNamespace is the MountNamespace of the new task. MountNamespace *vfs.MountNamespace // RSeqAddr is a pointer to the userspace linux.RSeq structure. RSeqAddr hostarch.Addr // RSeqSignature is the signature that the rseq abort IP must be signed // with. RSeqSignature uint32 // ContainerID is the container the new task belongs to. ContainerID string // InitialCgroups are the cgroups the container is initialised to. InitialCgroups map[Cgroup]struct{} // UserCounters is user resource counters. UserCounters *UserCounters // SessionKeyring is the session keyring associated with the parent task. // It may be nil. SessionKeyring *auth.Key Origin TaskOrigin } // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. // // If successful, NewTask transfers references held by cfg to the new task. // Otherwise, NewTask releases them. func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { var err error cleanup := func() { cfg.TaskImage.release(ctx) cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.UTSNamespace.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) cfg.NetworkNamespace.DecRef(ctx) if cfg.MountNamespace != nil { cfg.MountNamespace.DecRef(ctx) } } if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { cleanup() return nil, err } t, err := ts.newTask(ctx, cfg) if err != nil { cfg.UserCounters.decRLimitNProc() cleanup() return nil, err } return t, nil } // newTask is a helper for TaskSet.NewTask that only takes ownership of parts // of cfg if it succeeds. func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { srcT := TaskFromContext(ctx) tg := cfg.ThreadGroup image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, parent: cfg.Parent, children: make(map[*Task]struct{}), }, runState: (*runApp)(nil), interruptChan: make(chan struct{}, 1), signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, mountNamespace: cfg.MountNamespace, rseqCPU: -1, rseqAddr: cfg.RSeqAddr, rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, cgroups: make(map[Cgroup]struct{}), userCounters: cfg.UserCounters, sessionKeyring: cfg.SessionKeyring, Origin: cfg.Origin, onDestroyAction: make(map[TaskDestroyAction]struct{}), } t.netns = cfg.NetworkNamespace t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu // We don't construct t.blockingTimer until Task.run(); see that function // for justification. var ( cg Cgroup charged, committed bool ) // Reserve cgroup PIDs controller charge. This is either committed when the // new task enters the cgroup below, or rolled back on failure. // // We may also get here from a non-task context (for example, when // creating the init task, or from the exec control command). In these cases // we skip charging the pids controller, as non-userspace task creation // bypasses pid limits. if srcT != nil { var err error if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { return nil, err } if charged { defer func() { if !committed { if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) } } // Ref from ChargeFor. Note that we need to drop this outside of // TaskSet.mu critical sections. cg.DecRef(ctx) }() } } // If the task was the first to be added to the thread group, check if // it needs to be notified of CPU limits being exceeded. // We use a defer here because we need to do this without holding the // TaskSet or signalHandlers lock. var isFirstTask bool defer func() { if isFirstTask { tg.notifyRlimitCPUUpdated(t) } }() // Make the new task (and possibly thread group) visible to the rest of // the system atomically. ts.mu.Lock() defer ts.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() if tg.exiting || tg.execing != nil { // If the caller is in the same thread group, then what we return // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. return nil, linuxerr.EINTR } if ts.liveTasks == 0 && ts.noNewTasksIfZeroLive { // Since liveTasks == 0, our caller cannot be a task goroutine invoking // a syscall, so it's safe to return a non-errno error that is more // explanatory. return nil, fmt.Errorf("task creation disabled after Kernel.WaitExited() may have returned") } if err := ts.assignTIDsLocked(t); err != nil { return nil, err } // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). ts.liveTasks++ // Logging on t's behalf will panic if t.logPrefix hasn't been // initialized. This is the earliest point at which we can do so // (since t now has thread IDs). t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent } if t.parent != nil { t.parent.children[t] = struct{}{} } // If InitialCgroups is not nil, the new task will be placed in the // specified cgroups. Otherwise, if srcT is not nil, the new task will // be placed in the srcT's cgroups. If neither is specified, the new task // will be in the root cgroups. t.EnterInitialCgroups(srcT, cfg.InitialCgroups) committed = true if isFirstTask = tg.leader == nil; isFirstTask { // New thread group. tg.leader = t if parentPG := tg.parentPG(); parentPG == nil { tg.createSession() } else { // Inherit the process group and terminal. parentPG.incRefWithParent(parentPG) tg.processGroup = parentPG tg.tty = t.parent.tg.tty } // If our parent is a child subreaper, or if it has a child // subreaper, then this new thread group does as well. if t.parent != nil { tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper } } tg.tasks.PushBack(t) tg.tasksCount++ tg.liveTasks++ tg.activeTasks++ // Propagate external TaskSet stops to the new task. t.stopCount = atomicbitops.FromInt32(ts.stopCount) t.mu.Lock() defer t.mu.Unlock() t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) t.startTime = t.k.RealtimeClock().Now() // As a final step, initialize the platform context. This may require // other pieces to be initialized as the task is used the context. t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) return t, nil } // assignTIDsLocked ensures that new task t is visible in all PID namespaces in // which it should be visible. // // Preconditions: ts.mu must be locked for writing. func (ts *TaskSet) assignTIDsLocked(t *Task) error { type allocatedTID struct { ns *PIDNamespace tid ThreadID } var allocatedTIDs []allocatedTID var tid ThreadID var err error for ns := t.tg.pidns; ns != nil; ns = ns.parent { if tid, err = ns.allocateTID(); err != nil { break } if err = ns.addTask(t, tid); err != nil { break } allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) } if err != nil { // Failure. Remove the tids we already allocated in descendant // namespaces. for _, a := range allocatedTIDs { a.ns.deleteTask(t) } return err } t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) return nil } // allocateTID returns an unused ThreadID from ns. // // Preconditions: ns.owner.mu must be locked for writing. func (ns *PIDNamespace) allocateTID() (ThreadID, error) { if ns.exiting { // "In this case, a subsequent fork(2) into this PID namespace will // fail with the error ENOMEM; it is not possible to create a new // processes [sic] in a PID namespace whose init process has // terminated." - pid_namespaces(7) return 0, linuxerr.ENOMEM } tid := ns.last for { // Next. tid++ if tid > TasksLimit { tid = initTID + 1 } // Is it available? tidInUse := func() bool { if _, ok := ns.tasks[tid]; ok { return true } if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { return true } if _, ok := ns.sessions[SessionID(tid)]; ok { return true } return false }() if !tidInUse { ns.last = tid return tid, nil } // Did we do a full cycle? if tid == ns.last { // No tid available. return 0, linuxerr.EAGAIN } } } // Start starts the task goroutine. Start must be called exactly once for each // task returned by NewTask. // // 'tid' must be the task's TID in the root PID namespace and it's used for // debugging purposes only (set as parameter to Task.run to make it visible // in stack dumps). func (t *Task) Start(tid ThreadID) { // If the task was restored, it may be "starting" after having already exited. if t.runState == nil { return } t.goroutineStopped.Add(1) t.tg.liveGoroutines.Add(1) t.tg.pidns.owner.runningGoroutines.Add(1) // Task is now running in system mode. t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) // Use the task's TID in the root PID namespace to make it visible in stack dumps. go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops }