mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
2d90353f9f
gVisor currently implements CPU clocks as follows: - A per-sentry "CPU clock ticker goroutine" (task_sched.go:Kernel.runCPUClockTicker()) periodically advances Kernel.cpuClock, causing it to serve as a very coarse but inexpensive monotonic wall clock (that happens to be suspended when no tasks are running). - Task goroutines observe the most recent value of Kernel.cpuClock when changing state (Task.gosched.Timestamp), and use it to compute the number of CPU clock ticks that have elapsed in a given state. Thus, task CPU clocks are approximately based on the wall time during which they were marked as running. - ITIMER_VIRTUAL, ITIMER_PROF, and RLIMIT_CPU are checked by the CPU clock ticker goroutine after advancing Kernel.cpuClock. POSIX interval timers and timerfds check CPU clocks (taskClock/tgClock) in ktime.SampledTimer goroutines. This has three major problems: - ktime.SampledTimer goroutines for CPU clock timers run concurrently with the CPU clock ticker, and are not informed as to when corresponding tasks start or stop running (due to overhead on the task execution critical path), so they can't determine when CPU clocks have/will advance; instead, they simply poll CPU clocks on a period equal to that of the represented timer, resulting in significant overhead for CPU-clock-based POSIX interval timers and timerfds. - For the same reason, CPU clock interval timers and timerfds may expire much later than when the CPU clock is actually incremented; in the interval timer case, this can result in notification signals being sent long after tasks have stopped running. (This is the same problem as in b/116538398, which motivated the special-casing of ITIMER_VIRTUAL and ITIMER_PROF described above, but applied to POSIX interval timers.) - The sentry does not impose a limit on the number of tasks that may be concurrently marked running, so if more tasks are marked running than the number of CPUs advertised to applications, application CPU utilization can appear to exceed 100%. This CL fixes these problems by introducing explicit per-Task and ThreadGroup CPU clocks, directly advancing (up to Kernel.applicationCores of) them in the CPU clock ticker, and directly expiring CPU timers when doing so. Itimer and RLIMIT_CPU timers lose their special-casing and instead behave like other CPU timers (see task_acct.go). Kernel.cpuClock is still required, but only for the sentry watchdog. Minor cleanup changes: - Gather all stateify hooks in kernel_state.go. - Replace kernel.randInt31n() with math/rand/v2, which fixes the same problem (https://go.dev/blog/randv2#problem.rand). Test workload: ``` #include <err.h> #include <signal.h> #include <time.h> #include <chrono> #include <thread> constexpr int kNumTimers = 1000; constexpr long kTimerPeriodNS = 10000000; int main(int argc, char** argv) { for (int i = 0; i < kNumTimers; i++) { struct sigevent sev = {.sigev_notify = SIGEV_NONE}; timer_t timerid; if (timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid) < 0) { err(1, "timer_create failed"); } struct itimerspec it = { .it_interval = {0, kTimerPeriodNS}, .it_value = {0, kTimerPeriodNS}, }; if (timer_settime(timerid, 0, &it, nullptr) < 0) { err(1, "timer_settime failed"); } } std::this_thread::sleep_for(std::chrono::seconds(5)); return 0; } ``` Before this CL: ``` # /usr/bin/time ./runsc --ignore-cgroups --platform kvm --network none do $(pwd)/workloads/threadcputimers 1.50user 0.17system 0:05.25elapsed 31%CPU (0avgtext+0avgdata 35792maxresident)k 0inputs+184outputs (10major+20889minor)pagefaults 0swaps ``` After this CL: ``` # /usr/bin/time ./runsc --ignore-cgroups --platform kvm --network none do $(pwd)/workloads/threadcputimers 0.10user 0.12system 0:05.22elapsed 4%CPU (0avgtext+0avgdata 34040maxresident)k 0inputs+192outputs (6major+20929minor)pagefaults 0swaps ``` PiperOrigin-RevId: 695198313
444 lines
14 KiB
Go
444 lines
14 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package kernel
|
|
|
|
// CPU scheduling, real and fake.
|
|
|
|
import (
|
|
"fmt"
|
|
"math/rand/v2"
|
|
"time"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/errors/linuxerr"
|
|
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
|
|
"gvisor.dev/gvisor/pkg/sentry/ktime"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
)
|
|
|
|
// TaskGoroutineState is a coarse representation of the current execution
|
|
// status of a kernel.Task goroutine.
|
|
type TaskGoroutineState uint32
|
|
|
|
const (
|
|
// TaskGoroutineNonexistent indicates that the task goroutine has either
|
|
// not yet been created by Task.Start() or has returned from Task.run().
|
|
// This must be the zero value for TaskGoroutineState.
|
|
TaskGoroutineNonexistent TaskGoroutineState = iota
|
|
|
|
// TaskGoroutineRunningSys indicates that the task goroutine is executing
|
|
// sentry code.
|
|
TaskGoroutineRunningSys
|
|
|
|
// TaskGoroutineRunningApp indicates that the task goroutine is executing
|
|
// application code.
|
|
TaskGoroutineRunningApp
|
|
|
|
// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
|
|
// blocked in Task.block(), and hence may be woken by Task.interrupt()
|
|
// (e.g. due to signal delivery).
|
|
TaskGoroutineBlockedInterruptible
|
|
|
|
// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
|
|
// stopped outside of Task.block() and Task.doStop(), and hence cannot be
|
|
// woken by Task.interrupt().
|
|
TaskGoroutineBlockedUninterruptible
|
|
|
|
// TaskGoroutineStopped indicates that the task goroutine is blocked in
|
|
// Task.doStop(). TaskGoroutineStopped is similar to
|
|
// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
|
|
// possible to determine when Task.stop is meaningful.
|
|
TaskGoroutineStopped
|
|
)
|
|
|
|
// TaskGoroutineState returns the current state of the task goroutine.
|
|
func (t *Task) TaskGoroutineState() TaskGoroutineState {
|
|
return TaskGoroutineState(t.gostate.Load())
|
|
}
|
|
|
|
// TaskGoroutineStateTime returns the current state of the task goroutine, and
|
|
// the value of Kernel.CPUClockNow() when that state was last updated or
|
|
// refreshed.
|
|
func (t *Task) TaskGoroutineStateTime() (state TaskGoroutineState, time ktime.Time) {
|
|
for {
|
|
epoch := t.gostateSeq.BeginRead()
|
|
state = t.TaskGoroutineState()
|
|
time = ktime.FromNanoseconds(t.gostateTime.Load())
|
|
if t.gostateSeq.ReadOk(epoch) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
|
|
if oldState := t.TaskGoroutineState(); oldState != TaskGoroutineRunningSys {
|
|
panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", oldState, TaskGoroutineRunningSys, state))
|
|
}
|
|
t.gostateSeq.BeginWrite()
|
|
t.gostate.Store(uint32(state))
|
|
t.touchGostateTime()
|
|
t.gostateSeq.EndWrite()
|
|
if state != TaskGoroutineRunningApp {
|
|
// Task is blocking/stopping.
|
|
t.k.decRunningTasks()
|
|
}
|
|
}
|
|
|
|
// Preconditions:
|
|
// - The caller must be running on the task goroutine.
|
|
// - The caller must be leaving a state indicated by a previous call to
|
|
// t.accountTaskGoroutineEnter(state).
|
|
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
|
|
if state != TaskGoroutineRunningApp {
|
|
// Task is unblocking/continuing.
|
|
t.k.incRunningTasks()
|
|
}
|
|
if oldState := t.TaskGoroutineState(); oldState != state {
|
|
panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", oldState, state, TaskGoroutineRunningSys))
|
|
}
|
|
t.gostateSeq.BeginWrite()
|
|
t.gostate.Store(uint32(TaskGoroutineRunningSys))
|
|
t.touchGostateTime()
|
|
t.gostateSeq.EndWrite()
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) accountTaskGoroutineRunning() {
|
|
if oldState := t.TaskGoroutineState(); oldState != TaskGoroutineRunningSys {
|
|
panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", oldState, TaskGoroutineRunningSys))
|
|
}
|
|
t.touchGostateTime()
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) touchGostateTime() {
|
|
t.gostateTime.Store(t.k.cpuClock.Load())
|
|
}
|
|
|
|
// CPUClockNow returns the current value of the kernel CPU clock, which
|
|
// coarsely approximates wall time but is suspended when no tasks are running.
|
|
func (k *Kernel) CPUClockNow() ktime.Time {
|
|
return ktime.FromNanoseconds(k.cpuClock.Load())
|
|
}
|
|
|
|
// UserCPUClock returns a clock measuring the CPU time the task has spent
|
|
// executing application code.
|
|
func (t *Task) UserCPUClock() ktime.Clock {
|
|
return &t.appCPUClock
|
|
}
|
|
|
|
// CPUClock returns a clock measuring the CPU time the task has spent executing
|
|
// application and "kernel" code.
|
|
func (t *Task) CPUClock() ktime.Clock {
|
|
return &t.appSysCPUClock
|
|
}
|
|
|
|
// UserCPUClock returns a ktime.Clock that measures the time that a thread
|
|
// group has spent executing.
|
|
func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
|
|
return &tg.appCPUClock
|
|
}
|
|
|
|
// CPUClock returns a ktime.Clock that measures the time that a thread group
|
|
// has spent executing, including sentry time.
|
|
func (tg *ThreadGroup) CPUClock() ktime.Clock {
|
|
return &tg.appSysCPUClock
|
|
}
|
|
|
|
// CPUStats returns the CPU usage statistics of t.
|
|
func (t *Task) CPUStats() usage.CPUStats {
|
|
// The CPU clock ticker advances t.appCPUClock before t.appSysCPUClock, so
|
|
// it's possible for the former to transiently exceed the latter.
|
|
appNS := t.appCPUClock.Now().Nanoseconds()
|
|
appSysNS := t.appSysCPUClock.Now().Nanoseconds()
|
|
sysNS := max(appSysNS-appNS, 0)
|
|
return usage.CPUStats{
|
|
UserTime: time.Duration(appNS),
|
|
SysTime: time.Duration(sysNS),
|
|
VoluntarySwitches: t.yieldCount.Load(),
|
|
}
|
|
}
|
|
|
|
// CPUStats returns the combined CPU usage statistics of all past and present
|
|
// threads in tg.
|
|
func (tg *ThreadGroup) CPUStats() usage.CPUStats {
|
|
// The CPU clock ticker advances tg.appCPUClock before tg.appSysCPUClock,
|
|
// so it's possible for the former to transiently exceed the latter.
|
|
appNS := tg.appCPUClock.Now().Nanoseconds()
|
|
appSysNS := tg.appSysCPUClock.Now().Nanoseconds()
|
|
sysNS := max(appSysNS-appNS, 0)
|
|
return usage.CPUStats{
|
|
UserTime: time.Duration(appNS),
|
|
SysTime: time.Duration(sysNS),
|
|
VoluntarySwitches: tg.yieldCount.Load(),
|
|
}
|
|
}
|
|
|
|
// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
|
|
// resource usage statistics for all children of [tg] that have terminated and
|
|
// been waited for. These statistics will include the resources used by
|
|
// grandchildren, and further removed descendants, if all of the intervening
|
|
// descendants waited on their terminated children."
|
|
func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
|
|
tg.pidns.owner.mu.RLock()
|
|
defer tg.pidns.owner.mu.RUnlock()
|
|
return tg.childCPUStats
|
|
}
|
|
|
|
func (k *Kernel) runCPUClockTicker() {
|
|
// Storage reused between iterations of the main loop:
|
|
var (
|
|
allTasks []*Task
|
|
incTasks = make([]*Task, k.applicationCores)
|
|
)
|
|
|
|
for {
|
|
// Stop CPU clocks while nothing is running.
|
|
if k.runningTasks.Load() == 0 {
|
|
k.runningTasksMu.Lock()
|
|
if k.runningTasks.Load() == 0 {
|
|
k.cpuClockTickerRunning = false
|
|
k.cpuClockTickerStopCond.Broadcast()
|
|
k.runningTasksCond.Wait()
|
|
// k.cpuClockTickerRunning was set to true by our waker
|
|
// (Kernel.incRunningTasks()). For reasons described there, we must
|
|
// process at least one CPU clock tick between calls to
|
|
// k.runningTasksCond.Wait().
|
|
}
|
|
k.runningTasksMu.Unlock()
|
|
}
|
|
|
|
// Wait for the next CPU clock tick.
|
|
select {
|
|
case <-k.cpuClockTickTimer.C:
|
|
k.cpuClockTickTimer.Reset(linux.ClockTick)
|
|
case <-k.cpuClockTickerWakeCh:
|
|
continue
|
|
}
|
|
|
|
// Advance the "kernel CPU clock".
|
|
k.cpuClock.Add(linux.ClockTick.Nanoseconds())
|
|
|
|
// Advance CPU clocks. gVisor generally has no knowledge of when sentry
|
|
// or application code is actually running on a CPU (due to Go and/or
|
|
// host kernel scheduling, with significant variation between
|
|
// platforms), so CPU clocks are approximated. We do so by choosing up
|
|
// to applicationCores running tasks (randomly, using reservoir
|
|
// sampling) and accounting a full CPU clock tick to each of those
|
|
// tasks. The alternative would be to distribute CPU time evenly to all
|
|
// running tasks, but:
|
|
//
|
|
// - If the CPU time per task is between 0 and 1 (nanoseconds), then
|
|
// neither rounded value is desirable: 0 would cause all CPU clocks to
|
|
// cease advancing, while 1 would cause the total CPU time accrued by
|
|
// all tasks to exceed the number of claimed CPUs.
|
|
//
|
|
// - This would require us to mutate CPU clocks and check timers for
|
|
// all running tasks and their thread groups, rather than only up to
|
|
// applicationCores running tasks (and their thread groups).
|
|
allTasks = k.tasks.Root.TasksAppend(allTasks)
|
|
runningTasks := 0
|
|
for _, t := range allTasks {
|
|
state := t.TaskGoroutineState()
|
|
if state != TaskGoroutineRunningApp && state != TaskGoroutineRunningSys {
|
|
continue
|
|
}
|
|
if runningTasks < len(incTasks) {
|
|
incTasks[runningTasks] = t
|
|
runningTasks++
|
|
continue
|
|
}
|
|
runningTasks++
|
|
if i := rand.IntN(runningTasks); i < len(incTasks) {
|
|
incTasks[i] = t
|
|
}
|
|
}
|
|
numIncTasks := min(runningTasks, len(incTasks))
|
|
// Shuffle incTasks to ensure that if multiple tasks are in the same
|
|
// thread group, then all are equally likely to be
|
|
// ThreadGroup.app[Sys]CPUClockLast when a ThreadGroup CPU timer fires.
|
|
rand.Shuffle(numIncTasks, func(i, j int) {
|
|
incTasks[i], incTasks[j] = incTasks[j], incTasks[i]
|
|
})
|
|
for _, t := range incTasks[:numIncTasks] {
|
|
switch t.TaskGoroutineState() {
|
|
case TaskGoroutineRunningApp:
|
|
t.appCPUClock.Add(linux.ClockTick)
|
|
t.tg.appCPUClockLast.Store(t)
|
|
t.tg.appCPUClock.Add(linux.ClockTick)
|
|
fallthrough
|
|
case TaskGoroutineRunningSys:
|
|
t.appSysCPUClock.Add(linux.ClockTick)
|
|
t.tg.appSysCPUClockLast.Store(t)
|
|
t.tg.appSysCPUClock.Add(linux.ClockTick)
|
|
}
|
|
}
|
|
|
|
// Reset storage for the next iteration.
|
|
clear(allTasks)
|
|
allTasks = allTasks[:0]
|
|
clear(incTasks[:numIncTasks])
|
|
}
|
|
}
|
|
|
|
// StateStatus returns a string representation of the task's current state,
|
|
// appropriate for /proc/[pid]/status.
|
|
func (t *Task) StateStatus() string {
|
|
switch s := t.TaskGoroutineState(); s {
|
|
case TaskGoroutineNonexistent, TaskGoroutineRunningSys:
|
|
switch t.ExitState() {
|
|
case TaskExitZombie:
|
|
return "Z (zombie)"
|
|
case TaskExitDead:
|
|
return "X (dead)"
|
|
default:
|
|
// The task goroutine can't exit before passing through
|
|
// runExitNotify, so if s == TaskGoroutineNonexistent, the task has
|
|
// been created but the task goroutine hasn't yet started. The
|
|
// Linux equivalent is struct task_struct::state == TASK_NEW
|
|
// (kernel/fork.c:copy_process() =>
|
|
// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
|
|
// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
|
|
// TASK_RUNNING.
|
|
return "R (running)"
|
|
}
|
|
case TaskGoroutineRunningApp:
|
|
return "R (running)"
|
|
case TaskGoroutineBlockedInterruptible:
|
|
return "S (sleeping)"
|
|
case TaskGoroutineStopped:
|
|
sh := t.tg.signalLock()
|
|
defer sh.mu.Unlock()
|
|
switch t.stop.(type) {
|
|
case *groupStop:
|
|
return "T (stopped)"
|
|
case *ptraceStop:
|
|
return "t (tracing stop)"
|
|
}
|
|
fallthrough
|
|
case TaskGoroutineBlockedUninterruptible:
|
|
// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
|
|
// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
|
|
// fs/proc/array.c:task_state_array.
|
|
return "D (disk sleep)"
|
|
default:
|
|
panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
|
|
}
|
|
}
|
|
|
|
// CPUMask returns a copy of t's allowed CPU mask.
|
|
func (t *Task) CPUMask() sched.CPUSet {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.allowedCPUMask.Copy()
|
|
}
|
|
|
|
// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
|
|
// mask.
|
|
//
|
|
// Preconditions: mask.Size() ==
|
|
// sched.CPUSetSize(t.Kernel().ApplicationCores()).
|
|
func (t *Task) SetCPUMask(mask sched.CPUSet) error {
|
|
if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
|
|
panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
|
|
}
|
|
|
|
// Remove CPUs in mask above Kernel.applicationCores.
|
|
mask.ClearAbove(t.k.applicationCores)
|
|
|
|
// Ensure that at least 1 CPU is still allowed.
|
|
if mask.NumCPUs() == 0 {
|
|
return linuxerr.EINVAL
|
|
}
|
|
|
|
if t.k.useHostCores {
|
|
// No-op; pretend the mask was immediately changed back.
|
|
return nil
|
|
}
|
|
|
|
t.tg.pidns.owner.mu.RLock()
|
|
rootTID := t.tg.pidns.owner.Root.tids[t]
|
|
t.tg.pidns.owner.mu.RUnlock()
|
|
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.allowedCPUMask = mask
|
|
t.cpu.Store(assignCPU(mask, rootTID))
|
|
return nil
|
|
}
|
|
|
|
// CPU returns the cpu id for a given task.
|
|
func (t *Task) CPU() int32 {
|
|
if t.k.useHostCores {
|
|
return int32(hostcpu.GetCPU())
|
|
}
|
|
|
|
return t.cpu.Load()
|
|
}
|
|
|
|
// assignCPU returns the virtualized CPU number for the task with global TID
|
|
// tid and allowedCPUMask allowed.
|
|
func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
|
|
// To pretend that threads are evenly distributed to allowed CPUs, choose n
|
|
// to be less than the number of CPUs in allowed ...
|
|
n := int(tid) % int(allowed.NumCPUs())
|
|
// ... then pick the nth CPU in allowed.
|
|
allowed.ForEachCPU(func(c uint) {
|
|
if n == 0 {
|
|
cpu = int32(c)
|
|
}
|
|
n--
|
|
})
|
|
return cpu
|
|
}
|
|
|
|
// Niceness returns t's niceness.
|
|
func (t *Task) Niceness() int {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.niceness
|
|
}
|
|
|
|
// Priority returns t's priority.
|
|
func (t *Task) Priority() int {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.niceness + 20
|
|
}
|
|
|
|
// SetNiceness sets t's niceness to n.
|
|
func (t *Task) SetNiceness(n int) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.niceness = n
|
|
}
|
|
|
|
// NumaPolicy returns t's current numa policy.
|
|
func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.numaPolicy, t.numaNodeMask
|
|
}
|
|
|
|
// SetNumaPolicy sets t's numa policy.
|
|
func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.numaPolicy = policy
|
|
t.numaNodeMask = nodeMask
|
|
}
|