mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
2d90353f9f
gVisor currently implements CPU clocks as follows: - A per-sentry "CPU clock ticker goroutine" (task_sched.go:Kernel.runCPUClockTicker()) periodically advances Kernel.cpuClock, causing it to serve as a very coarse but inexpensive monotonic wall clock (that happens to be suspended when no tasks are running). - Task goroutines observe the most recent value of Kernel.cpuClock when changing state (Task.gosched.Timestamp), and use it to compute the number of CPU clock ticks that have elapsed in a given state. Thus, task CPU clocks are approximately based on the wall time during which they were marked as running. - ITIMER_VIRTUAL, ITIMER_PROF, and RLIMIT_CPU are checked by the CPU clock ticker goroutine after advancing Kernel.cpuClock. POSIX interval timers and timerfds check CPU clocks (taskClock/tgClock) in ktime.SampledTimer goroutines. This has three major problems: - ktime.SampledTimer goroutines for CPU clock timers run concurrently with the CPU clock ticker, and are not informed as to when corresponding tasks start or stop running (due to overhead on the task execution critical path), so they can't determine when CPU clocks have/will advance; instead, they simply poll CPU clocks on a period equal to that of the represented timer, resulting in significant overhead for CPU-clock-based POSIX interval timers and timerfds. - For the same reason, CPU clock interval timers and timerfds may expire much later than when the CPU clock is actually incremented; in the interval timer case, this can result in notification signals being sent long after tasks have stopped running. (This is the same problem as in b/116538398, which motivated the special-casing of ITIMER_VIRTUAL and ITIMER_PROF described above, but applied to POSIX interval timers.) - The sentry does not impose a limit on the number of tasks that may be concurrently marked running, so if more tasks are marked running than the number of CPUs advertised to applications, application CPU utilization can appear to exceed 100%. This CL fixes these problems by introducing explicit per-Task and ThreadGroup CPU clocks, directly advancing (up to Kernel.applicationCores of) them in the CPU clock ticker, and directly expiring CPU timers when doing so. Itimer and RLIMIT_CPU timers lose their special-casing and instead behave like other CPU timers (see task_acct.go). Kernel.cpuClock is still required, but only for the sentry watchdog. Minor cleanup changes: - Gather all stateify hooks in kernel_state.go. - Replace kernel.randInt31n() with math/rand/v2, which fixes the same problem (https://go.dev/blog/randv2#problem.rand). Test workload: ``` #include <err.h> #include <signal.h> #include <time.h> #include <chrono> #include <thread> constexpr int kNumTimers = 1000; constexpr long kTimerPeriodNS = 10000000; int main(int argc, char** argv) { for (int i = 0; i < kNumTimers; i++) { struct sigevent sev = {.sigev_notify = SIGEV_NONE}; timer_t timerid; if (timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid) < 0) { err(1, "timer_create failed"); } struct itimerspec it = { .it_interval = {0, kTimerPeriodNS}, .it_value = {0, kTimerPeriodNS}, }; if (timer_settime(timerid, 0, &it, nullptr) < 0) { err(1, "timer_settime failed"); } } std::this_thread::sleep_for(std::chrono::seconds(5)); return 0; } ``` Before this CL: ``` # /usr/bin/time ./runsc --ignore-cgroups --platform kvm --network none do $(pwd)/workloads/threadcputimers 1.50user 0.17system 0:05.25elapsed 31%CPU (0avgtext+0avgdata 35792maxresident)k 0inputs+184outputs (10major+20889minor)pagefaults 0swaps ``` After this CL: ``` # /usr/bin/time ./runsc --ignore-cgroups --platform kvm --network none do $(pwd)/workloads/threadcputimers 0.10user 0.12system 0:05.22elapsed 4%CPU (0avgtext+0avgdata 34040maxresident)k 0inputs+192outputs (6major+20929minor)pagefaults 0swaps ``` PiperOrigin-RevId: 695198313
99 lines
2.8 KiB
Go
99 lines
2.8 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package kernel
|
|
|
|
import (
|
|
"context"
|
|
|
|
"gvisor.dev/gvisor/pkg/tcpip"
|
|
)
|
|
|
|
// afterLoad is invoked by stateify.
|
|
func (ts *TaskSet) afterLoad(_ context.Context) {
|
|
ts.zeroLiveTasksCond.L = &ts.mu
|
|
}
|
|
|
|
// saveDanglingEndpoints is invoked by stateify.
|
|
func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
|
|
return tcpip.GetDanglingEndpoints()
|
|
}
|
|
|
|
// loadDanglingEndpoints is invoked by stateify.
|
|
func (k *Kernel) loadDanglingEndpoints(_ context.Context, es []tcpip.Endpoint) {
|
|
for _, e := range es {
|
|
tcpip.AddDanglingEndpoint(e)
|
|
}
|
|
}
|
|
|
|
// saveVforkParent is invoked by stateify.
|
|
func (t *Task) saveVforkParent() *Task {
|
|
return t.vforkParent.Load()
|
|
}
|
|
|
|
// loadVforkParent is invoked by stateify.
|
|
func (t *Task) loadVforkParent(_ context.Context, vforkParent *Task) {
|
|
t.vforkParent.Store(vforkParent)
|
|
}
|
|
|
|
// savePtraceTracer is invoked by stateify.
|
|
func (t *Task) savePtraceTracer() *Task {
|
|
return t.ptraceTracer.Load()
|
|
}
|
|
|
|
// loadPtraceTracer is invoked by stateify.
|
|
func (t *Task) loadPtraceTracer(_ context.Context, tracer *Task) {
|
|
t.ptraceTracer.Store(tracer)
|
|
}
|
|
|
|
// saveSeccomp is invoked by stateify.
|
|
func (t *Task) saveSeccomp() *taskSeccomp {
|
|
return t.seccomp.Load()
|
|
}
|
|
|
|
// loadSeccomp is invoked by stateify.
|
|
func (t *Task) loadSeccomp(_ context.Context, seccompData *taskSeccomp) {
|
|
t.seccomp.Store(seccompData)
|
|
}
|
|
|
|
// saveAppCPUClockLast is invoked by stateify.
|
|
func (tg *ThreadGroup) saveAppCPUClockLast() *Task {
|
|
return tg.appCPUClockLast.Load()
|
|
}
|
|
|
|
// loadAppCPUClockLast is invoked by stateify.
|
|
func (tg *ThreadGroup) loadAppCPUClockLast(_ context.Context, task *Task) {
|
|
tg.appCPUClockLast.Store(task)
|
|
}
|
|
|
|
// saveAppSysCPUClockLast is invoked by stateify.
|
|
func (tg *ThreadGroup) saveAppSysCPUClockLast() *Task {
|
|
return tg.appSysCPUClockLast.Load()
|
|
}
|
|
|
|
// loadAppSysCPUClockLast is invoked by stateify.
|
|
func (tg *ThreadGroup) loadAppSysCPUClockLast(_ context.Context, task *Task) {
|
|
tg.appSysCPUClockLast.Store(task)
|
|
}
|
|
|
|
// saveOldRSeqCritical is invoked by stateify.
|
|
func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
|
|
return tg.oldRSeqCritical.Load()
|
|
}
|
|
|
|
// loadOldRSeqCritical is invoked by stateify.
|
|
func (tg *ThreadGroup) loadOldRSeqCritical(_ context.Context, r *OldRSeqCriticalRegion) {
|
|
tg.oldRSeqCritical.Store(r)
|
|
}
|