Files
Ayush Ranjan e699298d58 Add Container.RestoreInTest() to handle known Docker bugs.
This can be used by all test users. Avoids duplicated code. We can handle all
known issues in one place.

There is a Docker bug which causes restore to fail sporadically. See
https://github.com/moby/moby/issues/42900. This has been broken at least since
Docker v19.03.12 (when the issue was reported) and was fixed in v25.0.4. Added
the handling for this issue.

Also got rid of the testutil.Poll() around restore. That can hide gVisor
restore flakiness issues. That was added in 0990ef7517 ("Make
checkpoint/restore e2e test less flaky"). The original sleep has been restored.

PiperOrigin-RevId: 734303878
2025-03-06 15:13:16 -08:00

79 lines
2.3 KiB
Go

// Copyright 2023 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package sr_test runs checkpoint/restore tests for nvproxy.
package sr_test
import (
"strings"
"testing"
"time"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/test/dockerutil"
"gvisor.dev/gvisor/pkg/test/testutil"
)
func TestGPUCheckpointRestore(t *testing.T) {
if !testutil.IsCheckpointSupported() {
t.Skip("Checkpoint is not supported.")
}
dockerutil.EnsureDockerExperimentalEnabled()
ctx := context.Background()
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: "all",
})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "gpu/cuda-tests"
if err := c.Spawn(ctx, opts, "sleep", "infinity"); err != nil {
t.Fatalf("could not start cuda-tests container: %v", err)
}
defer func() {
logs, err := c.Logs(ctx)
if err != nil {
t.Errorf("Could not get container logs: %v", err)
}
t.Logf("Container logs:\n%v", logs)
}()
// Run the vector add program.
vectorAddCmd := []string{"/run_sample", "--timeout=120s", "0_Introduction/vectorAdd"}
if output, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v; output: %v", err, strings.TrimSpace(output))
}
// Create a snapshot.
const ckptName = "test"
if err := c.Checkpoint(ctx, ckptName); err != nil {
t.Fatalf("docker checkpoint failed: %v", err)
}
if err := c.WaitTimeout(ctx, time.Minute); err != nil {
t.Fatalf("wait failed: %v", err)
}
// Restore the snapshot.
c.RestoreInTest(ctx, t, ckptName)
// Run the vector add program again to ensure GPUs are functional.
if _, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v", err)
}
}