gvisor/test/gpu/cuda_test.go

// Copyright 2024 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package cuda_test tests basic CUDA workloads.
package cuda_test

import (
	"context"
	"errors"
	"flag"
	"fmt"
	"math"
	"os"
	"runtime"
	"slices"
	"strconv"
	"strings"
	"sync"
	"testing"
	"time"

	"golang.org/x/sync/errgroup"
	"gvisor.dev/gvisor/pkg/test/dockerutil"
	"gvisor.dev/gvisor/pkg/test/testutil"
)

const (
	// defaultTestTimeout is the default timeout for a single CUDA sample test.
	defaultTestTimeout = 20 * time.Minute

	// hangingTestTimeout is the test timeout for tests that are fast when they
	// succeed, but hang forever otherwise.
	hangingTestTimeout = 1 * time.Minute

	// defaultContainersPerCPU is the default number of pooled containers to
	// spawn for each CPU. This can be a floating-point value.
	// This value was arrived at experimentally and has no particular meaning.
	// Setting it too low will cause the test to take longer than necessary
	// because of insufficient parallelism.
	// However, setting it too high will *also* cause the test to take longer
	// than necessary, because the added resource contention will cause more
	// tests to fail when run in parallel with each other, forcing them to be
	// re-run serialized.
	defaultContainersPerCPU = 1.75

	// exitCodeWaived is the EXIT_WAIVED constant used in CUDA tests.
	// This exit code is typically used by CUDA tests to indicate that the
	// test requires a capability or condition that is not met in the current
	// test environment.
	exitCodeWaived = 2
)

// Flags.
var (
	verifyCompatibility = flag.Bool("cuda_verify_compatibility", os.Getenv("GVISOR_TEST_CUDA_VERIFY_COMPATIBILITY") == "true", "whether to verify that all tests are marked as compatible")
	logSuccessfulTests  = flag.Bool("cuda_log_successful_tests", false, "log console output of successful tests")
	debug               = flag.Bool("cuda_test_debug", false, "log more data as the test is running")
	containersPerCPU    = flag.Float64("cuda_containers_per_cpu", defaultContainersPerCPU, "number of parallel execution containers to spawn per CPU (floating point values allowed)")
)

var testSuiteCompatibility = map[string]Compatibility{
	"0_Introduction":            &NoCrossCompile{},
	"1_Utilities":               &NoCrossCompile{},
	"2_Concepts_and_Techniques": &NoCrossCompile{},
	"3_CUDA_Features":           &NoCrossCompile{},
	"4_CUDA_Libraries":          &NoCrossCompile{},
	"5_Domain_Specific":         &NoCrossCompile{},
	"6_Performance":             &NoCrossCompile{},
}

// testCompatibility maps test names to their compatibility data.
// Unmapped test names are assumed to be fully compatible.
var testCompatibility = map[string]Compatibility{
	"0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching),
	"0_Introduction/simpleCUDA2GL":    RequiresFeatures(FeatureGL),
	"0_Introduction/simpleP2P":        &RequiresP2P{},
	"2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{
		Reason: "Requires ancient version of glibc (<=2.33)",
	},
	"2_Concepts_and_Techniques/EGLStream_CUDA_Interop": &BrokenEverywhere{
		Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)",
	},
	"2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU": MultiCompatibility(
		&RequiresMultiGPU{},
		&BrokenEverywhere{
			Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)",
		},
	),
	"2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop":  &OnlyOnWindows{},
	"2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{},
	"2_Concepts_and_Techniques/streamOrderedAllocationP2P": &RequiresP2P{},
	"3_CUDA_Features/bf16TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
	"3_CUDA_Features/cdpAdvancedQuicksort":                 RequiresFeatures(FeatureDynamicParallelism),
	"3_CUDA_Features/cudaCompressibleMemory":               RequiresFeatures(FeatureCompressibleMemory),
	"3_CUDA_Features/dmmaTensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
	"3_CUDA_Features/memMapIPCDrv":                         &RequiresMultiGPU{},
	"3_CUDA_Features/tf32TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
	"4_CUDA_Libraries/conjugateGradientMultiDeviceCG":      MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
	"4_CUDA_Libraries/cudaNvSci":                           &RequiresNvSci{},
	"4_CUDA_Libraries/cudaNvSciNvMedia":                    &RequiresNvSci{},
	"4_CUDA_Libraries/cuDLAErrorReporting":                 &OnlyOnWindows{},
	"4_CUDA_Libraries/cuDLAHybridMode":                     &OnlyOnWindows{},
	"4_CUDA_Libraries/cuDLAStandaloneMode":                 &OnlyOnWindows{},
	"4_CUDA_Libraries/cuDLALayerwiseStatsHybrid":           &OnlyOnWindows{},
	"4_CUDA_Libraries/cuDLALayerwiseStatsStandalone":       &OnlyOnWindows{},
	"4_CUDA_Libraries/simpleCUFFT_2d_MGPU":                 &RequiresMultiGPU{},
	"4_CUDA_Libraries/simpleCUFFT_MGPU":                    &RequiresMultiGPU{},
	"5_Domain_Specific/fluidsD3D9":                         &OnlyOnWindows{},
	"5_Domain_Specific/fluidsGL":                           RequiresFeatures(FeatureGL),
	"5_Domain_Specific/fluidsGLES":                         &OnlyOnWindows{},
	"5_Domain_Specific/nbody_opengles":                     &OnlyOnWindows{},
	"5_Domain_Specific/nbody_screen":                       &OnlyOnWindows{},
	"5_Domain_Specific/postProcessGL":                      RequiresFeatures(FeatureGL),
	"5_Domain_Specific/simpleD3D10":                        &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D10RenderTarget":            &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D10Texture":                 &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D11":                        &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D11Texture":                 &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D12":                        &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D9":                         &OnlyOnWindows{},
	"5_Domain_Specific/simpleD3D9Texture":                  &OnlyOnWindows{},
	"5_Domain_Specific/simpleGLES":                         &OnlyOnWindows{},
	"5_Domain_Specific/simpleGLES_EGLOutput":               &OnlyOnWindows{},
	"5_Domain_Specific/simpleGLES_screen":                  &OnlyOnWindows{},
	"5_Domain_Specific/simpleVulkan":                       RequiresFeatures(FeatureGL),
	"5_Domain_Specific/simpleVulkanMMAP":                   RequiresFeatures(FeatureGL),
	"5_Domain_Specific/SLID3D10Texture":                    &OnlyOnWindows{},
	"5_Domain_Specific/VFlockingD3D10":                     &OnlyOnWindows{},
	"5_Domain_Specific/vulkanImageCUDA":                    RequiresFeatures(FeatureGL),
}

// flakyTests is a list of tests that are flaky.
// These will be retried up to 3 times in parallel before running 3 times
// serially.
var flakyTests = map[string]struct{}{
	"3_CUDA_Features/cdpAdvancedQuicksort": {},
}

// exclusiveTests is a list of tests that must run exclusively (i.e. with
// no other test running on the machine at the same time), or they will
// likely fail. These tests are not attempted to be run in parallel.
// This is usually the case for performance tests or tests that use a lot
// of resources in general.
// This saves the trouble to run them in parallel, while also avoiding
// causing spurious failures for the tests that happen to be running in
// parallel with them.
var exclusiveTests = map[string]struct{}{
	// Can fail due to
	// "launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount"
	// when running in parallel with other tests.
	"3_CUDA_Features/cdpAdvancedQuicksort": {},

	// Performance-intensive tests that tend to make other concurrent tests
	// flake due to their high resource usage.
	"6_Performance/alignedTypes":      {},
	"6_Performance/transpose":         {},
	"6_Performance/UnifiedMemoryPerf": {},
}

// alwaysSkippedTests don't run at all, ever, and are not verified when
// --cuda_verify_compatibility is set.
// Each test is mapped to a reason why it should be skipped.
var alwaysSkippedTests = map[string]string{}

// Feature is a feature as listed by /list_features.sh.
type Feature string

// All CUDA features listed by /list_features.sh.
const (
	FeaturePersistentL2Caching Feature = "PERSISTENT_L2_CACHING"
	FeatureDynamicParallelism  Feature = "DYNAMIC_PARALLELISM"
	FeatureGL                  Feature = "GL"
	FeatureTensorCores         Feature = "TENSOR_CORES"
	FeatureCompressibleMemory  Feature = "COMPRESSIBLE_MEMORY"
	FeatureP2P                 Feature = "P2P"
)

// allFeatures is a list of all CUDA features above.
var allFeatures = []Feature{
	FeaturePersistentL2Caching,
	FeatureDynamicParallelism,
	FeatureGL,
	FeatureTensorCores,
	FeatureCompressibleMemory,
	FeatureP2P,
}

// TestEnvironment represents the environment in which a sample test runs.
type TestEnvironment struct {
	NumGPUs         int
	RuntimeIsGVisor bool
	Features        map[Feature]bool
}

// Compatibility encodes the compatibility of a test depending on the
// environment it runs in.
type Compatibility interface {
	// WillFail returns a string explaining why the test is expected to fail
	// in the given environment, or "" if it isn't expected to fail.
	WillFail(ctx context.Context, env *TestEnvironment) string

	// IsExpectedFailure checks whether the `logs` (from a failed run of the test
	// in the given environment) matches the failure that this test expects in
	// that environment. If they match, this function should return nil.
	// It is only called when `WillFail` returns a non-empty string for the same
	// environment, so it may assume that `env` is non-compatible.
	IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error
}

// BrokenEverywhere implements `Compatibility` for tests that are broken in
// all environments.
type BrokenEverywhere struct {
	Reason string
}

// WillFail implements `Compatibility.WillFail`.
func (be *BrokenEverywhere) WillFail(ctx context.Context, env *TestEnvironment) string {
	return fmt.Sprintf("Known-broken test: %v", be.Reason)
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*BrokenEverywhere) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	return nil
}

// BrokenInGVisor implements `Compatibility` for tests that are broken in
// gVisor only.
type BrokenInGVisor struct {
	// OnlyWhenMultipleGPU may be set to true for tests which only fail when
	// multiple GPUs are present. This should not be used for tests that
	// *require* multiple GPUs to run (use RequiresMultiGPU instead).
	// This is for tests that can run on a single or multiple GPUs alike,
	// but specifically fail in gVisor when run with multiple GPUs.
	OnlyWhenMultipleGPU bool
}

// WillFail implements `Compatibility.WillFail`.
func (big *BrokenInGVisor) WillFail(ctx context.Context, env *TestEnvironment) string {
	if !env.RuntimeIsGVisor {
		return ""
	}
	if big.OnlyWhenMultipleGPU && env.NumGPUs == 1 {
		return ""
	}
	if big.OnlyWhenMultipleGPU {
		return "Known to be broken in gVisor when multiple GPUs are present"
	}
	return "Known to be broken in gVisor"
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*BrokenInGVisor) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	return nil
}

// RequiresMultiGPU implements `Compatibility` for tests that require multiple
// GPUs.
type RequiresMultiGPU struct{}

// WillFail implements `Compatibility.WillFail`.
func (*RequiresMultiGPU) WillFail(ctx context.Context, env *TestEnvironment) string {
	if env.NumGPUs < 2 {
		return "Requires >= 2 GPUs"
	}
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	if exitCode != exitCodeWaived {
		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
	}
	return nil
}

// RequiresMultiGPU implements `Compatibility` for tests that require
// peer-to-peer communication between GPUs.
// Implies RequiresMultiGPU, so tests do not need to specify both.
type RequiresP2P struct{}

// WillFail implements `Compatibility.WillFail`.
func (*RequiresP2P) WillFail(ctx context.Context, env *TestEnvironment) string {
	if notEnoughGPUs := (&RequiresMultiGPU{}).WillFail(ctx, env); notEnoughGPUs != "" {
		return notEnoughGPUs
	}
	if hasP2P := env.Features[FeatureP2P]; !hasP2P {
		return "Requires P2P support"
	}
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*RequiresP2P) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	if err := (&RequiresMultiGPU{}).IsExpectedFailure(ctx, env, logs, exitCode); err == nil {
		return nil
	}
	const wantLog = "Peer to Peer access is not available amongst GPUs in the system, waiving test"
	if strings.Contains(logs, wantLog) {
		return nil
	}
	return fmt.Errorf("exit code %d and logs %q, expected EXIT_WAIVED (%d) or log message %q", exitCode, logs, exitCodeWaived, wantLog)
}

// requiresFeatures implements `Compatibility` for tests that require
// specific features.
type requiresFeatures struct {
	features []Feature
}

func RequiresFeatures(features ...Feature) Compatibility {
	return &requiresFeatures{features: features}
}

// WillFail implements `Compatibility.WillFail`.
func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) string {
	for _, feature := range r.features {
		if !env.Features[feature] {
			return fmt.Sprintf("Requires feature %s", feature)
		}
	}
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (r *requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	if slices.Contains(r.features, FeatureGL) && !env.Features[FeatureGL] && strings.Contains(logs, `code=999(cudaErrorUnknown) "cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)"`) {
		// Some GL-requiring tests such as `5_Domain_Specific/postProcessGL`
		// and `5_Domain_Specific/fluidsGL` will incorrectly detect that GL
		// is supported, and fail with this error message rather than waiving.
		return nil
	}
	if exitCode != exitCodeWaived {
		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
	}
	return nil
}

// OnlyOnWindows implements `Compatibility` for tests that are only expected
// to only pass on Windows.
type OnlyOnWindows struct{}

// WillFail implements `Compatibility.WillFail`.
func (*OnlyOnWindows) WillFail(ctx context.Context, env *TestEnvironment) string {
	if runtime.GOOS != "windows" {
		return "Only runs on Windows"
	}
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*OnlyOnWindows) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	if strings.Contains(logs, "is not supported on Linux") {
		return nil
	}
	if exitCode != exitCodeWaived {
		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
	}
	return nil
}

type RequiresNvSci struct{}

// WillFail implements `Compatibility.WillFail`.
func (*RequiresNvSci) WillFail(ctx context.Context, env *TestEnvironment) string {
	return "Requires NvSci library which is not open-source"
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*RequiresNvSci) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	return nil
}

type NoCrossCompile struct{}

func (*NoCrossCompile) WillFail(ctx context.Context, env *TestEnvironment) string {
	if strings.HasPrefix(runtime.GOARCH, "arm") {
		return "Test not supported on ARM. Cross compiled libraries not supported."
	}
	return ""
}

func (*NoCrossCompile) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, _ int) error {
	crossCompileString := "cross compiling from sbsa to aarch64 is not supported!"
	if strings.Contains(logs, crossCompileString) {
		return nil
	}
	return fmt.Errorf("expected log message %q not found: logs: %q", crossCompileString, logs)
}

// multiCompatibility implements `Compatibility` with multiple possible
// Compatibility implementations.
type multiCompatibility struct {
	compats []Compatibility
}

// MultiCompatibility implements `Compatibility` with multiple possible
// Compatibility implementations.
func MultiCompatibility(compats ...Compatibility) Compatibility {
	return &multiCompatibility{compats: compats}
}

// WillFail implements `Compatibility.WillFail`.
func (mc *multiCompatibility) WillFail(ctx context.Context, env *TestEnvironment) string {
	for _, compat := range mc.compats {
		if reason := compat.WillFail(ctx, env); reason != "" {
			return reason
		}
	}
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (mc *multiCompatibility) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	var possibleCompats []Compatibility
	for _, compat := range mc.compats {
		if reason := compat.WillFail(ctx, env); reason != "" {
			possibleCompats = append(possibleCompats, compat)
		}
	}
	if len(possibleCompats) == 0 {
		return errors.New("no known explanation for this failure")
	}
	var errs []string
	for _, compat := range possibleCompats {
		err := compat.IsExpectedFailure(ctx, env, logs, exitCode)
		if err == nil {
			return nil
		}
		errs = append(errs, fmt.Sprintf("might have been broken because %s but %v", compat.WillFail(ctx, env), err))
	}
	return fmt.Errorf("no known explanation for this failure: %v", strings.Join(errs, "; "))
}

// FullyCompatible implements `Compatibility` for tests that are expected to
// pass in any environment.
type FullyCompatible struct{}

// WillFail implements `Compatibility.WillFail`.
func (*FullyCompatible) WillFail(ctx context.Context, env *TestEnvironment) string {
	return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
	return errors.New("test is expected to pass regardless of environment")
}

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() (dockerutil.RunOpts, error) {
	opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
		Capabilities: "all",
	})
	if err != nil {
		return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
	}
	opts.Image = "gpu/cuda-tests"
	return opts, nil
}

// testLog logs a line as a test log.
// If debug is enabled, it is also printed immediately to stderr.
// This is useful for debugging tests.
func testLog(t *testing.T, format string, values ...any) {
	t.Helper()
	if *debug {
		fmt.Fprintf(os.Stderr, "[%s] %s\n", t.Name(), fmt.Sprintf(format, values...))
	}
	t.Logf(format, values...)
}

// multiLineLog logs a multiline string as separate log messages to `t`.
// This is useful to log multi-line container logs without them looking weird
// with line breaks in the middle.
func multiLineLog(t *testing.T, output string) {
	t.Helper()
	for _, line := range strings.Split(output, "\n") {
		// `line` may contain % characters here, so we need to format it through
		// `%s` so that `%` characters don't show up as "MISSING" in the logs.
		testLog(t, "%s", line)
	}
}

// GetEnvironment returns the environment in which a sample test runs.
func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) {
	numGPU := dockerutil.NumGPU()
	if numGPU == 0 {
		return nil, errors.New("no GPUs detected")
	}
	if numGPU == 1 {
		testLog(t, "1 GPU detected")
	} else {
		testLog(t, "%d GPUs detected", numGPU)
	}
	runtimeIsGVisor, err := dockerutil.IsGVisorRuntime(ctx, t)
	if err != nil {
		return nil, fmt.Errorf("cannot determine if runtime is gVisor or not: %w", err)
	}
	if runtimeIsGVisor {
		testLog(t, "Runtime is detected as gVisor")
		runtimeArgs, err := dockerutil.RuntimeArgs()
		if err != nil {
			t.Fatalf("Failed to get runtime arguments: %v", err)
		}
		foundNVCaps := ""
		const nvCapsPrefixFlag = "--nvproxy-allowed-driver-capabilities"
		for i, arg := range runtimeArgs {
			if strings.HasPrefix(arg, nvCapsPrefixFlag+"=") {
				foundNVCaps = strings.TrimPrefix(arg, nvCapsPrefixFlag+"=")
			} else if arg == "--nvproxy-allowed-driver-capabilities" && i < len(runtimeArgs)-1 {
				foundNVCaps = runtimeArgs[i+1]
			}
		}
		if foundNVCaps == "" {
			return nil, fmt.Errorf("did not find --nvproxy-allowed-driver-capabilities=all flag in gVisor runtime arguments, please specify it for this test")
		}
		if foundNVCaps != "all" {
			return nil, fmt.Errorf("found --nvproxy-allowed-driver-capabilities=%q flag in gVisor runtime arguments, please specify --nvproxy-allowed-driver-capabilities=all for this test", foundNVCaps)
		}
	} else {
		testLog(t, "Runtime is detected as non-gVisor")
	}
	featuresContainer := dockerutil.MakeContainer(ctx, t)
	defer featuresContainer.CleanUp(ctx)
	runOpts, err := getContainerOpts()
	if err != nil {
		return nil, fmt.Errorf("failed to get container options: %w", err)
	}
	featuresList, err := featuresContainer.Run(ctx, runOpts, "/list_features.sh")
	if err != nil {
		return nil, fmt.Errorf("cannot get list of CUDA features: %v", err)
	}
	features := make(map[Feature]bool)
	for _, line := range strings.Split(featuresList, "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		if strings.HasPrefix(line, "//") {
			testLog(t, "/list_features.sh: %s", line)
			continue
		}
		featureAvailable := false
		var feature Feature
		if strings.HasPrefix(line, "PRESENT: ") {
			featureAvailable = true
			feature = Feature(strings.TrimPrefix(line, "PRESENT: "))
		} else if strings.HasPrefix(line, "ABSENT: ") {
			featureAvailable = false
			feature = Feature(strings.TrimPrefix(line, "ABSENT: "))
		} else {
			return nil, fmt.Errorf("unexpected CUDA feature line: %q", line)
		}
		found := false
		for _, f := range allFeatures {
			if feature == f {
				features[f] = featureAvailable
				if featureAvailable {
					testLog(t, "CUDA feature is available: %s", string(f))
				} else {
					testLog(t, "CUDA feature is *not* available: %s", string(f))
				}
				found = true
				break
			}
		}
		if !found {
			return nil, fmt.Errorf("unknown CUDA feature: %s", string(feature))
		}
	}
	for _, feature := range allFeatures {
		if _, ok := features[feature]; !ok {
			return nil, fmt.Errorf("CUDA feature not found in feature list: %s", string(feature))
		}
	}
	// Use CUDA dynamic parallelism as a litmus test to see if the features were
	// enumerated correctly.
	if _, hasDynamicParallelism := features[FeatureDynamicParallelism]; !hasDynamicParallelism {
		return nil, errors.New("CUDA feature Dynamic Parallelism is not available yet should be available in all environments gVisor supports; this indicates a failure in the feature listing script")
	}
	return &TestEnvironment{
		NumGPUs:         numGPU,
		RuntimeIsGVisor: runtimeIsGVisor,
		Features:        features,
	}, nil
}

// runSampleTest runs a single CUDA sample test.
// It first tries to run in pooled container.
// If that fails, then it runs in an exclusive container.
// It returns a skip reason (or empty if the test was not skipped), and
// an error if the test fails.
func runSampleTest(ctx context.Context, t *testing.T, testName string, te *TestEnvironment, cp *dockerutil.ContainerPool) (string, error) {
	compatibilities := []Compatibility{}
	if compat, found := testCompatibility[testName]; found {
		compatibilities = append(compatibilities, compat)
	}
	for suite, comp := range testSuiteCompatibility {
		if strings.HasPrefix(testName, suite) {
			compatibilities = append(compatibilities, comp)
		}
	}
	compat := MultiCompatibility(compatibilities...)
	if len(compatibilities) == 0 {
		compat = &FullyCompatible{}
	}
	willFailReason := compat.WillFail(ctx, te)
	if willFailReason != "" && !*verifyCompatibility {
		return fmt.Sprintf("this test is expected to fail (%s) --cuda_verify_compatibility=true to verify compatibility)", willFailReason), nil
	}
	if skipReason, isAlwaysSkipped := alwaysSkippedTests[testName]; isAlwaysSkipped {
		return fmt.Sprintf("this test is always skipped (%v)", skipReason), nil
	}
	testTimeout := defaultTestTimeout
	execTestTimeout := testTimeout - 15*time.Second
	testAttempts := 1
	if _, isFlakyTest := flakyTests[testName]; isFlakyTest {
		testAttempts = 3
	}
	parallelAttempts := testAttempts
	if _, isExclusiveTest := exclusiveTests[testName]; isExclusiveTest {
		parallelAttempts = 0
	}
	for attempt := 0; attempt < parallelAttempts; attempt++ {
		c, release, err := cp.Get(ctx)
		if err != nil {
			release()
			return "", fmt.Errorf("failed to get container: %v", err)
		}
		cp.SetContainerLabel(c, fmt.Sprintf("Running %s in parallel (attempt %d/%d)", testName, attempt+1, parallelAttempts))
		testLog(t, "Running test in parallel mode in container %s (attempt %d/%d)...", c.Name, attempt+1, parallelAttempts)
		parallelCtx, parallelCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("parallel execution took too long"))
		testStartedAt := time.Now()
		output, err := c.Exec(parallelCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName)
		testDuration := time.Since(testStartedAt)
		parallelCancel()
		release()
		if err == nil {
			if willFailReason != "" {
				multiLineLog(t, output)
				return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason)
			}
			// Only log the output when the test succeeds here.
			// If it fails, we'll run exclusively below, and the output from *that*
			// run will be logged instead.
			if *logSuccessfulTests {
				multiLineLog(t, output)
			}
			testLog(t, "Test passed in parallel mode in %v.", testDuration)
			return "", nil
		}
		var exitCode int
		if execErr, ok := err.(*dockerutil.ExecError); ok {
			exitCode = execErr.ExitStatus
		}
		if willFailReason != "" {
			isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode)
			if isExpectedErr == nil {
				testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration)
				return "", nil
			}
		}
	}
	if parallelAttempts > 0 {
		testLog(t, "Will re-run the test in exclusive mode.")
	}
	c, release, err := cp.GetExclusive(ctx)
	defer release()
	if err != nil {
		return "", fmt.Errorf("failed to get excusive container: %v", err)
	}
	var testErr error
	for attempt := 0; attempt < testAttempts; attempt++ {
		cp.SetContainerLabel(c, fmt.Sprintf("Running %s exclusively (attempt %d/%d)", testName, attempt+1, testAttempts))
		testLog(t, "Running test in exclusive mode in container %s (attempt %d/%d)...", c.Name, attempt+1, testAttempts)
		exclusiveCtx, exclusiveCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("exclusive execution took too long"))
		testStartedAt := time.Now()
		var output string
		output, testErr = c.Exec(exclusiveCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName)
		testDuration := time.Since(testStartedAt)
		exclusiveCancel()
		if testErr == nil {
			if willFailReason != "" {
				multiLineLog(t, output)
				return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason)
			}
			if *logSuccessfulTests {
				multiLineLog(t, output)
			}
			testLog(t, "Test passed in exclusive mode in %v.", testDuration)
			return "", nil
		}
		multiLineLog(t, output)
		var exitCode int
		if execErr, ok := testErr.(*dockerutil.ExecError); ok {
			exitCode = execErr.ExitStatus
		}
		if willFailReason != "" {
			isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode)
			if isExpectedErr == nil {
				testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration)
				return "", nil
			}
			return "", fmt.Errorf("test was expected to fail (%s), but it failed with %v which is a different reason reason than expected: %v", willFailReason, testErr, isExpectedErr)
		}
	}
	return "", fmt.Errorf("test failed: %v", testErr)
}

// getDesiredTestParallelism returns the number of tests to run in parallel.
func getDesiredTestParallelism() int {
	numCPU := runtime.NumCPU()
	if numCPU <= 0 {
		panic("cannot detect number of cores")
	}
	return int(math.Ceil((*containersPerCPU) * float64(numCPU)))
}

// TestCUDA runs CUDA tests.
func TestCUDA(t *testing.T) {
	const defaultMaxDuration = 59*time.Minute + 30*time.Second

	testStart := time.Now()
	maxDuration := defaultMaxDuration
	if timeoutFlag := flag.Lookup("timeout"); timeoutFlag != nil {
		if timeoutFlagStr := timeoutFlag.Value.String(); timeoutFlagStr != "" {
			timeoutFlagValue, err := time.ParseDuration(timeoutFlagStr)
			if err != nil {
				t.Fatalf("--timeout flag %q is not a valid duration: %v", timeoutFlagStr, err)
			}
			if timeoutFlagValue != 0 {
				maxDuration = timeoutFlagValue
			}
		}
	}
	ctx, cancel := context.WithTimeoutCause(context.Background(), maxDuration, errors.New("overall test timed out"))
	defer cancel()
	testDeadline, ok := ctx.Deadline()
	if !ok {
		t.Fatal("context had no deadline")
	}
	testLog(t, "Test timeout is %v; started at %v, deadline is %v", maxDuration, testStart, testDeadline)

	te, err := GetEnvironment(ctx, t)
	if err != nil {
		t.Fatalf("Failed to get test environment: %v", err)
	}

	// Get a list of sample tests.
	listContainer := dockerutil.MakeContainer(ctx, t)
	defer listContainer.CleanUp(ctx)
	runOpts, err := getContainerOpts()
	if err != nil {
		t.Fatalf("Failed to get container options: %v", err)
	}
	testsList, err := listContainer.Run(ctx, runOpts, "/list_sample_tests.sh")
	if err != nil {
		t.Fatalf("Cannot list sample tests: %v", err)
	}
	testsSplit := strings.Split(testsList, "\n")
	allTests := make([]string, 0, len(testsSplit))
	allTestsMap := make(map[string]struct{}, len(testsSplit))
	for _, test := range testsSplit {
		testName := strings.TrimSpace(test)
		if testName == "" {
			continue
		}
		allTestsMap[testName] = struct{}{}
		allTests = append(allTests, testName)
	}
	numTests := len(allTests)
	testLog(t, "Number of CUDA sample tests detected: %d", numTests)

	// Check that all tests in test maps still exist.
	t.Run("CUDA test existence", func(t *testing.T) {
		for testName := range testCompatibility {
			if _, ok := allTestsMap[testName]; !ok {
				t.Errorf("CUDA test %q referenced in `testCompatibility` but it no longer exists, please remove it.", testName)
			}
		}
	})

	// Filter tests if partitioning is enabled.
	testIndices, err := testutil.TestIndicesForShard(numTests)
	if err != nil {
		t.Fatalf("Failed to get test indices for shard: %v", err)
	}
	if len(testIndices) != numTests {
		filteredTests := make([]string, 0, len(testIndices))
		for _, testIndex := range testIndices {
			filteredTests = append(filteredTests, allTests[testIndex])
		}
		testLog(t, "Filtered tests from sharding; %d -> %d tests.", numTests, len(filteredTests))
		allTests = filteredTests
		numTests = len(allTests)
	}

	// In order to go through tests efficiently, we reuse containers.
	// However, running tests serially within the same container would also be
	// slow. So this test spawns a pool of containers, one per CPU.
	// This saves time because a lot of the time here is actually spent waiting
	// for compilation of the CUDA program on the CPU, and isn't actually
	// blocked on the GPU. However, it is possible that two CUDA tests do end
	// up running on the GPU at the same time, and that they don't work together
	// for some reason (e.g. out of GPU memory).
	// To address this, the test first runs every test in parallel. Then, if
	// any of them failed, it will run only the failed ones serially.
	numParallel := getDesiredTestParallelism()
	numContainers := min(numParallel, max(numTests, 1))
	if numContainers == numParallel {
		testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers)
	} else {
		testLog(t, "%d tests to run, spawning %d CUDA containers...", numTests, numContainers)
	}
	spawnGroup, spawnCtx := errgroup.WithContext(ctx)
	containers := make([]*dockerutil.Container, numContainers)
	for i := 0; i < numContainers; i++ {
		spawnGroup.Go(func() error {
			c := dockerutil.MakeContainer(ctx, t)
			runOpts, err := getContainerOpts()
			if err != nil {
				return fmt.Errorf("failed to get container options: %w", err)
			}
			if err := c.Spawn(spawnCtx, runOpts, "/bin/sleep", "6h"); err != nil {
				return fmt.Errorf("container %v failed to spawn: %w", c.Name, err)
			}
			containers[i] = c
			return nil
		})
	}
	if err := spawnGroup.Wait(); err != nil {
		for _, c := range containers {
			if c != nil {
				c.CleanUp(ctx)
			}
		}
		t.Fatalf("Failed to spawn containers: %v", err)
	}
	cp := dockerutil.NewContainerPool(containers)
	defer cp.CleanUp(ctx)
	var testMu sync.Mutex
	testsDone := 0
	var failedTests []string
	statusFn := func() {
		now := time.Now()
		testMu.Lock()
		defer testMu.Unlock()
		donePct := 100.0 * float64(testsDone) / float64(numTests)
		startedAgo := now.Sub(testStart)
		deadlineIn := testDeadline.Sub(now)
		durationPct := 100.0 * float64(startedAgo) / float64(testDeadline.Sub(testStart))
		testLog(t, "[Timing] %d/%d tests (%.1f%%) finished executing. Test started %v ago, deadline in %v (%.1f%%).", testsDone, numTests, donePct, startedAgo.Truncate(time.Second), deadlineIn.Truncate(time.Second), durationPct)
		if len(failedTests) > 0 {
			testLog(t, "[Failed] %d test(s) failed: %v", len(failedTests), strings.Join(failedTests, ", "))
		}
		testLog(t, "[Pool] %v", cp.String())
	}
	if *debug {
		go func() {
			ticker := time.NewTicker(5 * time.Second)
			defer ticker.Stop()
			for {
				select {
				case <-ctx.Done():
					return
				case <-ticker.C:
					statusFn()
				}
			}
		}()
	}
	var samplesTestName string
	t.Run("Samples", func(t *testing.T) {
		samplesTestName = t.Name()
		// Now spawn all subtests in parallel.
		// All sub-tests will first try to run in parallel using one of the pooled
		// containers.
		// Those that failed will try to grab `serialMu` in order to run serially.
		// Therefore, the main goroutine here holds `serialMu` and only releases
		// when all parallel test attempts have completed.
		testutil.NewTree(allTests, "/").RunParallel(t, func(t *testing.T, testName string) {
			t.Helper()
			skippedReason, err := runSampleTest(ctx, t, testName, te, cp)
			if err != nil {
				t.Errorf("%s: %v", testName, err)
			}
			testMu.Lock()
			defer testMu.Unlock()
			testsDone++
			if t.Failed() && ctx.Err() == nil {
				failedTests = append(failedTests, testName)
			}
			if skippedReason != "" {
				t.Skip(skippedReason)
			}
		})
	})
	statusFn()
	testMu.Lock()
	defer testMu.Unlock()
	if len(failedTests) > 0 {
		if ctx.Err() != nil {
			t.Errorf("%d tests failed prior to timeout:", len(failedTests))
			for _, testName := range failedTests {
				t.Errorf("  %s", testName)
			}
		}
		if len(failedTests) > 0 {
			t.Errorf("To re-run a specific test locally, either re-run this test with filtering enabled (example: --test.run=%s/%s), or:", samplesTestName, failedTests[0])
			t.Errorf(
				"  $ docker run --runtime=%s --gpus=all -e %s --rm %s /run_sample %s",
				dockerutil.Runtime(),
				dockerutil.AllGPUCapabilitiesEnv,
				runOpts.Image,
				failedTests[0],
			)
		}
	} else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 {
		testLog(t, "WARNING: Container pool utilization was only %.1f%% during the test.", poolUtilization*100.0)
		testLog(t, "This test can be made faster and more efficient with proper test categorization,")
		testLog(t, "by identifying flaky tests and exclusive-requiring tests.")
		testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.")
	}
}

// TestMain overrides the `test.parallel` flag.
func TestMain(m *testing.M) {
	dockerutil.EnsureSupportedDockerVersion()
	flag.Parse()
	// The Go testing library won't run more than GOMAXPROCS parallel tests by
	// default, and the value of GOMAXPROCS is taken at program initialization
	// time, so by the time we get here, it is already stuck at GOMAXPROCS.
	// In order to run more parallel tests than there are cores, we therefore
	// need to override the `test.parallel` flag here before `m.Run`.
	testParallelFlag := flag.Lookup("test.parallel")
	if testParallelFlag == nil {
		panic("cannot find -test.parallel flag")
	}
	if err := testParallelFlag.Value.Set(strconv.Itoa(getDesiredTestParallelism())); err != nil {
		panic(fmt.Sprintf("cannot set -test.parallel flag: %v", err))
	}
	os.Exit(m.Run())
}