Files
gvisor/pkg/devutil/devutil.go
T
Ayush Ranjan a5e93550c1 Move GPU device ownership to gofer process.
Tested on a T4 GPU with driver version 525.60.13:
```
$ docker run --runtime=runsc --gpus=all --rm -it nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubi8
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done
```

Also tested this on GKE with the same vectoradd workload. Checked that the
device gofer connection is actually being closed when the container is deleted.
Something to note is that the gofer logs for the GPU-container sometimes end
abruptly (the "All lisafs servers exited." line does not print). This is
because runsc/container/container.go:stop() SIGKILLs the gofer before it can
cleanup naturally. The device gofer connection is only closed at the end of
Loader.destroySubcontainer(), which gives little time before the gofer is
SIGKILL-ed.

PiperOrigin-RevId: 581365665
2023-11-10 14:20:31 -08:00

119 lines
3.4 KiB
Go

// Copyright 2023 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package devutil provides device specific utilities.
package devutil
import (
"fmt"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fsutil"
"gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/unet"
)
// GoferClient is the lisafs client for the /dev gofer connection.
type GoferClient struct {
clientFD lisafs.ClientFD
hostFD int
}
// NewGoferClient establishes the LISAFS connection to the dev gofer server.
// It takes ownership of fd.
func NewGoferClient(ctx context.Context, fd int) (*GoferClient, error) {
ctx.UninterruptibleSleepStart(false)
defer ctx.UninterruptibleSleepFinish(false)
sock, err := unet.NewSocket(fd)
if err != nil {
ctx.Warningf("failed to create socket for dev gofer client: %v", err)
return nil, err
}
client, devInode, devHostFD, err := lisafs.NewClient(sock)
if err != nil {
ctx.Warningf("failed to create dev gofer client: %v", err)
return nil, err
}
return &GoferClient{
clientFD: client.NewFD(devInode.ControlFD),
hostFD: devHostFD,
}, nil
}
// Close closes the LISAFS connection.
func (g *GoferClient) Close() {
// Close the connection to the server. This implicitly closes all FDs.
g.clientFD.Client().Close()
if g.hostFD >= 0 {
_ = unix.Close(g.hostFD)
}
}
// DirentNames returns names of all the dirents for /dev on the gofer.
func (g *GoferClient) DirentNames(ctx context.Context) ([]string, error) {
if g.hostFD >= 0 {
return fsutil.DirentNames(g.hostFD)
}
client := g.clientFD.Client()
openFDID, _, err := g.clientFD.OpenAt(ctx, unix.O_RDONLY)
if err != nil {
return nil, fmt.Errorf("failed to open dev from gofer: %v", err)
}
defer client.CloseFD(ctx, openFDID, true /* flush */)
openFD := client.NewFD(openFDID)
const count = int32(64 * 1024)
var names []string
for {
dirents, err := openFD.Getdents64(ctx, count)
if err != nil {
return nil, fmt.Errorf("Getdents64 RPC failed: %v", err)
}
if len(dirents) == 0 {
break
}
for i := range dirents {
names = append(names, string(dirents[i].Name))
}
}
return names, nil
}
// OpenAt opens the device file at /dev/{name} on the gofer.
func (g *GoferClient) OpenAt(ctx context.Context, name string, flags uint32) (int, error) {
flags &= unix.O_ACCMODE
if g.hostFD >= 0 {
return unix.Openat(g.hostFD, name, int(flags|unix.O_NOFOLLOW), 0)
}
childInode, err := g.clientFD.Walk(ctx, name)
if err != nil {
log.Infof("failed to walk %q from dev gofer FD", name)
return 0, err
}
client := g.clientFD.Client()
childFD := client.NewFD(childInode.ControlFD)
childOpenFD, childHostFD, err := childFD.OpenAt(ctx, flags)
if err != nil {
log.Infof("failed to open %q from child FD", name)
client.CloseFD(ctx, childFD.ID(), true /* flush */)
return 0, err
}
client.CloseFD(ctx, childFD.ID(), false /* flush */)
client.CloseFD(ctx, childOpenFD, true /* flush */)
return childHostFD, nil
}