Files
Nicolas Lacasse c238e15234 Fix validation of close_range last fd argument.
The `last` fd argument can be up to max uint32, and some applications call it
with this maximum:
https://github.com/GNOME/glib/blob/26bc1d08ec574b387ff4bcd919a020a586727bbf/glib/glib-unix.c#L890

PiperOrigin-RevId: 718526878
2025-01-22 14:25:24 -08:00

1711 lines
49 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
// Mknod implements Linux syscall mknod(2).
func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
dev := args[2].Uint()
return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
}
// Mknodat implements Linux syscall mknodat(2).
func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
dev := args[3].Uint()
return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
}
func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release(t)
// "Zero file type is equivalent to type S_IFREG." - mknod(2)
if mode.FileType() == 0 {
mode |= linux.ModeRegular
}
major, minor := linux.DecodeDeviceID(dev)
return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
Mode: mode &^ linux.FileMode(t.FSContext().Umask()),
DevMajor: uint32(major),
DevMinor: minor,
})
}
// Open implements Linux syscall open(2).
func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
flags := args[1].Uint()
mode := args[2].ModeT()
return openat(t, linux.AT_FDCWD, addr, flags, mode)
}
// Openat implements Linux syscall openat(2).
func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
flags := args[2].Uint()
mode := args[3].ModeT()
return openat(t, dirfd, addr, flags, mode)
}
// Creat implements Linux syscall creat(2).
func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
}
func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
if err != nil {
return 0, nil, err
}
defer tpop.Release(t)
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
Flags: flags | linux.O_LARGEFILE,
Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
if err != nil {
return 0, nil, err
}
defer file.DecRef(t)
fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
})
return uintptr(fd), nil, err
}
// Access implements Linux syscall access(2).
func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].Uint()
return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */)
}
// Faccessat implements Linux syscall faccessat(2).
func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].Uint()
return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */)
}
// Faccessat2 implements Linux syscall faccessat2(2).
func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].Uint()
flags := args[3].Int()
return 0, nil, accessAt(t, dirfd, addr, mode, flags)
}
func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint32, flags int32) error {
const rOK = 4
const wOK = 2
const xOK = 1
// Sanity check the mode.
if mode&^(rOK|wOK|xOK) != 0 {
return linuxerr.EINVAL
}
// faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does.
if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 {
return linuxerr.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0))
if err != nil {
return err
}
defer tpop.Release(t)
creds := t.Credentials()
if flags&linux.AT_EACCESS == 0 {
// access(2) and faccessat(2) check permissions using real
// UID/GID, not effective UID/GID.
//
// "access() needs to use the real uid/gid, not the effective
// uid/gid. We do this by temporarily clearing all FS-related
// capabilities and switching the fsuid/fsgid around to the
// real ones." -fs/open.c:faccessat
creds = creds.Fork()
creds.EffectiveKUID = creds.RealKUID
creds.EffectiveKGID = creds.RealKGID
if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
creds.EffectiveCaps = creds.PermittedCaps
} else {
creds.EffectiveCaps = 0
}
}
return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
}
// Ioctl implements Linux syscall ioctl(2).
func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
if file.StatusFlags()&linux.O_PATH != 0 {
return 0, nil, linuxerr.EBADF
}
// Handle ioctls that apply to all FDs.
switch args[1].Int() {
case linux.FIONCLEX:
t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
case linux.FIONBIO:
var set int32
if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.StatusFlags()
if set != 0 {
flags |= linux.O_NONBLOCK
} else {
flags &^= linux.O_NONBLOCK
}
return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
case linux.FIOASYNC:
var set int32
if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.StatusFlags()
if set != 0 {
flags |= linux.O_ASYNC
} else {
flags &^= linux.O_ASYNC
}
file.SetStatusFlags(t, t.Credentials(), flags)
return 0, nil, nil
case linux.FIOGETOWN, linux.SIOCGPGRP:
var who int32
owner, hasOwner := getAsyncOwner(t, file)
if hasOwner {
if owner.Type == linux.F_OWNER_PGRP {
who = -owner.PID
} else {
who = owner.PID
}
}
_, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
return 0, nil, err
case linux.FIOSETOWN, linux.SIOCSPGRP:
var who int32
if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
return 0, nil, err
}
ownerType := int32(linux.F_OWNER_PID)
if who < 0 {
// Check for overflow before flipping the sign.
if who-1 > who {
return 0, nil, linuxerr.EINVAL
}
ownerType = linux.F_OWNER_PGRP
who = -who
}
return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
}
ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args)
return ret, nil, err
}
// Getcwd implements Linux syscall getcwd(2).
func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
size := args[1].SizeT()
root := t.FSContext().RootDirectory()
wd := t.FSContext().WorkingDirectory()
s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
root.DecRef(t)
wd.DecRef(t)
if err != nil {
return 0, nil, err
}
// Note this is >= because we need a terminator.
if uint(len(s)) >= size {
return 0, nil, linuxerr.ERANGE
}
// Construct a byte slice containing a NUL terminator.
buf := t.CopyScratchBuffer(len(s) + 1)
copy(buf, s)
buf[len(buf)-1] = 0
// Write the pathname slice.
n, err := t.CopyOutBytes(addr, buf)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Chdir implements Linux syscall chdir(2).
func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetWorkingDirectory(t, vd)
vd.DecRef(t)
return 0, nil, nil
}
// Fchdir implements Linux syscall fchdir(2).
func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetWorkingDirectory(t, vd)
vd.DecRef(t)
return 0, nil, nil
}
// Chroot implements Linux syscall chroot(2).
func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
if !t.HasCapability(linux.CAP_SYS_CHROOT) {
return 0, nil, linuxerr.EPERM
}
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetRootDirectory(t, vd)
vd.DecRef(t)
return 0, nil, nil
}
// PivotRoot implements Linux syscall pivot_root(2).
func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr1 := args[0].Pointer()
addr2 := args[1].Pointer()
if !t.HasCapability(linux.CAP_SYS_ADMIN) {
return 0, nil, linuxerr.EPERM
}
newRootPath, err := copyInPath(t, addr1)
if err != nil {
return 0, nil, err
}
newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer newRootTpop.Release(t)
putOldPath, err := copyInPath(t, addr2)
if err != nil {
return 0, nil, err
}
putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer putOldTpop.Release(t)
newRoot, oldRoot, err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop)
if err != nil {
return 0, nil, err
}
defer newRoot.DecRef(t)
defer oldRoot.DecRef(t)
t.Kernel().ReplaceFSContextRoots(t, oldRoot, newRoot)
return 0, nil, nil
}
// Close implements Linux syscall close(2).
func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
file := t.FDTable().Remove(t, fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
err := file.OnClose(t)
return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file)
}
// CloseRange implements linux syscall close_range(2).
func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
first := args[0].Uint()
last := args[1].Uint()
flags := args[2].Uint()
if first > last {
return 0, nil, linuxerr.EINVAL
}
if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 {
return 0, nil, linuxerr.EINVAL
}
// close_range allows fd arguments to be up to MaxUint32, but only fds
// up to kernel.MaxFdLimit are valid, so cap it here.
if last > uint32(kernel.MaxFdLimit) {
last = uint32(kernel.MaxFdLimit)
}
cloexec := flags & linux.CLOSE_RANGE_CLOEXEC
unshare := flags & linux.CLOSE_RANGE_UNSHARE
if unshare != 0 {
// If possible, we don't want to copy FDs to the new unshared table, because those FDs will
// be promptly closed and no longer used. So in the case where we know the range extends all
// the way to the end of the FdTable, we can simply copy the FdTable only up to the start of
// the range that we are closing.
if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() {
t.UnshareFdTable(int32(first))
} else {
t.UnshareFdTable(kernel.MaxFdLimit)
}
}
if cloexec != 0 {
flagToApply := kernel.FDFlags{
CloseOnExec: true,
}
t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply)
return 0, nil, nil
}
fdTable := t.FDTable()
fd := int32(first)
for {
fd, file := fdTable.RemoveNextInRange(t, fd, int32(last))
if file == nil {
break
}
fd++
// Per the close_range(2) documentation, errors upon closing file descriptors are ignored.
_ = file.OnClose(t)
file.DecRef(t)
}
return 0, nil, nil
}
// Dup implements Linux syscall dup(2).
func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, linuxerr.EMFILE
}
return uintptr(newFD), nil, nil
}
// Dup2 implements Linux syscall dup2(2).
func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldfd := args[0].Int()
newfd := args[1].Int()
if oldfd == newfd {
// As long as oldfd is valid, dup2() does nothing and returns newfd.
file := t.GetFile(oldfd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
file.DecRef(t)
return uintptr(newfd), nil, nil
}
return dup3(t, oldfd, newfd, 0)
}
// Dup3 implements Linux syscall dup3(2).
func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldfd := args[0].Int()
newfd := args[1].Int()
flags := args[2].Uint()
if oldfd == newfd {
return 0, nil, linuxerr.EINVAL
}
return dup3(t, oldfd, newfd, flags)
}
func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
if flags&^linux.O_CLOEXEC != 0 {
return 0, nil, linuxerr.EINVAL
}
file := t.GetFile(oldfd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
df, err := t.NewFDAt(newfd, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
})
if linuxerr.Equals(linuxerr.EMFILE, err) {
err = linuxerr.EBADF
}
if err != nil {
return 0, nil, err
}
if df != nil {
// "If the file descriptor newfd was previously open, it is closed
// before being reused; the close is performed silently (i.e., any
// errors during the close are not reported by dup2())." - dup(2)
_ = df.OnClose(t)
df.DecRef(t)
}
return uintptr(newfd), nil, nil
}
// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
cmd := args[1].Int()
file, flags := t.FDTable().Get(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
if file.StatusFlags()&linux.O_PATH != 0 {
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL:
// allowed
default:
return 0, nil, linuxerr.EBADF
}
}
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
minfd := args[2].Int()
fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{
CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
case linux.F_GETFD:
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
return 0, nil, err
case linux.F_GETFL:
return uintptr(file.StatusFlags()), nil, nil
case linux.F_SETFL:
return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
case linux.F_GETOWN:
owner, hasOwner := getAsyncOwner(t, file)
if !hasOwner {
return 0, nil, nil
}
if owner.Type == linux.F_OWNER_PGRP {
return uintptr(-owner.PID), nil, nil
}
return uintptr(owner.PID), nil, nil
case linux.F_SETOWN:
who := args[2].Int()
ownerType := int32(linux.F_OWNER_PID)
if who < 0 {
// Check for overflow before flipping the sign.
if who-1 > who {
return 0, nil, linuxerr.EINVAL
}
ownerType = linux.F_OWNER_PGRP
who = -who
}
return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
case linux.F_GETOWN_EX:
owner, hasOwner := getAsyncOwner(t, file)
if !hasOwner {
return 0, nil, nil
}
_, err := owner.CopyOut(t, args[2].Pointer())
return 0, nil, err
case linux.F_SETOWN_EX:
var owner linux.FOwnerEx
_, err := owner.CopyIn(t, args[2].Pointer())
if err != nil {
return 0, nil, err
}
return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID)
case linux.F_SETPIPE_SZ:
pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
if !ok {
return 0, nil, linuxerr.EBADF
}
n, err := pipefile.SetPipeSize(int64(args[2].Int()))
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
case linux.F_GETPIPE_SZ:
pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
if !ok {
return 0, nil, linuxerr.EBADF
}
return uintptr(pipefile.PipeSize()), nil, nil
case linux.F_GET_SEALS:
val, err := tmpfs.GetSeals(file)
return uintptr(val), nil, err
case linux.F_ADD_SEALS:
if !file.IsWritable() {
return 0, nil, linuxerr.EPERM
}
err := tmpfs.AddSeals(file, args[2].Uint())
return 0, nil, err
case linux.F_SETLK:
return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */)
case linux.F_SETLKW:
return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */)
case linux.F_GETLK:
return 0, nil, posixTestLock(t, args, file, false /* ofd */)
case linux.F_OFD_SETLK:
return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */)
case linux.F_OFD_SETLKW:
return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */)
case linux.F_OFD_GETLK:
return 0, nil, posixTestLock(t, args, file, true /* ofd */)
case linux.F_GETSIG:
a := file.AsyncHandler()
if a == nil {
// Default behavior aka SIGIO.
return 0, nil, nil
}
return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil
case linux.F_SETSIG:
a, err := file.SetAsyncHandler(fasync.New(int(fd)))
if err != nil {
return 0, nil, err
}
async := a.(*fasync.FileAsync)
return 0, nil, async.SetSignal(linux.Signal(args[2].Int()))
default:
// Everything else is not yet supported.
return 0, nil, linuxerr.EINVAL
}
}
func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
a := fd.AsyncHandler()
if a == nil {
return linux.FOwnerEx{}, false
}
ot, otg, opg := a.(*fasync.FileAsync).Owner()
switch {
case ot != nil:
return linux.FOwnerEx{
Type: linux.F_OWNER_TID,
PID: int32(t.PIDNamespace().IDOfTask(ot)),
}, true
case otg != nil:
return linux.FOwnerEx{
Type: linux.F_OWNER_PID,
PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
}, true
case opg != nil:
return linux.FOwnerEx{
Type: linux.F_OWNER_PGRP,
PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
}, true
default:
return linux.FOwnerEx{}, true
}
}
func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error {
switch ownerType {
case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
// Acceptable type.
default:
return linuxerr.EINVAL
}
a, err := file.SetAsyncHandler(fasync.New(fd))
if err != nil {
return err
}
async := a.(*fasync.FileAsync)
if pid == 0 {
async.ClearOwner()
return nil
}
switch ownerType {
case linux.F_OWNER_TID:
task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
if task == nil {
return linuxerr.ESRCH
}
async.SetOwnerTask(t, task)
return nil
case linux.F_OWNER_PID:
tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
if tg == nil {
return linuxerr.ESRCH
}
async.SetOwnerThreadGroup(t, tg)
return nil
case linux.F_OWNER_PGRP:
pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
if pg == nil {
return linuxerr.ESRCH
}
async.SetOwnerProcessGroup(t, pg)
return nil
default:
return linuxerr.EINVAL
}
}
func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error {
// Copy in the lock request.
flockAddr := args[2].Pointer()
var flock linux.Flock
if _, err := flock.CopyIn(t, flockAddr); err != nil {
return err
}
var typ lock.LockType
switch flock.Type {
case linux.F_RDLCK:
typ = lock.ReadLock
case linux.F_WRLCK:
typ = lock.WriteLock
default:
return linuxerr.EINVAL
}
r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
if err != nil {
return err
}
uid := lock.UniqueID(t.FDTable())
if ofd {
uid = lock.UniqueID(file)
}
newFlock, err := file.TestPOSIX(t, uid, typ, r)
if err != nil {
return err
}
if !ofd {
newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID)
}
if _, err = newFlock.CopyOut(t, flockAddr); err != nil {
return err
}
return nil
}
// translatePID translates a pid from one namespace to another. Note that this
// may race with task termination/creation, in which case the original task
// corresponding to pid may no longer exist. This is used to implement the
// F_GETLK fcntl, which has the same potential race in Linux as well (i.e.,
// there is no synchronization between retrieving the lock PID and translating
// it). See fs/locks.c:posix_lock_to_flock.
func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 {
return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid))))
}
func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error {
// Copy in the lock request.
flockAddr := args[2].Pointer()
var flock linux.Flock
if _, err := flock.CopyIn(t, flockAddr); err != nil {
return err
}
if ofd && flock.PID != 0 {
return linuxerr.EINVAL
}
uid := lock.UniqueID(t.FDTable())
pid := int32(t.TGIDInRoot())
if ofd {
uid = lock.UniqueID(file)
pid = -1
}
r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
if err != nil {
return err
}
switch flock.Type {
case linux.F_RDLCK:
if !file.IsReadable() {
return linuxerr.EBADF
}
return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block)
case linux.F_WRLCK:
if !file.IsWritable() {
return linuxerr.EBADF
}
return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block)
case linux.F_UNLCK:
return file.UnlockPOSIX(t, uid, r)
default:
return linuxerr.EINVAL
}
}
// Fadvise64 implements fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
length := args[2].Int64()
advice := args[3].Int()
// Note: offset is allowed to be negative.
if length < 0 {
return 0, nil, linuxerr.EINVAL
}
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
if file.StatusFlags()&linux.O_PATH != 0 {
return 0, nil, linuxerr.EBADF
}
// If the FD refers to a pipe or FIFO, return error.
if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
return 0, nil, linuxerr.ESPIPE
}
switch advice {
case linux.POSIX_FADV_NORMAL:
case linux.POSIX_FADV_RANDOM:
case linux.POSIX_FADV_SEQUENTIAL:
case linux.POSIX_FADV_WILLNEED:
case linux.POSIX_FADV_DONTNEED:
case linux.POSIX_FADV_NOREUSE:
default:
return 0, nil, linuxerr.EINVAL
}
// Sure, whatever.
return 0, nil, nil
}
// Mkdir implements Linux syscall mkdir(2).
func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
}
// Mkdirat implements Linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
return 0, nil, mkdirat(t, dirfd, addr, mode)
}
func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release(t)
return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
}
// Rmdir implements Linux syscall rmdir(2).
func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
}
func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release(t)
return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
}
// Symlink implements Linux syscall symlink(2).
func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
targetAddr := args[0].Pointer()
linkpathAddr := args[1].Pointer()
return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
}
// Symlinkat implements Linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
targetAddr := args[0].Pointer()
newdirfd := args[1].Int()
linkpathAddr := args[2].Pointer()
return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
}
func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error {
target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
if err != nil {
return err
}
if len(target) == 0 {
return linuxerr.ENOENT
}
linkpath, err := copyInPath(t, linkpathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release(t)
return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
}
// Link implements Linux syscall link(2).
func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldpathAddr := args[0].Pointer()
newpathAddr := args[1].Pointer()
return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}
// Linkat implements Linux syscall linkat(2).
func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
flags := args[4].Int()
return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}
func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
return linuxerr.EINVAL
}
if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
return linuxerr.ENOENT
}
oldpath, err := copyInPath(t, oldpathAddr)
if err != nil {
return err
}
oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
if err != nil {
return err
}
defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
return err
}
newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer newtpop.Release(t)
return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
}
// Readlinkat implements Linux syscall readlinkat(2).
func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
bufAddr := args[2].Pointer()
size := args[3].SizeT()
return readlinkat(t, dirfd, pathAddr, bufAddr, size)
}
// Readlink implements Linux syscall readlink(2).
func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
bufAddr := args[1].Pointer()
size := args[2].SizeT()
return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
}
func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
if int(size) <= 0 {
return 0, nil, linuxerr.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
// "Since Linux 2.6.39, pathname can be an empty string, in which case the
// call operates on the symbolic link referred to by dirfd ..." -
// readlinkat(2)
tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release(t)
target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
if err != nil {
return 0, nil, err
}
if len(target) > int(size) {
target = target[:size]
}
n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
if n == 0 {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Unlink implements Linux syscall unlink(2).
func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
}
func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release(t)
return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
}
// Unlinkat implements Linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
flags := args[2].Int()
if flags&^linux.AT_REMOVEDIR != 0 {
return 0, nil, linuxerr.EINVAL
}
if flags&linux.AT_REMOVEDIR != 0 {
return 0, nil, rmdirat(t, dirfd, pathAddr)
}
return 0, nil, unlinkat(t, dirfd, pathAddr)
}
func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
root := t.FSContext().RootDirectory()
defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
return linuxerr.ENOENT
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectory()
defer start.DecRef(t)
} else {
dirfile := t.GetFile(dirfd)
if dirfile == nil {
return linuxerr.EBADF
}
if !path.HasComponents() && dirfile.StatusFlags()&linux.O_PATH == 0 {
// For empty path, use FileDescription.SetStat() instead of
// VirtualFilesystem.SetStatAt(), since the former may be able to use
// opened file state to expedite the SetStat. Skip this optimization
// for FDs with O_PATH, since the FD impl always returns EBADF.
err := dirfile.SetStat(t, *opts)
dirfile.DecRef(t)
return err
}
start = dirfile.VirtualDentry()
start.IncRef()
defer start.DecRef(t)
dirfile.DecRef(t)
}
}
return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: bool(shouldFollowFinalSymlink),
}, opts)
}
func handleSetSizeError(t *kernel.Task, err error) error {
if err == linuxerr.ErrExceedsFileSizeLimit {
// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
return linuxerr.EFBIG
}
return err
}
// Truncate implements Linux syscall truncate(2).
func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
length := args[1].Int64()
if length < 0 {
return 0, nil, linuxerr.EINVAL
}
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
NeedWritePerm: true,
})
return 0, nil, handleSetSizeError(t, err)
}
// Ftruncate implements Linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
length := args[1].Int64()
if length < 0 {
return 0, nil, linuxerr.EINVAL
}
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
if !file.IsWritable() {
return 0, nil, linuxerr.EINVAL
}
err := file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
})
return 0, nil, handleSetSizeError(t, err)
}
// Umask implements linux syscall umask(2).
func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
mask := args[0].ModeT()
mask = t.FSContext().SwapUmask(mask & 0777)
return uintptr(mask), nil, nil
}
// Chown implements Linux syscall chown(2).
func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
owner := args[1].Int()
group := args[2].Int()
return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
}
// Lchown implements Linux syscall lchown(2).
func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
owner := args[1].Int()
group := args[2].Int()
return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
}
// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
owner := args[2].Int()
group := args[3].Int()
flags := args[4].Int()
return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
}
func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
return linuxerr.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
return err
}
return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
}
func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
userns := t.UserNamespace()
if owner != -1 {
kuid := userns.MapToKUID(auth.UID(owner))
if !kuid.Ok() {
return linuxerr.EINVAL
}
opts.Stat.Mask |= linux.STATX_UID
opts.Stat.UID = uint32(kuid)
}
if group != -1 {
kgid := userns.MapToKGID(auth.GID(group))
if !kgid.Ok() {
return linuxerr.EINVAL
}
opts.Stat.Mask |= linux.STATX_GID
opts.Stat.GID = uint32(kgid)
}
return nil
}
// Fchown implements Linux syscall fchown(2).
func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
owner := args[1].Int()
group := args[2].Int()
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
return 0, nil, err
}
return 0, nil, file.SetStat(t, opts)
}
const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
// Chmod implements Linux syscall chmod(2).
func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
mode := args[1].ModeT()
return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
}
// Fchmodat implements Linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
mode := args[2].ModeT()
return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
}
func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_MODE,
Mode: uint16(mode & chmodMask),
},
})
}
// Fchmod implements Linux syscall fchmod(2).
func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
mode := args[1].ModeT()
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
return 0, nil, file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_MODE,
Mode: uint16(mode & chmodMask),
},
})
}
// Utime implements Linux syscall utime(2).
func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
timesAddr := args[1].Pointer()
opts := vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_ATIME | linux.STATX_MTIME,
},
}
if timesAddr == 0 {
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
} else {
var times linux.Utime
if _, err := times.CopyIn(t, timesAddr); err != nil {
return 0, nil, err
}
opts.Stat.Atime.Sec = times.Actime
opts.Stat.Mtime.Sec = times.Modtime
}
return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts)
}
// Utimes implements Linux syscall utimes(2).
func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
timesAddr := args[1].Pointer()
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
return 0, nil, err
}
return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts)
}
// Futimesat implements Linux syscall futimesat(2).
func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
return 0, nil, err
}
return 0, nil, utimes(t, dirfd, pathAddr, followFinalSymlink, &opts)
}
func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
if timesAddr == 0 {
opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
return nil
}
var times [2]linux.Timeval
if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
return linuxerr.EINVAL
}
opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
opts.Stat.Atime = linux.StatxTimestamp{
Sec: times[0].Sec,
Nsec: uint32(times[0].Usec * 1000),
}
opts.Stat.Mtime = linux.StatxTimestamp{
Sec: times[1].Sec,
Nsec: uint32(times[1].Usec * 1000),
}
return nil
}
// Utimensat implements Linux syscall utimensat(2).
func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
flags := args[3].Int()
// Linux requires that the UTIME_OMIT check occur before flags.
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
return 0, nil, err
}
if opts.Stat.Mask == 0 {
return 0, nil, nil
}
if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
return 0, nil, linuxerr.EINVAL
}
return 0, nil, utimes(t, dirfd, pathAddr, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
}
func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
if timesAddr == 0 {
opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
return nil
}
var times [2]linux.Timespec
if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Nsec != linux.UTIME_OMIT {
if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
return linuxerr.EINVAL
}
opts.Stat.Mask |= linux.STATX_ATIME
opts.Stat.Atime = linux.StatxTimestamp{
Sec: times[0].Sec,
Nsec: uint32(times[0].Nsec),
}
}
if times[1].Nsec != linux.UTIME_OMIT {
if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
return linuxerr.EINVAL
}
opts.Stat.Mask |= linux.STATX_MTIME
opts.Stat.Mtime = linux.StatxTimestamp{
Sec: times[1].Sec,
Nsec: uint32(times[1].Nsec),
}
}
return nil
}
// Analogous to fs/utimes.c:do_utimes().
func utimes(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
// "If filename is NULL and dfd refers to an open file, then operate on the
// file. Otherwise look up filename, possibly using dfd as a starting
// point." - fs/utimes.c:do_utimes()
if dirfd != linux.AT_FDCWD && pathAddr == 0 {
file := t.GetFile(dirfd)
if file == nil {
return linuxerr.EBADF
}
defer file.DecRef(t)
return file.SetStat(t, *opts)
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
return setstatat(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink, opts)
}
// Rename implements Linux syscall rename(2).
func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldpathAddr := args[0].Pointer()
newpathAddr := args[1].Pointer()
return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}
// Renameat implements Linux syscall renameat(2).
func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
}
// Renameat2 implements Linux syscall renameat2(2).
func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
flags := args[4].Uint()
return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}
func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error {
oldpath, err := copyInPath(t, oldpathAddr)
if err != nil {
return err
}
// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
return err
}
newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer newtpop.Release(t)
return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
Flags: flags,
})
}
// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
mode := args[1].Uint64()
offset := args[2].Int64()
length := args[3].Int64()
file := t.GetFile(fd)
if file == nil {
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
if !file.IsWritable() {
return 0, nil, linuxerr.EBADF
}
if mode != 0 {
return 0, nil, linuxerr.ENOTSUP
}
if offset < 0 || length <= 0 {
return 0, nil, linuxerr.EINVAL
}
size := offset + length
if size < 0 {
return 0, nil, linuxerr.EFBIG
}
limit := limits.FromContext(t).Get(limits.FileSize).Cur
if uint64(size) >= limit {
t.SendSignal(&linux.SignalInfo{
Signo: int32(linux.SIGXFSZ),
Code: linux.SI_USER,
})
return 0, nil, linuxerr.EFBIG
}
return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length))
}
// Flock implements linux syscall flock(2).
func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
operation := args[1].Int()
file := t.GetFile(fd)
if file == nil {
// flock(2): EBADF fd is not an open file descriptor.
return 0, nil, linuxerr.EBADF
}
defer file.DecRef(t)
nonblocking := operation&linux.LOCK_NB != 0
operation &^= linux.LOCK_NB
switch operation {
case linux.LOCK_EX:
if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil {
return 0, nil, err
}
case linux.LOCK_SH:
if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil {
return 0, nil, err
}
case linux.LOCK_UN:
if err := file.UnlockBSD(t); err != nil {
return 0, nil, err
}
default:
// flock(2): EINVAL operation is invalid.
return 0, nil, linuxerr.EINVAL
}
return 0, nil, nil
}
const (
memfdPrefix = "memfd:"
memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
)
// MemfdCreate implements the linux syscall memfd_create(2).
func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
flags := args[1].Uint()
if flags&^memfdAllFlags != 0 {
// Unknown bits in flags.
return 0, nil, linuxerr.EINVAL
}
allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
cloExec := flags&linux.MFD_CLOEXEC != 0
name, err := t.CopyInString(addr, memfdMaxNameLen)
if err != nil {
return 0, nil, err
}
shmMount := t.Kernel().ShmMount()
file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
if err != nil {
return 0, nil, err
}
defer file.DecRef(t)
fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
}