mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
8b7b69c978
When e.g. an application thread takes a page fault on an mmapped file, MM calls `memmap.Mappable.Translate()` to obtain the corresponding host FD range that should be mapped into the application's address space. It passes both the range that *must* be mapped (e.g. the faulting page) as `required`, and the maximum range that *may* be mapped (the previously-unfaulted part of the corresponding VMA) as `optional`, such that file implementations can map more than `required` to avoid future page faults. Prior to this CL, `tmpfs.regularFile.Translate()` always returned translations up to `optional`, under the assumption that allocating larger ranges from `pgalloc.MemoryFile` has negligible incremental cost. This behavior dates to the introduction of `memmap.Mappable.Translate()` (cl/182882705) and thus predates the implementation of tmpfs size limits (cl/442686814). Now that the latter exists, unconditionally translating - and therefore allocating pages - up to `optional` can result in hitting tmpfs size limits prematurely. Thus: Constrain optional translations returned by `tmpfs.regularFile.Translate()`, applying the same logic as `gofer.maxFillRange()`. PiperOrigin-RevId: 713134287
908 lines
28 KiB
Go
908 lines
28 KiB
Go
// Copyright 2019 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package tmpfs
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
"gvisor.dev/gvisor/pkg/context"
|
|
"gvisor.dev/gvisor/pkg/errors/linuxerr"
|
|
"gvisor.dev/gvisor/pkg/hostarch"
|
|
"gvisor.dev/gvisor/pkg/safemem"
|
|
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
|
|
"gvisor.dev/gvisor/pkg/sentry/fsutil"
|
|
"gvisor.dev/gvisor/pkg/sentry/hostfd"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
"gvisor.dev/gvisor/pkg/sentry/vfs"
|
|
"gvisor.dev/gvisor/pkg/sync"
|
|
"gvisor.dev/gvisor/pkg/usermem"
|
|
)
|
|
|
|
// regularFile is a regular (=S_IFREG) tmpfs file.
|
|
//
|
|
// +stateify savable
|
|
type regularFile struct {
|
|
inode inode
|
|
|
|
// memoryUsageKind is the memory accounting category under which pages backing
|
|
// this regularFile's contents are accounted.
|
|
memoryUsageKind usage.MemoryKind
|
|
|
|
// mapsMu protects mappings.
|
|
mapsMu sync.Mutex `state:"nosave"`
|
|
|
|
// mappings tracks mappings of the file into memmap.MappingSpaces.
|
|
//
|
|
// Protected by mapsMu.
|
|
mappings memmap.MappingSet
|
|
|
|
// writableMappingPages tracks how many pages of virtual memory are mapped
|
|
// as potentially writable from this file. If a page has multiple mappings,
|
|
// each mapping is counted separately.
|
|
//
|
|
// This counter is susceptible to overflow as we can potentially count
|
|
// mappings from many VMAs. We count pages rather than bytes to slightly
|
|
// mitigate this.
|
|
//
|
|
// Protected by mapsMu.
|
|
writableMappingPages uint64
|
|
|
|
// dataMu protects the fields below.
|
|
dataMu sync.RWMutex `state:"nosave"`
|
|
|
|
// data maps offsets into the file to offsets into memFile that store
|
|
// the file's data.
|
|
//
|
|
// Protected by dataMu.
|
|
data fsutil.FileRangeSet
|
|
|
|
// seals represents file seals on this inode.
|
|
//
|
|
// Protected by dataMu.
|
|
seals uint32
|
|
|
|
// initiallyUnlinked is true if this file was created using NewZeroFile or
|
|
// NewMemfd => newUnlinkedRegularFileDescription. initiallyUnlinked should
|
|
// be true when the equivalent shmem file in Linux would use
|
|
// shmem_anon_vm_ops rather than shmem_vm_ops.
|
|
//
|
|
// initiallyUnlinked is immutable, but stored here since it fits into
|
|
// alignment padding.
|
|
initiallyUnlinked bool
|
|
|
|
// size is the size of data.
|
|
//
|
|
// Protected by both dataMu and inode.mu; reading it requires holding
|
|
// either mutex, while writing requires holding both AND using atomics.
|
|
// Readers that do not require consistency (like Stat) may read the
|
|
// value atomically without holding either lock.
|
|
size atomicbitops.Uint64
|
|
}
|
|
|
|
func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
|
|
file := ®ularFile{
|
|
memoryUsageKind: fs.usage,
|
|
seals: linux.F_SEAL_SEAL,
|
|
}
|
|
file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
|
|
file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory
|
|
return &file.inode
|
|
}
|
|
|
|
// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
|
|
// filesystem represented by mount and returns an FD representing that file.
|
|
// The new file is not reachable by path traversal from any other file.
|
|
//
|
|
// newUnlinkedRegularFileDescription is analogous to Linux's
|
|
// mm/shmem.c:__shmem_file_setup().
|
|
//
|
|
// Preconditions: mount must be a tmpfs mount.
|
|
func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
|
|
fs, ok := mount.Filesystem().Impl().(*filesystem)
|
|
if !ok {
|
|
panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
|
|
}
|
|
|
|
inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
|
|
inode.impl.(*regularFile).initiallyUnlinked = true
|
|
d := fs.newDentry(inode)
|
|
defer d.DecRef(ctx)
|
|
d.name = name
|
|
|
|
fd := ®ularFileFD{}
|
|
fd.Init(&inode.locks)
|
|
flags := uint32(linux.O_RDWR)
|
|
if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
|
|
return nil, err
|
|
}
|
|
return fd, nil
|
|
}
|
|
|
|
// NewZeroFile creates a new regular file and file description as for
|
|
// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
|
|
// initially (implicitly) filled with zeroes.
|
|
//
|
|
// Preconditions: mount must be a tmpfs mount.
|
|
func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
|
|
// Compare mm/shmem.c:shmem_zero_setup().
|
|
fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rf := fd.inode().impl.(*regularFile)
|
|
rf.memoryUsageKind = usage.Anonymous
|
|
rf.size.Store(size)
|
|
return &fd.vfsfd, err
|
|
}
|
|
|
|
// NewMemfd creates a new regular file and file description as for
|
|
// memfd_create.
|
|
//
|
|
// Preconditions: mount must be a tmpfs mount.
|
|
func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
|
|
fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if allowSeals {
|
|
fd.inode().impl.(*regularFile).seals = 0
|
|
}
|
|
return &fd.vfsfd, nil
|
|
}
|
|
|
|
// truncate grows or shrinks the file to the given size. It returns true if the
|
|
// file size was updated.
|
|
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
|
|
rf.inode.mu.Lock()
|
|
defer rf.inode.mu.Unlock()
|
|
return rf.truncateLocked(newSize)
|
|
}
|
|
|
|
// Preconditions:
|
|
// - rf.inode.mu must be held.
|
|
// - rf.dataMu must be locked for writing.
|
|
// - newSize > rf.size.
|
|
func (rf *regularFile) growLocked(newSize uint64) error {
|
|
// Can we grow the file?
|
|
if rf.seals&linux.F_SEAL_GROW != 0 {
|
|
return linuxerr.EPERM
|
|
}
|
|
rf.size.Store(newSize)
|
|
return nil
|
|
}
|
|
|
|
// Preconditions: rf.inode.mu must be held.
|
|
func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
|
|
oldSize := rf.size.RacyLoad()
|
|
if newSize == oldSize {
|
|
// Nothing to do.
|
|
return false, nil
|
|
}
|
|
|
|
// Need to hold inode.mu and dataMu while modifying size.
|
|
rf.dataMu.Lock()
|
|
if newSize > oldSize {
|
|
err := rf.growLocked(newSize)
|
|
rf.dataMu.Unlock()
|
|
return err == nil, err
|
|
}
|
|
|
|
// We are shrinking the file. First check if this is allowed.
|
|
if rf.seals&linux.F_SEAL_SHRINK != 0 {
|
|
rf.dataMu.Unlock()
|
|
return false, linuxerr.EPERM
|
|
}
|
|
|
|
rf.size.Store(newSize)
|
|
rf.dataMu.Unlock()
|
|
|
|
// Invalidate past translations of truncated pages.
|
|
oldpgend := offsetPageEnd(int64(oldSize))
|
|
newpgend := offsetPageEnd(int64(newSize))
|
|
if newpgend < oldpgend {
|
|
rf.mapsMu.Lock()
|
|
rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
|
|
// Compare Linux's mm/shmem.c:shmem_setattr() =>
|
|
// mm/memory.c:unmap_mapping_range(evencows=1).
|
|
InvalidatePrivate: true,
|
|
})
|
|
rf.mapsMu.Unlock()
|
|
}
|
|
|
|
// We are now guaranteed that there are no translations of truncated pages,
|
|
// and can remove them.
|
|
rf.dataMu.Lock()
|
|
decPages := rf.data.Truncate(newSize, rf.inode.fs.mf)
|
|
rf.dataMu.Unlock()
|
|
rf.inode.fs.unaccountPages(decPages)
|
|
return true, nil
|
|
}
|
|
|
|
// AddMapping implements memmap.Mappable.AddMapping.
|
|
func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
|
|
rf.mapsMu.Lock()
|
|
defer rf.mapsMu.Unlock()
|
|
rf.dataMu.RLock()
|
|
defer rf.dataMu.RUnlock()
|
|
|
|
// Reject writable mapping if F_SEAL_WRITE is set.
|
|
if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
|
|
return linuxerr.EPERM
|
|
}
|
|
|
|
rf.mappings.AddMapping(ms, ar, offset, writable)
|
|
if writable {
|
|
pagesBefore := rf.writableMappingPages
|
|
|
|
// ar is guaranteed to be page aligned per memmap.Mappable.
|
|
rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
|
|
|
|
if rf.writableMappingPages < pagesBefore {
|
|
panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RemoveMapping implements memmap.Mappable.RemoveMapping.
|
|
func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
|
|
rf.mapsMu.Lock()
|
|
defer rf.mapsMu.Unlock()
|
|
|
|
rf.mappings.RemoveMapping(ms, ar, offset, writable)
|
|
|
|
if writable {
|
|
pagesBefore := rf.writableMappingPages
|
|
|
|
// ar is guaranteed to be page aligned per memmap.Mappable.
|
|
rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
|
|
|
|
if rf.writableMappingPages > pagesBefore {
|
|
panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
|
|
}
|
|
}
|
|
}
|
|
|
|
// CopyMapping implements memmap.Mappable.CopyMapping.
|
|
func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
|
|
return rf.AddMapping(ctx, ms, dstAR, offset, writable)
|
|
}
|
|
|
|
// Translate implements memmap.Mappable.Translate.
|
|
func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
|
|
memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
|
|
|
|
rf.dataMu.Lock()
|
|
defer rf.dataMu.Unlock()
|
|
|
|
// Constrain translations to f.attr.Size (rounded up) to prevent
|
|
// translation to pages that may be concurrently truncated.
|
|
pgend := offsetPageEnd(int64(rf.size.RacyLoad()))
|
|
var beyondEOF bool
|
|
if required.End > pgend {
|
|
if required.Start >= pgend {
|
|
return nil, &memmap.BusError{io.EOF}
|
|
}
|
|
beyondEOF = true
|
|
required.End = pgend
|
|
}
|
|
if optional.End > pgend {
|
|
optional.End = pgend
|
|
}
|
|
// Constrain allocation to at most maxOptionalBytes or required.Length(),
|
|
// whichever is greater.
|
|
const maxOptionalBytes = 64 << 10 // 64 KB, arbitrarily matches Linux's default fault_around_pages
|
|
if required.Length() >= maxOptionalBytes {
|
|
optional = required
|
|
} else {
|
|
if optional.Length() > maxOptionalBytes {
|
|
optional.Start = required.Start
|
|
if optional.Length() > maxOptionalBytes {
|
|
optional.End = optional.Start + maxOptionalBytes
|
|
}
|
|
}
|
|
}
|
|
pagesToFill := rf.data.PagesToFill(required, optional)
|
|
if !rf.inode.fs.accountPages(pagesToFill) {
|
|
// If we can not accommodate pagesToFill pages, then retry with just
|
|
// the required range. Because optional may be larger than required.
|
|
// Only error out if even the required range can not be allocated for.
|
|
pagesToFill = rf.data.PagesToFill(required, required)
|
|
if !rf.inode.fs.accountPages(pagesToFill) {
|
|
return nil, &memmap.BusError{linuxerr.ENOSPC}
|
|
}
|
|
optional = required
|
|
}
|
|
pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, pgalloc.AllocOpts{
|
|
Kind: rf.memoryUsageKind,
|
|
MemCgID: memCgID,
|
|
}, nil)
|
|
// rf.data.Fill() may fail mid-way. We still want to account any pages that
|
|
// were allocated, irrespective of an error.
|
|
rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
|
|
|
|
var ts []memmap.Translation
|
|
var translatedEnd uint64
|
|
for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
|
|
segMR := seg.Range().Intersect(optional)
|
|
ts = append(ts, memmap.Translation{
|
|
Source: segMR,
|
|
File: rf.inode.fs.mf,
|
|
Offset: seg.FileRangeOf(segMR).Start,
|
|
Perms: hostarch.AnyAccess,
|
|
})
|
|
translatedEnd = segMR.End
|
|
}
|
|
|
|
// Don't return the error returned by f.data.Fill if it occurred outside of
|
|
// required.
|
|
if translatedEnd < required.End && cerr != nil {
|
|
return ts, &memmap.BusError{cerr}
|
|
}
|
|
if beyondEOF {
|
|
return ts, &memmap.BusError{io.EOF}
|
|
}
|
|
return ts, nil
|
|
}
|
|
|
|
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
|
|
func (*regularFile) InvalidateUnsavable(context.Context) error {
|
|
return nil
|
|
}
|
|
|
|
// +stateify savable
|
|
type regularFileFD struct {
|
|
fileDescription
|
|
|
|
// off is the file offset. off is accessed using atomic memory operations.
|
|
// offMu serializes operations that may mutate off.
|
|
off int64
|
|
offMu sync.Mutex `state:"nosave"`
|
|
}
|
|
|
|
// Release implements vfs.FileDescriptionImpl.Release.
|
|
func (fd *regularFileFD) Release(context.Context) {
|
|
// noop
|
|
}
|
|
|
|
// Allocate implements vfs.FileDescriptionImpl.Allocate.
|
|
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
|
|
f := fd.inode().impl.(*regularFile)
|
|
memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
|
|
|
|
// To be consistent with Linux, inode.mu must be locked throughout.
|
|
f.inode.mu.Lock()
|
|
defer f.inode.mu.Unlock()
|
|
end := offset + length
|
|
pgEnd, ok := hostarch.PageRoundUp(end)
|
|
if !ok {
|
|
return linuxerr.EFBIG
|
|
}
|
|
// Allocate in chunks for the following reasons:
|
|
// 1. Size limit may permit really large fallocate, which can take a long
|
|
// time to execute on the host. This can cause watchdog to timeout and
|
|
// crash the system. Watchdog needs petting.
|
|
// 2. Linux allocates folios iteratively while checking for interrupts. In
|
|
// gVisor, we need to manually check for interrupts between chunks.
|
|
const chunkSize = 4 << 30 // 4 GiB
|
|
for curPgStart := hostarch.PageRoundDown(offset); curPgStart < pgEnd; {
|
|
curPgEnd := pgEnd
|
|
newSize := end
|
|
if curPgEnd-curPgStart > chunkSize {
|
|
curPgEnd = curPgStart + chunkSize
|
|
newSize = curPgEnd
|
|
}
|
|
required := memmap.MappableRange{Start: curPgStart, End: curPgEnd}
|
|
if err := f.allocateLocked(ctx, mode, newSize, required, memCgID); err != nil {
|
|
return err
|
|
}
|
|
// This loop can take a long time to process, so periodically check for
|
|
// interrupts. This also pets the watchdog.
|
|
if ctx.Interrupted() {
|
|
return linuxerr.EINTR
|
|
}
|
|
// Advance curPgStart.
|
|
curPgStart = curPgEnd
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Preconditions:
|
|
// - rf.inode.mu is locked.
|
|
// - required must be page-aligned.
|
|
// - required.Start < newSize <= required.End.
|
|
func (rf *regularFile) allocateLocked(ctx context.Context, mode, newSize uint64, required memmap.MappableRange, memCgID uint32) error {
|
|
rf.dataMu.Lock()
|
|
defer rf.dataMu.Unlock()
|
|
|
|
// We must allocate pages in the range specified by offset and length.
|
|
// Even if newSize <= oldSize, there might not be actual memory backing this
|
|
// range, so any gaps must be filled by calling f.data.Fill().
|
|
// "After a successful call, subsequent writes into the range
|
|
// specified by offset and len are guaranteed not to fail because of
|
|
// lack of disk space." - fallocate(2)
|
|
pagesToFill := rf.data.PagesToFill(required, required)
|
|
if !rf.inode.fs.accountPages(pagesToFill) {
|
|
return linuxerr.ENOSPC
|
|
}
|
|
// Given our definitions in pgalloc, fallocate(2) semantics imply that pages
|
|
// in the MemoryFile must be committed, in addition to being allocated.
|
|
allocMode := pgalloc.AllocateAndCommit
|
|
if !rf.inode.fs.mf.IsDiskBacked() {
|
|
// Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We
|
|
// take a more aggressive approach in populating pages for memory-backed
|
|
// MemoryFiles. shmem pages are subject to swap rather than disk writeback.
|
|
// They are not likely to be swapped before they are written to. Hence it
|
|
// is beneficial to populate (in addition to commit) shmem pages to avoid
|
|
// faulting page-by-page when these pages are written to in the future.
|
|
allocMode = pgalloc.AllocateAndWritePopulate
|
|
}
|
|
pagesAlloced, err := rf.data.Fill(ctx, required, required, newSize, rf.inode.fs.mf, pgalloc.AllocOpts{
|
|
Kind: rf.memoryUsageKind,
|
|
MemCgID: memCgID,
|
|
Mode: allocMode,
|
|
}, nil /* r */)
|
|
// f.data.Fill() may fail mid-way. We still want to account any pages that
|
|
// were allocated, irrespective of an error.
|
|
rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
|
|
if err != nil && err != io.EOF {
|
|
return err
|
|
}
|
|
|
|
oldSize := rf.size.Load()
|
|
if oldSize >= newSize {
|
|
return nil
|
|
}
|
|
return rf.growLocked(newSize)
|
|
}
|
|
|
|
// PRead implements vfs.FileDescriptionImpl.PRead.
|
|
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
|
|
start := fsmetric.StartReadWait()
|
|
defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
|
|
fsmetric.TmpfsReads.Increment()
|
|
|
|
if offset < 0 {
|
|
return 0, linuxerr.EINVAL
|
|
}
|
|
|
|
// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
|
|
// all state is in-memory.
|
|
//
|
|
// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
|
|
if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
|
|
return 0, linuxerr.EOPNOTSUPP
|
|
}
|
|
|
|
if dst.NumBytes() == 0 {
|
|
return 0, nil
|
|
}
|
|
f := fd.inode().impl.(*regularFile)
|
|
// memCgID can be 0 here because regularFileReadWriter.ReadToBlocks() never
|
|
// allocates from pgalloc.
|
|
rw := getRegularFileReadWriter(f, offset, 0)
|
|
n, err := dst.CopyOutFrom(ctx, rw)
|
|
putRegularFileReadWriter(rw)
|
|
fd.inode().touchAtime(fd.vfsfd.Mount())
|
|
return n, err
|
|
}
|
|
|
|
// Read implements vfs.FileDescriptionImpl.Read.
|
|
func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
|
|
fd.offMu.Lock()
|
|
n, err := fd.PRead(ctx, dst, fd.off, opts)
|
|
fd.off += n
|
|
fd.offMu.Unlock()
|
|
return n, err
|
|
}
|
|
|
|
// PWrite implements vfs.FileDescriptionImpl.PWrite.
|
|
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
|
|
n, _, err := fd.pwrite(ctx, src, offset, opts)
|
|
return n, err
|
|
}
|
|
|
|
// pwrite returns the number of bytes written, final offset and error. The
|
|
// final offset should be ignored by PWrite.
|
|
func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
|
|
if offset < 0 {
|
|
return 0, offset, linuxerr.EINVAL
|
|
}
|
|
|
|
// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
|
|
// all state is in-memory.
|
|
//
|
|
// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
|
|
if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
|
|
return 0, offset, linuxerr.EOPNOTSUPP
|
|
}
|
|
|
|
srclen := src.NumBytes()
|
|
if srclen == 0 {
|
|
return 0, offset, nil
|
|
}
|
|
f := fd.inode().impl.(*regularFile)
|
|
f.inode.mu.Lock()
|
|
defer f.inode.mu.Unlock()
|
|
// If the file is opened with O_APPEND, update offset to file size.
|
|
if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
|
|
// Locking f.inode.mu is sufficient for reading f.size.
|
|
offset = int64(f.size.RacyLoad())
|
|
}
|
|
end := offset + srclen
|
|
if end < offset {
|
|
// Overflow.
|
|
return 0, offset, linuxerr.EINVAL
|
|
}
|
|
|
|
srclen, err = vfs.CheckLimit(ctx, offset, srclen)
|
|
if err != nil {
|
|
return 0, offset, err
|
|
}
|
|
src = src.TakeFirst64(srclen)
|
|
|
|
// Perform the write.
|
|
rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx))
|
|
n, err := src.CopyInTo(ctx, rw)
|
|
|
|
f.inode.touchCMtimeLocked()
|
|
for {
|
|
old := f.inode.mode.Load()
|
|
new := vfs.ClearSUIDAndSGID(old)
|
|
if swapped := f.inode.mode.CompareAndSwap(old, new); swapped {
|
|
break
|
|
}
|
|
}
|
|
putRegularFileReadWriter(rw)
|
|
return n, n + offset, err
|
|
}
|
|
|
|
// Write implements vfs.FileDescriptionImpl.Write.
|
|
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
|
|
fd.offMu.Lock()
|
|
n, off, err := fd.pwrite(ctx, src, fd.off, opts)
|
|
fd.off = off
|
|
fd.offMu.Unlock()
|
|
return n, err
|
|
}
|
|
|
|
// Seek implements vfs.FileDescriptionImpl.Seek.
|
|
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
|
|
fd.offMu.Lock()
|
|
defer fd.offMu.Unlock()
|
|
switch whence {
|
|
case linux.SEEK_SET:
|
|
// use offset as specified
|
|
case linux.SEEK_CUR:
|
|
offset += fd.off
|
|
case linux.SEEK_END:
|
|
offset += int64(fd.inode().impl.(*regularFile).size.Load())
|
|
default:
|
|
return 0, linuxerr.EINVAL
|
|
}
|
|
if offset < 0 {
|
|
return 0, linuxerr.EINVAL
|
|
}
|
|
fd.off = offset
|
|
return offset, nil
|
|
}
|
|
|
|
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
|
|
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
|
|
file := fd.inode().impl.(*regularFile)
|
|
opts.SentryOwnedContent = true
|
|
if file.initiallyUnlinked {
|
|
opts.NameMut = memmap.NameMutAnonShmem
|
|
}
|
|
return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
|
|
}
|
|
|
|
// offsetPageEnd returns the file offset rounded up to the nearest
|
|
// page boundary. offsetPageEnd panics if rounding up causes overflow,
|
|
// which shouldn't be possible given that offset is an int64.
|
|
func offsetPageEnd(offset int64) uint64 {
|
|
end, ok := hostarch.Addr(offset).RoundUp()
|
|
if !ok {
|
|
panic("impossible overflow")
|
|
}
|
|
return uint64(end)
|
|
}
|
|
|
|
// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
|
|
type regularFileReadWriter struct {
|
|
file *regularFile
|
|
|
|
// Offset into the file to read/write at. Note that this may be
|
|
// different from the FD offset if PRead/PWrite is used.
|
|
off uint64
|
|
|
|
// memCgID is the memory cgroup ID used for accounting the allocated
|
|
// pages.
|
|
memCgID uint32
|
|
}
|
|
|
|
var regularFileReadWriterPool = sync.Pool{
|
|
New: func() any {
|
|
return ®ularFileReadWriter{}
|
|
},
|
|
}
|
|
|
|
func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter {
|
|
rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
|
|
rw.file = file
|
|
rw.off = uint64(offset)
|
|
rw.memCgID = memCgID
|
|
return rw
|
|
}
|
|
|
|
func putRegularFileReadWriter(rw *regularFileReadWriter) {
|
|
rw.file = nil
|
|
regularFileReadWriterPool.Put(rw)
|
|
}
|
|
|
|
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
|
|
func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
|
|
rw.file.dataMu.RLock()
|
|
defer rw.file.dataMu.RUnlock()
|
|
size := rw.file.size.RacyLoad()
|
|
|
|
// Compute the range to read (limited by file size and overflow-checked).
|
|
if rw.off >= size {
|
|
return 0, io.EOF
|
|
}
|
|
end := size
|
|
if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
|
|
end = rend
|
|
}
|
|
|
|
var done uint64
|
|
seg, gap := rw.file.data.Find(uint64(rw.off))
|
|
for rw.off < end {
|
|
mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
|
|
switch {
|
|
case seg.Ok():
|
|
// Get internal mappings.
|
|
ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
|
|
if err != nil {
|
|
return done, err
|
|
}
|
|
|
|
// Copy from internal mappings.
|
|
n, err := safemem.CopySeq(dsts, ims)
|
|
done += n
|
|
rw.off += uint64(n)
|
|
dsts = dsts.DropFirst64(n)
|
|
if err != nil {
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = seg.NextNonEmpty()
|
|
|
|
case gap.Ok():
|
|
// Tmpfs holes are zero-filled.
|
|
gapmr := gap.Range().Intersect(mr)
|
|
dst := dsts.TakeFirst64(gapmr.Length())
|
|
n, err := safemem.ZeroSeq(dst)
|
|
done += n
|
|
rw.off += uint64(n)
|
|
dsts = dsts.DropFirst64(n)
|
|
if err != nil {
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
|
|
}
|
|
}
|
|
return done, nil
|
|
}
|
|
|
|
// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
|
|
//
|
|
// Preconditions: rw.file.inode.mu must be held.
|
|
func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
|
|
// Hold dataMu so we can modify size.
|
|
rw.file.dataMu.Lock()
|
|
defer rw.file.dataMu.Unlock()
|
|
|
|
// Compute the range to write (overflow-checked).
|
|
end := rw.off + srcs.NumBytes()
|
|
if end <= rw.off {
|
|
end = math.MaxInt64
|
|
}
|
|
|
|
// Check if seals prevent either file growth or all writes.
|
|
switch {
|
|
case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
|
|
return 0, linuxerr.EPERM
|
|
case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
|
|
// When growth is sealed, Linux effectively allows writes which would
|
|
// normally grow the file to partially succeed up to the current EOF,
|
|
// rounded down to the page boundary before the EOF.
|
|
//
|
|
// This happens because writes (and thus the growth check) for tmpfs
|
|
// files proceed page-by-page on Linux, and the final write to the page
|
|
// containing EOF fails, resulting in a partial write up to the start of
|
|
// that page.
|
|
//
|
|
// To emulate this behaviour, artificially truncate the write to the
|
|
// start of the page containing the current EOF.
|
|
//
|
|
// See Linux, mm/filemap.c:generic_perform_write() and
|
|
// mm/shmem.c:shmem_write_begin().
|
|
if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart {
|
|
end = pgstart
|
|
}
|
|
if end <= rw.off {
|
|
// Truncation would result in no data being written.
|
|
return 0, linuxerr.EPERM
|
|
}
|
|
}
|
|
|
|
// Page-aligned mr for when we need to allocate memory. RoundUp can't
|
|
// overflow since end is an int64.
|
|
pgstartaddr := hostarch.Addr(rw.off).RoundDown()
|
|
pgendaddr, _ := hostarch.Addr(end).RoundUp()
|
|
pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
|
|
|
|
var (
|
|
done uint64
|
|
retErr error
|
|
)
|
|
seg, gap := rw.file.data.Find(uint64(rw.off))
|
|
for rw.off < end {
|
|
mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
|
|
switch {
|
|
case seg.Ok():
|
|
n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs)
|
|
done += n
|
|
rw.off += uint64(n)
|
|
srcs = srcs.DropFirst64(n)
|
|
if err != nil {
|
|
retErr = err
|
|
goto exitLoop
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = seg.NextNonEmpty()
|
|
|
|
case gap.Ok():
|
|
// Allocate memory for the write.
|
|
gapMR := gap.Range().Intersect(pgMR)
|
|
pagesToFill := gapMR.Length() / hostarch.PageSize
|
|
pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill)
|
|
if pagesReserved == 0 {
|
|
if done == 0 {
|
|
retErr = linuxerr.ENOSPC
|
|
goto exitLoop
|
|
}
|
|
retErr = nil
|
|
goto exitLoop
|
|
}
|
|
gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved)
|
|
allocMode := pgalloc.AllocateAndWritePopulate
|
|
if rw.file.inode.fs.mf.IsDiskBacked() {
|
|
// Don't populate pages for disk-backed files. Benchmarking showed that
|
|
// disk-backed pages are likely to be written back to disk before we
|
|
// can write to them. The pages fault again on write anyways. In total,
|
|
// prepopulating disk-backed pages deteriorates performance as it fails
|
|
// to eliminate future page faults and we also additionally incur
|
|
// useless disk writebacks.
|
|
allocMode = pgalloc.AllocateCallerIndirectCommit
|
|
}
|
|
fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{
|
|
Kind: rw.file.memoryUsageKind,
|
|
MemCgID: rw.memCgID,
|
|
Mode: allocMode,
|
|
})
|
|
if err != nil {
|
|
retErr = err
|
|
rw.file.inode.fs.unaccountPages(pagesReserved)
|
|
goto exitLoop
|
|
}
|
|
|
|
// Write to that memory as usual.
|
|
seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|
|
exitLoop:
|
|
// If the write ends beyond the file's previous size, it causes the
|
|
// file to grow.
|
|
if rw.off > rw.file.size.RacyLoad() {
|
|
rw.file.size.Store(rw.off)
|
|
}
|
|
|
|
return done, retErr
|
|
}
|
|
|
|
func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) {
|
|
if rw.file.inode.fs.mf.IsDiskBacked() {
|
|
// Disk-backed files are not prepopulated. The safemem.CopySeq() approach
|
|
// used below incurs a lot of page faults without page prepopulation, which
|
|
// causes a lot of context switching. Use write(2) host syscall instead,
|
|
// which makes one context switch and faults all the pages that are touched
|
|
// during the write.
|
|
fd, err := rw.file.inode.fs.mf.DataFD(fr)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return hostfd.Pwritev2(
|
|
int32(fd), // fd
|
|
srcs.TakeFirst64(fr.Length()), // srcs
|
|
int64(fr.Start), // offset
|
|
0, // flags
|
|
)
|
|
}
|
|
// Get internal mappings.
|
|
ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
// Copy to internal mappings.
|
|
return safemem.CopySeq(ims, srcs)
|
|
}
|
|
|
|
// GetSeals returns the current set of seals on a memfd inode.
|
|
func GetSeals(fd *vfs.FileDescription) (uint32, error) {
|
|
f, ok := fd.Impl().(*regularFileFD)
|
|
if !ok {
|
|
return 0, linuxerr.EINVAL
|
|
}
|
|
rf := f.inode().impl.(*regularFile)
|
|
rf.dataMu.RLock()
|
|
defer rf.dataMu.RUnlock()
|
|
return rf.seals, nil
|
|
}
|
|
|
|
// AddSeals adds new file seals to a memfd inode.
|
|
func AddSeals(fd *vfs.FileDescription, val uint32) error {
|
|
f, ok := fd.Impl().(*regularFileFD)
|
|
if !ok {
|
|
return linuxerr.EINVAL
|
|
}
|
|
rf := f.inode().impl.(*regularFile)
|
|
rf.mapsMu.Lock()
|
|
defer rf.mapsMu.Unlock()
|
|
rf.dataMu.Lock()
|
|
defer rf.dataMu.Unlock()
|
|
|
|
if rf.seals&linux.F_SEAL_SEAL != 0 {
|
|
// Seal applied which prevents addition of any new seals.
|
|
return linuxerr.EPERM
|
|
}
|
|
|
|
// F_SEAL_WRITE can only be added if there are no active writable maps.
|
|
if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
|
|
if rf.writableMappingPages > 0 {
|
|
return linuxerr.EBUSY
|
|
}
|
|
}
|
|
|
|
// Seals can only be added, never removed.
|
|
rf.seals |= val
|
|
return nil
|
|
}
|