Files
Jamie Liu 8b7b69c978 tmpfs: limit regularFile.Translate() fill range
When e.g. an application thread takes a page fault on an mmapped file, MM calls
`memmap.Mappable.Translate()` to obtain the corresponding host FD range that
should be mapped into the application's address space. It passes both the range
that *must* be mapped (e.g. the faulting page) as `required`, and the maximum
range that *may* be mapped (the previously-unfaulted part of the corresponding
VMA) as `optional`, such that file implementations can map more than `required`
to avoid future page faults.

Prior to this CL, `tmpfs.regularFile.Translate()` always returned translations
up to `optional`, under the assumption that allocating larger ranges from
`pgalloc.MemoryFile` has negligible incremental cost. This behavior dates to
the introduction of `memmap.Mappable.Translate()` (cl/182882705) and thus
predates the implementation of tmpfs size limits (cl/442686814). Now that the
latter exists, unconditionally translating - and therefore allocating pages -
up to `optional` can result in hitting tmpfs size limits prematurely.

Thus: Constrain optional translations returned by
`tmpfs.regularFile.Translate()`, applying the same logic as
`gofer.maxFillRange()`.
PiperOrigin-RevId: 713134287
2025-01-07 20:25:15 -08:00

908 lines
28 KiB
Go

// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tmpfs
import (
"fmt"
"io"
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
"gvisor.dev/gvisor/pkg/sentry/fsutil"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
// regularFile is a regular (=S_IFREG) tmpfs file.
//
// +stateify savable
type regularFile struct {
inode inode
// memoryUsageKind is the memory accounting category under which pages backing
// this regularFile's contents are accounted.
memoryUsageKind usage.MemoryKind
// mapsMu protects mappings.
mapsMu sync.Mutex `state:"nosave"`
// mappings tracks mappings of the file into memmap.MappingSpaces.
//
// Protected by mapsMu.
mappings memmap.MappingSet
// writableMappingPages tracks how many pages of virtual memory are mapped
// as potentially writable from this file. If a page has multiple mappings,
// each mapping is counted separately.
//
// This counter is susceptible to overflow as we can potentially count
// mappings from many VMAs. We count pages rather than bytes to slightly
// mitigate this.
//
// Protected by mapsMu.
writableMappingPages uint64
// dataMu protects the fields below.
dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into memFile that store
// the file's data.
//
// Protected by dataMu.
data fsutil.FileRangeSet
// seals represents file seals on this inode.
//
// Protected by dataMu.
seals uint32
// initiallyUnlinked is true if this file was created using NewZeroFile or
// NewMemfd => newUnlinkedRegularFileDescription. initiallyUnlinked should
// be true when the equivalent shmem file in Linux would use
// shmem_anon_vm_ops rather than shmem_vm_ops.
//
// initiallyUnlinked is immutable, but stored here since it fits into
// alignment padding.
initiallyUnlinked bool
// size is the size of data.
//
// Protected by both dataMu and inode.mu; reading it requires holding
// either mutex, while writing requires holding both AND using atomics.
// Readers that do not require consistency (like Stat) may read the
// value atomically without holding either lock.
size atomicbitops.Uint64
}
func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
file := &regularFile{
memoryUsageKind: fs.usage,
seals: linux.F_SEAL_SEAL,
}
file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory
return &file.inode
}
// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
// filesystem represented by mount and returns an FD representing that file.
// The new file is not reachable by path traversal from any other file.
//
// newUnlinkedRegularFileDescription is analogous to Linux's
// mm/shmem.c:__shmem_file_setup().
//
// Preconditions: mount must be a tmpfs mount.
func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
fs, ok := mount.Filesystem().Impl().(*filesystem)
if !ok {
panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
}
inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
inode.impl.(*regularFile).initiallyUnlinked = true
d := fs.newDentry(inode)
defer d.DecRef(ctx)
d.name = name
fd := &regularFileFD{}
fd.Init(&inode.locks)
flags := uint32(linux.O_RDWR)
if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return fd, nil
}
// NewZeroFile creates a new regular file and file description as for
// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
// initially (implicitly) filled with zeroes.
//
// Preconditions: mount must be a tmpfs mount.
func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
// Compare mm/shmem.c:shmem_zero_setup().
fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
if err != nil {
return nil, err
}
rf := fd.inode().impl.(*regularFile)
rf.memoryUsageKind = usage.Anonymous
rf.size.Store(size)
return &fd.vfsfd, err
}
// NewMemfd creates a new regular file and file description as for
// memfd_create.
//
// Preconditions: mount must be a tmpfs mount.
func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
if err != nil {
return nil, err
}
if allowSeals {
fd.inode().impl.(*regularFile).seals = 0
}
return &fd.vfsfd, nil
}
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
rf.inode.mu.Lock()
defer rf.inode.mu.Unlock()
return rf.truncateLocked(newSize)
}
// Preconditions:
// - rf.inode.mu must be held.
// - rf.dataMu must be locked for writing.
// - newSize > rf.size.
func (rf *regularFile) growLocked(newSize uint64) error {
// Can we grow the file?
if rf.seals&linux.F_SEAL_GROW != 0 {
return linuxerr.EPERM
}
rf.size.Store(newSize)
return nil
}
// Preconditions: rf.inode.mu must be held.
func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
oldSize := rf.size.RacyLoad()
if newSize == oldSize {
// Nothing to do.
return false, nil
}
// Need to hold inode.mu and dataMu while modifying size.
rf.dataMu.Lock()
if newSize > oldSize {
err := rf.growLocked(newSize)
rf.dataMu.Unlock()
return err == nil, err
}
// We are shrinking the file. First check if this is allowed.
if rf.seals&linux.F_SEAL_SHRINK != 0 {
rf.dataMu.Unlock()
return false, linuxerr.EPERM
}
rf.size.Store(newSize)
rf.dataMu.Unlock()
// Invalidate past translations of truncated pages.
oldpgend := offsetPageEnd(int64(oldSize))
newpgend := offsetPageEnd(int64(newSize))
if newpgend < oldpgend {
rf.mapsMu.Lock()
rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
// Compare Linux's mm/shmem.c:shmem_setattr() =>
// mm/memory.c:unmap_mapping_range(evencows=1).
InvalidatePrivate: true,
})
rf.mapsMu.Unlock()
}
// We are now guaranteed that there are no translations of truncated pages,
// and can remove them.
rf.dataMu.Lock()
decPages := rf.data.Truncate(newSize, rf.inode.fs.mf)
rf.dataMu.Unlock()
rf.inode.fs.unaccountPages(decPages)
return true, nil
}
// AddMapping implements memmap.Mappable.AddMapping.
func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
rf.mapsMu.Lock()
defer rf.mapsMu.Unlock()
rf.dataMu.RLock()
defer rf.dataMu.RUnlock()
// Reject writable mapping if F_SEAL_WRITE is set.
if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
return linuxerr.EPERM
}
rf.mappings.AddMapping(ms, ar, offset, writable)
if writable {
pagesBefore := rf.writableMappingPages
// ar is guaranteed to be page aligned per memmap.Mappable.
rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
if rf.writableMappingPages < pagesBefore {
panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
}
}
return nil
}
// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
rf.mapsMu.Lock()
defer rf.mapsMu.Unlock()
rf.mappings.RemoveMapping(ms, ar, offset, writable)
if writable {
pagesBefore := rf.writableMappingPages
// ar is guaranteed to be page aligned per memmap.Mappable.
rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
if rf.writableMappingPages > pagesBefore {
panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
}
}
}
// CopyMapping implements memmap.Mappable.CopyMapping.
func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
return rf.AddMapping(ctx, ms, dstAR, offset, writable)
}
// Translate implements memmap.Mappable.Translate.
func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
rf.dataMu.Lock()
defer rf.dataMu.Unlock()
// Constrain translations to f.attr.Size (rounded up) to prevent
// translation to pages that may be concurrently truncated.
pgend := offsetPageEnd(int64(rf.size.RacyLoad()))
var beyondEOF bool
if required.End > pgend {
if required.Start >= pgend {
return nil, &memmap.BusError{io.EOF}
}
beyondEOF = true
required.End = pgend
}
if optional.End > pgend {
optional.End = pgend
}
// Constrain allocation to at most maxOptionalBytes or required.Length(),
// whichever is greater.
const maxOptionalBytes = 64 << 10 // 64 KB, arbitrarily matches Linux's default fault_around_pages
if required.Length() >= maxOptionalBytes {
optional = required
} else {
if optional.Length() > maxOptionalBytes {
optional.Start = required.Start
if optional.Length() > maxOptionalBytes {
optional.End = optional.Start + maxOptionalBytes
}
}
}
pagesToFill := rf.data.PagesToFill(required, optional)
if !rf.inode.fs.accountPages(pagesToFill) {
// If we can not accommodate pagesToFill pages, then retry with just
// the required range. Because optional may be larger than required.
// Only error out if even the required range can not be allocated for.
pagesToFill = rf.data.PagesToFill(required, required)
if !rf.inode.fs.accountPages(pagesToFill) {
return nil, &memmap.BusError{linuxerr.ENOSPC}
}
optional = required
}
pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, pgalloc.AllocOpts{
Kind: rf.memoryUsageKind,
MemCgID: memCgID,
}, nil)
// rf.data.Fill() may fail mid-way. We still want to account any pages that
// were allocated, irrespective of an error.
rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
var ts []memmap.Translation
var translatedEnd uint64
for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
segMR := seg.Range().Intersect(optional)
ts = append(ts, memmap.Translation{
Source: segMR,
File: rf.inode.fs.mf,
Offset: seg.FileRangeOf(segMR).Start,
Perms: hostarch.AnyAccess,
})
translatedEnd = segMR.End
}
// Don't return the error returned by f.data.Fill if it occurred outside of
// required.
if translatedEnd < required.End && cerr != nil {
return ts, &memmap.BusError{cerr}
}
if beyondEOF {
return ts, &memmap.BusError{io.EOF}
}
return ts, nil
}
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (*regularFile) InvalidateUnsavable(context.Context) error {
return nil
}
// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is accessed using atomic memory operations.
// offMu serializes operations that may mutate off.
off int64
offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *regularFileFD) Release(context.Context) {
// noop
}
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
f := fd.inode().impl.(*regularFile)
memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
// To be consistent with Linux, inode.mu must be locked throughout.
f.inode.mu.Lock()
defer f.inode.mu.Unlock()
end := offset + length
pgEnd, ok := hostarch.PageRoundUp(end)
if !ok {
return linuxerr.EFBIG
}
// Allocate in chunks for the following reasons:
// 1. Size limit may permit really large fallocate, which can take a long
// time to execute on the host. This can cause watchdog to timeout and
// crash the system. Watchdog needs petting.
// 2. Linux allocates folios iteratively while checking for interrupts. In
// gVisor, we need to manually check for interrupts between chunks.
const chunkSize = 4 << 30 // 4 GiB
for curPgStart := hostarch.PageRoundDown(offset); curPgStart < pgEnd; {
curPgEnd := pgEnd
newSize := end
if curPgEnd-curPgStart > chunkSize {
curPgEnd = curPgStart + chunkSize
newSize = curPgEnd
}
required := memmap.MappableRange{Start: curPgStart, End: curPgEnd}
if err := f.allocateLocked(ctx, mode, newSize, required, memCgID); err != nil {
return err
}
// This loop can take a long time to process, so periodically check for
// interrupts. This also pets the watchdog.
if ctx.Interrupted() {
return linuxerr.EINTR
}
// Advance curPgStart.
curPgStart = curPgEnd
}
return nil
}
// Preconditions:
// - rf.inode.mu is locked.
// - required must be page-aligned.
// - required.Start < newSize <= required.End.
func (rf *regularFile) allocateLocked(ctx context.Context, mode, newSize uint64, required memmap.MappableRange, memCgID uint32) error {
rf.dataMu.Lock()
defer rf.dataMu.Unlock()
// We must allocate pages in the range specified by offset and length.
// Even if newSize <= oldSize, there might not be actual memory backing this
// range, so any gaps must be filled by calling f.data.Fill().
// "After a successful call, subsequent writes into the range
// specified by offset and len are guaranteed not to fail because of
// lack of disk space." - fallocate(2)
pagesToFill := rf.data.PagesToFill(required, required)
if !rf.inode.fs.accountPages(pagesToFill) {
return linuxerr.ENOSPC
}
// Given our definitions in pgalloc, fallocate(2) semantics imply that pages
// in the MemoryFile must be committed, in addition to being allocated.
allocMode := pgalloc.AllocateAndCommit
if !rf.inode.fs.mf.IsDiskBacked() {
// Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We
// take a more aggressive approach in populating pages for memory-backed
// MemoryFiles. shmem pages are subject to swap rather than disk writeback.
// They are not likely to be swapped before they are written to. Hence it
// is beneficial to populate (in addition to commit) shmem pages to avoid
// faulting page-by-page when these pages are written to in the future.
allocMode = pgalloc.AllocateAndWritePopulate
}
pagesAlloced, err := rf.data.Fill(ctx, required, required, newSize, rf.inode.fs.mf, pgalloc.AllocOpts{
Kind: rf.memoryUsageKind,
MemCgID: memCgID,
Mode: allocMode,
}, nil /* r */)
// f.data.Fill() may fail mid-way. We still want to account any pages that
// were allocated, irrespective of an error.
rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced)
if err != nil && err != io.EOF {
return err
}
oldSize := rf.size.Load()
if oldSize >= newSize {
return nil
}
return rf.growLocked(newSize)
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
start := fsmetric.StartReadWait()
defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
fsmetric.TmpfsReads.Increment()
if offset < 0 {
return 0, linuxerr.EINVAL
}
// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
// all state is in-memory.
//
// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
return 0, linuxerr.EOPNOTSUPP
}
if dst.NumBytes() == 0 {
return 0, nil
}
f := fd.inode().impl.(*regularFile)
// memCgID can be 0 here because regularFileReadWriter.ReadToBlocks() never
// allocates from pgalloc.
rw := getRegularFileReadWriter(f, offset, 0)
n, err := dst.CopyOutFrom(ctx, rw)
putRegularFileReadWriter(rw)
fd.inode().touchAtime(fd.vfsfd.Mount())
return n, err
}
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
fd.offMu.Lock()
n, err := fd.PRead(ctx, dst, fd.off, opts)
fd.off += n
fd.offMu.Unlock()
return n, err
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, _, err := fd.pwrite(ctx, src, offset, opts)
return n, err
}
// pwrite returns the number of bytes written, final offset and error. The
// final offset should be ignored by PWrite.
func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if offset < 0 {
return 0, offset, linuxerr.EINVAL
}
// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
// all state is in-memory.
//
// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
return 0, offset, linuxerr.EOPNOTSUPP
}
srclen := src.NumBytes()
if srclen == 0 {
return 0, offset, nil
}
f := fd.inode().impl.(*regularFile)
f.inode.mu.Lock()
defer f.inode.mu.Unlock()
// If the file is opened with O_APPEND, update offset to file size.
if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
// Locking f.inode.mu is sufficient for reading f.size.
offset = int64(f.size.RacyLoad())
}
end := offset + srclen
if end < offset {
// Overflow.
return 0, offset, linuxerr.EINVAL
}
srclen, err = vfs.CheckLimit(ctx, offset, srclen)
if err != nil {
return 0, offset, err
}
src = src.TakeFirst64(srclen)
// Perform the write.
rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx))
n, err := src.CopyInTo(ctx, rw)
f.inode.touchCMtimeLocked()
for {
old := f.inode.mode.Load()
new := vfs.ClearSUIDAndSGID(old)
if swapped := f.inode.mode.CompareAndSwap(old, new); swapped {
break
}
}
putRegularFileReadWriter(rw)
return n, n + offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.offMu.Lock()
n, off, err := fd.pwrite(ctx, src, fd.off, opts)
fd.off = off
fd.offMu.Unlock()
return n, err
}
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
fd.offMu.Lock()
defer fd.offMu.Unlock()
switch whence {
case linux.SEEK_SET:
// use offset as specified
case linux.SEEK_CUR:
offset += fd.off
case linux.SEEK_END:
offset += int64(fd.inode().impl.(*regularFile).size.Load())
default:
return 0, linuxerr.EINVAL
}
if offset < 0 {
return 0, linuxerr.EINVAL
}
fd.off = offset
return offset, nil
}
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
file := fd.inode().impl.(*regularFile)
opts.SentryOwnedContent = true
if file.initiallyUnlinked {
opts.NameMut = memmap.NameMutAnonShmem
}
return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
}
// offsetPageEnd returns the file offset rounded up to the nearest
// page boundary. offsetPageEnd panics if rounding up causes overflow,
// which shouldn't be possible given that offset is an int64.
func offsetPageEnd(offset int64) uint64 {
end, ok := hostarch.Addr(offset).RoundUp()
if !ok {
panic("impossible overflow")
}
return uint64(end)
}
// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
type regularFileReadWriter struct {
file *regularFile
// Offset into the file to read/write at. Note that this may be
// different from the FD offset if PRead/PWrite is used.
off uint64
// memCgID is the memory cgroup ID used for accounting the allocated
// pages.
memCgID uint32
}
var regularFileReadWriterPool = sync.Pool{
New: func() any {
return &regularFileReadWriter{}
},
}
func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter {
rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
rw.file = file
rw.off = uint64(offset)
rw.memCgID = memCgID
return rw
}
func putRegularFileReadWriter(rw *regularFileReadWriter) {
rw.file = nil
regularFileReadWriterPool.Put(rw)
}
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
rw.file.dataMu.RLock()
defer rw.file.dataMu.RUnlock()
size := rw.file.size.RacyLoad()
// Compute the range to read (limited by file size and overflow-checked).
if rw.off >= size {
return 0, io.EOF
}
end := size
if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
end = rend
}
var done uint64
seg, gap := rw.file.data.Find(uint64(rw.off))
for rw.off < end {
mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
switch {
case seg.Ok():
// Get internal mappings.
ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
if err != nil {
return done, err
}
// Copy from internal mappings.
n, err := safemem.CopySeq(dsts, ims)
done += n
rw.off += uint64(n)
dsts = dsts.DropFirst64(n)
if err != nil {
return done, err
}
// Continue.
seg, gap = seg.NextNonEmpty()
case gap.Ok():
// Tmpfs holes are zero-filled.
gapmr := gap.Range().Intersect(mr)
dst := dsts.TakeFirst64(gapmr.Length())
n, err := safemem.ZeroSeq(dst)
done += n
rw.off += uint64(n)
dsts = dsts.DropFirst64(n)
if err != nil {
return done, err
}
// Continue.
seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
}
}
return done, nil
}
// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
//
// Preconditions: rw.file.inode.mu must be held.
func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
// Hold dataMu so we can modify size.
rw.file.dataMu.Lock()
defer rw.file.dataMu.Unlock()
// Compute the range to write (overflow-checked).
end := rw.off + srcs.NumBytes()
if end <= rw.off {
end = math.MaxInt64
}
// Check if seals prevent either file growth or all writes.
switch {
case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
return 0, linuxerr.EPERM
case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
// When growth is sealed, Linux effectively allows writes which would
// normally grow the file to partially succeed up to the current EOF,
// rounded down to the page boundary before the EOF.
//
// This happens because writes (and thus the growth check) for tmpfs
// files proceed page-by-page on Linux, and the final write to the page
// containing EOF fails, resulting in a partial write up to the start of
// that page.
//
// To emulate this behaviour, artificially truncate the write to the
// start of the page containing the current EOF.
//
// See Linux, mm/filemap.c:generic_perform_write() and
// mm/shmem.c:shmem_write_begin().
if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart {
end = pgstart
}
if end <= rw.off {
// Truncation would result in no data being written.
return 0, linuxerr.EPERM
}
}
// Page-aligned mr for when we need to allocate memory. RoundUp can't
// overflow since end is an int64.
pgstartaddr := hostarch.Addr(rw.off).RoundDown()
pgendaddr, _ := hostarch.Addr(end).RoundUp()
pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
var (
done uint64
retErr error
)
seg, gap := rw.file.data.Find(uint64(rw.off))
for rw.off < end {
mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
switch {
case seg.Ok():
n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs)
done += n
rw.off += uint64(n)
srcs = srcs.DropFirst64(n)
if err != nil {
retErr = err
goto exitLoop
}
// Continue.
seg, gap = seg.NextNonEmpty()
case gap.Ok():
// Allocate memory for the write.
gapMR := gap.Range().Intersect(pgMR)
pagesToFill := gapMR.Length() / hostarch.PageSize
pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill)
if pagesReserved == 0 {
if done == 0 {
retErr = linuxerr.ENOSPC
goto exitLoop
}
retErr = nil
goto exitLoop
}
gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved)
allocMode := pgalloc.AllocateAndWritePopulate
if rw.file.inode.fs.mf.IsDiskBacked() {
// Don't populate pages for disk-backed files. Benchmarking showed that
// disk-backed pages are likely to be written back to disk before we
// can write to them. The pages fault again on write anyways. In total,
// prepopulating disk-backed pages deteriorates performance as it fails
// to eliminate future page faults and we also additionally incur
// useless disk writebacks.
allocMode = pgalloc.AllocateCallerIndirectCommit
}
fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{
Kind: rw.file.memoryUsageKind,
MemCgID: rw.memCgID,
Mode: allocMode,
})
if err != nil {
retErr = err
rw.file.inode.fs.unaccountPages(pagesReserved)
goto exitLoop
}
// Write to that memory as usual.
seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
default:
panic("unreachable")
}
}
exitLoop:
// If the write ends beyond the file's previous size, it causes the
// file to grow.
if rw.off > rw.file.size.RacyLoad() {
rw.file.size.Store(rw.off)
}
return done, retErr
}
func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) {
if rw.file.inode.fs.mf.IsDiskBacked() {
// Disk-backed files are not prepopulated. The safemem.CopySeq() approach
// used below incurs a lot of page faults without page prepopulation, which
// causes a lot of context switching. Use write(2) host syscall instead,
// which makes one context switch and faults all the pages that are touched
// during the write.
fd, err := rw.file.inode.fs.mf.DataFD(fr)
if err != nil {
return 0, err
}
return hostfd.Pwritev2(
int32(fd), // fd
srcs.TakeFirst64(fr.Length()), // srcs
int64(fr.Start), // offset
0, // flags
)
}
// Get internal mappings.
ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write)
if err != nil {
return 0, err
}
// Copy to internal mappings.
return safemem.CopySeq(ims, srcs)
}
// GetSeals returns the current set of seals on a memfd inode.
func GetSeals(fd *vfs.FileDescription) (uint32, error) {
f, ok := fd.Impl().(*regularFileFD)
if !ok {
return 0, linuxerr.EINVAL
}
rf := f.inode().impl.(*regularFile)
rf.dataMu.RLock()
defer rf.dataMu.RUnlock()
return rf.seals, nil
}
// AddSeals adds new file seals to a memfd inode.
func AddSeals(fd *vfs.FileDescription, val uint32) error {
f, ok := fd.Impl().(*regularFileFD)
if !ok {
return linuxerr.EINVAL
}
rf := f.inode().impl.(*regularFile)
rf.mapsMu.Lock()
defer rf.mapsMu.Unlock()
rf.dataMu.Lock()
defer rf.dataMu.Unlock()
if rf.seals&linux.F_SEAL_SEAL != 0 {
// Seal applied which prevents addition of any new seals.
return linuxerr.EPERM
}
// F_SEAL_WRITE can only be added if there are no active writable maps.
if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
if rf.writableMappingPages > 0 {
return linuxerr.EBUSY
}
}
// Seals can only be added, never removed.
rf.seals |= val
return nil
}