mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
b01944883b
This has no effect (outside of debug logging) until cl/723723715. Updates #11436 PiperOrigin-RevId: 736686635
2004 lines
65 KiB
Go
2004 lines
65 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package pgalloc contains the page allocator subsystem, which provides
|
|
// allocatable memory that may be mapped into application address spaces.
|
|
package pgalloc
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/atomicbitops"
|
|
"gvisor.dev/gvisor/pkg/context"
|
|
"gvisor.dev/gvisor/pkg/errors/linuxerr"
|
|
"gvisor.dev/gvisor/pkg/hostarch"
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
"gvisor.dev/gvisor/pkg/safemem"
|
|
"gvisor.dev/gvisor/pkg/sentry/hostmm"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
"gvisor.dev/gvisor/pkg/sync"
|
|
)
|
|
|
|
const pagesPerHugePage = hostarch.HugePageSize / hostarch.PageSize
|
|
|
|
// MemoryFile is a memmap.File whose pages may be allocated to arbitrary
|
|
// users.
|
|
type MemoryFile struct {
|
|
memmap.DefaultMemoryType
|
|
memmap.NoBufferedIOFallback
|
|
|
|
// MemoryFile owns a single backing file. Each page in the backing file is
|
|
// considered "committed" or "uncommitted". A page is committed if the host
|
|
// kernel is spending resources to store its contents and uncommitted
|
|
// otherwise. This definition includes pages that the host kernel has
|
|
// swapped. This is intentional; it means that committed pages can only
|
|
// become uncommitted as a result of MemoryFile's actions, such that page
|
|
// commitment does not change even if host kernel swapping behavior changes.
|
|
//
|
|
// Each page in the MemoryFile is in one of the following logical states,
|
|
// protected by mu:
|
|
//
|
|
// - Void: Pages beyond the backing file's current size cannot store data.
|
|
// Void pages are uncommitted. Extending the file's size transitions pages
|
|
// between the old and new sizes from void to free.
|
|
//
|
|
// - Free: Free pages are immediately allocatable. Free pages are
|
|
// uncommitted, and implicitly zeroed. Free pages become used when they are
|
|
// allocated.
|
|
//
|
|
// - Used: Used pages have been allocated and currently have a non-zero
|
|
// reference count. Used pages may transition from uncommitted to committed
|
|
// outside of MemoryFile's control, but can only transition from committed
|
|
// to uncommitted via MemoryFile.Decommit(). The content of used pages is
|
|
// unknown. Used pages become waste when their reference count becomes
|
|
// zero.
|
|
//
|
|
// - Waste: Waste pages have no users, but cannot be immediately
|
|
// reallocated since their commitment state and content is unknown. Waste
|
|
// pages may be uncommitted or committed, but cannot transition between the
|
|
// two. MemoryFile's releaser goroutine transitions pages from waste to
|
|
// releasing. Allocations that may return committed pages can transition
|
|
// pages from waste to used (referred to as "recycling").
|
|
//
|
|
// - Releasing: Releasing pages are waste pages that the releaser goroutine
|
|
// has removed from waste-tracking, making them ineligible for recycling.
|
|
// The releaser decommits releasing pages without holding mu, then
|
|
// transitions them back to free or sub-released with mu locked.
|
|
//
|
|
// - Sub-release: Sub-released pages are released small pages within a
|
|
// huge-page-backed allocation where the containing huge page as a whole
|
|
// has not yet been released, which can arise because references are still
|
|
// counted at page granularity within huge-page-backed ranges. Sub-released
|
|
// pages cannot be used for allocations until release of the whole
|
|
// containing huge page causes it to transition it to free. We assume that
|
|
// sub-released pages are uncommitted; this isn't necessarily true (see
|
|
// discussion of khugepaged elsewhere in this file), but the assumption is
|
|
// consistent with legacy behavior.
|
|
|
|
mu memoryFileMutex
|
|
|
|
// unwasteSmall and unwasteHuge track waste ranges backed by small/huge pages
|
|
// respectively. Both sets are "inverted"; segments exist for all ranges that
|
|
// are *not* waste, allowing use of segment.Set gap-tracking to efficiently
|
|
// find ranges for both release and recycling allocations.
|
|
//
|
|
// unwasteSmall and unwasteHuge are protected by mu.
|
|
unwasteSmall unwasteSet
|
|
unwasteHuge unwasteSet
|
|
|
|
// haveWaste is true if there may be at least one waste page in the
|
|
// MemoryFile.
|
|
//
|
|
// haveWaste is protected by mu.
|
|
haveWaste bool
|
|
|
|
// releaseCond is signaled (with mu locked) when haveWaste or destroyed
|
|
// transitions from false to true.
|
|
releaseCond sync.Cond
|
|
|
|
// unfreeSmall and unfreeHuge track information for non-free ranges backed
|
|
// by small/huge pages respectively. Each unfreeSet also contains segments
|
|
// representing chunks that are backed by a different page size. Gaps in
|
|
// the sets therefore represent free ranges backed by small/huge pages,
|
|
// allowing use of segment.Set gap-tracking to efficiently find free ranges
|
|
// for allocation.
|
|
//
|
|
// unfreeSmall and unfreeHuge are protected by mu.
|
|
unfreeSmall unfreeSet
|
|
unfreeHuge unfreeSet
|
|
|
|
// subreleased maps hugepage-aligned file offsets to the number of
|
|
// sub-released small pages within the hugepage beginning at that offset.
|
|
// subreleased is protected by mu.
|
|
subreleased map[uint64]uint64
|
|
|
|
// These fields are used for memory accounting.
|
|
//
|
|
// Memory accounting is based on identifying the set of committed pages.
|
|
// Since we do not have direct access to application page tables (on most
|
|
// platforms), tracking application accesses to uncommitted pages to detect
|
|
// commitment would introduce additional page faults, which would be
|
|
// prohibitively expensive. Instead, we query the host kernel to determine
|
|
// which pages are committed.
|
|
//
|
|
// memAcct tracks memory accounting state, including commitment status, for
|
|
// each page. Non-empty gaps in memAcct represent pages known to be
|
|
// uncommitted (void, free, and sub-released pages).
|
|
//
|
|
// knownCommittedBytes is the number of bytes in the file known to be
|
|
// committed, i.e. the span of all segments in memAcct for which
|
|
// knownCommitted is true.
|
|
//
|
|
// commitSeq is a sequence counter used to detect races between scans for
|
|
// committed pages and concurrent decommitment.
|
|
//
|
|
// nextCommitScan is the next time at which UpdateUsage() may scan the
|
|
// backing file for commitment information.
|
|
//
|
|
// isSaving is non-zero during f.SaveTo() to prevent concurrent calls to
|
|
// f.UpdateUsage() from marking pages as committed.
|
|
//
|
|
// All of these fields are protected by mu.
|
|
memAcct memAcctSet
|
|
knownCommittedBytes uint64
|
|
commitSeq uint64
|
|
nextCommitScan time.Time
|
|
isSaving uint
|
|
|
|
// evictable maps EvictableMemoryUsers to eviction state.
|
|
//
|
|
// evictable is protected by mu.
|
|
evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
|
|
|
|
// evictionWG counts the number of goroutines currently performing evictions.
|
|
evictionWG sync.WaitGroup
|
|
|
|
// opts holds options passed to NewMemoryFile. opts is immutable.
|
|
opts MemoryFileOpts
|
|
|
|
// savable is true if this MemoryFile will be saved via SaveTo() during
|
|
// the kernel's SaveTo operation. savable is protected by mu.
|
|
savable bool
|
|
|
|
// destroyed is set by Destroy to instruct the releaser goroutine to
|
|
// release all MemoryFile resources and exit. destroyed is protected by mu.
|
|
destroyed bool
|
|
|
|
// stopNotifyPressure stops memory cgroup pressure level
|
|
// notifications used to drive eviction. stopNotifyPressure is
|
|
// immutable.
|
|
stopNotifyPressure func()
|
|
|
|
// If asyncPageLoad is non-nil, it tracks the state of in-progress or
|
|
// failed async page loading.
|
|
asyncPageLoad atomic.Pointer[aplShared]
|
|
|
|
// file is the backing file. The file pointer is immutable.
|
|
file *os.File
|
|
|
|
// chunks holds metadata for each usable chunk in the backing file.
|
|
//
|
|
// chunks is at the end of MemoryFile in hopes of placing it on a relatively
|
|
// quiet cache line, since MapInternal() is by far the hottest path through
|
|
// pgalloc.
|
|
//
|
|
// chunks is protected by mu. chunks slices are immutable.
|
|
chunks atomic.Pointer[[]chunkInfo]
|
|
}
|
|
|
|
const (
|
|
chunkShift = 30
|
|
chunkSize = 1 << chunkShift // 1 GB
|
|
chunkMask = chunkSize - 1
|
|
maxChunks = math.MaxInt64 / chunkSize // because file size is int64
|
|
)
|
|
|
|
// chunkInfo is the value type of MemoryFile.chunks.
|
|
//
|
|
// +stateify savable
|
|
type chunkInfo struct {
|
|
// mapping is the start address of a mapping of the chunk.
|
|
//
|
|
// mapping is immutable.
|
|
mapping uintptr `state:"nosave"`
|
|
|
|
// huge is true if this chunk is expected to be hugepage-backed and false if
|
|
// this chunk is expected to be smallpage-backed.
|
|
//
|
|
// huge is immutable.
|
|
huge bool
|
|
}
|
|
|
|
func (f *MemoryFile) chunksLoad() []chunkInfo {
|
|
return *f.chunks.Load()
|
|
}
|
|
|
|
// forEachChunk invokes fn on a sequence of chunks that collectively span all
|
|
// bytes in fr. In each call, chunkFR is the subset of fr that falls within
|
|
// chunk. If any call to f returns false, forEachChunk stops iteration and
|
|
// returns.
|
|
func (f *MemoryFile) forEachChunk(fr memmap.FileRange, fn func(chunk *chunkInfo, chunkFR memmap.FileRange) bool) {
|
|
chunks := f.chunksLoad()
|
|
chunkStart := fr.Start &^ chunkMask
|
|
i := int(fr.Start / chunkSize)
|
|
for chunkStart < fr.End {
|
|
chunkEnd := chunkStart + chunkSize
|
|
if !fn(&chunks[i], fr.Intersect(memmap.FileRange{chunkStart, chunkEnd})) {
|
|
return
|
|
}
|
|
chunkStart = chunkEnd
|
|
i++
|
|
}
|
|
}
|
|
|
|
// unwasteInfo is the value type of MemoryFile.unwasteSmall/Huge.
|
|
//
|
|
// +stateify savable
|
|
type unwasteInfo struct{}
|
|
|
|
// unfreeInfo is the value type of MemoryFile.unfreeSmall/Huge.
|
|
//
|
|
// +stateify savable
|
|
type unfreeInfo struct {
|
|
// refs is the per-page reference count. refs is non-zero for used pages,
|
|
// and zero for void, waste, releasing, and sub-released pages, as well as
|
|
// pages backed by a different page size.
|
|
refs uint64
|
|
}
|
|
|
|
// memAcctInfo is the value type of MemoryFile.memAcct.
|
|
//
|
|
// +stateify savable
|
|
type memAcctInfo struct {
|
|
// kind is the memory accounting type. kind is allocation-dependent for
|
|
// used pages, and usage.System for void, waste, releasing, and
|
|
// sub-released pages.
|
|
kind usage.MemoryKind
|
|
|
|
// memCgID is the memory cgroup ID to which represented pages are accounted.
|
|
memCgID uint32
|
|
|
|
// knownCommitted is true if represented pages are definitely committed.
|
|
// (If knownCommitted is false, represented pages may or may not be
|
|
// committed; pages that are definitely not committed are represented by
|
|
// gaps in MemoryFile.memAcct.)
|
|
knownCommitted bool
|
|
|
|
// If true, represented pages are waste or releasing pages.
|
|
wasteOrReleasing bool
|
|
|
|
// If knownCommitted is false, commitSeq was the value of
|
|
// MemoryFile.commitSeq when knownCommitted last transitioned to false.
|
|
// Otherwise, commitSeq is 0.
|
|
commitSeq uint64
|
|
}
|
|
|
|
// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
|
|
// may be asked to deallocate that memory in the presence of memory pressure.
|
|
type EvictableMemoryUser interface {
|
|
// Evict requests that the EvictableMemoryUser deallocate memory used by
|
|
// er, which was registered as evictable by a previous call to
|
|
// MemoryFile.MarkEvictable.
|
|
//
|
|
// Evict is not required to deallocate memory. In particular, since pgalloc
|
|
// must call Evict without holding locks to avoid circular lock ordering,
|
|
// it is possible that the passed range has already been marked as
|
|
// unevictable by a racing call to MemoryFile.MarkUnevictable.
|
|
// Implementations of EvictableMemoryUser must detect such races and handle
|
|
// them by making Evict have no effect on unevictable ranges.
|
|
//
|
|
// After a call to Evict, the MemoryFile will consider the evicted range
|
|
// unevictable (i.e. it will not call Evict on the same range again) until
|
|
// informed otherwise by a subsequent call to MarkEvictable.
|
|
Evict(ctx context.Context, er EvictableRange)
|
|
}
|
|
|
|
// An EvictableRange represents a range of uint64 offsets in an
|
|
// EvictableMemoryUser.
|
|
//
|
|
// In practice, most EvictableMemoryUsers will probably be implementations of
|
|
// memmap.Mappable, and EvictableRange therefore corresponds to
|
|
// memmap.MappableRange. However, this package cannot depend on the memmap
|
|
// package, since doing so would create a circular dependency.
|
|
//
|
|
// type EvictableRange <generated using go_generics>
|
|
|
|
// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
|
|
type evictableMemoryUserInfo struct {
|
|
// ranges tracks all evictable ranges for the given user.
|
|
ranges evictableRangeSet
|
|
|
|
// If evicting is true, there is a goroutine currently evicting all
|
|
// evictable ranges for this user.
|
|
evicting bool
|
|
}
|
|
|
|
// MemoryFileOpts provides options to NewMemoryFile.
|
|
type MemoryFileOpts struct {
|
|
// DelayedEviction controls the extent to which the MemoryFile may delay
|
|
// eviction of evictable allocations.
|
|
DelayedEviction DelayedEvictionType
|
|
|
|
// If UseHostMemcgPressure is true, use host memory cgroup pressure level
|
|
// notifications to determine when eviction is necessary. This option has
|
|
// no effect unless DelayedEviction is DelayedEvictionEnabled.
|
|
UseHostMemcgPressure bool
|
|
|
|
// DecommitOnDestroy indicates whether the entire host file should be
|
|
// decommitted on destruction. This is appropriate for host filesystem based
|
|
// files that need to be explicitly cleaned up to release disk space.
|
|
DecommitOnDestroy bool
|
|
|
|
// If DisableIMAWorkAround is true, NewMemoryFile will not call
|
|
// IMAWorkAroundForMemFile().
|
|
DisableIMAWorkAround bool
|
|
|
|
// DiskBackedFile indicates that the MemoryFile is backed by a file on disk.
|
|
DiskBackedFile bool
|
|
|
|
// RestoreID is an opaque string used to reassociate the MemoryFile with its
|
|
// replacement during restore.
|
|
RestoreID string
|
|
|
|
// If ExpectHugepages is true, MemoryFile will expect that the host will
|
|
// attempt to back AllocOpts.Huge == true allocations with huge pages. If
|
|
// ExpectHugepages is false, MemoryFile will expect that the host will back
|
|
// all allocations with small pages.
|
|
ExpectHugepages bool
|
|
|
|
// If AdviseHugepage is true, MemoryFile will request that the host back
|
|
// AllocOpts.Huge == true allocations with huge pages using MADV_HUGEPAGE.
|
|
AdviseHugepage bool
|
|
|
|
// If AdviseNoHugepage is true, MemoryFile will request that the host back
|
|
// AllocOpts.Huge == false allocations with small pages using
|
|
// MADV_NOHUGEPAGE.
|
|
AdviseNoHugepage bool
|
|
|
|
// If DisableMemoryAccounting is true, memory usage observed by the
|
|
// MemoryFile will not be reported in usage.MemoryAccounting.
|
|
DisableMemoryAccounting bool
|
|
}
|
|
|
|
// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
|
|
type DelayedEvictionType uint8
|
|
|
|
const (
|
|
// DelayedEvictionDefault has unspecified behavior.
|
|
DelayedEvictionDefault DelayedEvictionType = iota
|
|
|
|
// DelayedEvictionDisabled requires that evictable allocations are evicted
|
|
// as soon as possible.
|
|
DelayedEvictionDisabled
|
|
|
|
// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
|
|
// evictable allocations until doing so is considered necessary to avoid
|
|
// performance degradation due to host memory pressure, or OOM kills.
|
|
//
|
|
// As of this writing, the behavior of DelayedEvictionEnabled depends on
|
|
// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
|
|
//
|
|
// - If UseHostMemcgPressure is true, evictions are delayed until memory
|
|
// pressure is indicated.
|
|
//
|
|
// - Otherwise, evictions are only delayed until the releaser goroutine is
|
|
// out of work (pages to release).
|
|
DelayedEvictionEnabled
|
|
|
|
// DelayedEvictionManual requires that evictable allocations are only
|
|
// evicted when MemoryFile.StartEvictions() is called. This is extremely
|
|
// dangerous outside of tests.
|
|
DelayedEvictionManual
|
|
)
|
|
|
|
// NewMemoryFile creates a MemoryFile backed by the given file. If
|
|
// NewMemoryFile succeeds, ownership of file is transferred to the returned
|
|
// MemoryFile.
|
|
func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
|
|
switch opts.DelayedEviction {
|
|
case DelayedEvictionDefault:
|
|
opts.DelayedEviction = DelayedEvictionEnabled
|
|
case DelayedEvictionDisabled, DelayedEvictionManual:
|
|
opts.UseHostMemcgPressure = false
|
|
case DelayedEvictionEnabled:
|
|
// ok
|
|
default:
|
|
return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
|
|
}
|
|
|
|
// Truncate the file to 0 bytes first to ensure that it's empty.
|
|
if err := file.Truncate(0); err != nil {
|
|
return nil, err
|
|
}
|
|
f := &MemoryFile{
|
|
opts: opts,
|
|
file: file,
|
|
}
|
|
f.initFields()
|
|
|
|
if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
|
|
stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
|
|
f.mu.Lock()
|
|
startedAny := f.startEvictionsLocked()
|
|
f.mu.Unlock()
|
|
if startedAny {
|
|
log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
|
|
}
|
|
}, "low")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
|
|
}
|
|
f.stopNotifyPressure = stop
|
|
}
|
|
|
|
go f.releaserMain() // S/R-SAFE: f.mu
|
|
|
|
if !opts.DisableIMAWorkAround {
|
|
IMAWorkAroundForMemFile(file.Fd())
|
|
}
|
|
return f, nil
|
|
}
|
|
|
|
func (f *MemoryFile) initFields() {
|
|
// Initially, all pages are void.
|
|
fullFR := memmap.FileRange{0, math.MaxUint64}
|
|
f.unwasteSmall.InsertRange(fullFR, unwasteInfo{})
|
|
f.unwasteHuge.InsertRange(fullFR, unwasteInfo{})
|
|
f.releaseCond.L = &f.mu
|
|
f.unfreeSmall.InsertRange(fullFR, unfreeInfo{})
|
|
f.unfreeHuge.InsertRange(fullFR, unfreeInfo{})
|
|
f.subreleased = make(map[uint64]uint64)
|
|
f.evictable = make(map[EvictableMemoryUser]*evictableMemoryUserInfo)
|
|
chunks := []chunkInfo(nil)
|
|
f.chunks.Store(&chunks)
|
|
}
|
|
|
|
// IMAWorkAroundForMemFile works around IMA by immediately creating a temporary
|
|
// PROT_EXEC mapping, while the backing file is still small. IMA will ignore
|
|
// any future mappings.
|
|
//
|
|
// The Linux kernel contains an optional feature called "Integrity
|
|
// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
|
|
// binaries the first time they are mapped PROT_EXEC. This is bad news for
|
|
// executable pages mapped from our backing file, which can grow to
|
|
// terabytes in (sparse) size. If IMA attempts to checksum a file that
|
|
// large, it will allocate all of the sparse pages and quickly exhaust all
|
|
// memory.
|
|
func IMAWorkAroundForMemFile(fd uintptr) {
|
|
m, _, errno := unix.Syscall6(
|
|
unix.SYS_MMAP,
|
|
0,
|
|
hostarch.PageSize,
|
|
unix.PROT_EXEC,
|
|
unix.MAP_SHARED,
|
|
fd,
|
|
0)
|
|
if errno != 0 {
|
|
// This isn't fatal (IMA may not even be in use). Log the error, but
|
|
// don't return it.
|
|
log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
|
|
} else {
|
|
if _, _, errno := unix.Syscall(
|
|
unix.SYS_MUNMAP,
|
|
m,
|
|
hostarch.PageSize,
|
|
0); errno != 0 {
|
|
panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Destroy releases all resources used by f.
|
|
//
|
|
// Preconditions: All pages allocated by f have been freed.
|
|
//
|
|
// Postconditions: None of f's methods may be called after Destroy.
|
|
func (f *MemoryFile) Destroy() {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.destroyed = true
|
|
f.releaseCond.Signal()
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked.
|
|
func (f *MemoryFile) releaserDestroyLocked() {
|
|
if !f.destroyed {
|
|
panic("destroyed is no longer set")
|
|
}
|
|
|
|
if f.opts.DecommitOnDestroy {
|
|
if chunks := f.chunksLoad(); len(chunks) != 0 {
|
|
if err := f.decommitFile(memmap.FileRange{0, uint64(len(chunks)) * chunkSize}); err != nil {
|
|
panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err))
|
|
}
|
|
}
|
|
}
|
|
|
|
f.file.Close()
|
|
// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
|
|
// that has possibly been reassigned.
|
|
f.file = nil
|
|
chunks := f.chunksLoad()
|
|
for i := range chunks {
|
|
chunk := &chunks[i]
|
|
_, _, errno := unix.Syscall(unix.SYS_MUNMAP, chunk.mapping, chunkSize, 0)
|
|
if errno != 0 {
|
|
log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", chunk.mapping, i, errno)
|
|
}
|
|
chunk.mapping = 0
|
|
}
|
|
}
|
|
|
|
// AllocOpts are options used in MemoryFile.Allocate.
|
|
type AllocOpts struct {
|
|
// Kind is the allocation's memory accounting type.
|
|
Kind usage.MemoryKind
|
|
|
|
// MemCgID is the memory cgroup ID and the zero value indicates that
|
|
// the memory will not be accounted to any cgroup.
|
|
MemCgID uint32
|
|
|
|
// Mode controls the commitment status of returned pages.
|
|
Mode AllocationMode
|
|
|
|
// If Huge is true, the allocation should be hugepage-backed if possible.
|
|
Huge bool
|
|
|
|
// Dir indicates the direction in which offsets are allocated.
|
|
Dir Direction
|
|
|
|
// If ReaderFunc is provided, the allocated memory is filled by calling it
|
|
// repeatedly until either length bytes are read or a non-nil error is
|
|
// returned. It returns the allocated memory, truncated down to the nearest
|
|
// page. If this is shorter than length bytes due to an error returned by
|
|
// ReaderFunc, it returns the partially filled fr and error.
|
|
ReaderFunc safemem.ReaderFunc
|
|
}
|
|
|
|
// Direction is the type of AllocOpts.Dir.
|
|
type Direction uint8
|
|
|
|
const (
|
|
// BottomUp allocates offsets in increasing offsets.
|
|
BottomUp Direction = iota
|
|
// TopDown allocates offsets in decreasing offsets.
|
|
TopDown
|
|
)
|
|
|
|
// String implements fmt.Stringer.
|
|
func (d Direction) String() string {
|
|
switch d {
|
|
case BottomUp:
|
|
return "up"
|
|
case TopDown:
|
|
return "down"
|
|
}
|
|
panic(fmt.Sprintf("invalid direction: %d", d))
|
|
}
|
|
|
|
// AllocationMode is the type of AllocOpts.Mode.
|
|
type AllocationMode int
|
|
|
|
const (
|
|
// AllocateUncommitted indicates that MemoryFile.Allocate() must return
|
|
// uncommitted pages.
|
|
AllocateUncommitted AllocationMode = iota
|
|
|
|
// AllocateCallerIndirectCommit indicates that the caller of
|
|
// MemoryFile.Allocate() intends to commit all allocated pages, without
|
|
// using our page tables. Thus, Allocate() may return committed or
|
|
// uncommitted pages.
|
|
AllocateCallerIndirectCommit
|
|
|
|
// AllocateAndCommit indicates that MemoryFile.Allocate() must return
|
|
// committed pages.
|
|
AllocateAndCommit
|
|
|
|
// AllocateAndWritePopulate indicates that the caller of
|
|
// MemoryFile.Allocate() intends to commit all allocated pages, using our
|
|
// page tables. Thus, Allocate() may return committed or uncommitted pages,
|
|
// and should pre-populate page table entries permitting writing for
|
|
// mappings of those pages returned by MapInternal().
|
|
AllocateAndWritePopulate
|
|
)
|
|
|
|
// allocState holds the state of a call to MemoryFile.Allocate().
|
|
type allocState struct {
|
|
length uint64
|
|
opts AllocOpts
|
|
willCommit bool // either us or our caller
|
|
recycled bool
|
|
huge bool
|
|
}
|
|
|
|
// Allocate returns a range of initially-zeroed pages of the given length, with
|
|
// a single reference on each page held by the caller. When the last reference
|
|
// on an allocated page is released, ownership of the page is returned to the
|
|
// MemoryFile, allowing it to be returned by a future call to Allocate.
|
|
//
|
|
// Preconditions:
|
|
// - length > 0.
|
|
// - length must be page-aligned.
|
|
// - If opts.Hugepage == true, length must be hugepage-aligned.
|
|
func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) {
|
|
if length == 0 || !hostarch.IsPageAligned(length) || (opts.Huge && !hostarch.IsHugePageAligned(length)) {
|
|
panic(fmt.Sprintf("invalid allocation length: %#x", length))
|
|
}
|
|
|
|
alloc := allocState{
|
|
length: length,
|
|
opts: opts,
|
|
willCommit: opts.Mode != AllocateUncommitted,
|
|
huge: opts.Huge && f.opts.ExpectHugepages,
|
|
}
|
|
|
|
fr, err := f.findAllocatableAndMarkUsed(&alloc)
|
|
if err != nil {
|
|
return fr, err
|
|
}
|
|
|
|
var dsts safemem.BlockSeq
|
|
if alloc.willCommit {
|
|
needHugeTouch := false
|
|
if alloc.recycled {
|
|
// We will need writable page table entries in our address space to
|
|
// zero these pages.
|
|
alloc.opts.Mode = AllocateAndWritePopulate
|
|
} else if alloc.opts.Mode != AllocateAndWritePopulate && ((alloc.huge && f.opts.AdviseHugepage) || (!alloc.huge && f.opts.AdviseNoHugepage)) {
|
|
// If Mode is AllocateCallerIndirectCommit and we do nothing, the
|
|
// first access to the allocation may be by the application,
|
|
// through a platform.AddressSpace, which may not have
|
|
// MADV_HUGEPAGE (=> vma flag VM_HUGEPAGE) set. Consequently,
|
|
// shmem_fault() => shmem_get_folio_gfp() will commit a small page.
|
|
//
|
|
// If Mode is AllocateAndCommit and we do nothing, the first access
|
|
// to the allocation is via fallocate(2), which has the same
|
|
// problem: shmem_fallocate() => shmem_get_folio() =>
|
|
// shmem_get_folio_gfp(vma=NULL).
|
|
//
|
|
// khugepaged may eventually collapse the containing
|
|
// hugepage-aligned region into a huge page when it scans our
|
|
// mapping (khugepaged_scan_mm_slot() => khugepaged_scan_file()),
|
|
// but this depends on khugepaged_max_ptes_none, and in addition to
|
|
// the latency and overhead of doing so, this will incur another
|
|
// round of page faults.
|
|
//
|
|
// If write-populating through our mappings succeeds, then it will
|
|
// avoid this problem. Otherwise, we need to touch each huge page
|
|
// through our mappings.
|
|
//
|
|
// An analogous problem applies if MADV_NOHUGEPAGE is required
|
|
// rather than MADV_HUGEPAGE; MADV_NOHUGEPAGE is only enabled if
|
|
// the file defaults to huge pages, so populating or touching
|
|
// through our mappings is needed to ensure that the allocation is
|
|
// small-page-backed. In this case, we only need to force
|
|
// commitment of one small page per huge page to prevent future
|
|
// page faults within the huge page from faulting a huge page,
|
|
// though there's nothing we can do about khugepaged.
|
|
alloc.opts.Mode = AllocateAndWritePopulate
|
|
needHugeTouch = true
|
|
}
|
|
|
|
switch alloc.opts.Mode {
|
|
case AllocateUncommitted, AllocateCallerIndirectCommit:
|
|
// Nothing for us to do.
|
|
case AllocateAndCommit:
|
|
if err := f.commitFile(fr); err != nil {
|
|
f.DecRef(fr)
|
|
return memmap.FileRange{}, err
|
|
}
|
|
case AllocateAndWritePopulate:
|
|
dsts, err = f.MapInternal(fr, hostarch.Write)
|
|
if err != nil {
|
|
f.DecRef(fr)
|
|
return memmap.FileRange{}, err
|
|
}
|
|
if canPopulate() {
|
|
rem := dsts
|
|
for {
|
|
if !tryPopulate(rem.Head()) {
|
|
break
|
|
}
|
|
rem = rem.Tail()
|
|
if rem.IsEmpty() {
|
|
needHugeTouch = false
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if alloc.recycled {
|
|
// The contents of recycled waste pages are initially unknown, so we
|
|
// need to zero them.
|
|
f.manuallyZero(fr)
|
|
} else if needHugeTouch {
|
|
// We only need to touch a single byte in each huge page.
|
|
f.forEachMappingSlice(fr, func(bs []byte) {
|
|
for i := 0; i < len(bs); i += hostarch.HugePageSize {
|
|
bs[i] = 0
|
|
}
|
|
})
|
|
}
|
|
default:
|
|
panic(fmt.Sprintf("unknown AllocOpts.Mode %d", alloc.opts.Mode))
|
|
}
|
|
}
|
|
if alloc.opts.ReaderFunc != nil {
|
|
if dsts.IsEmpty() {
|
|
dsts, err = f.MapInternal(fr, hostarch.Write)
|
|
if err != nil {
|
|
f.DecRef(fr)
|
|
return memmap.FileRange{}, err
|
|
}
|
|
}
|
|
n, err := safemem.ReadFullToBlocks(alloc.opts.ReaderFunc, dsts)
|
|
un := uint64(hostarch.Addr(n).RoundDown())
|
|
if un < length {
|
|
// Free unused memory and update fr to contain only the memory that is
|
|
// still allocated.
|
|
f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
|
|
fr.End = fr.Start + un
|
|
}
|
|
if err != nil {
|
|
return fr, err
|
|
}
|
|
}
|
|
|
|
return fr, nil
|
|
}
|
|
|
|
func (f *MemoryFile) findAllocatableAndMarkUsed(alloc *allocState) (fr memmap.FileRange, err error) {
|
|
unwaste := &f.unwasteSmall
|
|
unfree := &f.unfreeSmall
|
|
if alloc.huge {
|
|
unwaste = &f.unwasteHuge
|
|
unfree = &f.unfreeHuge
|
|
}
|
|
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
|
|
if alloc.willCommit {
|
|
// Try to recycle waste pages, since this avoids the overhead of
|
|
// decommitting and then committing them again.
|
|
var uwgap unwasteGapIterator
|
|
if alloc.opts.Dir == BottomUp {
|
|
uwgap = unwaste.FirstLargeEnoughGap(alloc.length)
|
|
} else {
|
|
uwgap = unwaste.LastLargeEnoughGap(alloc.length)
|
|
}
|
|
if uwgap.Ok() {
|
|
alloc.recycled = true
|
|
if alloc.opts.Dir == BottomUp {
|
|
fr = memmap.FileRange{
|
|
Start: uwgap.Start(),
|
|
End: uwgap.Start() + alloc.length,
|
|
}
|
|
} else {
|
|
fr = memmap.FileRange{
|
|
Start: uwgap.End() - alloc.length,
|
|
End: uwgap.End(),
|
|
}
|
|
}
|
|
unwaste.Insert(uwgap, fr, unwasteInfo{})
|
|
// Update reference count for these pages from 0 to 1.
|
|
unfree.MutateFullRange(fr, func(ufseg unfreeIterator) bool {
|
|
uf := ufseg.ValuePtr()
|
|
if uf.refs != 0 {
|
|
panic(fmt.Sprintf("waste pages %v have unexpected refcount %d during recycling of %v\n%s", ufseg.Range(), uf.refs, fr, f.stringLocked()))
|
|
}
|
|
uf.refs = 1
|
|
return true
|
|
})
|
|
// These pages should all be unknown-commitment or known-committed;
|
|
// mark them unknown-commitment, for consistency with non-recycling
|
|
// allocations (below).
|
|
f.memAcct.MutateFullRange(fr, func(maseg memAcctIterator) bool {
|
|
ma := maseg.ValuePtr()
|
|
malen := maseg.Range().Length()
|
|
if ma.knownCommitted {
|
|
if ma.kind != usage.System {
|
|
panic(fmt.Sprintf("waste pages %v have unexpected kind %v\n%s", maseg.Range(), ma.kind, f.stringLocked()))
|
|
}
|
|
ma.knownCommitted = false
|
|
ma.commitSeq = 0
|
|
f.knownCommittedBytes -= malen
|
|
if !f.opts.DisableMemoryAccounting {
|
|
usage.MemoryAccounting.Dec(malen, usage.System, ma.memCgID)
|
|
}
|
|
}
|
|
ma.kind = alloc.opts.Kind
|
|
ma.memCgID = alloc.opts.MemCgID
|
|
ma.wasteOrReleasing = false
|
|
return true
|
|
})
|
|
return
|
|
}
|
|
}
|
|
|
|
// No suitable waste pages or we can't use them.
|
|
retryFree:
|
|
// Try to allocate free pages from existing chunks.
|
|
var ufgap unfreeGapIterator
|
|
if alloc.opts.Dir == BottomUp {
|
|
ufgap = unfree.FirstLargeEnoughGap(alloc.length)
|
|
} else {
|
|
ufgap = unfree.LastLargeEnoughGap(alloc.length)
|
|
}
|
|
if !ufgap.Ok() {
|
|
// Extend the file to create more chunks.
|
|
err = f.extendChunksLocked(alloc)
|
|
if err != nil {
|
|
return
|
|
}
|
|
// Retry the allocation using new chunks.
|
|
goto retryFree
|
|
}
|
|
if alloc.opts.Dir == BottomUp {
|
|
fr = memmap.FileRange{
|
|
Start: ufgap.Start(),
|
|
End: ufgap.Start() + alloc.length,
|
|
}
|
|
} else {
|
|
fr = memmap.FileRange{
|
|
Start: ufgap.End() - alloc.length,
|
|
End: ufgap.End(),
|
|
}
|
|
}
|
|
unfree.Insert(ufgap, fr, unfreeInfo{refs: 1})
|
|
// These pages should all be known-decommitted; mark them
|
|
// unknown-commitment, since they can be concurrently committed by the
|
|
// allocation's users at any time until deallocation.
|
|
//
|
|
// If alloc.willCommit is true, we expect these pages to become committed
|
|
// in the near future; mark them unknown-commitment anyway, since marking
|
|
// them committed prematurely makes them more likely to be saved even if
|
|
// zeroed, unless SaveOpts.ExcludeCommittedZeroPages is enabled.
|
|
f.memAcct.InsertRange(fr, memAcctInfo{
|
|
kind: alloc.opts.Kind,
|
|
memCgID: alloc.opts.MemCgID,
|
|
knownCommitted: false,
|
|
commitSeq: f.commitSeq,
|
|
})
|
|
return
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked.
|
|
func (f *MemoryFile) extendChunksLocked(alloc *allocState) error {
|
|
unfree := &f.unfreeSmall
|
|
if alloc.huge {
|
|
unfree = &f.unfreeHuge
|
|
}
|
|
|
|
oldChunks := f.chunksLoad()
|
|
oldNrChunks := uint64(len(oldChunks))
|
|
oldFileSize := oldNrChunks * chunkSize
|
|
|
|
// Determine how many chunks we need to satisfy alloc.
|
|
tail := uint64(0)
|
|
if oldNrChunks != 0 {
|
|
if lastChunk := oldChunks[oldNrChunks-1]; lastChunk.huge == alloc.huge {
|
|
// We can use free pages at the end of the current last chunk.
|
|
if ufgap := unfree.FindGap(oldFileSize - 1); ufgap.Ok() {
|
|
tail = ufgap.Range().Length()
|
|
}
|
|
}
|
|
}
|
|
incNrChunks := (alloc.length + chunkMask - tail) / chunkSize
|
|
incFileSize := incNrChunks * chunkSize
|
|
newNrChunks := oldNrChunks + incNrChunks
|
|
if newNrChunks > maxChunks || newNrChunks < oldNrChunks /* overflow */ {
|
|
return linuxerr.ENOMEM
|
|
}
|
|
newFileSize := newNrChunks * chunkSize
|
|
|
|
// Extend the backing file and obtain mappings for the new chunks. If the
|
|
// backing file is memory-backed, and THP is enabled, Linux will align our
|
|
// mapping to a hugepage boundary; see
|
|
// mm/shmem.c:shmem_get_unmapped_area().
|
|
//
|
|
// In tests, f.file may be nil.
|
|
var mapStart uintptr
|
|
if f.file != nil {
|
|
if err := f.file.Truncate(int64(newFileSize)); err != nil {
|
|
return err
|
|
}
|
|
m, _, errno := unix.Syscall6(
|
|
unix.SYS_MMAP,
|
|
0,
|
|
uintptr(incFileSize),
|
|
unix.PROT_READ|unix.PROT_WRITE,
|
|
unix.MAP_SHARED,
|
|
f.file.Fd(),
|
|
uintptr(oldFileSize))
|
|
if errno != 0 {
|
|
return errno
|
|
}
|
|
mapStart = m
|
|
f.madviseChunkMapping(mapStart, uintptr(incFileSize), alloc.huge)
|
|
}
|
|
|
|
// Update chunk state.
|
|
newChunks := make([]chunkInfo, newNrChunks, newNrChunks)
|
|
copy(newChunks, oldChunks)
|
|
m := mapStart
|
|
for i := oldNrChunks; i < newNrChunks; i++ {
|
|
newChunks[i].huge = alloc.huge
|
|
if f.file != nil {
|
|
newChunks[i].mapping = m
|
|
m += chunkSize
|
|
}
|
|
}
|
|
f.chunks.Store(&newChunks)
|
|
|
|
// Mark void pages free.
|
|
unfree.RemoveFullRange(memmap.FileRange{
|
|
Start: oldNrChunks * chunkSize,
|
|
End: newNrChunks * chunkSize,
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
func (f *MemoryFile) madviseChunkMapping(addr, len uintptr, huge bool) {
|
|
if huge {
|
|
if f.opts.AdviseHugepage {
|
|
_, _, errno := unix.Syscall(unix.SYS_MADVISE, addr, len, unix.MADV_HUGEPAGE)
|
|
if errno != 0 {
|
|
// Log this failure but continue.
|
|
log.Warningf("madvise(%#x, %d, MADV_HUGEPAGE) failed: %s", addr, len, errno)
|
|
}
|
|
}
|
|
} else {
|
|
if f.opts.AdviseNoHugepage {
|
|
_, _, errno := unix.Syscall(unix.SYS_MADVISE, addr, len, unix.MADV_NOHUGEPAGE)
|
|
if errno != 0 {
|
|
// Log this failure but continue.
|
|
log.Warningf("madvise(%#x, %d, MADV_NOHUGEPAGE) failed: %s", addr, len, errno)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var mlockDisabled atomicbitops.Uint32
|
|
var madvPopulateWriteDisabled atomicbitops.Uint32
|
|
|
|
func canPopulate() bool {
|
|
return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0
|
|
}
|
|
|
|
func tryPopulateMadv(b safemem.Block) bool {
|
|
if madvPopulateWriteDisabled.Load() != 0 {
|
|
return false
|
|
}
|
|
// Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated.
|
|
// 1 syscall overhead >= 1 page fault overhead. This is because syscalls are
|
|
// susceptible to additional overheads like seccomp-bpf filters and auditing.
|
|
if b.Len() <= hostarch.PageSize {
|
|
return true
|
|
}
|
|
_, _, errno := unix.Syscall(unix.SYS_MADVISE, b.Addr(), uintptr(b.Len()), unix.MADV_POPULATE_WRITE)
|
|
if errno != 0 {
|
|
if errno == unix.EINVAL {
|
|
// EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14).
|
|
log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
|
|
} else {
|
|
log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
|
|
}
|
|
madvPopulateWriteDisabled.Store(1)
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func tryPopulateMlock(b safemem.Block) bool {
|
|
if mlockDisabled.Load() != 0 {
|
|
return false
|
|
}
|
|
// Call mlock to populate pages, then munlock to cancel the mlock (but keep
|
|
// the pages populated). Only do so for hugepage-aligned address ranges to
|
|
// ensure that splitting the VMA in mlock doesn't split any existing
|
|
// hugepages. This assumes that two host syscalls, plus the MM overhead of
|
|
// mlock + munlock, is faster on average than trapping for
|
|
// HugePageSize/PageSize small page faults.
|
|
start, ok := hostarch.Addr(b.Addr()).HugeRoundUp()
|
|
if !ok {
|
|
return true
|
|
}
|
|
end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown()
|
|
if start >= end {
|
|
return true
|
|
}
|
|
_, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0)
|
|
unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0)
|
|
if errno != 0 {
|
|
if errno == unix.ENOMEM || errno == unix.EPERM {
|
|
// These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or
|
|
// hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively.
|
|
log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
|
|
} else {
|
|
log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
|
|
}
|
|
mlockDisabled.Store(1)
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func tryPopulate(b safemem.Block) bool {
|
|
// There are two approaches for populating writable pages:
|
|
// 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate
|
|
// (prefault) page tables writable, faulting in all pages in the range
|
|
// just as if manually writing to each each page".
|
|
// 2. Call mlock to populate pages, then munlock to cancel the mlock (but
|
|
// keep the pages populated).
|
|
//
|
|
// Prefer the madvise(MADV_POPULATE_WRITE) approach because:
|
|
// - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach.
|
|
// - It is faster because it doesn't have to modify vmas like mlock does.
|
|
// - It works for disk-backed memory mappings too. The mlock approach doesn't
|
|
// work for disk-backed filesystems (e.g. ext4). This is because
|
|
// mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable
|
|
// MAP_SHARED mappings. For memory-backed (shmem) files,
|
|
// mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so
|
|
// the page table entries populated by a read fault are writable. For
|
|
// disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is
|
|
// true, so the page table entries populated by a read fault are read-only.
|
|
if tryPopulateMadv(b) {
|
|
return true
|
|
}
|
|
return tryPopulateMlock(b)
|
|
}
|
|
|
|
// Decommit uncommits the given pages, causing them to become zeroed.
|
|
//
|
|
// Preconditions:
|
|
// - fr.Start and fr.End must be page-aligned.
|
|
// - fr.Length() > 0.
|
|
// - At least one reference must be held on all pages in fr.
|
|
func (f *MemoryFile) Decommit(fr memmap.FileRange) {
|
|
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
|
|
panic(fmt.Sprintf("invalid range: %v", fr))
|
|
}
|
|
|
|
f.decommitOrManuallyZero(fr)
|
|
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.memAcct.MutateFullRange(fr, func(maseg memAcctIterator) bool {
|
|
ma := maseg.ValuePtr()
|
|
if ma.knownCommitted {
|
|
ma.knownCommitted = false
|
|
malen := maseg.Range().Length()
|
|
f.knownCommittedBytes -= malen
|
|
if !f.opts.DisableMemoryAccounting {
|
|
usage.MemoryAccounting.Dec(malen, ma.kind, ma.memCgID)
|
|
}
|
|
}
|
|
// Update commitSeq to invalidate any observations made by
|
|
// concurrent calls to f.updateUsageLocked().
|
|
ma.commitSeq = f.commitSeq
|
|
return true
|
|
})
|
|
}
|
|
|
|
func (f *MemoryFile) commitFile(fr memmap.FileRange) error {
|
|
// "The default operation (i.e., mode is zero) of fallocate() allocates the
|
|
// disk space within the range specified by offset and len." - fallocate(2)
|
|
return unix.Fallocate(
|
|
int(f.file.Fd()),
|
|
0, // mode
|
|
int64(fr.Start),
|
|
int64(fr.Length()))
|
|
}
|
|
|
|
func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
|
|
// "After a successful call, subsequent reads from this range will
|
|
// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
|
|
// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
|
|
return unix.Fallocate(
|
|
int(f.file.Fd()),
|
|
unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE,
|
|
int64(fr.Start),
|
|
int64(fr.Length()))
|
|
}
|
|
|
|
func (f *MemoryFile) manuallyZero(fr memmap.FileRange) {
|
|
f.forEachMappingSlice(fr, func(bs []byte) {
|
|
clear(bs)
|
|
})
|
|
}
|
|
|
|
func (f *MemoryFile) decommitOrManuallyZero(fr memmap.FileRange) {
|
|
if err := f.decommitFile(fr); err != nil {
|
|
log.Warningf("Failed to decommit %v: %v", fr, err)
|
|
// Zero the pages manually. This won't reduce memory usage, but at
|
|
// least ensures that the pages will be zeroed when reallocated.
|
|
f.manuallyZero(fr)
|
|
}
|
|
}
|
|
|
|
// HasUniqueRef returns true if all pages in the given range have exactly one
|
|
// reference. A return value of false is inherently racy, but if the caller
|
|
// holds a reference on the given range and is preventing other goroutines from
|
|
// copying it, then a return value of true is not racy.
|
|
//
|
|
// Preconditions: At least one reference must be held on all pages in fr.
|
|
func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool {
|
|
hasUniqueRef := true
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool {
|
|
unfree := &f.unfreeSmall
|
|
if chunk.huge {
|
|
unfree = &f.unfreeHuge
|
|
}
|
|
unfree.VisitFullRange(fr, func(ufseg unfreeIterator) bool {
|
|
if ufseg.ValuePtr().refs != 1 {
|
|
hasUniqueRef = false
|
|
return false
|
|
}
|
|
return true
|
|
})
|
|
return hasUniqueRef
|
|
})
|
|
return hasUniqueRef
|
|
}
|
|
|
|
// IncRef implements memmap.File.IncRef.
|
|
func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) {
|
|
if !fr.WellFormed() || fr.Length() == 0 || !hostarch.IsPageAligned(fr.Start) || !hostarch.IsPageAligned(fr.End) {
|
|
panic(fmt.Sprintf("invalid range: %v", fr))
|
|
}
|
|
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.incRefLocked(fr)
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked.
|
|
func (f *MemoryFile) incRefLocked(fr memmap.FileRange) {
|
|
f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool {
|
|
unfree := &f.unfreeSmall
|
|
if chunk.huge {
|
|
unfree = &f.unfreeHuge
|
|
}
|
|
unfree.MutateFullRange(chunkFR, func(ufseg unfreeIterator) bool {
|
|
uf := ufseg.ValuePtr()
|
|
if uf.refs <= 0 {
|
|
panic(fmt.Sprintf("IncRef(%v) called with %d references on pages %v", fr, uf.refs, ufseg.Range()))
|
|
}
|
|
uf.refs++
|
|
return true
|
|
})
|
|
return true
|
|
})
|
|
}
|
|
|
|
// DecRef implements memmap.File.DecRef.
|
|
func (f *MemoryFile) DecRef(fr memmap.FileRange) {
|
|
if !fr.WellFormed() || fr.Length() == 0 || !hostarch.IsPageAligned(fr.Start) || !hostarch.IsPageAligned(fr.End) {
|
|
panic(fmt.Sprintf("invalid range: %v", fr))
|
|
}
|
|
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
|
|
haveWaste := false
|
|
f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool {
|
|
unwaste := &f.unwasteSmall
|
|
unfree := &f.unfreeSmall
|
|
if chunk.huge {
|
|
unwaste = &f.unwasteHuge
|
|
unfree = &f.unfreeHuge
|
|
}
|
|
unfree.MutateFullRange(chunkFR, func(ufseg unfreeIterator) bool {
|
|
uf := ufseg.ValuePtr()
|
|
if uf.refs <= 0 {
|
|
panic(fmt.Sprintf("DecRef(%v) called with %d references on pages %v", fr, uf.refs, ufseg.Range()))
|
|
}
|
|
uf.refs--
|
|
if uf.refs == 0 {
|
|
// Mark these pages as waste.
|
|
wasteFR := ufseg.Range()
|
|
unwaste.RemoveFullRange(wasteFR)
|
|
haveWaste = true
|
|
// Reclassify waste memory as System until it's recycled or
|
|
// released.
|
|
f.memAcct.MutateFullRange(wasteFR, func(maseg memAcctIterator) bool {
|
|
ma := maseg.ValuePtr()
|
|
if !f.opts.DisableMemoryAccounting && ma.knownCommitted {
|
|
usage.MemoryAccounting.Move(maseg.Range().Length(), usage.System, ma.kind, ma.memCgID)
|
|
}
|
|
ma.kind = usage.System
|
|
ma.wasteOrReleasing = true
|
|
return true
|
|
})
|
|
// Cancel any pending async load on waste pages.
|
|
if apl := f.asyncPageLoad.Load(); apl != nil {
|
|
apl.cancelWasteLoad(wasteFR)
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
return true
|
|
})
|
|
|
|
// Wake the releaser if we marked any pages as waste. Leave this until just
|
|
// before unlocking f.mu.
|
|
if haveWaste && !f.haveWaste {
|
|
f.haveWaste = true
|
|
f.releaseCond.Signal()
|
|
}
|
|
}
|
|
|
|
// releaserMain implements the releaser goroutine.
|
|
func (f *MemoryFile) releaserMain() {
|
|
f.mu.Lock()
|
|
MainLoop:
|
|
for {
|
|
for {
|
|
if f.destroyed {
|
|
f.releaserDestroyLocked()
|
|
f.mu.Unlock()
|
|
// This must be called without holding f.mu to avoid circular lock
|
|
// ordering.
|
|
if f.stopNotifyPressure != nil {
|
|
f.stopNotifyPressure()
|
|
}
|
|
return
|
|
}
|
|
if f.haveWaste {
|
|
break
|
|
}
|
|
if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
|
|
// No work to do. Evict any pending evictable allocations to
|
|
// get more waste pages before going to sleep.
|
|
f.startEvictionsLocked()
|
|
}
|
|
f.releaseCond.Wait() // releases f.mu while waiting
|
|
}
|
|
// Huge pages are relatively rare and expensive due to fragmentation
|
|
// and the cost of compaction. Fragmentation is expected to increase
|
|
// over time. Most allocations are done upwards, with the main
|
|
// exception being thread stacks. So we expect lower offsets to weakly
|
|
// correlate with older allocations, which are more likely to actually
|
|
// be hugepage-backed. Thus, release from unwasteSmall before
|
|
// unwasteHuge, and higher offsets before lower ones.
|
|
for i, unwaste := range []*unwasteSet{&f.unwasteSmall, &f.unwasteHuge} {
|
|
if uwgap := unwaste.LastLargeEnoughGap(1); uwgap.Ok() {
|
|
fr := uwgap.Range()
|
|
// Linux serializes fallocate()s on shmem files, so limit the amount we
|
|
// release at once to avoid starving Decommit().
|
|
const maxReleasingBytes = 128 << 20 // 128 MB
|
|
if fr.Length() > maxReleasingBytes {
|
|
fr.Start = fr.End - maxReleasingBytes
|
|
}
|
|
unwaste.Insert(uwgap, fr, unwasteInfo{})
|
|
f.releaseLocked(fr, i == 1)
|
|
continue MainLoop
|
|
}
|
|
}
|
|
f.haveWaste = false
|
|
}
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked; it may be unlocked and reacquired.
|
|
func (f *MemoryFile) releaseLocked(fr memmap.FileRange, huge bool) {
|
|
defer func() {
|
|
maseg := f.memAcct.LowerBoundSegmentSplitBefore(fr.Start)
|
|
for maseg.Ok() && maseg.Start() < fr.End {
|
|
maseg = f.memAcct.SplitAfter(maseg, fr.End)
|
|
ma := maseg.ValuePtr()
|
|
if ma.kind != usage.System {
|
|
panic(fmt.Sprintf("waste pages %v have unexpected kind %v\n%s", maseg.Range(), ma.kind, f.stringLocked()))
|
|
}
|
|
if ma.knownCommitted {
|
|
malen := maseg.Range().Length()
|
|
f.knownCommittedBytes -= malen
|
|
if !f.opts.DisableMemoryAccounting {
|
|
usage.MemoryAccounting.Dec(malen, ma.kind, ma.memCgID)
|
|
}
|
|
}
|
|
maseg = f.memAcct.Remove(maseg).NextSegment()
|
|
}
|
|
}()
|
|
|
|
if !huge {
|
|
// Decommit the range being released, then mark the released range as
|
|
// freed.
|
|
f.mu.Unlock()
|
|
f.decommitOrManuallyZero(fr)
|
|
f.mu.Lock()
|
|
f.unfreeSmall.RemoveFullRange(fr)
|
|
return
|
|
}
|
|
|
|
// Handle huge pages and sub-release.
|
|
|
|
firstHugeStart := hostarch.HugePageRoundDown(fr.Start)
|
|
lastHugeStart := hostarch.HugePageRoundDown(fr.End - 1)
|
|
firstHugeEnd := firstHugeStart + hostarch.HugePageSize
|
|
lastHugeEnd := lastHugeStart + hostarch.HugePageSize
|
|
if firstHugeStart == lastHugeStart {
|
|
// All of fr falls within a single huge page.
|
|
oldSubrel := f.subreleased[firstHugeStart]
|
|
incSubrel := fr.Length() / hostarch.PageSize
|
|
newSubrel := oldSubrel + incSubrel
|
|
if newSubrel == pagesPerHugePage {
|
|
// Free this huge page.
|
|
//
|
|
// When a small page within a hugepage-backed allocation is
|
|
// individually deallocated (becomes waste), we decommit it to
|
|
// reduce memory usage (and for consistency with legacy behavior).
|
|
// This requires the host to split the containing huge page, if one
|
|
// exists. khugepaged may later re-assemble the containing huge
|
|
// page, implicitly re-committing previously-decommitted small
|
|
// pages as a result.
|
|
//
|
|
// Thus: When a huge page is freed, ensure that the whole huge page
|
|
// is decommitted rather than just the final small page(s), to
|
|
// ensure that we leave behind an uncommitted hugepage-sized range
|
|
// with no re-committed small pages.
|
|
if oldSubrel != 0 {
|
|
delete(f.subreleased, firstHugeStart)
|
|
}
|
|
hugeFR := memmap.FileRange{firstHugeStart, firstHugeEnd}
|
|
f.mu.Unlock()
|
|
f.decommitOrManuallyZero(hugeFR)
|
|
f.mu.Lock()
|
|
f.unfreeHuge.RemoveFullRange(hugeFR)
|
|
} else {
|
|
f.subreleased[firstHugeStart] = newSubrel
|
|
f.mu.Unlock()
|
|
f.decommitOrManuallyZero(fr)
|
|
f.mu.Lock()
|
|
}
|
|
return
|
|
}
|
|
|
|
// fr spans at least two huge pages. Resolve sub-release in the first and
|
|
// last huge pages; any huge pages in between are decommitted/freed in
|
|
// full.
|
|
var (
|
|
decommitFR memmap.FileRange
|
|
freeFR memmap.FileRange
|
|
)
|
|
if fr.Start == firstHugeStart {
|
|
decommitFR.Start = firstHugeStart
|
|
freeFR.Start = firstHugeStart
|
|
} else {
|
|
oldSubrel := f.subreleased[firstHugeStart]
|
|
incSubrel := (firstHugeEnd - fr.Start) / hostarch.PageSize
|
|
newSubrel := oldSubrel + incSubrel
|
|
if newSubrel == pagesPerHugePage {
|
|
if oldSubrel != 0 {
|
|
delete(f.subreleased, firstHugeStart)
|
|
}
|
|
decommitFR.Start = firstHugeStart
|
|
freeFR.Start = firstHugeStart
|
|
} else {
|
|
decommitFR.Start = fr.Start
|
|
freeFR.Start = firstHugeEnd
|
|
}
|
|
}
|
|
if fr.End == lastHugeEnd {
|
|
decommitFR.End = lastHugeEnd
|
|
freeFR.End = lastHugeEnd
|
|
} else {
|
|
oldSubrel := f.subreleased[lastHugeStart]
|
|
incSubrel := (fr.End - lastHugeStart) / hostarch.PageSize
|
|
newSubrel := oldSubrel + incSubrel
|
|
if newSubrel == pagesPerHugePage {
|
|
if oldSubrel != 0 {
|
|
delete(f.subreleased, lastHugeStart)
|
|
}
|
|
decommitFR.End = lastHugeEnd
|
|
freeFR.End = lastHugeEnd
|
|
} else {
|
|
decommitFR.End = fr.End
|
|
freeFR.End = lastHugeStart
|
|
}
|
|
}
|
|
f.mu.Unlock()
|
|
f.decommitOrManuallyZero(decommitFR)
|
|
f.mu.Lock()
|
|
if freeFR.Length() != 0 {
|
|
f.unfreeHuge.RemoveFullRange(freeFR)
|
|
}
|
|
}
|
|
|
|
// MapInternal implements memmap.File.MapInternal.
|
|
func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
|
|
if !fr.WellFormed() || fr.Length() == 0 {
|
|
panic(fmt.Sprintf("invalid range: %v", fr))
|
|
}
|
|
if at.Execute {
|
|
return safemem.BlockSeq{}, linuxerr.EACCES
|
|
}
|
|
|
|
if apl := f.asyncPageLoad.Load(); apl != nil {
|
|
if err := apl.awaitLoad(f, fr); err != nil {
|
|
return safemem.BlockSeq{}, err
|
|
}
|
|
}
|
|
|
|
chunks := ((fr.End + chunkMask) / chunkSize) - (fr.Start / chunkSize)
|
|
if chunks == 1 {
|
|
// Avoid an unnecessary slice allocation.
|
|
var seq safemem.BlockSeq
|
|
f.forEachMappingSlice(fr, func(bs []byte) {
|
|
seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
|
|
})
|
|
return seq, nil
|
|
}
|
|
blocks := make([]safemem.Block, 0, chunks)
|
|
f.forEachMappingSlice(fr, func(bs []byte) {
|
|
blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
|
|
})
|
|
return safemem.BlockSeqFromSlice(blocks), nil
|
|
}
|
|
|
|
// forEachMappingSlice invokes fn on a sequence of byte slices that
|
|
// collectively map all bytes in fr.
|
|
func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) {
|
|
f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool {
|
|
fn(chunk.sliceAt(chunkFR))
|
|
return true
|
|
})
|
|
}
|
|
|
|
// MarkEvictable allows f to request memory deallocation by calling
|
|
// user.Evict(er) in the future.
|
|
//
|
|
// Redundantly marking an already-evictable range as evictable has no effect.
|
|
func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
info, ok := f.evictable[user]
|
|
if !ok {
|
|
info = &evictableMemoryUserInfo{}
|
|
f.evictable[user] = info
|
|
}
|
|
gap := info.ranges.LowerBoundGap(er.Start)
|
|
for gap.Ok() && gap.Start() < er.End {
|
|
gapER := gap.Range().Intersect(er)
|
|
if gapER.Length() == 0 {
|
|
gap = gap.NextGap()
|
|
continue
|
|
}
|
|
gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
|
|
}
|
|
if !info.evicting {
|
|
switch f.opts.DelayedEviction {
|
|
case DelayedEvictionDisabled:
|
|
// Kick off eviction immediately.
|
|
f.startEvictionGoroutineLocked(user, info)
|
|
case DelayedEvictionEnabled:
|
|
if !f.opts.UseHostMemcgPressure {
|
|
// Ensure that the releaser goroutine is running, so that it
|
|
// can start eviction when necessary.
|
|
f.releaseCond.Signal()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// MarkUnevictable informs f that user no longer considers er to be evictable,
|
|
// so the MemoryFile should no longer call user.Evict(er). Note that, per
|
|
// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
|
|
// called even after MarkUnevictable returns due to race conditions, and
|
|
// implementations of EvictableMemoryUser must handle this possibility.
|
|
//
|
|
// Redundantly marking an already-unevictable range as unevictable has no
|
|
// effect.
|
|
func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
info, ok := f.evictable[user]
|
|
if !ok {
|
|
return
|
|
}
|
|
info.ranges.RemoveRange(er)
|
|
// We can only remove info if there's no eviction goroutine running on its
|
|
// behalf.
|
|
if !info.evicting && info.ranges.IsEmpty() {
|
|
delete(f.evictable, user)
|
|
}
|
|
}
|
|
|
|
// MarkAllUnevictable informs f that user no longer considers any offsets to be
|
|
// evictable. It otherwise has the same semantics as MarkUnevictable.
|
|
func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
info, ok := f.evictable[user]
|
|
if !ok {
|
|
return
|
|
}
|
|
info.ranges.RemoveAll()
|
|
// We can only remove info if there's no eviction goroutine running on its
|
|
// behalf.
|
|
if !info.evicting {
|
|
delete(f.evictable, user)
|
|
}
|
|
}
|
|
|
|
// ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
|
|
// evictable memory, such that it may be advantageous to cache data in
|
|
// evictable memory. The value returned by ShouldCacheEvictable may change
|
|
// between calls.
|
|
func (f *MemoryFile) ShouldCacheEvictable() bool {
|
|
return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
|
|
}
|
|
|
|
// UpdateUsage ensures that the memory usage statistics in
|
|
// usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages
|
|
// will be scanned. Else only the pages which belong to the memory cgroup ids
|
|
// in memCgIDs will be scanned and the memory usage will be updated.
|
|
func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error {
|
|
// If we already know of every committed page, skip scanning.
|
|
currentUsage, err := f.TotalUsage()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
if currentUsage == f.knownCommittedBytes {
|
|
return nil
|
|
}
|
|
|
|
if f.isSaving != 0 {
|
|
log.Debugf("pgalloc.MemoryFile.UpdateUsage() inhibited during MemoryFile save")
|
|
return nil
|
|
}
|
|
|
|
// Linux updates usage values at CONFIG_HZ; throttle our scans to the same
|
|
// frequency.
|
|
startTime := time.Now()
|
|
if startTime.Before(f.nextCommitScan) {
|
|
return nil
|
|
}
|
|
if memCgIDs == nil {
|
|
f.nextCommitScan = startTime.Add(time.Second / linux.CLOCKS_PER_SEC)
|
|
}
|
|
|
|
err = f.updateUsageLocked(memCgIDs, false /* alsoScanCommitted */, false /* callerIsSaveTo */, mincore)
|
|
if _, ok := err.(updateUsageDuringSaveErr); ok {
|
|
log.Debugf("pgalloc.MemoryFile.UpdateUsage() inhibited during MemoryFile save")
|
|
return nil
|
|
}
|
|
if log.IsLogging(log.Debug) {
|
|
log.Debugf("UpdateUsage: took %v, currentUsage=%d knownCommittedBytes=%d",
|
|
time.Since(startTime), currentUsage, f.knownCommittedBytes)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// updateUsageLocked attempts to detect commitment of previously-uncommitted
|
|
// pages by invoking checkCommitted, and updates memory accounting to reflect
|
|
// newly-committed pages. If alsoScanCommitted is true, updateUsageLocked also
|
|
// attempts to detect decommitment of previously-committed pages; this is only
|
|
// used by save/restore, which optionally temporarily treats zeroed pages as
|
|
// decommitted in order to skip saving them.
|
|
//
|
|
// For each page i in bs, checkCommitted must set committed[i] to 1 if the page
|
|
// is committed and 0 otherwise. off is the offset at which bs begins.
|
|
// wasCommitted is true if the page was known-committed before the call to
|
|
// checkCommitted and false otherwise; wasCommitted can only be true if
|
|
// alsoScanCommitted is true.
|
|
//
|
|
// callerIsSaveTo is true if the caller is f.SaveTo() and false if the caller
|
|
// is f.UpdateUsage().
|
|
//
|
|
// Precondition: f.mu must be held; it may be unlocked and reacquired.
|
|
// +checklocks:f.mu
|
|
func (f *MemoryFile) updateUsageLocked(memCgIDs map[uint32]struct{}, alsoScanCommitted, callerIsSaveTo bool, checkCommitted func(bs []byte, committed []byte, off uint64, wasCommitted bool) error) error {
|
|
// Track if anything changed to elide the merge.
|
|
changedAny := false
|
|
defer func() {
|
|
if changedAny {
|
|
f.memAcct.MergeAll()
|
|
}
|
|
}()
|
|
|
|
// Reused mincore buffer.
|
|
var buf []byte
|
|
|
|
maseg := f.memAcct.FirstSegment()
|
|
unscannedStart := uint64(0)
|
|
for maseg.Ok() {
|
|
ma := maseg.ValuePtr()
|
|
if ma.wasteOrReleasing {
|
|
// Skip scanning of waste and releasing pages. This isn't
|
|
// necessarily correct, since !knownCommitted may have become
|
|
// committed after the last call to updateUsageLocked(), then
|
|
// transitioned from used to waste. However, this is consistent
|
|
// with legacy behavior.
|
|
maseg = maseg.NextSegment()
|
|
continue
|
|
}
|
|
wasCommitted := ma.knownCommitted
|
|
if !alsoScanCommitted && wasCommitted {
|
|
maseg = maseg.NextSegment()
|
|
continue
|
|
}
|
|
|
|
// Scan the pages of the given memCgID only. This will avoid scanning
|
|
// the whole memory file when the memory usage is required only for a
|
|
// specific cgroup. The total memory usage of all cgroups can be
|
|
// obtained when memCgIDs is nil.
|
|
if memCgIDs != nil {
|
|
if _, ok := memCgIDs[ma.memCgID]; !ok {
|
|
maseg = maseg.NextSegment()
|
|
continue
|
|
}
|
|
}
|
|
|
|
fr := maseg.Range()
|
|
if fr.Start < unscannedStart {
|
|
fr.Start = unscannedStart
|
|
}
|
|
var checkErr error
|
|
f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool {
|
|
s := chunk.sliceAt(chunkFR)
|
|
|
|
// Ensure that we have sufficient buffer for the call (one byte per
|
|
// page). The length of s must be page-aligned.
|
|
bufLen := len(s) / hostarch.PageSize
|
|
if len(buf) < bufLen {
|
|
buf = make([]byte, bufLen)
|
|
}
|
|
|
|
// Query for new pages in core.
|
|
// NOTE(b/165896008): mincore (which is passed as checkCommitted by
|
|
// f.UpdateUsage()) might take a really long time. So unlock f.mu while
|
|
// checkCommitted runs.
|
|
lastCommitSeq := f.commitSeq
|
|
f.commitSeq++
|
|
f.mu.Unlock() // +checklocksforce
|
|
err := checkCommitted(s, buf, chunkFR.Start, wasCommitted)
|
|
f.mu.Lock()
|
|
if err != nil {
|
|
checkErr = err
|
|
return false
|
|
}
|
|
|
|
// Reconcile internal state with buf. Since we temporarily dropped
|
|
// f.mu, f.isSaving and f.memAcct may have changed, and maseg/ma
|
|
// are no longer valid. If wasCommitted is false, then we are
|
|
// marking ranges that are now committed; otherwise, we are marking
|
|
// ranges that are now uncommitted.
|
|
if !callerIsSaveTo && f.isSaving != 0 {
|
|
checkErr = updateUsageDuringSaveErr{}
|
|
return false
|
|
}
|
|
unchangedVal := byte(0)
|
|
if wasCommitted {
|
|
unchangedVal = 1
|
|
}
|
|
maseg = f.memAcct.LowerBoundSegment(chunkFR.Start)
|
|
for i := 0; i < bufLen; {
|
|
if buf[i]&0x1 == unchangedVal {
|
|
i++
|
|
continue
|
|
}
|
|
// Scan to the end of this changed range.
|
|
j := i + 1
|
|
for ; j < bufLen; j++ {
|
|
if buf[j]&0x1 == unchangedVal {
|
|
break
|
|
}
|
|
}
|
|
changedFR := memmap.FileRange{
|
|
Start: chunkFR.Start + uint64(i*hostarch.PageSize),
|
|
End: chunkFR.Start + uint64(j*hostarch.PageSize),
|
|
}
|
|
// Advance maseg to changedFR.Start.
|
|
for maseg.Ok() && maseg.End() <= changedFR.Start {
|
|
maseg = maseg.NextSegment()
|
|
}
|
|
// Update pages overlapping changedFR, but don't mark ranges as
|
|
// committed if they might have raced with decommit.
|
|
for maseg.Ok() && maseg.Start() < changedFR.End {
|
|
if !maseg.ValuePtr().wasteOrReleasing &&
|
|
((!wasCommitted && !maseg.ValuePtr().knownCommitted && ma.commitSeq <= lastCommitSeq) ||
|
|
(wasCommitted && maseg.ValuePtr().knownCommitted)) {
|
|
maseg = f.memAcct.Isolate(maseg, changedFR)
|
|
ma := maseg.ValuePtr()
|
|
amount := maseg.Range().Length()
|
|
if wasCommitted {
|
|
ma.knownCommitted = false
|
|
ma.commitSeq = f.commitSeq
|
|
f.knownCommittedBytes -= amount
|
|
if !f.opts.DisableMemoryAccounting {
|
|
usage.MemoryAccounting.Dec(amount, ma.kind, ma.memCgID)
|
|
}
|
|
} else {
|
|
ma.knownCommitted = true
|
|
ma.commitSeq = 0
|
|
f.knownCommittedBytes += amount
|
|
if !f.opts.DisableMemoryAccounting {
|
|
usage.MemoryAccounting.Inc(amount, ma.kind, ma.memCgID)
|
|
}
|
|
}
|
|
changedAny = true
|
|
}
|
|
maseg = maseg.NextSegment()
|
|
}
|
|
// Continue scanning for changed pages.
|
|
i = j + 1
|
|
}
|
|
|
|
// Don't continue to the next chunk, since while f.mu was unlocked
|
|
// its memory accounting state could have changed completely.
|
|
// Instead, continue the outer loop with the first segment after
|
|
// chunkFR.End.
|
|
maseg = f.memAcct.LowerBoundSegment(chunkFR.End)
|
|
unscannedStart = chunkFR.End
|
|
return false
|
|
})
|
|
if checkErr != nil {
|
|
return checkErr
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type updateUsageDuringSaveErr struct{}
|
|
|
|
// Error implements error.Error.
|
|
func (updateUsageDuringSaveErr) Error() string {
|
|
return "pgalloc.MemoryFile.UpdateUsage() called during MemoryFile save"
|
|
}
|
|
|
|
// TotalUsage returns an aggregate usage for all memory statistics except
|
|
// Mapped (which is external to MemoryFile). This is generally much cheaper
|
|
// than UpdateUsage, but will not provide a fine-grained breakdown.
|
|
func (f *MemoryFile) TotalUsage() (uint64, error) {
|
|
// Stat the underlying file to discover the underlying usage. stat(2)
|
|
// always reports the allocated block count in units of 512 bytes. This
|
|
// includes pages in the page cache and swapped pages.
|
|
var stat unix.Stat_t
|
|
if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
|
|
return 0, err
|
|
}
|
|
return uint64(stat.Blocks * 512), nil
|
|
}
|
|
|
|
// TotalSize returns the current size of the backing file in bytes, which is an
|
|
// upper bound on the amount of memory that can currently be allocated from the
|
|
// MemoryFile. The value returned by TotalSize is permitted to change.
|
|
func (f *MemoryFile) TotalSize() uint64 {
|
|
return uint64(len(f.chunksLoad())) * chunkSize
|
|
}
|
|
|
|
// File returns the backing file.
|
|
func (f *MemoryFile) File() *os.File {
|
|
return f.file
|
|
}
|
|
|
|
// DataFD implements memmap.File.DataFD.
|
|
func (f *MemoryFile) DataFD(fr memmap.FileRange) (int, error) {
|
|
if apl := f.asyncPageLoad.Load(); apl != nil {
|
|
if err := apl.awaitLoad(f, fr); err != nil {
|
|
return -1, err
|
|
}
|
|
}
|
|
return f.FD(), nil
|
|
}
|
|
|
|
// FD implements memmap.File.FD.
|
|
func (f *MemoryFile) FD() int {
|
|
return int(f.file.Fd())
|
|
}
|
|
|
|
// IsDiskBacked returns true if f is backed by a file on disk.
|
|
func (f *MemoryFile) IsDiskBacked() bool {
|
|
return f.opts.DiskBackedFile
|
|
}
|
|
|
|
// HugepagesEnabled returns true if the MemoryFile expects to back allocations
|
|
// for which AllocOpts.Huge == true with huge pages.
|
|
func (f *MemoryFile) HugepagesEnabled() bool {
|
|
return f.opts.ExpectHugepages
|
|
}
|
|
|
|
// String implements fmt.Stringer.String.
|
|
func (f *MemoryFile) String() string {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
return f.stringLocked()
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked.
|
|
func (f *MemoryFile) stringLocked() string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "unwasteSmall:\n%s", &f.unwasteSmall)
|
|
if f.opts.ExpectHugepages {
|
|
fmt.Fprintf(&b, "unwasteHuge:\n%s", &f.unwasteHuge)
|
|
}
|
|
fmt.Fprintf(&b, "unfreeSmall:\n%s", &f.unfreeSmall)
|
|
if f.opts.ExpectHugepages {
|
|
fmt.Fprintf(&b, "unfreeHuge:\n%s", &f.unfreeHuge)
|
|
fmt.Fprintf(&b, "subreleased:\n")
|
|
for off, pgs := range f.subreleased {
|
|
fmt.Fprintf(&b, "- %#x: %d\n", off, pgs)
|
|
}
|
|
}
|
|
fmt.Fprintf(&b, "memAcct:\n%s", &f.memAcct)
|
|
return b.String()
|
|
}
|
|
|
|
// StartEvictions requests that f evict all evictable allocations. It does not
|
|
// wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
|
|
func (f *MemoryFile) StartEvictions() {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.startEvictionsLocked()
|
|
}
|
|
|
|
// Preconditions: f.mu must be locked.
|
|
func (f *MemoryFile) startEvictionsLocked() bool {
|
|
startedAny := false
|
|
for user, info := range f.evictable {
|
|
// Don't start multiple goroutines to evict the same user's
|
|
// allocations.
|
|
if !info.evicting {
|
|
f.startEvictionGoroutineLocked(user, info)
|
|
startedAny = true
|
|
}
|
|
}
|
|
return startedAny
|
|
}
|
|
|
|
// Preconditions:
|
|
// - info == f.evictable[user].
|
|
// - !info.evicting.
|
|
// - f.mu must be locked.
|
|
func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
|
|
info.evicting = true
|
|
f.evictionWG.Add(1)
|
|
go func() { // S/R-SAFE: f.evictionWG
|
|
defer f.evictionWG.Done()
|
|
for {
|
|
f.mu.Lock()
|
|
info, ok := f.evictable[user]
|
|
if !ok {
|
|
// This shouldn't happen: only this goroutine is permitted
|
|
// to delete this entry.
|
|
f.mu.Unlock()
|
|
panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
|
|
}
|
|
if info.ranges.IsEmpty() {
|
|
delete(f.evictable, user)
|
|
f.mu.Unlock()
|
|
return
|
|
}
|
|
// Evict from the end of info.ranges, under the assumption that
|
|
// if ranges in user start being used again (and are
|
|
// consequently marked unevictable), such uses are more likely
|
|
// to start from the beginning of user.
|
|
seg := info.ranges.LastSegment()
|
|
er := seg.Range()
|
|
info.ranges.Remove(seg)
|
|
// user.Evict() must be called without holding f.mu to avoid
|
|
// circular lock ordering.
|
|
f.mu.Unlock()
|
|
user.Evict(context.Background(), er)
|
|
}
|
|
}()
|
|
}
|
|
|
|
// WaitForEvictions blocks until f is no longer evicting any evictable
|
|
// allocations.
|
|
func (f *MemoryFile) WaitForEvictions() {
|
|
f.evictionWG.Wait()
|
|
}
|
|
|
|
type unwasteSetFunctions struct{}
|
|
|
|
func (unwasteSetFunctions) MinKey() uint64 {
|
|
return 0
|
|
}
|
|
|
|
func (unwasteSetFunctions) MaxKey() uint64 {
|
|
return math.MaxUint64
|
|
}
|
|
|
|
func (unwasteSetFunctions) ClearValue(val *unwasteInfo) {
|
|
}
|
|
|
|
func (unwasteSetFunctions) Merge(_ memmap.FileRange, val1 unwasteInfo, _ memmap.FileRange, val2 unwasteInfo) (unwasteInfo, bool) {
|
|
return val1, val1 == val2
|
|
}
|
|
|
|
func (unwasteSetFunctions) Split(_ memmap.FileRange, val unwasteInfo, _ uint64) (unwasteInfo, unwasteInfo) {
|
|
return val, val
|
|
}
|
|
|
|
type unfreeSetFunctions struct{}
|
|
|
|
func (unfreeSetFunctions) MinKey() uint64 {
|
|
return 0
|
|
}
|
|
|
|
func (unfreeSetFunctions) MaxKey() uint64 {
|
|
return math.MaxUint64
|
|
}
|
|
|
|
func (unfreeSetFunctions) ClearValue(val *unfreeInfo) {
|
|
}
|
|
|
|
func (unfreeSetFunctions) Merge(_ memmap.FileRange, val1 unfreeInfo, _ memmap.FileRange, val2 unfreeInfo) (unfreeInfo, bool) {
|
|
return val1, val1 == val2
|
|
}
|
|
|
|
func (unfreeSetFunctions) Split(_ memmap.FileRange, val unfreeInfo, _ uint64) (unfreeInfo, unfreeInfo) {
|
|
return val, val
|
|
}
|
|
|
|
type memAcctSetFunctions struct{}
|
|
|
|
func (memAcctSetFunctions) MinKey() uint64 {
|
|
return 0
|
|
}
|
|
|
|
func (memAcctSetFunctions) MaxKey() uint64 {
|
|
return math.MaxUint64
|
|
}
|
|
|
|
func (memAcctSetFunctions) ClearValue(val *memAcctInfo) {
|
|
}
|
|
|
|
func (memAcctSetFunctions) Merge(_ memmap.FileRange, val1 memAcctInfo, _ memmap.FileRange, val2 memAcctInfo) (memAcctInfo, bool) {
|
|
return val1, val1 == val2
|
|
}
|
|
|
|
func (memAcctSetFunctions) Split(_ memmap.FileRange, val memAcctInfo, _ uint64) (memAcctInfo, memAcctInfo) {
|
|
return val, val
|
|
}
|
|
|
|
// evictableRangeSetValue is the value type of evictableRangeSet.
|
|
type evictableRangeSetValue struct{}
|
|
|
|
type evictableRangeSetFunctions struct{}
|
|
|
|
func (evictableRangeSetFunctions) MinKey() uint64 {
|
|
return 0
|
|
}
|
|
|
|
func (evictableRangeSetFunctions) MaxKey() uint64 {
|
|
return math.MaxUint64
|
|
}
|
|
|
|
func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
|
|
}
|
|
|
|
func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
|
|
return evictableRangeSetValue{}, true
|
|
}
|
|
|
|
func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
|
|
return evictableRangeSetValue{}, evictableRangeSetValue{}
|
|
}
|