You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc bits - ocfs2 - most(?) of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (125 commits) thp: fix comments of __pmd_trans_huge_lock() cgroup: remove unnecessary 0 check from css_from_id() cgroup: fix idr leak for the first cgroup root mm: memcontrol: fix documentation for compound parameter mm: memcontrol: remove BUG_ON in uncharge_list mm: fix build warnings in <linux/compaction.h> mm, thp: convert from optimistic swapin collapsing to conservative mm, thp: fix comment inconsistency for swapin readahead functions thp: update Documentation/{vm/transhuge,filesystems/proc}.txt shmem: split huge pages beyond i_size under memory pressure thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE khugepaged: add support of collapse for tmpfs/shmem pages shmem: make shmem_inode_info::lock irq-safe khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() thp: extract khugepaged from mm/huge_memory.c shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings shmem: add huge pages support shmem: get_unmapped_area align huge page shmem: prepare huge= mount option and sysfs knob mm, rmap: account shmem thp pages ...
This commit is contained in:
@@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be
|
||||
pre-created. Default: 1.
|
||||
|
||||
2) Set max number of compression streams
|
||||
Regardless the value passed to this attribute, ZRAM will always
|
||||
allocate multiple compression streams - one per online CPUs - thus
|
||||
allowing several concurrent compression operations. The number of
|
||||
allocated compression streams goes down when some of the CPUs
|
||||
become offline. There is no single-compression-stream mode anymore,
|
||||
unless you are running a UP system or has only 1 CPU online.
|
||||
Regardless the value passed to this attribute, ZRAM will always
|
||||
allocate multiple compression streams - one per online CPUs - thus
|
||||
allowing several concurrent compression operations. The number of
|
||||
allocated compression streams goes down when some of the CPUs
|
||||
become offline. There is no single-compression-stream mode anymore,
|
||||
unless you are running a UP system or has only 1 CPU online.
|
||||
|
||||
To find out how many streams are currently available:
|
||||
To find out how many streams are currently available:
|
||||
cat /sys/block/zram0/max_comp_streams
|
||||
|
||||
3) Select compression algorithm
|
||||
Using comp_algorithm device attribute one can see available and
|
||||
currently selected (shown in square brackets) compression algorithms,
|
||||
change selected compression algorithm (once the device is initialised
|
||||
there is no way to change compression algorithm).
|
||||
Using comp_algorithm device attribute one can see available and
|
||||
currently selected (shown in square brackets) compression algorithms,
|
||||
change selected compression algorithm (once the device is initialised
|
||||
there is no way to change compression algorithm).
|
||||
|
||||
Examples:
|
||||
Examples:
|
||||
#show supported compression algorithms
|
||||
cat /sys/block/zram0/comp_algorithm
|
||||
lzo [lz4]
|
||||
@@ -83,17 +83,27 @@ pre-created. Default: 1.
|
||||
#select lzo compression algorithm
|
||||
echo lzo > /sys/block/zram0/comp_algorithm
|
||||
|
||||
4) Set Disksize
|
||||
Set disk size by writing the value to sysfs node 'disksize'.
|
||||
The value can be either in bytes or you can use mem suffixes.
|
||||
Examples:
|
||||
# Initialize /dev/zram0 with 50MB disksize
|
||||
echo $((50*1024*1024)) > /sys/block/zram0/disksize
|
||||
For the time being, the `comp_algorithm' content does not necessarily
|
||||
show every compression algorithm supported by the kernel. We keep this
|
||||
list primarily to simplify device configuration and one can configure
|
||||
a new device with a compression algorithm that is not listed in
|
||||
`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
|
||||
and, if some of the algorithms were built as modules, it's impossible
|
||||
to list all of them using, for instance, /proc/crypto or any other
|
||||
method. This, however, has an advantage of permitting the usage of
|
||||
custom crypto compression modules (implementing S/W or H/W compression).
|
||||
|
||||
# Using mem suffixes
|
||||
echo 256K > /sys/block/zram0/disksize
|
||||
echo 512M > /sys/block/zram0/disksize
|
||||
echo 1G > /sys/block/zram0/disksize
|
||||
4) Set Disksize
|
||||
Set disk size by writing the value to sysfs node 'disksize'.
|
||||
The value can be either in bytes or you can use mem suffixes.
|
||||
Examples:
|
||||
# Initialize /dev/zram0 with 50MB disksize
|
||||
echo $((50*1024*1024)) > /sys/block/zram0/disksize
|
||||
|
||||
# Using mem suffixes
|
||||
echo 256K > /sys/block/zram0/disksize
|
||||
echo 512M > /sys/block/zram0/disksize
|
||||
echo 1G > /sys/block/zram0/disksize
|
||||
|
||||
Note:
|
||||
There is little point creating a zram of greater than twice the size of memory
|
||||
@@ -101,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
|
||||
size of the disk when not in use so a huge zram is wasteful.
|
||||
|
||||
5) Set memory limit: Optional
|
||||
Set memory limit by writing the value to sysfs node 'mem_limit'.
|
||||
The value can be either in bytes or you can use mem suffixes.
|
||||
In addition, you could change the value in runtime.
|
||||
Examples:
|
||||
# limit /dev/zram0 with 50MB memory
|
||||
echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
|
||||
Set memory limit by writing the value to sysfs node 'mem_limit'.
|
||||
The value can be either in bytes or you can use mem suffixes.
|
||||
In addition, you could change the value in runtime.
|
||||
Examples:
|
||||
# limit /dev/zram0 with 50MB memory
|
||||
echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
|
||||
|
||||
# Using mem suffixes
|
||||
echo 256K > /sys/block/zram0/mem_limit
|
||||
echo 512M > /sys/block/zram0/mem_limit
|
||||
echo 1G > /sys/block/zram0/mem_limit
|
||||
# Using mem suffixes
|
||||
echo 256K > /sys/block/zram0/mem_limit
|
||||
echo 512M > /sys/block/zram0/mem_limit
|
||||
echo 1G > /sys/block/zram0/mem_limit
|
||||
|
||||
# To disable memory limit
|
||||
echo 0 > /sys/block/zram0/mem_limit
|
||||
# To disable memory limit
|
||||
echo 0 > /sys/block/zram0/mem_limit
|
||||
|
||||
6) Activate:
|
||||
mkswap /dev/zram0
|
||||
|
||||
@@ -195,7 +195,9 @@ prototypes:
|
||||
int (*releasepage) (struct page *, int);
|
||||
void (*freepage)(struct page *);
|
||||
int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
|
||||
bool (*isolate_page) (struct page *, isolate_mode_t);
|
||||
int (*migratepage)(struct address_space *, struct page *, struct page *);
|
||||
void (*putback_page) (struct page *);
|
||||
int (*launder_page)(struct page *);
|
||||
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
|
||||
int (*error_remove_page)(struct address_space *, struct page *);
|
||||
@@ -219,7 +221,9 @@ invalidatepage: yes
|
||||
releasepage: yes
|
||||
freepage: yes
|
||||
direct_IO:
|
||||
isolate_page: yes
|
||||
migratepage: yes (both)
|
||||
putback_page: yes
|
||||
launder_page: yes
|
||||
is_partially_uptodate: yes
|
||||
error_remove_page: yes
|
||||
@@ -544,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
|
||||
locked. The VM will unlock the page.
|
||||
|
||||
->map_pages() is called when VM asks to map easy accessible pages.
|
||||
Filesystem should find and map pages associated with offsets from "pgoff"
|
||||
till "max_pgoff". ->map_pages() is called with page table locked and must
|
||||
Filesystem should find and map pages associated with offsets from "start_pgoff"
|
||||
till "end_pgoff". ->map_pages() is called with page table locked and must
|
||||
not block. If it's not possible to reach a page without blocking,
|
||||
filesystem should skip it. Filesystem should use do_set_pte() to setup
|
||||
page table entry. Pointer to entry associated with offset "pgoff" is
|
||||
passed in "pte" field in vm_fault structure. Pointers to entries for other
|
||||
offsets should be calculated relative to "pte".
|
||||
page table entry. Pointer to entry associated with the page is passed in
|
||||
"pte" field in fault_env structure. Pointers to entries for other offsets
|
||||
should be calculated relative to "pte".
|
||||
|
||||
->page_mkwrite() is called when a previously read-only pte is
|
||||
about to become writeable. The filesystem again must ensure that there are
|
||||
|
||||
@@ -49,6 +49,7 @@ These block devices may be used for inspiration:
|
||||
- axonram: Axon DDR2 device driver
|
||||
- brd: RAM backed block device driver
|
||||
- dcssblk: s390 dcss block device driver
|
||||
- pmem: NVDIMM persistent memory driver
|
||||
|
||||
|
||||
Implementation Tips for Filesystem Writers
|
||||
@@ -75,8 +76,9 @@ calls to get_block() (for example by a page-fault racing with a read()
|
||||
or a write()) work correctly.
|
||||
|
||||
These filesystems may be used for inspiration:
|
||||
- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
|
||||
- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
|
||||
- ext2: see Documentation/filesystems/ext2.txt
|
||||
- ext4: see Documentation/filesystems/ext4.txt
|
||||
- xfs: see Documentation/filesystems/xfs.txt
|
||||
|
||||
|
||||
Handling Media Errors
|
||||
|
||||
@@ -436,6 +436,7 @@ Private_Dirty: 0 kB
|
||||
Referenced: 892 kB
|
||||
Anonymous: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
Shared_Hugetlb: 0 kB
|
||||
Private_Hugetlb: 0 kB
|
||||
Swap: 0 kB
|
||||
@@ -464,6 +465,8 @@ accessed.
|
||||
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
||||
and a page is modified, the file page is replaced by a private anonymous copy.
|
||||
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
|
||||
"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
|
||||
huge pages.
|
||||
"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
|
||||
hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
|
||||
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
|
||||
@@ -868,6 +871,9 @@ VmallocTotal: 112216 kB
|
||||
VmallocUsed: 428 kB
|
||||
VmallocChunk: 111088 kB
|
||||
AnonHugePages: 49152 kB
|
||||
ShmemHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
|
||||
|
||||
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
|
||||
bits and the kernel binary code)
|
||||
@@ -912,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new
|
||||
AnonHugePages: Non-file backed huge pages mapped into userspace page tables
|
||||
Mapped: files which have been mmaped, such as libraries
|
||||
Shmem: Total memory used by shared memory (shmem) and tmpfs
|
||||
ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
|
||||
with huge pages
|
||||
ShmemPmdMapped: Shared memory mapped into userspace with huge pages
|
||||
Slab: in-kernel data structures cache
|
||||
SReclaimable: Part of Slab, that might be reclaimed, such as caches
|
||||
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
|
||||
|
||||
@@ -592,9 +592,14 @@ struct address_space_operations {
|
||||
int (*releasepage) (struct page *, int);
|
||||
void (*freepage)(struct page *);
|
||||
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
|
||||
/* isolate a page for migration */
|
||||
bool (*isolate_page) (struct page *, isolate_mode_t);
|
||||
/* migrate the contents of a page to the specified target */
|
||||
int (*migratepage) (struct page *, struct page *);
|
||||
/* put migration-failed page back to right list */
|
||||
void (*putback_page) (struct page *);
|
||||
int (*launder_page) (struct page *);
|
||||
|
||||
int (*is_partially_uptodate) (struct page *, unsigned long,
|
||||
unsigned long);
|
||||
void (*is_dirty_writeback) (struct page *, bool *, bool *);
|
||||
@@ -747,6 +752,10 @@ struct address_space_operations {
|
||||
and transfer data directly between the storage and the
|
||||
application's address space.
|
||||
|
||||
isolate_page: Called by the VM when isolating a movable non-lru page.
|
||||
If page is successfully isolated, VM marks the page as PG_isolated
|
||||
via __SetPageIsolated.
|
||||
|
||||
migrate_page: This is used to compact the physical memory usage.
|
||||
If the VM wants to relocate a page (maybe off a memory card
|
||||
that is signalling imminent failure) it will pass a new page
|
||||
@@ -754,6 +763,8 @@ struct address_space_operations {
|
||||
transfer any private data across and update any references
|
||||
that it has to the page.
|
||||
|
||||
putback_page: Called by the VM when isolated page's migration fails.
|
||||
|
||||
launder_page: Called before freeing a page - it writes back the dirty page. To
|
||||
prevent redirtying the page, it is kept locked during the whole
|
||||
operation.
|
||||
|
||||
@@ -142,5 +142,111 @@ Steps:
|
||||
20. The new page is moved to the LRU and can be scanned by the swapper
|
||||
etc again.
|
||||
|
||||
Christoph Lameter, May 8, 2006.
|
||||
C. Non-LRU page migration
|
||||
-------------------------
|
||||
|
||||
Although original migration aimed for reducing the latency of memory access
|
||||
for NUMA, compaction who want to create high-order page is also main customer.
|
||||
|
||||
Current problem of the implementation is that it is designed to migrate only
|
||||
*LRU* pages. However, there are potential non-lru pages which can be migrated
|
||||
in drivers, for example, zsmalloc, virtio-balloon pages.
|
||||
|
||||
For virtio-balloon pages, some parts of migration code path have been hooked
|
||||
up and added virtio-balloon specific functions to intercept migration logics.
|
||||
It's too specific to a driver so other drivers who want to make their pages
|
||||
movable would have to add own specific hooks in migration path.
|
||||
|
||||
To overclome the problem, VM supports non-LRU page migration which provides
|
||||
generic functions for non-LRU movable pages without driver specific hooks
|
||||
migration path.
|
||||
|
||||
If a driver want to make own pages movable, it should define three functions
|
||||
which are function pointers of struct address_space_operations.
|
||||
|
||||
1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
|
||||
|
||||
What VM expects on isolate_page function of driver is to return *true*
|
||||
if driver isolates page successfully. On returing true, VM marks the page
|
||||
as PG_isolated so concurrent isolation in several CPUs skip the page
|
||||
for isolation. If a driver cannot isolate the page, it should return *false*.
|
||||
|
||||
Once page is successfully isolated, VM uses page.lru fields so driver
|
||||
shouldn't expect to preserve values in that fields.
|
||||
|
||||
2. int (*migratepage) (struct address_space *mapping,
|
||||
struct page *newpage, struct page *oldpage, enum migrate_mode);
|
||||
|
||||
After isolation, VM calls migratepage of driver with isolated page.
|
||||
The function of migratepage is to move content of the old page to new page
|
||||
and set up fields of struct page newpage. Keep in mind that you should
|
||||
indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
|
||||
under page_lock if you migrated the oldpage successfully and returns
|
||||
MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
|
||||
can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
|
||||
because VM interprets -EAGAIN as "temporal migration failure". On returning
|
||||
any error except -EAGAIN, VM will give up the page migration without retrying
|
||||
in this time.
|
||||
|
||||
Driver shouldn't touch page.lru field VM using in the functions.
|
||||
|
||||
3. void (*putback_page)(struct page *);
|
||||
|
||||
If migration fails on isolated page, VM should return the isolated page
|
||||
to the driver so VM calls driver's putback_page with migration failed page.
|
||||
In this function, driver should put the isolated page back to the own data
|
||||
structure.
|
||||
|
||||
4. non-lru movable page flags
|
||||
|
||||
There are two page flags for supporting non-lru movable page.
|
||||
|
||||
* PG_movable
|
||||
|
||||
Driver should use the below function to make page movable under page_lock.
|
||||
|
||||
void __SetPageMovable(struct page *page, struct address_space *mapping)
|
||||
|
||||
It needs argument of address_space for registering migration family functions
|
||||
which will be called by VM. Exactly speaking, PG_movable is not a real flag of
|
||||
struct page. Rather than, VM reuses page->mapping's lower bits to represent it.
|
||||
|
||||
#define PAGE_MAPPING_MOVABLE 0x2
|
||||
page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
|
||||
|
||||
so driver shouldn't access page->mapping directly. Instead, driver should
|
||||
use page_mapping which mask off the low two bits of page->mapping under
|
||||
page lock so it can get right struct address_space.
|
||||
|
||||
For testing of non-lru movable page, VM supports __PageMovable function.
|
||||
However, it doesn't guarantee to identify non-lru movable page because
|
||||
page->mapping field is unified with other variables in struct page.
|
||||
As well, if driver releases the page after isolation by VM, page->mapping
|
||||
doesn't have stable value although it has PAGE_MAPPING_MOVABLE
|
||||
(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
|
||||
page is LRU or non-lru movable once the page has been isolated. Because
|
||||
LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
|
||||
good for just peeking to test non-lru movable pages before more expensive
|
||||
checking with lock_page in pfn scanning to select victim.
|
||||
|
||||
For guaranteeing non-lru movable page, VM provides PageMovable function.
|
||||
Unlike __PageMovable, PageMovable functions validates page->mapping and
|
||||
mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
|
||||
destroying of page->mapping.
|
||||
|
||||
Driver using __SetPageMovable should clear the flag via __ClearMovablePage
|
||||
under page_lock before the releasing the page.
|
||||
|
||||
* PG_isolated
|
||||
|
||||
To prevent concurrent isolation among several CPUs, VM marks isolated page
|
||||
as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
|
||||
movable page, it can skip it. Driver doesn't need to manipulate the flag
|
||||
because VM will set/clear it automatically. Keep in mind that if driver
|
||||
sees PG_isolated page, it means the page have been isolated by VM so it
|
||||
shouldn't touch page.lru field.
|
||||
PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
|
||||
for own purpose.
|
||||
|
||||
Christoph Lameter, May 8, 2006.
|
||||
Minchan Kim, Mar 28, 2016.
|
||||
|
||||
@@ -9,8 +9,8 @@ using huge pages for the backing of virtual memory with huge pages
|
||||
that supports the automatic promotion and demotion of page sizes and
|
||||
without the shortcomings of hugetlbfs.
|
||||
|
||||
Currently it only works for anonymous memory mappings but in the
|
||||
future it can expand over the pagecache layer starting with tmpfs.
|
||||
Currently it only works for anonymous memory mappings and tmpfs/shmem.
|
||||
But in the future it can expand to other filesystems.
|
||||
|
||||
The reason applications are running faster is because of two
|
||||
factors. The first factor is almost completely irrelevant and it's not
|
||||
@@ -57,10 +57,6 @@ miss is going to run faster.
|
||||
feature that applies to all dynamic high order allocations in the
|
||||
kernel)
|
||||
|
||||
- this initial support only offers the feature in the anonymous memory
|
||||
regions but it'd be ideal to move it to tmpfs and the pagecache
|
||||
later
|
||||
|
||||
Transparent Hugepage Support maximizes the usefulness of free memory
|
||||
if compared to the reservation approach of hugetlbfs by allowing all
|
||||
unused memory to be used as cache or other movable (or even unmovable
|
||||
@@ -94,21 +90,21 @@ madvise(MADV_HUGEPAGE) on their critical mmapped regions.
|
||||
|
||||
== sysfs ==
|
||||
|
||||
Transparent Hugepage Support can be entirely disabled (mostly for
|
||||
debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
|
||||
avoid the risk of consuming more memory resources) or enabled system
|
||||
wide. This can be achieved with one of:
|
||||
Transparent Hugepage Support for anonymous memory can be entirely disabled
|
||||
(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
|
||||
regions (to avoid the risk of consuming more memory resources) or enabled
|
||||
system wide. This can be achieved with one of:
|
||||
|
||||
echo always >/sys/kernel/mm/transparent_hugepage/enabled
|
||||
echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
|
||||
echo never >/sys/kernel/mm/transparent_hugepage/enabled
|
||||
|
||||
It's also possible to limit defrag efforts in the VM to generate
|
||||
hugepages in case they're not immediately free to madvise regions or
|
||||
to never try to defrag memory and simply fallback to regular pages
|
||||
unless hugepages are immediately available. Clearly if we spend CPU
|
||||
time to defrag memory, we would expect to gain even more by the fact
|
||||
we use hugepages later instead of regular pages. This isn't always
|
||||
anonymous hugepages in case they're not immediately free to madvise
|
||||
regions or to never try to defrag memory and simply fallback to regular
|
||||
pages unless hugepages are immediately available. Clearly if we spend CPU
|
||||
time to defrag memory, we would expect to gain even more by the fact we
|
||||
use hugepages later instead of regular pages. This isn't always
|
||||
guaranteed, but it may be more likely in case the allocation is for a
|
||||
MADV_HUGEPAGE region.
|
||||
|
||||
@@ -133,9 +129,9 @@ that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
|
||||
|
||||
"never" should be self-explanatory.
|
||||
|
||||
By default kernel tries to use huge zero page on read page fault.
|
||||
It's possible to disable huge zero page by writing 0 or enable it
|
||||
back by writing 1:
|
||||
By default kernel tries to use huge zero page on read page fault to
|
||||
anonymous mapping. It's possible to disable huge zero page by writing 0
|
||||
or enable it back by writing 1:
|
||||
|
||||
echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
|
||||
echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
|
||||
@@ -204,21 +200,67 @@ Support by passing the parameter "transparent_hugepage=always" or
|
||||
"transparent_hugepage=madvise" or "transparent_hugepage=never"
|
||||
(without "") to the kernel command line.
|
||||
|
||||
== Hugepages in tmpfs/shmem ==
|
||||
|
||||
You can control hugepage allocation policy in tmpfs with mount option
|
||||
"huge=". It can have following values:
|
||||
|
||||
- "always":
|
||||
Attempt to allocate huge pages every time we need a new page;
|
||||
|
||||
- "never":
|
||||
Do not allocate huge pages;
|
||||
|
||||
- "within_size":
|
||||
Only allocate huge page if it will be fully within i_size.
|
||||
Also respect fadvise()/madvise() hints;
|
||||
|
||||
- "advise:
|
||||
Only allocate huge pages if requested with fadvise()/madvise();
|
||||
|
||||
The default policy is "never".
|
||||
|
||||
"mount -o remount,huge= /mountpoint" works fine after mount: remounting
|
||||
huge=never will not attempt to break up huge pages at all, just stop more
|
||||
from being allocated.
|
||||
|
||||
There's also sysfs knob to control hugepage allocation policy for internal
|
||||
shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
|
||||
is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
|
||||
MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
|
||||
|
||||
In addition to policies listed above, shmem_enabled allows two further
|
||||
values:
|
||||
|
||||
- "deny":
|
||||
For use in emergencies, to force the huge option off from
|
||||
all mounts;
|
||||
- "force":
|
||||
Force the huge option on for all - very useful for testing;
|
||||
|
||||
== Need of application restart ==
|
||||
|
||||
The transparent_hugepage/enabled values only affect future
|
||||
behavior. So to make them effective you need to restart any
|
||||
application that could have been using hugepages. This also applies to
|
||||
the regions registered in khugepaged.
|
||||
The transparent_hugepage/enabled values and tmpfs mount option only affect
|
||||
future behavior. So to make them effective you need to restart any
|
||||
application that could have been using hugepages. This also applies to the
|
||||
regions registered in khugepaged.
|
||||
|
||||
== Monitoring usage ==
|
||||
|
||||
The number of transparent huge pages currently used by the system is
|
||||
available by reading the AnonHugePages field in /proc/meminfo. To
|
||||
identify what applications are using transparent huge pages, it is
|
||||
necessary to read /proc/PID/smaps and count the AnonHugePages fields
|
||||
for each mapping. Note that reading the smaps file is expensive and
|
||||
reading it frequently will incur overhead.
|
||||
The number of anonymous transparent huge pages currently used by the
|
||||
system is available by reading the AnonHugePages field in /proc/meminfo.
|
||||
To identify what applications are using anonymous transparent huge pages,
|
||||
it is necessary to read /proc/PID/smaps and count the AnonHugePages fields
|
||||
for each mapping.
|
||||
|
||||
The number of file transparent huge pages mapped to userspace is available
|
||||
by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo.
|
||||
To identify what applications are mapping file transparent huge pages, it
|
||||
is necessary to read /proc/PID/smaps and count the FileHugeMapped fields
|
||||
for each mapping.
|
||||
|
||||
Note that reading the smaps file is expensive and reading it
|
||||
frequently will incur overhead.
|
||||
|
||||
There are a number of counters in /proc/vmstat that may be used to
|
||||
monitor how successfully the system is providing huge pages for use.
|
||||
@@ -238,6 +280,12 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
|
||||
of pages that should be collapsed into one huge page but failed
|
||||
the allocation.
|
||||
|
||||
thp_file_alloc is incremented every time a file huge page is successfully
|
||||
i allocated.
|
||||
|
||||
thp_file_mapped is incremented every time a file huge page is mapped into
|
||||
user address space.
|
||||
|
||||
thp_split_page is incremented every time a huge page is split into base
|
||||
pages. This can happen for a variety of reasons but a common
|
||||
reason is that a huge page is old and is being reclaimed.
|
||||
@@ -403,19 +451,27 @@ pages:
|
||||
on relevant sub-page of the compound page.
|
||||
|
||||
- map/unmap of the whole compound page accounted in compound_mapcount
|
||||
(stored in first tail page).
|
||||
(stored in first tail page). For file huge pages, we also increment
|
||||
->_mapcount of all sub-pages in order to have race-free detection of
|
||||
last unmap of subpages.
|
||||
|
||||
PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
|
||||
This additional reference is required to get race-free detection of unmap of
|
||||
subpages when we have them mapped with both PMDs and PTEs.
|
||||
PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
|
||||
|
||||
For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
|
||||
subpages is offset up by one. This additional reference is required to
|
||||
get race-free detection of unmap of subpages when we have them mapped with
|
||||
both PMDs and PTEs.
|
||||
|
||||
This is optimization required to lower overhead of per-subpage mapcount
|
||||
tracking. The alternative is alter ->_mapcount in all subpages on each
|
||||
map/unmap of the whole compound page.
|
||||
|
||||
We set PG_double_map when a PMD of the page got split for the first time,
|
||||
but still have PMD mapping. The additional references go away with last
|
||||
compound_mapcount.
|
||||
For anonymous pages, we set PG_double_map when a PMD of the page got split
|
||||
for the first time, but still have PMD mapping. The additional references
|
||||
go away with last compound_mapcount.
|
||||
|
||||
File pages get PG_double_map set on first map of the page with PTE and
|
||||
goes away when the page gets evicted from page cache.
|
||||
|
||||
split_huge_page internally has to distribute the refcounts in the head
|
||||
page to the tail pages before clearing all PG_head/tail bits from the page
|
||||
@@ -427,7 +483,7 @@ sum of mapcount of all sub-pages plus one (split_huge_page caller must
|
||||
have reference for head page).
|
||||
|
||||
split_huge_page uses migration entries to stabilize page->_refcount and
|
||||
page->_mapcount.
|
||||
page->_mapcount of anonymous pages. File pages just got unmapped.
|
||||
|
||||
We safe against physical memory scanners too: the only legitimate way
|
||||
scanner can get reference to a page is get_page_unless_zero().
|
||||
|
||||
@@ -461,6 +461,27 @@ unevictable LRU is enabled, the work of compaction is mostly handled by
|
||||
the page migration code and the same work flow as described in MIGRATING
|
||||
MLOCKED PAGES will apply.
|
||||
|
||||
MLOCKING TRANSPARENT HUGE PAGES
|
||||
-------------------------------
|
||||
|
||||
A transparent huge page is represented by a single entry on an LRU list.
|
||||
Therefore, we can only make unevictable an entire compound page, not
|
||||
individual subpages.
|
||||
|
||||
If a user tries to mlock() part of a huge page, we want the rest of the
|
||||
page to be reclaimable.
|
||||
|
||||
We cannot just split the page on partial mlock() as split_huge_page() can
|
||||
fail and new intermittent failure mode for the syscall is undesirable.
|
||||
|
||||
We handle this by keeping PTE-mapped huge pages on normal LRU lists: the
|
||||
PMD on border of VM_LOCKED VMA will be split into PTE table.
|
||||
|
||||
This way the huge page is accessible for vmscan. Under memory pressure the
|
||||
page will be split, subpages which belong to VM_LOCKED VMAs will be moved
|
||||
to unevictable LRU and the rest can be reclaimed.
|
||||
|
||||
See also comment in follow_trans_huge_pmd().
|
||||
|
||||
mmap(MAP_LOCKED) SYSTEM CALL HANDLING
|
||||
-------------------------------------
|
||||
|
||||
@@ -647,41 +647,28 @@ ifneq ($(CONFIG_FRAME_WARN),0)
|
||||
KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
|
||||
endif
|
||||
|
||||
# Handle stack protector mode.
|
||||
#
|
||||
# Since kbuild can potentially perform two passes (first with the old
|
||||
# .config values and then with updated .config values), we cannot error out
|
||||
# if a desired compiler option is unsupported. If we were to error, kbuild
|
||||
# could never get to the second pass and actually notice that we changed
|
||||
# the option to something that was supported.
|
||||
#
|
||||
# Additionally, we don't want to fallback and/or silently change which compiler
|
||||
# flags will be used, since that leads to producing kernels with different
|
||||
# security feature characteristics depending on the compiler used. ("But I
|
||||
# selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
|
||||
#
|
||||
# The middle ground is to warn here so that the failed option is obvious, but
|
||||
# to let the build fail with bad compiler flags so that we can't produce a
|
||||
# kernel when there is a CONFIG and compiler mismatch.
|
||||
#
|
||||
# This selects the stack protector compiler flag. Testing it is delayed
|
||||
# until after .config has been reprocessed, in the prepare-compiler-check
|
||||
# target.
|
||||
ifdef CONFIG_CC_STACKPROTECTOR_REGULAR
|
||||
stackp-flag := -fstack-protector
|
||||
ifeq ($(call cc-option, $(stackp-flag)),)
|
||||
$(warning Cannot use CONFIG_CC_STACKPROTECTOR_REGULAR: \
|
||||
-fstack-protector not supported by compiler)
|
||||
endif
|
||||
stackp-name := REGULAR
|
||||
else
|
||||
ifdef CONFIG_CC_STACKPROTECTOR_STRONG
|
||||
stackp-flag := -fstack-protector-strong
|
||||
ifeq ($(call cc-option, $(stackp-flag)),)
|
||||
$(warning Cannot use CONFIG_CC_STACKPROTECTOR_STRONG: \
|
||||
-fstack-protector-strong not supported by compiler)
|
||||
endif
|
||||
stackp-name := STRONG
|
||||
else
|
||||
# Force off for distro compilers that enable stack protector by default.
|
||||
stackp-flag := $(call cc-option, -fno-stack-protector)
|
||||
endif
|
||||
endif
|
||||
# Find arch-specific stack protector compiler sanity-checking script.
|
||||
ifdef CONFIG_CC_STACKPROTECTOR
|
||||
stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh
|
||||
ifneq ($(wildcard $(stackp-path)),)
|
||||
stackp-check := $(stackp-path)
|
||||
endif
|
||||
endif
|
||||
KBUILD_CFLAGS += $(stackp-flag)
|
||||
|
||||
ifdef CONFIG_KCOV
|
||||
@@ -1017,8 +1004,10 @@ ifneq ($(KBUILD_SRC),)
|
||||
fi;
|
||||
endif
|
||||
|
||||
# prepare2 creates a makefile if using a separate output directory
|
||||
prepare2: prepare3 outputmakefile asm-generic
|
||||
# prepare2 creates a makefile if using a separate output directory.
|
||||
# From this point forward, .config has been reprocessed, so any rules
|
||||
# that need to depend on updated CONFIG_* values can be checked here.
|
||||
prepare2: prepare3 prepare-compiler-check outputmakefile asm-generic
|
||||
|
||||
prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
|
||||
include/config/auto.conf
|
||||
@@ -1049,6 +1038,32 @@ endif
|
||||
PHONY += prepare-objtool
|
||||
prepare-objtool: $(objtool_target)
|
||||
|
||||
# Check for CONFIG flags that require compiler support. Abort the build
|
||||
# after .config has been processed, but before the kernel build starts.
|
||||
#
|
||||
# For security-sensitive CONFIG options, we don't want to fallback and/or
|
||||
# silently change which compiler flags will be used, since that leads to
|
||||
# producing kernels with different security feature characteristics
|
||||
# depending on the compiler used. (For example, "But I selected
|
||||
# CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
|
||||
PHONY += prepare-compiler-check
|
||||
prepare-compiler-check: FORCE
|
||||
# Make sure compiler supports requested stack protector flag.
|
||||
ifdef stackp-name
|
||||
ifeq ($(call cc-option, $(stackp-flag)),)
|
||||
@echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \
|
||||
$(stackp-flag) not supported by compiler >&2 && exit 1
|
||||
endif
|
||||
endif
|
||||
# Make sure compiler does not have buggy stack-protector support.
|
||||
ifdef stackp-check
|
||||
ifneq ($(shell $(CONFIG_SHELL) $(stackp-check) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
|
||||
@echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \
|
||||
$(stackp-flag) available but compiler is broken >&2 && exit 1
|
||||
endif
|
||||
endif
|
||||
@:
|
||||
|
||||
# Generate some files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ retry:
|
||||
/* If for any reason at all we couldn't handle the fault,
|
||||
make sure we exit gracefully rather than endlessly redo
|
||||
the fault. */
|
||||
fault = handle_mm_fault(mm, vma, address, flags);
|
||||
fault = handle_mm_fault(vma, address, flags);
|
||||
|
||||
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
|
||||
return;
|
||||
|
||||
+1
-1
@@ -137,7 +137,7 @@ good_area:
|
||||
* make sure we exit gracefully rather than endlessly redo
|
||||
* the fault.
|
||||
*/
|
||||
fault = handle_mm_fault(mm, vma, address, flags);
|
||||
fault = handle_mm_fault(vma, address, flags);
|
||||
|
||||
/* If Pagefault was interrupted by SIGKILL, exit page fault "early" */
|
||||
if (unlikely(fatal_signal_pending(current))) {
|
||||
|
||||
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
|
||||
extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
|
||||
|
||||
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
|
||||
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
|
||||
|
||||
static inline void clean_pte_table(pte_t *pte)
|
||||
{
|
||||
|
||||
@@ -209,17 +209,38 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
|
||||
tlb_flush(tlb);
|
||||
}
|
||||
|
||||
static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
|
||||
static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
|
||||
{
|
||||
if (tlb->nr == tlb->max)
|
||||
return true;
|
||||
tlb->pages[tlb->nr++] = page;
|
||||
VM_BUG_ON(tlb->nr > tlb->max);
|
||||
return tlb->max - tlb->nr;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
|
||||
{
|
||||
if (!__tlb_remove_page(tlb, page))
|
||||
if (__tlb_remove_page(tlb, page)) {
|
||||
tlb_flush_mmu(tlb);
|
||||
__tlb_remove_page(tlb, page);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
||||
struct page *page, int page_size)
|
||||
{
|
||||
return __tlb_remove_page(tlb, page);
|
||||
}
|
||||
|
||||
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
|
||||
struct page *page)
|
||||
{
|
||||
return __tlb_remove_page(tlb, page);
|
||||
}
|
||||
|
||||
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
|
||||
struct page *page, int page_size)
|
||||
{
|
||||
return tlb_remove_page(tlb, page);
|
||||
}
|
||||
|
||||
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
|
||||
|
||||
+1
-1
@@ -243,7 +243,7 @@ good_area:
|
||||
goto out;
|
||||
}
|
||||
|
||||
return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags);
|
||||
return handle_mm_fault(vma, addr & PAGE_MASK, flags);
|
||||
|
||||
check_stack:
|
||||
/* Don't allow expansion below FIRST_USER_ADDRESS */
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@
|
||||
#define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL)
|
||||
#define __pgd_free(pgd) kfree(pgd)
|
||||
#else
|
||||
#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT, 2)
|
||||
#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2)
|
||||
#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -233,7 +233,7 @@ good_area:
|
||||
goto out;
|
||||
}
|
||||
|
||||
return handle_mm_fault(mm, vma, addr & PAGE_MASK, mm_flags);
|
||||
return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
|
||||
|
||||
check_stack:
|
||||
if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
|
||||
|
||||
@@ -134,7 +134,7 @@ good_area:
|
||||
* sure we exit gracefully rather than endlessly redo the
|
||||
* fault.
|
||||
*/
|
||||
fault = handle_mm_fault(mm, vma, address, flags);
|
||||
fault = handle_mm_fault(vma, address, flags);
|
||||
|
||||
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
|
||||
return;
|
||||
|
||||
@@ -168,7 +168,7 @@ retry:
|
||||
* the fault.
|
||||
*/
|
||||
|
||||
fault = handle_mm_fault(mm, vma, address, flags);
|
||||
fault = handle_mm_fault(vma, address, flags);
|
||||
|
||||
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
|
||||
return;
|
||||
|
||||
+1
-1
@@ -164,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
|
||||
* make sure we exit gracefully rather than endlessly redo
|
||||
* the fault.
|
||||
*/
|
||||
fault = handle_mm_fault(mm, vma, ear0, flags);
|
||||
fault = handle_mm_fault(vma, ear0, flags);
|
||||
if (unlikely(fault & VM_FAULT_ERROR)) {
|
||||
if (fault & VM_FAULT_OOM)
|
||||
goto out_of_memory;
|
||||
|
||||
@@ -101,7 +101,7 @@ good_area:
|
||||
break;
|
||||
}
|
||||
|
||||
fault = handle_mm_fault(mm, vma, address, flags);
|
||||
fault = handle_mm_fault(vma, address, flags);
|
||||
|
||||
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
|
||||
return;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user