You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
What: /proc/pid/smaps_rollup
|
||||
Date: August 2017
|
||||
Contact: Daniel Colascione <dancol@google.com>
|
||||
Description:
|
||||
This file provides pre-summed memory information for a
|
||||
process. The format is identical to /proc/pid/smaps,
|
||||
except instead of an entry for each VMA in a process,
|
||||
smaps_rollup has a single entry (tagged "[rollup]")
|
||||
for which each field is the sum of the corresponding
|
||||
fields from all the maps in /proc/pid/smaps.
|
||||
For more details, see the procfs man page.
|
||||
|
||||
Typical output looks like this:
|
||||
|
||||
00100000-ff709000 ---p 00000000 00:00 0 [rollup]
|
||||
Rss: 884 kB
|
||||
Pss: 385 kB
|
||||
Shared_Clean: 696 kB
|
||||
Shared_Dirty: 0 kB
|
||||
Private_Clean: 120 kB
|
||||
Private_Dirty: 68 kB
|
||||
Referenced: 884 kB
|
||||
Anonymous: 68 kB
|
||||
LazyFree: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
Shared_Hugetlb: 0 kB
|
||||
Private_Hugetlb: 0 kB
|
||||
Swap: 0 kB
|
||||
SwapPss: 0 kB
|
||||
Locked: 385 kB
|
||||
@@ -90,3 +90,11 @@ Description:
|
||||
device's debugging info useful for kernel developers. Its
|
||||
format is not documented intentionally and may change
|
||||
anytime without any notice.
|
||||
|
||||
What: /sys/block/zram<id>/backing_dev
|
||||
Date: June 2017
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
What: /sys/kernel/mm/swap/
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Interface for swapping
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_enabled
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Enable/disable VMA based swap readahead.
|
||||
|
||||
If set to true, the VMA based swap readahead algorithm
|
||||
will be used for swappable anonymous pages mapped in a
|
||||
VMA, and the global swap readahead algorithm will be
|
||||
still used for tmpfs etc. other users. If set to
|
||||
false, the global swap readahead algorithm will be
|
||||
used for all swappable pages.
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_max_order
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: The max readahead size in order for VMA based swap readahead
|
||||
|
||||
VMA based swap readahead algorithm will readahead at
|
||||
most 1 << max_order pages for each readahead. The
|
||||
real readahead size for each readahead will be scaled
|
||||
according to the estimation algorithm.
|
||||
@@ -2783,7 +2783,7 @@
|
||||
Allowed values are enable and disable
|
||||
|
||||
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
|
||||
one of ['zone', 'node', 'default'] can be specified
|
||||
'node', 'default' can be specified
|
||||
This can be set from sysctl after boot.
|
||||
See Documentation/sysctl/vm.txt for details.
|
||||
|
||||
|
||||
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
|
||||
|
||||
User space is advised to use the following files to read the device statistics.
|
||||
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
|
||||
resets the disksize to zero. You must set the disksize again
|
||||
before reusing the device.
|
||||
|
||||
* Optional Feature
|
||||
|
||||
= writeback
|
||||
|
||||
With incompressible pages, there is no memory saving with zram.
|
||||
Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
|
||||
to backing storage rather than keeping it in memory.
|
||||
User should set up backing device via /sys/block/zramX/backing_dev
|
||||
before disksize setting.
|
||||
|
||||
Nitin Gupta
|
||||
ngupta@vflare.org
|
||||
|
||||
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
|
||||
void (*mark_pages_cached)(void *cookie_netfs_data,
|
||||
struct address_space *mapping,
|
||||
struct pagevec *cached_pvec);
|
||||
|
||||
void (*now_uncached)(void *cookie_netfs_data);
|
||||
};
|
||||
|
||||
This has the following fields:
|
||||
|
||||
@@ -63,9 +63,8 @@ Filesystem support consists of
|
||||
- implementing an mmap file operation for DAX files which sets the
|
||||
VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
|
||||
include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
|
||||
handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
|
||||
handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
|
||||
iomap operations.
|
||||
handlers should probably call dax_iomap_fault() passing the appropriate
|
||||
fault size and iomap operations.
|
||||
- calling iomap_zero_range() passing appropriate iomap operations instead of
|
||||
block_truncate_page() for DAX files
|
||||
- ensuring that there is sufficient locking between reads, writes,
|
||||
|
||||
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
|
||||
|
||||
numa_zonelist_order
|
||||
|
||||
This sysctl is only for NUMA.
|
||||
This sysctl is only for NUMA and it is deprecated. Anything but
|
||||
Node order will fail!
|
||||
|
||||
'where the memory is allocated from' is controlled by zonelists.
|
||||
(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
|
||||
you may be able to read ZONE_DMA as ZONE_DMA32...)
|
||||
|
||||
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
|
||||
fall back to the same zone type on a different node, or to a different zone
|
||||
type on the same node. This is an important consideration because some zones,
|
||||
such as DMA or DMA32, represent relatively scarce resources. Linux chooses
|
||||
a default zonelist order based on the sizes of the various zone types relative
|
||||
to the total memory of the node and the total memory of the system. The
|
||||
default zonelist order may be overridden using the numa_zonelist_order kernel
|
||||
boot parameter or sysctl. [see Documentation/admin-guide/kernel-parameters.rst and
|
||||
Documentation/sysctl/vm.txt]
|
||||
a default Node ordered zonelist. This means it tries to fallback to other zones
|
||||
from the same node before using remote nodes which are ordered by NUMA distance.
|
||||
|
||||
By default, Linux will attempt to satisfy memory allocation requests from the
|
||||
node to which the CPU that executes the request is assigned. Specifically,
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
Automatically bind swap device to numa node
|
||||
-------------------------------------------
|
||||
|
||||
If the system has more than one swap device and swap device has the node
|
||||
information, we can make use of this information to decide which swap
|
||||
device to use in get_swap_pages() to get better performance.
|
||||
|
||||
|
||||
How to use this feature
|
||||
-----------------------
|
||||
|
||||
Swap device has priority and that decides the order of it to be used. To make
|
||||
use of automatically binding, there is no need to manipulate priority settings
|
||||
for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
|
||||
swapB, with swapA attached to node 0 and swapB attached to node 1, are going
|
||||
to be swapped on. Simply swapping them on by doing:
|
||||
# swapon /dev/swapA
|
||||
# swapon /dev/swapB
|
||||
|
||||
Then node 0 will use the two swap devices in the order of swapA then swapB and
|
||||
node 1 will use the two swap devices in the order of swapB then swapA. Note
|
||||
that the order of them being swapped on doesn't matter.
|
||||
|
||||
A more complex example on a 4 node machine. Assume 6 swap devices are going to
|
||||
be swapped on: swapA and swapB are attached to node 0, swapC is attached to
|
||||
node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
|
||||
The way to swap them on is the same as above:
|
||||
# swapon /dev/swapA
|
||||
# swapon /dev/swapB
|
||||
# swapon /dev/swapC
|
||||
# swapon /dev/swapD
|
||||
# swapon /dev/swapE
|
||||
# swapon /dev/swapF
|
||||
|
||||
Then node 0 will use them in the order of:
|
||||
swapA/swapB -> swapC -> swapD -> swapE -> swapF
|
||||
swapA and swapB will be used in a round robin mode before any other swap device.
|
||||
|
||||
node 1 will use them in the order of:
|
||||
swapC -> swapA -> swapB -> swapD -> swapE -> swapF
|
||||
|
||||
node 2 will use them in the order of:
|
||||
swapD/swapE -> swapA -> swapB -> swapC -> swapF
|
||||
Similaly, swapD and swapE will be used in a round robin mode before any
|
||||
other swap devices.
|
||||
|
||||
node 3 will use them in the order of:
|
||||
swapF -> swapA -> swapB -> swapC -> swapD -> swapE
|
||||
|
||||
|
||||
Implementation details
|
||||
----------------------
|
||||
|
||||
The current code uses a priority based list, swap_avail_list, to decide
|
||||
which swap device to use and if multiple swap devices share the same
|
||||
priority, they are used round robin. This change here replaces the single
|
||||
global swap_avail_list with a per-numa-node list, i.e. for each numa node,
|
||||
it sees its own priority based list of available swap devices. Swap
|
||||
device's priority can be promoted on its matching node's swap_avail_list.
|
||||
|
||||
The current swap device's priority is set as: user can set a >=0 value,
|
||||
or the system will pick one starting from -1 then downwards. The priority
|
||||
value in the swap_avail_list is the negated value of the swap device's
|
||||
due to plist being sorted from low to high. The new policy doesn't change
|
||||
the semantics for priority >=0 cases, the previous starting from -1 then
|
||||
downwards now becomes starting from -2 then downwards and -1 is reserved
|
||||
as the promoted value. So if multiple swap devices are attached to the same
|
||||
node, they will all be promoted to priority -1 on that node's plist and will
|
||||
be used round robin before any other swap devices.
|
||||
@@ -64,20 +64,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#ifdef CONFIG_NUMA
|
||||
|
||||
#define cpu_to_node(cpu) ((void)(cpu), 0)
|
||||
#define parent_node(node) ((void)(node), 0)
|
||||
|
||||
#define cpumask_of_node(node) ((void)node, cpu_online_mask)
|
||||
|
||||
|
||||
@@ -91,20 +91,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
||||
@@ -57,6 +57,9 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
#define MADV_HWPOISON 100 /* poison a page for testing */
|
||||
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
|
||||
|
||||
@@ -64,17 +67,6 @@
|
||||
#define MAP_FILE 0
|
||||
#define MAP_VARIABLE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
||||
@@ -29,20 +29,4 @@
|
||||
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
|
||||
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
|
||||
* encode the log2 of the huge page size. A value of zero indicates that the
|
||||
* default huge page size should be used. To use a non-default huge page size,
|
||||
* one of these defines can be used, or the size can be encoded by hand. Note
|
||||
* that on most systems only a subset, or possibly none, of these sizes will be
|
||||
* available.
|
||||
*/
|
||||
#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */
|
||||
#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */
|
||||
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */
|
||||
#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */
|
||||
#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */
|
||||
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */
|
||||
#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */
|
||||
|
||||
#endif /* _UAPI_ASM_POWERPC_MMAN_H */
|
||||
|
||||
+3
-1
@@ -1806,7 +1806,9 @@ config X86_SMAP
|
||||
config X86_INTEL_MPX
|
||||
prompt "Intel MPX (Memory Protection Extensions)"
|
||||
def_bool n
|
||||
depends on CPU_SUP_INTEL
|
||||
# Note: only available in 64-bit mode due to VMA flags shortage
|
||||
depends on CPU_SUP_INTEL && X86_64
|
||||
select ARCH_USES_HIGH_VMA_FLAGS
|
||||
---help---
|
||||
MPX provides hardware features that can be used in
|
||||
conjunction with compiler-instrumented code to check
|
||||
|
||||
@@ -3,9 +3,6 @@
|
||||
|
||||
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
|
||||
|
||||
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
|
||||
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
/*
|
||||
* Take the 4 protection key bits out of the vma->vm_flags
|
||||
|
||||
@@ -103,20 +103,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
||||
+20
-10
@@ -388,6 +388,19 @@ static ssize_t show_phys_device(struct device *dev,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
|
||||
unsigned long nr_pages, int online_type,
|
||||
struct zone *default_zone)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
|
||||
if (zone != default_zone) {
|
||||
strcat(buf, " ");
|
||||
strcat(buf, zone->name);
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t show_valid_zones(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
@@ -395,7 +408,7 @@ static ssize_t show_valid_zones(struct device *dev,
|
||||
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
unsigned long valid_start_pfn, valid_end_pfn;
|
||||
bool append = false;
|
||||
struct zone *default_zone;
|
||||
int nid;
|
||||
|
||||
/*
|
||||
@@ -418,16 +431,13 @@ static ssize_t show_valid_zones(struct device *dev,
|
||||
}
|
||||
|
||||
nid = pfn_to_nid(start_pfn);
|
||||
if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
|
||||
strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
|
||||
append = true;
|
||||
}
|
||||
default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
|
||||
strcat(buf, default_zone->name);
|
||||
|
||||
if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
|
||||
if (append)
|
||||
strcat(buf, " ");
|
||||
strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
|
||||
}
|
||||
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
|
||||
default_zone);
|
||||
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
|
||||
default_zone);
|
||||
out:
|
||||
strcat(buf, "\n");
|
||||
|
||||
|
||||
+5
-1
@@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, bool is_write)
|
||||
{
|
||||
struct brd_device *brd = bdev->bd_disk->private_data;
|
||||
int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
|
||||
int err;
|
||||
|
||||
if (PageTransHuge(page))
|
||||
return -ENOTSUPP;
|
||||
err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
|
||||
page_endio(page, is_write, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user