You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
This commit is contained in:
+20
@@ -0,0 +1,20 @@
|
||||
#
|
||||
# Makefile for the linux memory manager.
|
||||
#
|
||||
|
||||
mmu-y := nommu.o
|
||||
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
|
||||
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
|
||||
vmalloc.o
|
||||
|
||||
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
page_alloc.o page-writeback.o pdflush.o \
|
||||
readahead.o slab.o swap.o truncate.o vmscan.o \
|
||||
prio_tree.o $(mmu-y)
|
||||
|
||||
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
|
||||
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
|
||||
obj-$(CONFIG_NUMA) += mempolicy.o
|
||||
obj-$(CONFIG_SHMEM) += shmem.o
|
||||
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
|
||||
|
||||
+400
@@ -0,0 +1,400 @@
|
||||
/*
|
||||
* linux/mm/bootmem.c
|
||||
*
|
||||
* Copyright (C) 1999 Ingo Molnar
|
||||
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
|
||||
*
|
||||
* simple boot-time physical memory area allocator and
|
||||
* free memory collector. It's used to deal with reserved
|
||||
* system memory and memory holes as well.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/dma.h>
|
||||
#include <asm/io.h>
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* Access to this subsystem has to be serialized externally. (this is
|
||||
* true for the boot process anyway)
|
||||
*/
|
||||
unsigned long max_low_pfn;
|
||||
unsigned long min_low_pfn;
|
||||
unsigned long max_pfn;
|
||||
|
||||
EXPORT_SYMBOL(max_pfn); /* This is exported so
|
||||
* dma_get_required_mask(), which uses
|
||||
* it, can be an inline function */
|
||||
|
||||
/* return the number of _pages_ that will be allocated for the boot bitmap */
|
||||
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
|
||||
{
|
||||
unsigned long mapsize;
|
||||
|
||||
mapsize = (pages+7)/8;
|
||||
mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
|
||||
mapsize >>= PAGE_SHIFT;
|
||||
|
||||
return mapsize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called once to set up the allocator itself.
|
||||
*/
|
||||
static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
|
||||
unsigned long mapstart, unsigned long start, unsigned long end)
|
||||
{
|
||||
bootmem_data_t *bdata = pgdat->bdata;
|
||||
unsigned long mapsize = ((end - start)+7)/8;
|
||||
|
||||
pgdat->pgdat_next = pgdat_list;
|
||||
pgdat_list = pgdat;
|
||||
|
||||
mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
|
||||
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
|
||||
bdata->node_boot_start = (start << PAGE_SHIFT);
|
||||
bdata->node_low_pfn = end;
|
||||
|
||||
/*
|
||||
* Initially all pages are reserved - setup_arch() has to
|
||||
* register free RAM areas explicitly.
|
||||
*/
|
||||
memset(bdata->node_bootmem_map, 0xff, mapsize);
|
||||
|
||||
return mapsize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Marks a particular physical memory range as unallocatable. Usable RAM
|
||||
* might be used for boot-time allocations - or it might get added
|
||||
* to the free page pool later on.
|
||||
*/
|
||||
static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
|
||||
{
|
||||
unsigned long i;
|
||||
/*
|
||||
* round up, partially reserved pages are considered
|
||||
* fully reserved.
|
||||
*/
|
||||
unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
|
||||
unsigned long eidx = (addr + size - bdata->node_boot_start +
|
||||
PAGE_SIZE-1)/PAGE_SIZE;
|
||||
unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
|
||||
|
||||
BUG_ON(!size);
|
||||
BUG_ON(sidx >= eidx);
|
||||
BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn);
|
||||
BUG_ON(end > bdata->node_low_pfn);
|
||||
|
||||
for (i = sidx; i < eidx; i++)
|
||||
if (test_and_set_bit(i, bdata->node_bootmem_map)) {
|
||||
#ifdef CONFIG_DEBUG_BOOTMEM
|
||||
printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
|
||||
{
|
||||
unsigned long i;
|
||||
unsigned long start;
|
||||
/*
|
||||
* round down end of usable mem, partially free pages are
|
||||
* considered reserved.
|
||||
*/
|
||||
unsigned long sidx;
|
||||
unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
|
||||
unsigned long end = (addr + size)/PAGE_SIZE;
|
||||
|
||||
BUG_ON(!size);
|
||||
BUG_ON(end > bdata->node_low_pfn);
|
||||
|
||||
if (addr < bdata->last_success)
|
||||
bdata->last_success = addr;
|
||||
|
||||
/*
|
||||
* Round up the beginning of the address.
|
||||
*/
|
||||
start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
|
||||
sidx = start - (bdata->node_boot_start/PAGE_SIZE);
|
||||
|
||||
for (i = sidx; i < eidx; i++) {
|
||||
if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We 'merge' subsequent allocations to save space. We might 'lose'
|
||||
* some fraction of a page if allocations cannot be satisfied due to
|
||||
* size constraints on boxes where there is physical RAM space
|
||||
* fragmentation - in these cases (mostly large memory boxes) this
|
||||
* is not a problem.
|
||||
*
|
||||
* On low memory boxes we get it right in 100% of the cases.
|
||||
*
|
||||
* alignment has to be a power of 2 value.
|
||||
*
|
||||
* NOTE: This function is _not_ reentrant.
|
||||
*/
|
||||
static void * __init
|
||||
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
|
||||
unsigned long align, unsigned long goal)
|
||||
{
|
||||
unsigned long offset, remaining_size, areasize, preferred;
|
||||
unsigned long i, start = 0, incr, eidx;
|
||||
void *ret;
|
||||
|
||||
if(!size) {
|
||||
printk("__alloc_bootmem_core(): zero-sized request\n");
|
||||
BUG();
|
||||
}
|
||||
BUG_ON(align & (align-1));
|
||||
|
||||
eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
|
||||
offset = 0;
|
||||
if (align &&
|
||||
(bdata->node_boot_start & (align - 1UL)) != 0)
|
||||
offset = (align - (bdata->node_boot_start & (align - 1UL)));
|
||||
offset >>= PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* We try to allocate bootmem pages above 'goal'
|
||||
* first, then we try to allocate lower pages.
|
||||
*/
|
||||
if (goal && (goal >= bdata->node_boot_start) &&
|
||||
((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
|
||||
preferred = goal - bdata->node_boot_start;
|
||||
|
||||
if (bdata->last_success >= preferred)
|
||||
preferred = bdata->last_success;
|
||||
} else
|
||||
preferred = 0;
|
||||
|
||||
preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
|
||||
preferred += offset;
|
||||
areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
|
||||
incr = align >> PAGE_SHIFT ? : 1;
|
||||
|
||||
restart_scan:
|
||||
for (i = preferred; i < eidx; i += incr) {
|
||||
unsigned long j;
|
||||
i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
|
||||
i = ALIGN(i, incr);
|
||||
if (test_bit(i, bdata->node_bootmem_map))
|
||||
continue;
|
||||
for (j = i + 1; j < i + areasize; ++j) {
|
||||
if (j >= eidx)
|
||||
goto fail_block;
|
||||
if (test_bit (j, bdata->node_bootmem_map))
|
||||
goto fail_block;
|
||||
}
|
||||
start = i;
|
||||
goto found;
|
||||
fail_block:
|
||||
i = ALIGN(j, incr);
|
||||
}
|
||||
|
||||
if (preferred > offset) {
|
||||
preferred = offset;
|
||||
goto restart_scan;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
found:
|
||||
bdata->last_success = start << PAGE_SHIFT;
|
||||
BUG_ON(start >= eidx);
|
||||
|
||||
/*
|
||||
* Is the next page of the previous allocation-end the start
|
||||
* of this allocation's buffer? If yes then we can 'merge'
|
||||
* the previous partial page with this allocation.
|
||||
*/
|
||||
if (align < PAGE_SIZE &&
|
||||
bdata->last_offset && bdata->last_pos+1 == start) {
|
||||
offset = (bdata->last_offset+align-1) & ~(align-1);
|
||||
BUG_ON(offset > PAGE_SIZE);
|
||||
remaining_size = PAGE_SIZE-offset;
|
||||
if (size < remaining_size) {
|
||||
areasize = 0;
|
||||
/* last_pos unchanged */
|
||||
bdata->last_offset = offset+size;
|
||||
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
|
||||
bdata->node_boot_start);
|
||||
} else {
|
||||
remaining_size = size - remaining_size;
|
||||
areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
|
||||
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
|
||||
bdata->node_boot_start);
|
||||
bdata->last_pos = start+areasize-1;
|
||||
bdata->last_offset = remaining_size;
|
||||
}
|
||||
bdata->last_offset &= ~PAGE_MASK;
|
||||
} else {
|
||||
bdata->last_pos = start + areasize - 1;
|
||||
bdata->last_offset = size & ~PAGE_MASK;
|
||||
ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reserve the area now:
|
||||
*/
|
||||
for (i = start; i < start+areasize; i++)
|
||||
if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
|
||||
BUG();
|
||||
memset(ret, 0, size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
|
||||
{
|
||||
struct page *page;
|
||||
bootmem_data_t *bdata = pgdat->bdata;
|
||||
unsigned long i, count, total = 0;
|
||||
unsigned long idx;
|
||||
unsigned long *map;
|
||||
int gofast = 0;
|
||||
|
||||
BUG_ON(!bdata->node_bootmem_map);
|
||||
|
||||
count = 0;
|
||||
/* first extant page of the node */
|
||||
page = virt_to_page(phys_to_virt(bdata->node_boot_start));
|
||||
idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
|
||||
map = bdata->node_bootmem_map;
|
||||
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
|
||||
if (bdata->node_boot_start == 0 ||
|
||||
ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
|
||||
gofast = 1;
|
||||
for (i = 0; i < idx; ) {
|
||||
unsigned long v = ~map[i / BITS_PER_LONG];
|
||||
if (gofast && v == ~0UL) {
|
||||
int j, order;
|
||||
|
||||
count += BITS_PER_LONG;
|
||||
__ClearPageReserved(page);
|
||||
order = ffs(BITS_PER_LONG) - 1;
|
||||
set_page_refs(page, order);
|
||||
for (j = 1; j < BITS_PER_LONG; j++) {
|
||||
if (j + 16 < BITS_PER_LONG)
|
||||
prefetchw(page + j + 16);
|
||||
__ClearPageReserved(page + j);
|
||||
}
|
||||
__free_pages(page, order);
|
||||
i += BITS_PER_LONG;
|
||||
page += BITS_PER_LONG;
|
||||
} else if (v) {
|
||||
unsigned long m;
|
||||
for (m = 1; m && i < idx; m<<=1, page++, i++) {
|
||||
if (v & m) {
|
||||
count++;
|
||||
__ClearPageReserved(page);
|
||||
set_page_refs(page, 0);
|
||||
__free_page(page);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
i+=BITS_PER_LONG;
|
||||
page += BITS_PER_LONG;
|
||||
}
|
||||
}
|
||||
total += count;
|
||||
|
||||
/*
|
||||
* Now free the allocator bitmap itself, it's not
|
||||
* needed anymore:
|
||||
*/
|
||||
page = virt_to_page(bdata->node_bootmem_map);
|
||||
count = 0;
|
||||
for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
|
||||
count++;
|
||||
__ClearPageReserved(page);
|
||||
set_page_count(page, 1);
|
||||
__free_page(page);
|
||||
}
|
||||
total += count;
|
||||
bdata->node_bootmem_map = NULL;
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
|
||||
{
|
||||
return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
|
||||
}
|
||||
|
||||
void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
|
||||
{
|
||||
reserve_bootmem_core(pgdat->bdata, physaddr, size);
|
||||
}
|
||||
|
||||
void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
|
||||
{
|
||||
free_bootmem_core(pgdat->bdata, physaddr, size);
|
||||
}
|
||||
|
||||
unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
|
||||
{
|
||||
return(free_all_bootmem_core(pgdat));
|
||||
}
|
||||
|
||||
unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
|
||||
{
|
||||
max_low_pfn = pages;
|
||||
min_low_pfn = start;
|
||||
return(init_bootmem_core(NODE_DATA(0), start, 0, pages));
|
||||
}
|
||||
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
||||
void __init reserve_bootmem (unsigned long addr, unsigned long size)
|
||||
{
|
||||
reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
|
||||
}
|
||||
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
||||
|
||||
void __init free_bootmem (unsigned long addr, unsigned long size)
|
||||
{
|
||||
free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
|
||||
}
|
||||
|
||||
unsigned long __init free_all_bootmem (void)
|
||||
{
|
||||
return(free_all_bootmem_core(NODE_DATA(0)));
|
||||
}
|
||||
|
||||
void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
|
||||
{
|
||||
pg_data_t *pgdat = pgdat_list;
|
||||
void *ptr;
|
||||
|
||||
for_each_pgdat(pgdat)
|
||||
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
|
||||
align, goal)))
|
||||
return(ptr);
|
||||
|
||||
/*
|
||||
* Whoops, we cannot satisfy the allocation request.
|
||||
*/
|
||||
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
|
||||
panic("Out of memory");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
|
||||
if (ptr)
|
||||
return (ptr);
|
||||
|
||||
return __alloc_bootmem(size, align, goal);
|
||||
}
|
||||
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
* mm/fadvise.c
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 11Jan2003 akpm@digeo.com
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/fadvise.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/unistd.h>
|
||||
|
||||
/*
|
||||
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
|
||||
* deactivate the pages and clear PG_Referenced.
|
||||
*/
|
||||
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
|
||||
{
|
||||
struct file *file = fget(fd);
|
||||
struct address_space *mapping;
|
||||
struct backing_dev_info *bdi;
|
||||
loff_t endbyte;
|
||||
pgoff_t start_index;
|
||||
pgoff_t end_index;
|
||||
unsigned long nrpages;
|
||||
int ret = 0;
|
||||
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
|
||||
mapping = file->f_mapping;
|
||||
if (!mapping || len < 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Careful about overflows. Len == 0 means "as much as possible" */
|
||||
endbyte = offset + len;
|
||||
if (!len || endbyte < len)
|
||||
endbyte = -1;
|
||||
|
||||
bdi = mapping->backing_dev_info;
|
||||
|
||||
switch (advice) {
|
||||
case POSIX_FADV_NORMAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages;
|
||||
break;
|
||||
case POSIX_FADV_RANDOM:
|
||||
file->f_ra.ra_pages = 0;
|
||||
break;
|
||||
case POSIX_FADV_SEQUENTIAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages * 2;
|
||||
break;
|
||||
case POSIX_FADV_WILLNEED:
|
||||
case POSIX_FADV_NOREUSE:
|
||||
if (!mapping->a_ops->readpage) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* First and last PARTIAL page! */
|
||||
start_index = offset >> PAGE_CACHE_SHIFT;
|
||||
end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
|
||||
|
||||
/* Careful about overflow on the "+1" */
|
||||
nrpages = end_index - start_index + 1;
|
||||
if (!nrpages)
|
||||
nrpages = ~0UL;
|
||||
|
||||
ret = force_page_cache_readahead(mapping, file,
|
||||
start_index,
|
||||
max_sane_readahead(nrpages));
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
if (!bdi_write_congested(mapping->backing_dev_info))
|
||||
filemap_flush(mapping);
|
||||
|
||||
/* First and last FULL page! */
|
||||
start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
|
||||
end_index = (endbyte >> PAGE_CACHE_SHIFT);
|
||||
|
||||
if (end_index > start_index)
|
||||
invalidate_mapping_pages(mapping, start_index, end_index-1);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
out:
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef __ARCH_WANT_SYS_FADVISE64
|
||||
|
||||
asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
|
||||
{
|
||||
return sys_fadvise64_64(fd, offset, len, advice);
|
||||
}
|
||||
|
||||
#endif
|
||||
+2306
File diff suppressed because it is too large
Load Diff
+256
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
* linux/mm/fremap.c
|
||||
*
|
||||
* Explicit pagetable population and nonlinear (random) mappings support.
|
||||
*
|
||||
* started by Ingo Molnar, Copyright (C) 2002, 2003
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
pte_t pte = *ptep;
|
||||
|
||||
if (pte_none(pte))
|
||||
return;
|
||||
if (pte_present(pte)) {
|
||||
unsigned long pfn = pte_pfn(pte);
|
||||
|
||||
flush_cache_page(vma, addr, pfn);
|
||||
pte = ptep_clear_flush(vma, addr, ptep);
|
||||
if (pfn_valid(pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
if (!PageReserved(page)) {
|
||||
if (pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
page_remove_rmap(page);
|
||||
page_cache_release(page);
|
||||
dec_mm_counter(mm, rss);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (!pte_file(pte))
|
||||
free_swap_and_cache(pte_to_swp_entry(pte));
|
||||
pte_clear(mm, addr, ptep);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Install a file page to a given virtual memory address, release any
|
||||
* previously existing mapping.
|
||||
*/
|
||||
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, struct page *page, pgprot_t prot)
|
||||
{
|
||||
struct inode *inode;
|
||||
pgoff_t size;
|
||||
int err = -ENOMEM;
|
||||
pte_t *pte;
|
||||
pmd_t *pmd;
|
||||
pud_t *pud;
|
||||
pgd_t *pgd;
|
||||
pte_t pte_val;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
||||
pud = pud_alloc(mm, pgd, addr);
|
||||
if (!pud)
|
||||
goto err_unlock;
|
||||
|
||||
pmd = pmd_alloc(mm, pud, addr);
|
||||
if (!pmd)
|
||||
goto err_unlock;
|
||||
|
||||
pte = pte_alloc_map(mm, pmd, addr);
|
||||
if (!pte)
|
||||
goto err_unlock;
|
||||
|
||||
/*
|
||||
* This page may have been truncated. Tell the
|
||||
* caller about it.
|
||||
*/
|
||||
err = -EINVAL;
|
||||
inode = vma->vm_file->f_mapping->host;
|
||||
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||
if (!page->mapping || page->index >= size)
|
||||
goto err_unlock;
|
||||
|
||||
zap_pte(mm, vma, addr, pte);
|
||||
|
||||
inc_mm_counter(mm,rss);
|
||||
flush_icache_page(vma, page);
|
||||
set_pte_at(mm, addr, pte, mk_pte(page, prot));
|
||||
page_add_file_rmap(page);
|
||||
pte_val = *pte;
|
||||
pte_unmap(pte);
|
||||
update_mmu_cache(vma, addr, pte_val);
|
||||
|
||||
err = 0;
|
||||
err_unlock:
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(install_page);
|
||||
|
||||
|
||||
/*
|
||||
* Install a file pte to a given virtual memory address, release any
|
||||
* previously existing mapping.
|
||||
*/
|
||||
int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long pgoff, pgprot_t prot)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
pte_t *pte;
|
||||
pmd_t *pmd;
|
||||
pud_t *pud;
|
||||
pgd_t *pgd;
|
||||
pte_t pte_val;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
||||
pud = pud_alloc(mm, pgd, addr);
|
||||
if (!pud)
|
||||
goto err_unlock;
|
||||
|
||||
pmd = pmd_alloc(mm, pud, addr);
|
||||
if (!pmd)
|
||||
goto err_unlock;
|
||||
|
||||
pte = pte_alloc_map(mm, pmd, addr);
|
||||
if (!pte)
|
||||
goto err_unlock;
|
||||
|
||||
zap_pte(mm, vma, addr, pte);
|
||||
|
||||
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
|
||||
pte_val = *pte;
|
||||
pte_unmap(pte);
|
||||
update_mmu_cache(vma, addr, pte_val);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
|
||||
* file within an existing vma.
|
||||
* @start: start of the remapped virtual memory range
|
||||
* @size: size of the remapped virtual memory range
|
||||
* @prot: new protection bits of the range
|
||||
* @pgoff: to be mapped page of the backing store file
|
||||
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
|
||||
*
|
||||
* this syscall works purely via pagetables, so it's the most efficient
|
||||
* way to map the same (large) file into a given virtual window. Unlike
|
||||
* mmap()/mremap() it does not create any new vmas. The new mappings are
|
||||
* also safe across swapout.
|
||||
*
|
||||
* NOTE: the 'prot' parameter right now is ignored, and the vma's default
|
||||
* protection is used. Arbitrary protections might be implemented in the
|
||||
* future.
|
||||
*/
|
||||
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
||||
unsigned long __prot, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct address_space *mapping;
|
||||
unsigned long end = start + size;
|
||||
struct vm_area_struct *vma;
|
||||
int err = -EINVAL;
|
||||
int has_write_lock = 0;
|
||||
|
||||
if (__prot)
|
||||
return err;
|
||||
/*
|
||||
* Sanitize the syscall parameters:
|
||||
*/
|
||||
start = start & PAGE_MASK;
|
||||
size = size & PAGE_MASK;
|
||||
|
||||
/* Does the address range wrap, or is the span zero-sized? */
|
||||
if (start + size <= start)
|
||||
return err;
|
||||
|
||||
/* Can we represent this offset inside this architecture's pte's? */
|
||||
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
|
||||
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
|
||||
return err;
|
||||
#endif
|
||||
|
||||
/* We need down_write() to change vma->vm_flags. */
|
||||
down_read(&mm->mmap_sem);
|
||||
retry:
|
||||
vma = find_vma(mm, start);
|
||||
|
||||
/*
|
||||
* Make sure the vma is shared, that it supports prefaulting,
|
||||
* and that the remapped range is valid and fully within
|
||||
* the single existing vma. vm_private_data is used as a
|
||||
* swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
|
||||
* or VM_LOCKED, but VM_LOCKED could be revoked later on).
|
||||
*/
|
||||
if (vma && (vma->vm_flags & VM_SHARED) &&
|
||||
(!vma->vm_private_data ||
|
||||
(vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
|
||||
vma->vm_ops && vma->vm_ops->populate &&
|
||||
end > start && start >= vma->vm_start &&
|
||||
end <= vma->vm_end) {
|
||||
|
||||
/* Must set VM_NONLINEAR before any pages are populated. */
|
||||
if (pgoff != linear_page_index(vma, start) &&
|
||||
!(vma->vm_flags & VM_NONLINEAR)) {
|
||||
if (!has_write_lock) {
|
||||
up_read(&mm->mmap_sem);
|
||||
down_write(&mm->mmap_sem);
|
||||
has_write_lock = 1;
|
||||
goto retry;
|
||||
}
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
flush_dcache_mmap_lock(mapping);
|
||||
vma->vm_flags |= VM_NONLINEAR;
|
||||
vma_prio_tree_remove(vma, &mapping->i_mmap);
|
||||
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
|
||||
flush_dcache_mmap_unlock(mapping);
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
err = vma->vm_ops->populate(vma, start, size,
|
||||
vma->vm_page_prot,
|
||||
pgoff, flags & MAP_NONBLOCK);
|
||||
|
||||
/*
|
||||
* We can't clear VM_NONLINEAR because we'd have to do
|
||||
* it after ->populate completes, and that would prevent
|
||||
* downgrading the lock. (Locks can't be upgraded).
|
||||
*/
|
||||
}
|
||||
if (likely(!has_write_lock))
|
||||
up_read(&mm->mmap_sem);
|
||||
else
|
||||
up_write(&mm->mmap_sem);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
+607
File diff suppressed because it is too large
Load Diff
+260
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
* Generic hugetlb support.
|
||||
* (C) William Irwin, April 2004
|
||||
*/
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/nodemask.h>
|
||||
|
||||
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
|
||||
static unsigned long nr_huge_pages, free_huge_pages;
|
||||
unsigned long max_huge_pages;
|
||||
static struct list_head hugepage_freelists[MAX_NUMNODES];
|
||||
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
||||
static unsigned int free_huge_pages_node[MAX_NUMNODES];
|
||||
static DEFINE_SPINLOCK(hugetlb_lock);
|
||||
|
||||
static void enqueue_huge_page(struct page *page)
|
||||
{
|
||||
int nid = page_to_nid(page);
|
||||
list_add(&page->lru, &hugepage_freelists[nid]);
|
||||
free_huge_pages++;
|
||||
free_huge_pages_node[nid]++;
|
||||
}
|
||||
|
||||
static struct page *dequeue_huge_page(void)
|
||||
{
|
||||
int nid = numa_node_id();
|
||||
struct page *page = NULL;
|
||||
|
||||
if (list_empty(&hugepage_freelists[nid])) {
|
||||
for (nid = 0; nid < MAX_NUMNODES; ++nid)
|
||||
if (!list_empty(&hugepage_freelists[nid]))
|
||||
break;
|
||||
}
|
||||
if (nid >= 0 && nid < MAX_NUMNODES &&
|
||||
!list_empty(&hugepage_freelists[nid])) {
|
||||
page = list_entry(hugepage_freelists[nid].next,
|
||||
struct page, lru);
|
||||
list_del(&page->lru);
|
||||
free_huge_pages--;
|
||||
free_huge_pages_node[nid]--;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct page *alloc_fresh_huge_page(void)
|
||||
{
|
||||
static int nid = 0;
|
||||
struct page *page;
|
||||
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
|
||||
HUGETLB_PAGE_ORDER);
|
||||
nid = (nid + 1) % num_online_nodes();
|
||||
if (page) {
|
||||
nr_huge_pages++;
|
||||
nr_huge_pages_node[page_to_nid(page)]++;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
void free_huge_page(struct page *page)
|
||||
{
|
||||
BUG_ON(page_count(page));
|
||||
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
page[1].mapping = NULL;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
enqueue_huge_page(page);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
}
|
||||
|
||||
struct page *alloc_huge_page(void)
|
||||
{
|
||||
struct page *page;
|
||||
int i;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
page = dequeue_huge_page();
|
||||
if (!page) {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return NULL;
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
set_page_count(page, 1);
|
||||
page[1].mapping = (void *)free_huge_page;
|
||||
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
|
||||
clear_highpage(&page[i]);
|
||||
return page;
|
||||
}
|
||||
|
||||
static int __init hugetlb_init(void)
|
||||
{
|
||||
unsigned long i;
|
||||
struct page *page;
|
||||
|
||||
for (i = 0; i < MAX_NUMNODES; ++i)
|
||||
INIT_LIST_HEAD(&hugepage_freelists[i]);
|
||||
|
||||
for (i = 0; i < max_huge_pages; ++i) {
|
||||
page = alloc_fresh_huge_page();
|
||||
if (!page)
|
||||
break;
|
||||
spin_lock(&hugetlb_lock);
|
||||
enqueue_huge_page(page);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
}
|
||||
max_huge_pages = free_huge_pages = nr_huge_pages = i;
|
||||
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
|
||||
return 0;
|
||||
}
|
||||
module_init(hugetlb_init);
|
||||
|
||||
static int __init hugetlb_setup(char *s)
|
||||
{
|
||||
if (sscanf(s, "%lu", &max_huge_pages) <= 0)
|
||||
max_huge_pages = 0;
|
||||
return 1;
|
||||
}
|
||||
__setup("hugepages=", hugetlb_setup);
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static void update_and_free_page(struct page *page)
|
||||
{
|
||||
int i;
|
||||
nr_huge_pages--;
|
||||
nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
|
||||
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
|
||||
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
|
||||
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
|
||||
1 << PG_private | 1<< PG_writeback);
|
||||
set_page_count(&page[i], 0);
|
||||
}
|
||||
set_page_count(page, 1);
|
||||
__free_pages(page, HUGETLB_PAGE_ORDER);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
static void try_to_free_low(unsigned long count)
|
||||
{
|
||||
int i, nid;
|
||||
for (i = 0; i < MAX_NUMNODES; ++i) {
|
||||
struct page *page, *next;
|
||||
list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
|
||||
if (PageHighMem(page))
|
||||
continue;
|
||||
list_del(&page->lru);
|
||||
update_and_free_page(page);
|
||||
nid = page_zone(page)->zone_pgdat->node_id;
|
||||
free_huge_pages--;
|
||||
free_huge_pages_node[nid]--;
|
||||
if (count >= nr_huge_pages)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void try_to_free_low(unsigned long count)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned long set_max_huge_pages(unsigned long count)
|
||||
{
|
||||
while (count > nr_huge_pages) {
|
||||
struct page *page = alloc_fresh_huge_page();
|
||||
if (!page)
|
||||
return nr_huge_pages;
|
||||
spin_lock(&hugetlb_lock);
|
||||
enqueue_huge_page(page);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
}
|
||||
if (count >= nr_huge_pages)
|
||||
return nr_huge_pages;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
try_to_free_low(count);
|
||||
while (count < nr_huge_pages) {
|
||||
struct page *page = dequeue_huge_page();
|
||||
if (!page)
|
||||
break;
|
||||
update_and_free_page(page);
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return nr_huge_pages;
|
||||
}
|
||||
|
||||
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
|
||||
struct file *file, void __user *buffer,
|
||||
size_t *length, loff_t *ppos)
|
||||
{
|
||||
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
|
||||
max_huge_pages = set_max_huge_pages(max_huge_pages);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
int hugetlb_report_meminfo(char *buf)
|
||||
{
|
||||
return sprintf(buf,
|
||||
"HugePages_Total: %5lu\n"
|
||||
"HugePages_Free: %5lu\n"
|
||||
"Hugepagesize: %5lu kB\n",
|
||||
nr_huge_pages,
|
||||
free_huge_pages,
|
||||
HPAGE_SIZE/1024);
|
||||
}
|
||||
|
||||
int hugetlb_report_node_meminfo(int nid, char *buf)
|
||||
{
|
||||
return sprintf(buf,
|
||||
"Node %d HugePages_Total: %5u\n"
|
||||
"Node %d HugePages_Free: %5u\n",
|
||||
nid, nr_huge_pages_node[nid],
|
||||
nid, free_huge_pages_node[nid]);
|
||||
}
|
||||
|
||||
int is_hugepage_mem_enough(size_t size)
|
||||
{
|
||||
return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
|
||||
}
|
||||
|
||||
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
|
||||
unsigned long hugetlb_total_pages(void)
|
||||
{
|
||||
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
|
||||
}
|
||||
EXPORT_SYMBOL(hugetlb_total_pages);
|
||||
|
||||
/*
|
||||
* We cannot handle pagefaults against hugetlb pages at all. They cause
|
||||
* handle_mm_fault() to try to instantiate regular-sized pages in the
|
||||
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
|
||||
* this far.
|
||||
*/
|
||||
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
|
||||
unsigned long address, int *unused)
|
||||
{
|
||||
BUG();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct vm_operations_struct hugetlb_vm_ops = {
|
||||
.nopage = hugetlb_nopage,
|
||||
};
|
||||
|
||||
void zap_hugepage_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long length)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
unmap_hugepage_range(vma, start, start + length);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
/* internal.h: mm/ internal definitions
|
||||
*
|
||||
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
|
||||
* Written by David Howells (dhowells@redhat.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
/* page_alloc.c */
|
||||
extern void set_page_refs(struct page *page, int order);
|
||||
+242
@@ -0,0 +1,242 @@
|
||||
/*
|
||||
* linux/mm/madvise.c
|
||||
*
|
||||
* Copyright (C) 1999 Linus Torvalds
|
||||
* Copyright (C) 2002 Christoph Hellwig
|
||||
*/
|
||||
|
||||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/hugetlb.h>
|
||||
|
||||
/*
|
||||
* We can potentially split a vm area into separate
|
||||
* areas, each area with its own behavior.
|
||||
*/
|
||||
static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
|
||||
unsigned long end, int behavior)
|
||||
{
|
||||
struct mm_struct * mm = vma->vm_mm;
|
||||
int error = 0;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
error = split_vma(mm, vma, start, 1);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
error = split_vma(mm, vma, end, 0);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_flags is protected by the mmap_sem held in write mode.
|
||||
*/
|
||||
VM_ClearReadHint(vma);
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_SEQUENTIAL:
|
||||
vma->vm_flags |= VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_RANDOM:
|
||||
vma->vm_flags |= VM_RAND_READ;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
error = -EAGAIN;
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule all required I/O operations. Do not wait for completion.
|
||||
*/
|
||||
static long madvise_willneed(struct vm_area_struct * vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
|
||||
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
|
||||
force_page_cache_readahead(file->f_mapping,
|
||||
file, start, max_sane_readahead(end - start));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Application no longer needs these pages. If the pages are dirty,
|
||||
* it's OK to just throw them away. The app will be more careful about
|
||||
* data it wants to keep. Be sure to free swap resources too. The
|
||||
* zap_page_range call sets things up for refill_inactive to actually free
|
||||
* these pages later if no one else has touched them in the meantime,
|
||||
* although we could add these pages to a global reuse list for
|
||||
* refill_inactive to pick up before reclaiming other pages.
|
||||
*
|
||||
* NB: This interface discards data rather than pushes it out to swap,
|
||||
* as some implementations do. This has performance implications for
|
||||
* applications like large transactional databases which want to discard
|
||||
* pages in anonymous maps after committing to backing store the data
|
||||
* that was kept in them. There is no reason to write this data out to
|
||||
* the swap area if the application is discarding it.
|
||||
*
|
||||
* An interface that causes the system to free clean pages and flush
|
||||
* dirty pages is already available as msync(MS_INVALIDATE).
|
||||
*/
|
||||
static long madvise_dontneed(struct vm_area_struct * vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
|
||||
return -EINVAL;
|
||||
|
||||
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
|
||||
struct zap_details details = {
|
||||
.nonlinear_vma = vma,
|
||||
.last_index = ULONG_MAX,
|
||||
};
|
||||
zap_page_range(vma, start, end - start, &details);
|
||||
} else
|
||||
zap_page_range(vma, start, end - start, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
|
||||
unsigned long end, int behavior)
|
||||
{
|
||||
long error = -EBADF;
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_NORMAL:
|
||||
case MADV_SEQUENTIAL:
|
||||
case MADV_RANDOM:
|
||||
error = madvise_behavior(vma, start, end, behavior);
|
||||
break;
|
||||
|
||||
case MADV_WILLNEED:
|
||||
error = madvise_willneed(vma, start, end);
|
||||
break;
|
||||
|
||||
case MADV_DONTNEED:
|
||||
error = madvise_dontneed(vma, start, end);
|
||||
break;
|
||||
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* The madvise(2) system call.
|
||||
*
|
||||
* Applications can use madvise() to advise the kernel how it should
|
||||
* handle paging I/O in this VM area. The idea is to help the kernel
|
||||
* use appropriate read-ahead and caching techniques. The information
|
||||
* provided is advisory only, and can be safely disregarded by the
|
||||
* kernel without affecting the correct operation of the application.
|
||||
*
|
||||
* behavior values:
|
||||
* MADV_NORMAL - the default behavior is to read clusters. This
|
||||
* results in some read-ahead and read-behind.
|
||||
* MADV_RANDOM - the system should read the minimum amount of data
|
||||
* on any access, since it is unlikely that the appli-
|
||||
* cation will need more than what it asks for.
|
||||
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
|
||||
* once, so they can be aggressively read ahead, and
|
||||
* can be freed soon after they are accessed.
|
||||
* MADV_WILLNEED - the application is notifying the system to read
|
||||
* some pages ahead.
|
||||
* MADV_DONTNEED - the application is finished with the given range,
|
||||
* so the kernel can free resources associated with it.
|
||||
*
|
||||
* return values:
|
||||
* zero - success
|
||||
* -EINVAL - start + len < 0, start is not page-aligned,
|
||||
* "behavior" is not a valid value, or application
|
||||
* is attempting to release locked or shared pages.
|
||||
* -ENOMEM - addresses in the specified range are not currently
|
||||
* mapped, or are outside the AS of the process.
|
||||
* -EIO - an I/O error occurred while paging in data.
|
||||
* -EBADF - map exists, but area maps something that isn't a file.
|
||||
* -EAGAIN - a kernel resource was temporarily unavailable.
|
||||
*/
|
||||
asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
|
||||
{
|
||||
unsigned long end;
|
||||
struct vm_area_struct * vma;
|
||||
int unmapped_error = 0;
|
||||
int error = -EINVAL;
|
||||
size_t len;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
if (start & ~PAGE_MASK)
|
||||
goto out;
|
||||
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
|
||||
|
||||
/* Check to see whether len was rounded up from small -ve to zero */
|
||||
if (len_in && !len)
|
||||
goto out;
|
||||
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
goto out;
|
||||
|
||||
error = 0;
|
||||
if (end == start)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address
|
||||
* ranges, just ignore them, but return -ENOMEM at the end.
|
||||
*/
|
||||
vma = find_vma(current->mm, start);
|
||||
for (;;) {
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
/* Here start < vma->vm_end. */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < vma->vm_end. */
|
||||
if (end <= vma->vm_end) {
|
||||
if (start < end) {
|
||||
error = madvise_vma(vma, start, end,
|
||||
behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
error = unmapped_error;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < vma->vm_end < end. */
|
||||
error = madvise_vma(vma, start, vma->vm_end, behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
start = vma->vm_end;
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
|
||||
out:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
+2165
File diff suppressed because it is too large
Load Diff
+1138
File diff suppressed because it is too large
Load Diff
+290
@@ -0,0 +1,290 @@
|
||||
/*
|
||||
* linux/mm/mempool.c
|
||||
*
|
||||
* memory buffer pool support. Such pools are mostly used
|
||||
* for guaranteed, deadlock-free memory allocations during
|
||||
* extreme VM load.
|
||||
*
|
||||
* started by Ingo Molnar, Copyright (C) 2001
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
static void add_element(mempool_t *pool, void *element)
|
||||
{
|
||||
BUG_ON(pool->curr_nr >= pool->min_nr);
|
||||
pool->elements[pool->curr_nr++] = element;
|
||||
}
|
||||
|
||||
static void *remove_element(mempool_t *pool)
|
||||
{
|
||||
BUG_ON(pool->curr_nr <= 0);
|
||||
return pool->elements[--pool->curr_nr];
|
||||
}
|
||||
|
||||
static void free_pool(mempool_t *pool)
|
||||
{
|
||||
while (pool->curr_nr) {
|
||||
void *element = remove_element(pool);
|
||||
pool->free(element, pool->pool_data);
|
||||
}
|
||||
kfree(pool->elements);
|
||||
kfree(pool);
|
||||
}
|
||||
|
||||
/**
|
||||
* mempool_create - create a memory pool
|
||||
* @min_nr: the minimum number of elements guaranteed to be
|
||||
* allocated for this pool.
|
||||
* @alloc_fn: user-defined element-allocation function.
|
||||
* @free_fn: user-defined element-freeing function.
|
||||
* @pool_data: optional private data available to the user-defined functions.
|
||||
*
|
||||
* this function creates and allocates a guaranteed size, preallocated
|
||||
* memory pool. The pool can be used from the mempool_alloc and mempool_free
|
||||
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
|
||||
* functions might sleep - as long as the mempool_alloc function is not called
|
||||
* from IRQ contexts.
|
||||
*/
|
||||
mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
|
||||
mempool_free_t *free_fn, void *pool_data)
|
||||
{
|
||||
mempool_t *pool;
|
||||
|
||||
pool = kmalloc(sizeof(*pool), GFP_KERNEL);
|
||||
if (!pool)
|
||||
return NULL;
|
||||
memset(pool, 0, sizeof(*pool));
|
||||
pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
|
||||
if (!pool->elements) {
|
||||
kfree(pool);
|
||||
return NULL;
|
||||
}
|
||||
spin_lock_init(&pool->lock);
|
||||
pool->min_nr = min_nr;
|
||||
pool->pool_data = pool_data;
|
||||
init_waitqueue_head(&pool->wait);
|
||||
pool->alloc = alloc_fn;
|
||||
pool->free = free_fn;
|
||||
|
||||
/*
|
||||
* First pre-allocate the guaranteed number of buffers.
|
||||
*/
|
||||
while (pool->curr_nr < pool->min_nr) {
|
||||
void *element;
|
||||
|
||||
element = pool->alloc(GFP_KERNEL, pool->pool_data);
|
||||
if (unlikely(!element)) {
|
||||
free_pool(pool);
|
||||
return NULL;
|
||||
}
|
||||
add_element(pool, element);
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_create);
|
||||
|
||||
/**
|
||||
* mempool_resize - resize an existing memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
* @new_min_nr: the new minimum number of elements guaranteed to be
|
||||
* allocated for this pool.
|
||||
* @gfp_mask: the usual allocation bitmask.
|
||||
*
|
||||
* This function shrinks/grows the pool. In the case of growing,
|
||||
* it cannot be guaranteed that the pool will be grown to the new
|
||||
* size immediately, but new mempool_free() calls will refill it.
|
||||
*
|
||||
* Note, the caller must guarantee that no mempool_destroy is called
|
||||
* while this function is running. mempool_alloc() & mempool_free()
|
||||
* might be called (eg. from IRQ contexts) while this function executes.
|
||||
*/
|
||||
int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
|
||||
{
|
||||
void *element;
|
||||
void **new_elements;
|
||||
unsigned long flags;
|
||||
|
||||
BUG_ON(new_min_nr <= 0);
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (new_min_nr <= pool->min_nr) {
|
||||
while (new_min_nr < pool->curr_nr) {
|
||||
element = remove_element(pool);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
pool->free(element, pool->pool_data);
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
}
|
||||
pool->min_nr = new_min_nr;
|
||||
goto out_unlock;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
/* Grow the pool */
|
||||
new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
|
||||
if (!new_elements)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (unlikely(new_min_nr <= pool->min_nr)) {
|
||||
/* Raced, other resize will do our work */
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
kfree(new_elements);
|
||||
goto out;
|
||||
}
|
||||
memcpy(new_elements, pool->elements,
|
||||
pool->curr_nr * sizeof(*new_elements));
|
||||
kfree(pool->elements);
|
||||
pool->elements = new_elements;
|
||||
pool->min_nr = new_min_nr;
|
||||
|
||||
while (pool->curr_nr < pool->min_nr) {
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
element = pool->alloc(gfp_mask, pool->pool_data);
|
||||
if (!element)
|
||||
goto out;
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
add_element(pool, element);
|
||||
} else {
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
pool->free(element, pool->pool_data); /* Raced */
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out_unlock:
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_resize);
|
||||
|
||||
/**
|
||||
* mempool_destroy - deallocate a memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
*
|
||||
* this function only sleeps if the free_fn() function sleeps. The caller
|
||||
* has to guarantee that all elements have been returned to the pool (ie:
|
||||
* freed) prior to calling mempool_destroy().
|
||||
*/
|
||||
void mempool_destroy(mempool_t *pool)
|
||||
{
|
||||
if (pool->curr_nr != pool->min_nr)
|
||||
BUG(); /* There were outstanding elements */
|
||||
free_pool(pool);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_destroy);
|
||||
|
||||
/**
|
||||
* mempool_alloc - allocate an element from a specific memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
* @gfp_mask: the usual allocation bitmask.
|
||||
*
|
||||
* this function only sleeps if the alloc_fn function sleeps or
|
||||
* returns NULL. Note that due to preallocation, this function
|
||||
* *never* fails when called from process contexts. (it might
|
||||
* fail if called from an IRQ context.)
|
||||
*/
|
||||
void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
|
||||
{
|
||||
void *element;
|
||||
unsigned long flags;
|
||||
DEFINE_WAIT(wait);
|
||||
int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
|
||||
|
||||
might_sleep_if(gfp_mask & __GFP_WAIT);
|
||||
repeat_alloc:
|
||||
element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data);
|
||||
if (likely(element != NULL))
|
||||
return element;
|
||||
|
||||
/*
|
||||
* If the pool is less than 50% full and we can perform effective
|
||||
* page reclaim then try harder to allocate an element.
|
||||
*/
|
||||
mb();
|
||||
if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
|
||||
(pool->curr_nr <= pool->min_nr/2)) {
|
||||
element = pool->alloc(gfp_mask, pool->pool_data);
|
||||
if (likely(element != NULL))
|
||||
return element;
|
||||
}
|
||||
|
||||
/*
|
||||
* Kick the VM at this point.
|
||||
*/
|
||||
wakeup_bdflush(0);
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (likely(pool->curr_nr)) {
|
||||
element = remove_element(pool);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
return element;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
/* We must not sleep in the GFP_ATOMIC case */
|
||||
if (!(gfp_mask & __GFP_WAIT))
|
||||
return NULL;
|
||||
|
||||
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
mb();
|
||||
if (!pool->curr_nr)
|
||||
io_schedule();
|
||||
finish_wait(&pool->wait, &wait);
|
||||
|
||||
goto repeat_alloc;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_alloc);
|
||||
|
||||
/**
|
||||
* mempool_free - return an element to the pool.
|
||||
* @element: pool element pointer.
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
*
|
||||
* this function only sleeps if the free_fn() function sleeps.
|
||||
*/
|
||||
void mempool_free(void *element, mempool_t *pool)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
mb();
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
add_element(pool, element);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
wake_up(&pool->wait);
|
||||
return;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
}
|
||||
pool->free(element, pool->pool_data);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_free);
|
||||
|
||||
/*
|
||||
* A commonly used alloc and free fn.
|
||||
*/
|
||||
void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
|
||||
{
|
||||
kmem_cache_t *mem = (kmem_cache_t *) pool_data;
|
||||
return kmem_cache_alloc(mem, gfp_mask);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_alloc_slab);
|
||||
|
||||
void mempool_free_slab(void *element, void *pool_data)
|
||||
{
|
||||
kmem_cache_t *mem = (kmem_cache_t *) pool_data;
|
||||
kmem_cache_free(mem, element);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_free_slab);
|
||||
+191
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
* linux/mm/mincore.c
|
||||
*
|
||||
* Copyright (C) 1994-1999 Linus Torvalds
|
||||
*/
|
||||
|
||||
/*
|
||||
* The mincore() system call.
|
||||
*/
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
/*
|
||||
* Later we can get more picky about what "in core" means precisely.
|
||||
* For now, simply check to see if the page is in the page cache,
|
||||
* and is up to date; i.e. that no page-in operation would be required
|
||||
* at this time if an application were to map and access this page.
|
||||
*/
|
||||
static unsigned char mincore_page(struct vm_area_struct * vma,
|
||||
unsigned long pgoff)
|
||||
{
|
||||
unsigned char present = 0;
|
||||
struct address_space * as = vma->vm_file->f_mapping;
|
||||
struct page * page;
|
||||
|
||||
page = find_get_page(as, pgoff);
|
||||
if (page) {
|
||||
present = PageUptodate(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
return present;
|
||||
}
|
||||
|
||||
static long mincore_vma(struct vm_area_struct * vma,
|
||||
unsigned long start, unsigned long end, unsigned char __user * vec)
|
||||
{
|
||||
long error, i, remaining;
|
||||
unsigned char * tmp;
|
||||
|
||||
error = -ENOMEM;
|
||||
if (!vma->vm_file)
|
||||
return error;
|
||||
|
||||
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
|
||||
error = -EAGAIN;
|
||||
tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
|
||||
if (!tmp)
|
||||
return error;
|
||||
|
||||
/* (end - start) is # of pages, and also # of bytes in "vec */
|
||||
remaining = (end - start),
|
||||
|
||||
error = 0;
|
||||
for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
|
||||
int j = 0;
|
||||
long thispiece = (remaining < PAGE_SIZE) ?
|
||||
remaining : PAGE_SIZE;
|
||||
|
||||
while (j < thispiece)
|
||||
tmp[j++] = mincore_page(vma, start++);
|
||||
|
||||
if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
|
||||
error = -EFAULT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free_page((unsigned long) tmp);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* The mincore(2) system call.
|
||||
*
|
||||
* mincore() returns the memory residency status of the pages in the
|
||||
* current process's address space specified by [addr, addr + len).
|
||||
* The status is returned in a vector of bytes. The least significant
|
||||
* bit of each byte is 1 if the referenced page is in memory, otherwise
|
||||
* it is zero.
|
||||
*
|
||||
* Because the status of a page can change after mincore() checks it
|
||||
* but before it returns to the application, the returned vector may
|
||||
* contain stale information. Only locked pages are guaranteed to
|
||||
* remain in memory.
|
||||
*
|
||||
* return values:
|
||||
* zero - success
|
||||
* -EFAULT - vec points to an illegal address
|
||||
* -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
|
||||
* -ENOMEM - Addresses in the range [addr, addr + len] are
|
||||
* invalid for the address space of this process, or
|
||||
* specify one or more pages which are not currently
|
||||
* mapped
|
||||
* -EAGAIN - A kernel resource was temporarily unavailable.
|
||||
*/
|
||||
asmlinkage long sys_mincore(unsigned long start, size_t len,
|
||||
unsigned char __user * vec)
|
||||
{
|
||||
int index = 0;
|
||||
unsigned long end, limit;
|
||||
struct vm_area_struct * vma;
|
||||
size_t max;
|
||||
int unmapped_error = 0;
|
||||
long error;
|
||||
|
||||
/* check the arguments */
|
||||
if (start & ~PAGE_CACHE_MASK)
|
||||
goto einval;
|
||||
|
||||
if (start < FIRST_USER_PGD_NR * PGDIR_SIZE)
|
||||
goto enomem;
|
||||
|
||||
limit = TASK_SIZE;
|
||||
if (start >= limit)
|
||||
goto enomem;
|
||||
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
max = limit - start;
|
||||
len = PAGE_CACHE_ALIGN(len);
|
||||
if (len > max || !len)
|
||||
goto enomem;
|
||||
|
||||
end = start + len;
|
||||
|
||||
/* check the output buffer whilst holding the lock */
|
||||
error = -EFAULT;
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address
|
||||
* ranges, just ignore them, but return -ENOMEM at the end.
|
||||
*/
|
||||
error = 0;
|
||||
|
||||
vma = find_vma(current->mm, start);
|
||||
while (vma) {
|
||||
/* Here start < vma->vm_end. */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < vma->vm_end. */
|
||||
if (end <= vma->vm_end) {
|
||||
if (start < end) {
|
||||
error = mincore_vma(vma, start, end,
|
||||
&vec[index]);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
error = unmapped_error;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < vma->vm_end < end. */
|
||||
error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
|
||||
if (error)
|
||||
goto out;
|
||||
index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
|
||||
start = vma->vm_end;
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
|
||||
/* we found a hole in the area queried if we arrive here */
|
||||
error = -ENOMEM;
|
||||
|
||||
out:
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
|
||||
einval:
|
||||
return -EINVAL;
|
||||
enomem:
|
||||
return -ENOMEM;
|
||||
}
|
||||
+253
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* linux/mm/mlock.c
|
||||
*
|
||||
* (C) Copyright 1995 Linus Torvalds
|
||||
* (C) Copyright 2002 Christoph Hellwig
|
||||
*/
|
||||
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
|
||||
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, unsigned int newflags)
|
||||
{
|
||||
struct mm_struct * mm = vma->vm_mm;
|
||||
pgoff_t pgoff;
|
||||
int pages;
|
||||
int ret = 0;
|
||||
|
||||
if (newflags == vma->vm_flags) {
|
||||
*prev = vma;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
|
||||
vma->vm_file, pgoff, vma_policy(vma));
|
||||
if (*prev) {
|
||||
vma = *prev;
|
||||
goto success;
|
||||
}
|
||||
|
||||
*prev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
ret = split_vma(mm, vma, start, 1);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
ret = split_vma(mm, vma, end, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* vm_flags is protected by the mmap_sem held in write mode.
|
||||
* It's okay if try_to_unmap_one unmaps a page just after we
|
||||
* set VM_LOCKED, make_pages_present below will bring it back.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
|
||||
/*
|
||||
* Keep track of amount of locked VM.
|
||||
*/
|
||||
pages = (end - start) >> PAGE_SHIFT;
|
||||
if (newflags & VM_LOCKED) {
|
||||
pages = -pages;
|
||||
if (!(newflags & VM_IO))
|
||||
ret = make_pages_present(start, end);
|
||||
}
|
||||
|
||||
vma->vm_mm->locked_vm -= pages;
|
||||
out:
|
||||
if (ret == -ENOMEM)
|
||||
ret = -EAGAIN;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_mlock(unsigned long start, size_t len, int on)
|
||||
{
|
||||
unsigned long nstart, end, tmp;
|
||||
struct vm_area_struct * vma, * prev;
|
||||
int error;
|
||||
|
||||
len = PAGE_ALIGN(len);
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
if (end == start)
|
||||
return 0;
|
||||
vma = find_vma_prev(current->mm, start, &prev);
|
||||
if (!vma || vma->vm_start > start)
|
||||
return -ENOMEM;
|
||||
|
||||
if (start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned int newflags;
|
||||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!on)
|
||||
newflags &= ~VM_LOCKED;
|
||||
|
||||
tmp = vma->vm_end;
|
||||
if (tmp > end)
|
||||
tmp = end;
|
||||
error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
|
||||
if (error)
|
||||
break;
|
||||
nstart = tmp;
|
||||
if (nstart < prev->vm_end)
|
||||
nstart = prev->vm_end;
|
||||
if (nstart >= end)
|
||||
break;
|
||||
|
||||
vma = prev->vm_next;
|
||||
if (!vma || vma->vm_start != nstart) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long sys_mlock(unsigned long start, size_t len)
|
||||
{
|
||||
unsigned long locked;
|
||||
unsigned long lock_limit;
|
||||
int error = -ENOMEM;
|
||||
|
||||
if (!can_do_mlock())
|
||||
return -EPERM;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
|
||||
start &= PAGE_MASK;
|
||||
|
||||
locked = len >> PAGE_SHIFT;
|
||||
locked += current->mm->locked_vm;
|
||||
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
|
||||
/* check against resource limits */
|
||||
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
|
||||
error = do_mlock(start, len, 1);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long sys_munlock(unsigned long start, size_t len)
|
||||
{
|
||||
int ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
|
||||
start &= PAGE_MASK;
|
||||
ret = do_mlock(start, len, 0);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_mlockall(int flags)
|
||||
{
|
||||
struct vm_area_struct * vma, * prev = NULL;
|
||||
unsigned int def_flags = 0;
|
||||
|
||||
if (flags & MCL_FUTURE)
|
||||
def_flags = VM_LOCKED;
|
||||
current->mm->def_flags = def_flags;
|
||||
if (flags == MCL_FUTURE)
|
||||
goto out;
|
||||
|
||||
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
|
||||
unsigned int newflags;
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!(flags & MCL_CURRENT))
|
||||
newflags &= ~VM_LOCKED;
|
||||
|
||||
/* Ignore errors */
|
||||
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
|
||||
}
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
asmlinkage long sys_mlockall(int flags)
|
||||
{
|
||||
unsigned long lock_limit;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
|
||||
goto out;
|
||||
|
||||
ret = -EPERM;
|
||||
if (!can_do_mlock())
|
||||
goto out;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
|
||||
ret = -ENOMEM;
|
||||
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
|
||||
capable(CAP_IPC_LOCK))
|
||||
ret = do_mlockall(flags);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
asmlinkage long sys_munlockall(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
ret = do_mlockall(0);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
|
||||
* shm segments) get accounted against the user_struct instead.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(shmlock_user_lock);
|
||||
|
||||
int user_shm_lock(size_t size, struct user_struct *user)
|
||||
{
|
||||
unsigned long lock_limit, locked;
|
||||
int allowed = 0;
|
||||
|
||||
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
spin_lock(&shmlock_user_lock);
|
||||
if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
|
||||
goto out;
|
||||
get_uid(user);
|
||||
user->locked_shm += locked;
|
||||
allowed = 1;
|
||||
out:
|
||||
spin_unlock(&shmlock_user_lock);
|
||||
return allowed;
|
||||
}
|
||||
|
||||
void user_shm_unlock(size_t size, struct user_struct *user)
|
||||
{
|
||||
spin_lock(&shmlock_user_lock);
|
||||
user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
spin_unlock(&shmlock_user_lock);
|
||||
free_uid(user);
|
||||
}
|
||||
+282
@@ -0,0 +1,282 @@
|
||||
/*
|
||||
* mm/mprotect.c
|
||||
*
|
||||
* (C) Copyright 1994 Linus Torvalds
|
||||
* (C) Copyright 2002 Christoph Hellwig
|
||||
*
|
||||
* Address space accounting code <alan@redhat.com>
|
||||
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/personality.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
pte_t *pte;
|
||||
|
||||
pte = pte_offset_map(pmd, addr);
|
||||
do {
|
||||
if (pte_present(*pte)) {
|
||||
pte_t ptent;
|
||||
|
||||
/* Avoid an SMP race with hardware updated dirty/clean
|
||||
* bits by wiping the pte and then setting the new pte
|
||||
* into place.
|
||||
*/
|
||||
ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
|
||||
set_pte_at(mm, addr, pte, ptent);
|
||||
lazy_mmu_prot_update(ptent);
|
||||
}
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
pte_unmap(pte - 1);
|
||||
}
|
||||
|
||||
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
change_pte_range(mm, pmd, addr, next, newprot);
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
change_pmd_range(mm, pud, addr, next, newprot);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static void change_protection(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
unsigned long start = addr;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset(mm, addr);
|
||||
flush_cache_range(vma, addr, end);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
change_pud_range(mm, pgd, addr, next, newprot);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_tlb_range(vma, start, end);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
||||
unsigned long start, unsigned long end, unsigned long newflags)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long oldflags = vma->vm_flags;
|
||||
long nrpages = (end - start) >> PAGE_SHIFT;
|
||||
unsigned long charged = 0;
|
||||
pgprot_t newprot;
|
||||
pgoff_t pgoff;
|
||||
int error;
|
||||
|
||||
if (newflags == oldflags) {
|
||||
*pprev = vma;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we make a private mapping writable we increase our commit;
|
||||
* but (without finer accounting) cannot reduce our commit if we
|
||||
* make it unwritable again.
|
||||
*
|
||||
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
|
||||
* a MAP_NORESERVE private mapping to writable will now reserve.
|
||||
*/
|
||||
if (newflags & VM_WRITE) {
|
||||
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
|
||||
charged = nrpages;
|
||||
if (security_vm_enough_memory(charged))
|
||||
return -ENOMEM;
|
||||
newflags |= VM_ACCOUNT;
|
||||
}
|
||||
}
|
||||
|
||||
newprot = protection_map[newflags & 0xf];
|
||||
|
||||
/*
|
||||
* First try to merge with previous and/or next vma.
|
||||
*/
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*pprev = vma_merge(mm, *pprev, start, end, newflags,
|
||||
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
|
||||
if (*pprev) {
|
||||
vma = *pprev;
|
||||
goto success;
|
||||
}
|
||||
|
||||
*pprev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
error = split_vma(mm, vma, start, 1);
|
||||
if (error)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
error = split_vma(mm, vma, end, 0);
|
||||
if (error)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* vm_flags and vm_page_prot are protected by the mmap_sem
|
||||
* held in write mode.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
vma->vm_page_prot = newprot;
|
||||
change_protection(vma, start, end, newprot);
|
||||
__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
|
||||
__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
vm_unacct_memory(charged);
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long
|
||||
sys_mprotect(unsigned long start, size_t len, unsigned long prot)
|
||||
{
|
||||
unsigned long vm_flags, nstart, end, tmp, reqprot;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
int error = -EINVAL;
|
||||
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
|
||||
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
|
||||
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
|
||||
return -EINVAL;
|
||||
|
||||
if (start & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
if (!len)
|
||||
return 0;
|
||||
len = PAGE_ALIGN(len);
|
||||
end = start + len;
|
||||
if (end <= start)
|
||||
return -ENOMEM;
|
||||
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
|
||||
return -EINVAL;
|
||||
|
||||
reqprot = prot;
|
||||
/*
|
||||
* Does the application expect PROT_READ to imply PROT_EXEC:
|
||||
*/
|
||||
if (unlikely((prot & PROT_READ) &&
|
||||
(current->personality & READ_IMPLIES_EXEC)))
|
||||
prot |= PROT_EXEC;
|
||||
|
||||
vm_flags = calc_vm_prot_bits(prot);
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
vma = find_vma_prev(current->mm, start, &prev);
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
if (unlikely(grows & PROT_GROWSDOWN)) {
|
||||
if (vma->vm_start >= end)
|
||||
goto out;
|
||||
start = vma->vm_start;
|
||||
error = -EINVAL;
|
||||
if (!(vma->vm_flags & VM_GROWSDOWN))
|
||||
goto out;
|
||||
}
|
||||
else {
|
||||
if (vma->vm_start > start)
|
||||
goto out;
|
||||
if (unlikely(grows & PROT_GROWSUP)) {
|
||||
end = vma->vm_end;
|
||||
error = -EINVAL;
|
||||
if (!(vma->vm_flags & VM_GROWSUP))
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned long newflags;
|
||||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
error = -EACCES;
|
||||
goto out;
|
||||
}
|
||||
|
||||
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
|
||||
|
||||
if ((newflags & ~(newflags >> 4)) & 0xf) {
|
||||
error = -EACCES;
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = security_file_mprotect(vma, reqprot, prot);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
tmp = vma->vm_end;
|
||||
if (tmp > end)
|
||||
tmp = end;
|
||||
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
|
||||
if (error)
|
||||
goto out;
|
||||
nstart = tmp;
|
||||
|
||||
if (nstart < prev->vm_end)
|
||||
nstart = prev->vm_end;
|
||||
if (nstart >= end)
|
||||
goto out;
|
||||
|
||||
vma = prev->vm_next;
|
||||
if (!vma || vma->vm_start != nstart) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
+426
@@ -0,0 +1,426 @@
|
||||
/*
|
||||
* mm/mremap.c
|
||||
*
|
||||
* (C) Copyright 1996 Linus Torvalds
|
||||
*
|
||||
* Address space accounting code <alan@redhat.com>
|
||||
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte = NULL;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
goto end;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
goto end;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
goto end;
|
||||
|
||||
pte = pte_offset_map_nested(pmd, addr);
|
||||
if (pte_none(*pte)) {
|
||||
pte_unmap_nested(pte);
|
||||
pte = NULL;
|
||||
}
|
||||
end:
|
||||
return pte;
|
||||
}
|
||||
|
||||
static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
return NULL;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
return NULL;
|
||||
|
||||
return pte_offset_map(pmd, addr);
|
||||
}
|
||||
|
||||
static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte = NULL;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
|
||||
pud = pud_alloc(mm, pgd, addr);
|
||||
if (!pud)
|
||||
return NULL;
|
||||
pmd = pmd_alloc(mm, pud, addr);
|
||||
if (pmd)
|
||||
pte = pte_alloc_map(mm, pmd, addr);
|
||||
return pte;
|
||||
}
|
||||
|
||||
static int
|
||||
move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
|
||||
struct vm_area_struct *new_vma, unsigned long new_addr)
|
||||
{
|
||||
struct address_space *mapping = NULL;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int error = 0;
|
||||
pte_t *src, *dst;
|
||||
|
||||
if (vma->vm_file) {
|
||||
/*
|
||||
* Subtle point from Rajesh Venkatasubramanian: before
|
||||
* moving file-based ptes, we must lock vmtruncate out,
|
||||
* since it might clean the dst vma before the src vma,
|
||||
* and we propagate stale pages into the dst afterward.
|
||||
*/
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
if (new_vma->vm_truncate_count &&
|
||||
new_vma->vm_truncate_count != vma->vm_truncate_count)
|
||||
new_vma->vm_truncate_count = 0;
|
||||
}
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
||||
src = get_one_pte_map_nested(mm, old_addr);
|
||||
if (src) {
|
||||
/*
|
||||
* Look to see whether alloc_one_pte_map needs to perform a
|
||||
* memory allocation. If it does then we need to drop the
|
||||
* atomic kmap
|
||||
*/
|
||||
dst = get_one_pte_map(mm, new_addr);
|
||||
if (unlikely(!dst)) {
|
||||
pte_unmap_nested(src);
|
||||
if (mapping)
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
dst = alloc_one_pte_map(mm, new_addr);
|
||||
if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
}
|
||||
src = get_one_pte_map_nested(mm, old_addr);
|
||||
}
|
||||
/*
|
||||
* Since alloc_one_pte_map can drop and re-acquire
|
||||
* page_table_lock, we should re-check the src entry...
|
||||
*/
|
||||
if (src) {
|
||||
if (dst) {
|
||||
pte_t pte;
|
||||
pte = ptep_clear_flush(vma, old_addr, src);
|
||||
set_pte_at(mm, new_addr, dst, pte);
|
||||
} else
|
||||
error = -ENOMEM;
|
||||
pte_unmap_nested(src);
|
||||
}
|
||||
if (dst)
|
||||
pte_unmap(dst);
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
if (mapping)
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
return error;
|
||||
}
|
||||
|
||||
static unsigned long move_page_tables(struct vm_area_struct *vma,
|
||||
unsigned long old_addr, struct vm_area_struct *new_vma,
|
||||
unsigned long new_addr, unsigned long len)
|
||||
{
|
||||
unsigned long offset;
|
||||
|
||||
flush_cache_range(vma, old_addr, old_addr + len);
|
||||
|
||||
/*
|
||||
* This is not the clever way to do this, but we're taking the
|
||||
* easy way out on the assumption that most remappings will be
|
||||
* only a few pages.. This also makes error recovery easier.
|
||||
*/
|
||||
for (offset = 0; offset < len; offset += PAGE_SIZE) {
|
||||
if (move_one_page(vma, old_addr + offset,
|
||||
new_vma, new_addr + offset) < 0)
|
||||
break;
|
||||
cond_resched();
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
unsigned long old_addr, unsigned long old_len,
|
||||
unsigned long new_len, unsigned long new_addr)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *new_vma;
|
||||
unsigned long vm_flags = vma->vm_flags;
|
||||
unsigned long new_pgoff;
|
||||
unsigned long moved_len;
|
||||
unsigned long excess = 0;
|
||||
int split = 0;
|
||||
|
||||
/*
|
||||
* We'd prefer to avoid failure later on in do_munmap:
|
||||
* which may split one vma into three before unmapping.
|
||||
*/
|
||||
if (mm->map_count >= sysctl_max_map_count - 3)
|
||||
return -ENOMEM;
|
||||
|
||||
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
|
||||
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
|
||||
if (!new_vma)
|
||||
return -ENOMEM;
|
||||
|
||||
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
|
||||
if (moved_len < old_len) {
|
||||
/*
|
||||
* On error, move entries back from new area to old,
|
||||
* which will succeed since page tables still there,
|
||||
* and then proceed to unmap new area instead of old.
|
||||
*/
|
||||
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
|
||||
vma = new_vma;
|
||||
old_len = new_len;
|
||||
old_addr = new_addr;
|
||||
new_addr = -ENOMEM;
|
||||
}
|
||||
|
||||
/* Conceal VM_ACCOUNT so old reservation is not undone */
|
||||
if (vm_flags & VM_ACCOUNT) {
|
||||
vma->vm_flags &= ~VM_ACCOUNT;
|
||||
excess = vma->vm_end - vma->vm_start - old_len;
|
||||
if (old_addr > vma->vm_start &&
|
||||
old_addr + old_len < vma->vm_end)
|
||||
split = 1;
|
||||
}
|
||||
|
||||
if (do_munmap(mm, old_addr, old_len) < 0) {
|
||||
/* OOM: unable to split vma, just get accounts right */
|
||||
vm_unacct_memory(excess >> PAGE_SHIFT);
|
||||
excess = 0;
|
||||
}
|
||||
|
||||
/* Restore VM_ACCOUNT if one or two pieces of vma left */
|
||||
if (excess) {
|
||||
vma->vm_flags |= VM_ACCOUNT;
|
||||
if (split)
|
||||
vma->vm_next->vm_flags |= VM_ACCOUNT;
|
||||
}
|
||||
|
||||
mm->total_vm += new_len >> PAGE_SHIFT;
|
||||
__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += new_len >> PAGE_SHIFT;
|
||||
if (new_len > old_len)
|
||||
make_pages_present(new_addr + old_len,
|
||||
new_addr + new_len);
|
||||
}
|
||||
|
||||
return new_addr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Expand (or shrink) an existing mapping, potentially moving it at the
|
||||
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
|
||||
*
|
||||
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
|
||||
* This option implies MREMAP_MAYMOVE.
|
||||
*/
|
||||
unsigned long do_mremap(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_len,
|
||||
unsigned long flags, unsigned long new_addr)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long ret = -EINVAL;
|
||||
unsigned long charged = 0;
|
||||
|
||||
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
|
||||
goto out;
|
||||
|
||||
if (addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
|
||||
old_len = PAGE_ALIGN(old_len);
|
||||
new_len = PAGE_ALIGN(new_len);
|
||||
|
||||
/*
|
||||
* We allow a zero old-len as a special case
|
||||
* for DOS-emu "duplicate shm area" thing. But
|
||||
* a zero new-len is nonsensical.
|
||||
*/
|
||||
if (!new_len)
|
||||
goto out;
|
||||
|
||||
/* new_addr is only valid if MREMAP_FIXED is specified */
|
||||
if (flags & MREMAP_FIXED) {
|
||||
if (new_addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
if (!(flags & MREMAP_MAYMOVE))
|
||||
goto out;
|
||||
|
||||
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
|
||||
goto out;
|
||||
|
||||
/* Check if the location we're moving into overlaps the
|
||||
* old location at all, and fail if it does.
|
||||
*/
|
||||
if ((new_addr <= addr) && (new_addr+new_len) > addr)
|
||||
goto out;
|
||||
|
||||
if ((addr <= new_addr) && (addr+old_len) > new_addr)
|
||||
goto out;
|
||||
|
||||
ret = do_munmap(current->mm, new_addr, new_len);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Always allow a shrinking remap: that just unmaps
|
||||
* the unnecessary pages..
|
||||
* do_munmap does all the needed commit accounting
|
||||
*/
|
||||
if (old_len >= new_len) {
|
||||
ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
|
||||
if (ret && old_len != new_len)
|
||||
goto out;
|
||||
ret = addr;
|
||||
if (!(flags & MREMAP_FIXED) || (new_addr == addr))
|
||||
goto out;
|
||||
old_len = new_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, we need to grow.. or relocate.
|
||||
*/
|
||||
ret = -EFAULT;
|
||||
vma = find_vma(current->mm, addr);
|
||||
if (!vma || vma->vm_start > addr)
|
||||
goto out;
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
/* We can't remap across vm area boundaries */
|
||||
if (old_len > vma->vm_end - addr)
|
||||
goto out;
|
||||
if (vma->vm_flags & VM_DONTEXPAND) {
|
||||
if (new_len > old_len)
|
||||
goto out;
|
||||
}
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
unsigned long locked, lock_limit;
|
||||
locked = current->mm->locked_vm << PAGE_SHIFT;
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
locked += new_len - old_len;
|
||||
ret = -EAGAIN;
|
||||
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
|
||||
goto out;
|
||||
}
|
||||
ret = -ENOMEM;
|
||||
if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
|
||||
> current->signal->rlim[RLIMIT_AS].rlim_cur)
|
||||
goto out;
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT) {
|
||||
charged = (new_len - old_len) >> PAGE_SHIFT;
|
||||
if (security_vm_enough_memory(charged))
|
||||
goto out_nc;
|
||||
}
|
||||
|
||||
/* old_len exactly to the end of the area..
|
||||
* And we're not relocating the area.
|
||||
*/
|
||||
if (old_len == vma->vm_end - addr &&
|
||||
!((flags & MREMAP_FIXED) && (addr != new_addr)) &&
|
||||
(old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
|
||||
unsigned long max_addr = TASK_SIZE;
|
||||
if (vma->vm_next)
|
||||
max_addr = vma->vm_next->vm_start;
|
||||
/* can we just expand the current mapping? */
|
||||
if (max_addr - addr >= new_len) {
|
||||
int pages = (new_len - old_len) >> PAGE_SHIFT;
|
||||
|
||||
vma_adjust(vma, vma->vm_start,
|
||||
addr + new_len, vma->vm_pgoff, NULL);
|
||||
|
||||
current->mm->total_vm += pages;
|
||||
__vm_stat_account(vma->vm_mm, vma->vm_flags,
|
||||
vma->vm_file, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
current->mm->locked_vm += pages;
|
||||
make_pages_present(addr + old_len,
|
||||
addr + new_len);
|
||||
}
|
||||
ret = addr;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We weren't able to just expand or shrink the area,
|
||||
* we need to create a new one and move it..
|
||||
*/
|
||||
ret = -ENOMEM;
|
||||
if (flags & MREMAP_MAYMOVE) {
|
||||
if (!(flags & MREMAP_FIXED)) {
|
||||
unsigned long map_flags = 0;
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
map_flags |= MAP_SHARED;
|
||||
|
||||
new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
|
||||
vma->vm_pgoff, map_flags);
|
||||
ret = new_addr;
|
||||
if (new_addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
}
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr);
|
||||
}
|
||||
out:
|
||||
if (ret & ~PAGE_MASK)
|
||||
vm_unacct_memory(charged);
|
||||
out_nc:
|
||||
return ret;
|
||||
}
|
||||
|
||||
asmlinkage unsigned long sys_mremap(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_len,
|
||||
unsigned long flags, unsigned long new_addr)
|
||||
{
|
||||
unsigned long ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
+236
@@ -0,0 +1,236 @@
|
||||
/*
|
||||
* linux/mm/msync.c
|
||||
*
|
||||
* Copyright (C) 1994-1999 Linus Torvalds
|
||||
*/
|
||||
|
||||
/*
|
||||
* The msync() system call.
|
||||
*/
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
/*
|
||||
* Called with mm->page_table_lock held to protect against other
|
||||
* threads/the swapper from ripping pte's out from under us.
|
||||
*/
|
||||
|
||||
static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
pte_t *pte;
|
||||
|
||||
pte = pte_offset_map(pmd, addr);
|
||||
do {
|
||||
unsigned long pfn;
|
||||
struct page *page;
|
||||
|
||||
if (!pte_present(*pte))
|
||||
continue;
|
||||
pfn = pte_pfn(*pte);
|
||||
if (!pfn_valid(pfn))
|
||||
continue;
|
||||
page = pfn_to_page(pfn);
|
||||
if (PageReserved(page))
|
||||
continue;
|
||||
|
||||
if (ptep_clear_flush_dirty(vma, addr, pte) ||
|
||||
page_test_and_clear_dirty(page))
|
||||
set_page_dirty(page);
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
pte_unmap(pte - 1);
|
||||
}
|
||||
|
||||
static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
sync_pte_range(vma, pmd, addr, next);
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
sync_pmd_range(vma, pud, addr, next);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static void sync_page_range(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
|
||||
/* For hugepages we can't go walking the page table normally,
|
||||
* but that's ok, hugetlbfs is memory based, so we don't need
|
||||
* to do anything more on an msync() */
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset(mm, addr);
|
||||
flush_cache_range(vma, addr, end);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
sync_pud_range(vma, pgd, addr, next);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
static inline void filemap_sync(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
const size_t chunk = 64 * 1024; /* bytes */
|
||||
unsigned long next;
|
||||
|
||||
do {
|
||||
next = addr + chunk;
|
||||
if (next > end || next < addr)
|
||||
next = end;
|
||||
sync_page_range(vma, addr, next);
|
||||
cond_resched();
|
||||
} while (addr = next, addr != end);
|
||||
}
|
||||
#else
|
||||
static inline void filemap_sync(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
sync_page_range(vma, addr, end);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* MS_SYNC syncs the entire file - including mappings.
|
||||
*
|
||||
* MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
|
||||
* marks the relevant pages dirty. The application may now run fsync() to
|
||||
* write out the dirty pages and wait on the writeout and check the result.
|
||||
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
|
||||
* async writeout immediately.
|
||||
* So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
|
||||
* applications.
|
||||
*/
|
||||
static int msync_interval(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end, int flags)
|
||||
{
|
||||
int ret = 0;
|
||||
struct file *file = vma->vm_file;
|
||||
|
||||
if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
|
||||
return -EBUSY;
|
||||
|
||||
if (file && (vma->vm_flags & VM_SHARED)) {
|
||||
filemap_sync(vma, addr, end);
|
||||
|
||||
if (flags & MS_SYNC) {
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
int err;
|
||||
|
||||
ret = filemap_fdatawrite(mapping);
|
||||
if (file->f_op && file->f_op->fsync) {
|
||||
/*
|
||||
* We don't take i_sem here because mmap_sem
|
||||
* is already held.
|
||||
*/
|
||||
err = file->f_op->fsync(file,file->f_dentry,1);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
err = filemap_fdatawait(mapping);
|
||||
if (!ret)
|
||||
ret = err;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
|
||||
{
|
||||
unsigned long end;
|
||||
struct vm_area_struct *vma;
|
||||
int unmapped_error, error = -EINVAL;
|
||||
|
||||
if (flags & MS_SYNC)
|
||||
current->flags |= PF_SYNCWRITE;
|
||||
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
|
||||
goto out;
|
||||
if (start & ~PAGE_MASK)
|
||||
goto out;
|
||||
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
|
||||
goto out;
|
||||
error = -ENOMEM;
|
||||
len = (len + ~PAGE_MASK) & PAGE_MASK;
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
goto out;
|
||||
error = 0;
|
||||
if (end == start)
|
||||
goto out;
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address ranges,
|
||||
* just ignore them, but return -ENOMEM at the end.
|
||||
*/
|
||||
vma = find_vma(current->mm, start);
|
||||
unmapped_error = 0;
|
||||
for (;;) {
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
/* Here start < vma->vm_end. */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
}
|
||||
/* Here vma->vm_start <= start < vma->vm_end. */
|
||||
if (end <= vma->vm_end) {
|
||||
if (start < end) {
|
||||
error = msync_interval(vma, start, end, flags);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
error = unmapped_error;
|
||||
goto out;
|
||||
}
|
||||
/* Here vma->vm_start <= start < vma->vm_end < end. */
|
||||
error = msync_interval(vma, start, vma->vm_end, flags);
|
||||
if (error)
|
||||
goto out;
|
||||
start = vma->vm_end;
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
out:
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
current->flags &= ~PF_SYNCWRITE;
|
||||
return error;
|
||||
}
|
||||
+1180
File diff suppressed because it is too large
Load Diff
+292
@@ -0,0 +1,292 @@
|
||||
/*
|
||||
* linux/mm/oom_kill.c
|
||||
*
|
||||
* Copyright (C) 1998,2000 Rik van Riel
|
||||
* Thanks go out to Claus Fischer for some serious inspiration and
|
||||
* for goading me into coding this file...
|
||||
*
|
||||
* The routines in this file are used to kill a process when
|
||||
* we're seriously out of memory. This gets called from kswapd()
|
||||
* in linux/mm/vmscan.c when we really run out of memory.
|
||||
*
|
||||
* Since we won't call these routines often (on a well-configured
|
||||
* machine) this file will double as a 'coding guide' and a signpost
|
||||
* for newbie kernel hackers. It features several pointers to major
|
||||
* kernel subsystems and hints as to where to find out what things do.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
/* #define DEBUG */
|
||||
|
||||
/**
|
||||
* oom_badness - calculate a numeric value for how bad this task has been
|
||||
* @p: task struct of which task we should calculate
|
||||
* @p: current uptime in seconds
|
||||
*
|
||||
* The formula used is relatively simple and documented inline in the
|
||||
* function. The main rationale is that we want to select a good task
|
||||
* to kill when we run out of memory.
|
||||
*
|
||||
* Good in this context means that:
|
||||
* 1) we lose the minimum amount of work done
|
||||
* 2) we recover a large amount of memory
|
||||
* 3) we don't kill anything innocent of eating tons of memory
|
||||
* 4) we want to kill the minimum amount of processes (one)
|
||||
* 5) we try to kill the process the user expects us to kill, this
|
||||
* algorithm has been meticulously tuned to meet the principle
|
||||
* of least surprise ... (be careful when you change it)
|
||||
*/
|
||||
|
||||
unsigned long badness(struct task_struct *p, unsigned long uptime)
|
||||
{
|
||||
unsigned long points, cpu_time, run_time, s;
|
||||
struct list_head *tsk;
|
||||
|
||||
if (!p->mm)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* The memory size of the process is the basis for the badness.
|
||||
*/
|
||||
points = p->mm->total_vm;
|
||||
|
||||
/*
|
||||
* Processes which fork a lot of child processes are likely
|
||||
* a good choice. We add the vmsize of the childs if they
|
||||
* have an own mm. This prevents forking servers to flood the
|
||||
* machine with an endless amount of childs
|
||||
*/
|
||||
list_for_each(tsk, &p->children) {
|
||||
struct task_struct *chld;
|
||||
chld = list_entry(tsk, struct task_struct, sibling);
|
||||
if (chld->mm != p->mm && chld->mm)
|
||||
points += chld->mm->total_vm;
|
||||
}
|
||||
|
||||
/*
|
||||
* CPU time is in tens of seconds and run time is in thousands
|
||||
* of seconds. There is no particular reason for this other than
|
||||
* that it turned out to work very well in practice.
|
||||
*/
|
||||
cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
|
||||
>> (SHIFT_HZ + 3);
|
||||
|
||||
if (uptime >= p->start_time.tv_sec)
|
||||
run_time = (uptime - p->start_time.tv_sec) >> 10;
|
||||
else
|
||||
run_time = 0;
|
||||
|
||||
s = int_sqrt(cpu_time);
|
||||
if (s)
|
||||
points /= s;
|
||||
s = int_sqrt(int_sqrt(run_time));
|
||||
if (s)
|
||||
points /= s;
|
||||
|
||||
/*
|
||||
* Niced processes are most likely less important, so double
|
||||
* their badness points.
|
||||
*/
|
||||
if (task_nice(p) > 0)
|
||||
points *= 2;
|
||||
|
||||
/*
|
||||
* Superuser processes are usually more important, so we make it
|
||||
* less likely that we kill those.
|
||||
*/
|
||||
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
|
||||
p->uid == 0 || p->euid == 0)
|
||||
points /= 4;
|
||||
|
||||
/*
|
||||
* We don't want to kill a process with direct hardware access.
|
||||
* Not only could that mess up the hardware, but usually users
|
||||
* tend to only have this flag set on applications they think
|
||||
* of as important.
|
||||
*/
|
||||
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
|
||||
points /= 4;
|
||||
|
||||
/*
|
||||
* Adjust the score by oomkilladj.
|
||||
*/
|
||||
if (p->oomkilladj) {
|
||||
if (p->oomkilladj > 0)
|
||||
points <<= p->oomkilladj;
|
||||
else
|
||||
points >>= -(p->oomkilladj);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
|
||||
p->pid, p->comm, points);
|
||||
#endif
|
||||
return points;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple selection loop. We chose the process with the highest
|
||||
* number of 'points'. We expect the caller will lock the tasklist.
|
||||
*
|
||||
* (not docbooked, we don't want this one cluttering up the manual)
|
||||
*/
|
||||
static struct task_struct * select_bad_process(void)
|
||||
{
|
||||
unsigned long maxpoints = 0;
|
||||
struct task_struct *g, *p;
|
||||
struct task_struct *chosen = NULL;
|
||||
struct timespec uptime;
|
||||
|
||||
do_posix_clock_monotonic_gettime(&uptime);
|
||||
do_each_thread(g, p)
|
||||
/* skip the init task with pid == 1 */
|
||||
if (p->pid > 1) {
|
||||
unsigned long points;
|
||||
|
||||
/*
|
||||
* This is in the process of releasing memory so wait it
|
||||
* to finish before killing some other task by mistake.
|
||||
*/
|
||||
if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
|
||||
!(p->flags & PF_DEAD))
|
||||
return ERR_PTR(-1UL);
|
||||
if (p->flags & PF_SWAPOFF)
|
||||
return p;
|
||||
|
||||
points = badness(p, uptime.tv_sec);
|
||||
if (points > maxpoints || !chosen) {
|
||||
chosen = p;
|
||||
maxpoints = points;
|
||||
}
|
||||
}
|
||||
while_each_thread(g, p);
|
||||
return chosen;
|
||||
}
|
||||
|
||||
/**
|
||||
* We must be careful though to never send SIGKILL a process with
|
||||
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
|
||||
* we select a process with CAP_SYS_RAW_IO set).
|
||||
*/
|
||||
static void __oom_kill_task(task_t *p)
|
||||
{
|
||||
if (p->pid == 1) {
|
||||
WARN_ON(1);
|
||||
printk(KERN_WARNING "tried to kill init!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
task_lock(p);
|
||||
if (!p->mm || p->mm == &init_mm) {
|
||||
WARN_ON(1);
|
||||
printk(KERN_WARNING "tried to kill an mm-less task!\n");
|
||||
task_unlock(p);
|
||||
return;
|
||||
}
|
||||
task_unlock(p);
|
||||
printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
|
||||
|
||||
/*
|
||||
* We give our sacrificial lamb high priority and access to
|
||||
* all the memory it needs. That way it should be able to
|
||||
* exit() and clear out its resources quickly...
|
||||
*/
|
||||
p->time_slice = HZ;
|
||||
set_tsk_thread_flag(p, TIF_MEMDIE);
|
||||
|
||||
force_sig(SIGKILL, p);
|
||||
}
|
||||
|
||||
static struct mm_struct *oom_kill_task(task_t *p)
|
||||
{
|
||||
struct mm_struct *mm = get_task_mm(p);
|
||||
task_t * g, * q;
|
||||
|
||||
if (!mm)
|
||||
return NULL;
|
||||
if (mm == &init_mm) {
|
||||
mmput(mm);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
__oom_kill_task(p);
|
||||
/*
|
||||
* kill all processes that share the ->mm (i.e. all threads),
|
||||
* but are in a different thread group
|
||||
*/
|
||||
do_each_thread(g, q)
|
||||
if (q->mm == mm && q->tgid != p->tgid)
|
||||
__oom_kill_task(q);
|
||||
while_each_thread(g, q);
|
||||
|
||||
return mm;
|
||||
}
|
||||
|
||||
static struct mm_struct *oom_kill_process(struct task_struct *p)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
struct task_struct *c;
|
||||
struct list_head *tsk;
|
||||
|
||||
/* Try to kill a child first */
|
||||
list_for_each(tsk, &p->children) {
|
||||
c = list_entry(tsk, struct task_struct, sibling);
|
||||
if (c->mm == p->mm)
|
||||
continue;
|
||||
mm = oom_kill_task(c);
|
||||
if (mm)
|
||||
return mm;
|
||||
}
|
||||
return oom_kill_task(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* oom_kill - kill the "best" process when we run out of memory
|
||||
*
|
||||
* If we run out of memory, we have the choice between either
|
||||
* killing a random task (bad), letting the system crash (worse)
|
||||
* OR try to be smart about which process to kill. Note that we
|
||||
* don't have to be perfect here, we just have to be good.
|
||||
*/
|
||||
void out_of_memory(unsigned int __nocast gfp_mask)
|
||||
{
|
||||
struct mm_struct *mm = NULL;
|
||||
task_t * p;
|
||||
|
||||
read_lock(&tasklist_lock);
|
||||
retry:
|
||||
p = select_bad_process();
|
||||
|
||||
if (PTR_ERR(p) == -1UL)
|
||||
goto out;
|
||||
|
||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||
if (!p) {
|
||||
read_unlock(&tasklist_lock);
|
||||
show_free_areas();
|
||||
panic("Out of memory and no killable processes...\n");
|
||||
}
|
||||
|
||||
printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
|
||||
show_free_areas();
|
||||
mm = oom_kill_process(p);
|
||||
if (!mm)
|
||||
goto retry;
|
||||
|
||||
out:
|
||||
read_unlock(&tasklist_lock);
|
||||
if (mm)
|
||||
mmput(mm);
|
||||
|
||||
/*
|
||||
* Give "p" a good chance of killing itself before we
|
||||
* retry to allocate memory.
|
||||
*/
|
||||
__set_current_state(TASK_INTERRUPTIBLE);
|
||||
schedule_timeout(1);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user