Merge branch 'akpm'

* akpm: (182 commits)
  fbdev: bf54x-lq043fb: use kzalloc over kmalloc/memset
  fbdev: *bfin*: fix __dev{init,exit} markings
  fbdev: *bfin*: drop unnecessary calls to memset
  fbdev: bfin-t350mcqb-fb: drop unused local variables
  fbdev: blackfin has __raw I/O accessors, so use them in fb.h
  fbdev: s1d13xxxfb: add accelerated bitblt functions
  tcx: use standard fields for framebuffer physical address and length
  fbdev: add support for handoff from firmware to hw framebuffers
  intelfb: fix a bug when changing video timing
  fbdev: use framebuffer_release() for freeing fb_info structures
  radeon: P2G2CLK_ALWAYS_ONb tested twice, should 2nd be P2G2CLK_DAC_ALWAYS_ONb?
  s3c-fb: CPUFREQ frequency scaling support
  s3c-fb: fix resource releasing on error during probing
  carminefb: fix possible access beyond end of carmine_modedb[]
  acornfb: remove fb_mmap function
  mb862xxfb: use CONFIG_OF instead of CONFIG_PPC_OF
  mb862xxfb: restrict compliation of platform driver to PPC
  Samsung SoC Framebuffer driver: add Alpha Channel support
  atmel-lcdc: fix pixclock upper bound detection
  offb: use framebuffer_alloc() to allocate fb_info struct
  ...

Manually fix up conflicts due to kmemcheck in mm/slab.c
This commit is contained in:
Linus Torvalds
2009-06-16 19:50:13 -07:00
247 changed files with 5676 additions and 2693 deletions
+1 -13
View File
@@ -203,25 +203,13 @@ config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
config UNEVICTABLE_LRU
bool "Add LRU list to track non-evictable pages"
default y
help
Keeps unevictable pages off of the active and inactive pageout
lists, so kswapd will not waste CPU time or have its balancing
algorithms thrown off by scanning these pages. Selecting this
will use one page flag and increase the code size a little,
say Y unless you know what you are doing.
See Documentation/vm/unevictable-lru.txt for more information.
config HAVE_MLOCK
bool
default y if MMU=y
config HAVE_MLOCKED_PAGE_BIT
bool
default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
default y if HAVE_MLOCK=y
config MMU_NOTIFIER
bool
+1
View File
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
obj-y += init-mm.o
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
+1 -1
View File
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
ret = force_page_cache_readahead(mapping, file,
start_index,
max_sane_readahead(nrpages));
nrpages);
if (ret > 0)
ret = 0;
break;
+96 -73
View File
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
{
if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node();
return alloc_pages_node(n, gfp, 0);
return alloc_pages_exact_node(n, gfp, 0);
}
return alloc_pages(gfp, 0);
}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
{
if (!ra->ra_pages)
return;
ra->ra_pages /= 4;
}
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
force_page_cache_readahead(mapping, filp, index,
max_sane_readahead(nr));
force_page_cache_readahead(mapping, filp, index, nr);
return 0;
}
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
#define MMAP_LOTSAMISS (100)
/*
* Synchronous readahead happens when we don't even find
* a page in the page cache at all.
*/
static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
pgoff_t offset)
{
unsigned long ra_pages;
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (VM_SequentialReadHint(vma) ||
offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
}
if (ra->mmap_miss < INT_MAX)
ra->mmap_miss++;
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
return;
/*
* mmap read-around
*/
ra_pages = max_sane_readahead(ra->ra_pages);
if (ra_pages) {
ra->start = max_t(long, 0, offset - ra_pages/2);
ra->size = ra_pages;
ra->async_size = 0;
ra_submit(ra, mapping, file);
}
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further..
*/
static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
struct page *page,
pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
if (PageReadahead(page))
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
}
/**
* filemap_fault - read in file data for page fault handling
* @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
int did_readaround = 0;
int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
if (offset >= size)
return VM_FAULT_SIGBUS;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
goto no_cached_page;
/*
* Do we have something in the page cache already?
*/
retry_find:
page = find_lock_page(mapping, vmf->pgoff);
/*
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(vma)) {
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
page = find_lock_page(mapping, vmf->pgoff);
if (!page)
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping, ra, file, page,
vmf->pgoff, 1);
}
}
if (!page) {
unsigned long ra_pages;
ra->mmap_miss++;
page = find_get_page(mapping, offset);
if (likely(page)) {
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
* We found the page, so try async readahead before
* waiting for the lock.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
do_async_mmap_readahead(vma, ra, file, page, offset);
lock_page(page);
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto no_cached_page;
/*
* To keep the pgmajfault counter straight, we need to
* check did_readaround, as this is an inner loop.
*/
if (!did_readaround) {
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
did_readaround = 1;
ra_pages = max_sane_readahead(file->f_ra.ra_pages);
if (ra_pages) {
pgoff_t start = 0;
if (vmf->pgoff > ra_pages / 2)
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
page = find_lock_page(mapping, vmf->pgoff);
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_lock_page(mapping, offset);
if (!page)
goto no_cached_page;
}
if (!did_readaround)
ra->mmap_miss--;
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
/* Must recheck i_size under page lock */
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) {
if (unlikely(offset >= size)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
}
/*
* Found the page and have a reference on it.
*/
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1575,7 +1604,7 @@ no_cached_page:
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
error = page_cache_read(file, vmf->pgoff);
error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
return VM_FAULT_SIGBUS;
page_not_uptodate:
/* IO error path */
if (!did_readaround) {
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
+67 -39
View File
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
hugetlb_put_quota(mapping, 1);
}
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
static int prev_nid;
int nid = prev_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
do {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
/* To shrink on this node, there must be a surplus page */
if (delta < 0 && !h->surplus_huge_pages_node[nid])
continue;
/* Surplus cannot exceed the total number of pages */
if (delta > 0 && h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
continue;
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
} while (nid != prev_nid);
prev_nid = nid;
return ret;
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
static void prep_compound_gigantic_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
p->first_page = page;
}
}
int PageHuge(struct page *page)
{
compound_page_dtor *dtor;
if (!PageCompound(page))
return 0;
page = compound_head(page);
dtor = get_compound_page_dtor(page);
return dtor == free_huge_page;
}
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
if (h->order >= MAX_ORDER)
return NULL;
page = alloc_pages_node(nid,
page = alloc_pages_exact_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* Use a helper variable to find the next node and then
* copy it back to hugetlb_next_nid afterwards:
* otherwise there's a window in which a racer might
* pass invalid nid MAX_NUMNODES to alloc_pages_node.
* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
* doesn't matter if occasionally a racer chooses the
* same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* can no longer free unreserved surplus pages. This occurs when
* the nodes with surplus pages have no free pages.
*/
unsigned long remaining_iterations = num_online_nodes();
unsigned long remaining_iterations = nr_online_nodes;
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
nr_pages--;
remaining_iterations = num_online_nodes();
remaining_iterations = nr_online_nodes;
}
}
}
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
}
#endif
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
static int prev_nid;
int nid = prev_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
do {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
/* To shrink on this node, there must be a surplus page */
if (delta < 0 && !h->surplus_huge_pages_node[nid])
continue;
/* Surplus cannot exceed the total number of pages */
if (delta > 0 && h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
continue;
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
} while (nid != prev_nid);
prev_nid = nid;
return ret;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
{
+20
View File
@@ -0,0 +1,20 @@
#include <linux/mm_types.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/cpumask.h>
#include <asm/atomic.h>
#include <asm/pgtable.h>
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
};
+6 -27
View File
@@ -16,9 +16,6 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
extern void prep_compound_page(struct page *page, unsigned long order);
extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
*/
extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
/*
* function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* unevictable_migrate_page() called only from migrate_page_copy() to
* migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
if (TestClearPageUnevictable(old))
SetPageUnevictable(new);
}
#else
static inline void unevictable_migrate_page(struct page *new, struct page *old)
{
}
#endif
#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
/*
* free_page_mlock() -- clean up attempts to free and mlocked() page.
* Page should not be on lru, so no need to fix that up.
* free_pages_check() will verify...
*/
static inline void free_page_mlock(struct page *page)
{
if (unlikely(TestClearPageMlocked(page))) {
unsigned long flags;
local_irq_save(flags);
__dec_zone_page_state(page, NR_MLOCK);
__count_vm_event(UNEVICTABLE_MLOCKFREED);
local_irq_restore(flags);
}
}
#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void free_page_mlock(struct page *page) { }
#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas);
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1
#endif
+23 -3
View File
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
force_page_cache_readahead(file->f_mapping,
file, start, max_sane_readahead(end - start));
force_page_cache_readahead(file->f_mapping, file, start, end - start);
return 0;
}
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
break;
default:
error = -EINVAL;
BUG();
break;
}
return error;
}
static int
madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_DOFORK:
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
return 1;
default:
return 0;
}
}
/*
* The madvise(2) system call.
*
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int write;
size_t len;
if (!madvise_behavior_valid(behavior))
return error;
write = madvise_need_mmap_write(behavior);
if (write)
down_write(&current->mm->mmap_sem);
+11
View File
@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
return 0;
}
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
return (active > inactive);
}
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
+110 -20
View File
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i;
}
/**
* get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @len: number of pages from start to pin
* @write: whether pages will be written to by the caller
* @force: whether to force write access even if user mapping is
* readonly. This will result in the page being COWed even
* in MAP_SHARED mappings. You do not want this.
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If len is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If write=0, the page must not be written to. If the page is written to,
* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
* after the page is finished with, and before put_page is called.
*
* get_user_pages is typically used for fewer-copy IO operations, to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses. The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping (via the kmap APIs). Care should be taken to
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
static int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
resource_size_t phys_addr = 0;
struct mm_struct *mm = vma->vm_mm;
int ret = -EINVAL;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
if (pmd_huge(*pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
pte = *ptep;
if (!pte_present(pte))
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
out:
return -EINVAL;
}
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* Returns zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
{
int ret = -EINVAL;
pte_t *ptep, pte;
spinlock_t *ptl;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
phys_addr = pte_pfn(pte);
phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
*prot = pgprot_val(pte_pgprot(pte));
*phys = phys_addr;
ret = 0;
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
+5 -1
View File
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
setup_per_zone_pages_min();
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
if (onlined_pages) {
kswapd_run(zone_to_nid(zone));
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
totalram_pages -= offlined_pages;
num_physpages -= offlined_pages;
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
+104 -41
View File
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
return 0;
}
/* Create a new policy */
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t cpuset_context_nmask;
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&cpuset_context_nmask, nodes,
&cpuset_current_mems_allowed);
else
nodes_and(cpuset_context_nmask, *nodes,
cpuset_current_mems_allowed);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed =
cpuset_current_mems_allowed;
}
ret = mpol_ops[pol->mode].create(pol,
nodes ? &cpuset_context_nmask : NULL);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
nodemask_t cpuset_context_nmask;
int ret;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
nodes = NULL; /* flag local alloc */
}
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy->mode = mode;
policy->flags = flags;
if (nodes) {
/*
* cpuset related setup doesn't apply to local allocation
*/
cpuset_update_task_memory_state();
if (flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&cpuset_context_nmask, nodes,
&cpuset_current_mems_allowed);
else
nodes_and(cpuset_context_nmask, *nodes,
cpuset_current_mems_allowed);
if (mpol_store_user_nodemask(policy))
policy->w.user_nodemask = *nodes;
else
policy->w.cpuset_mems_allowed =
cpuset_mems_allowed(current);
}
ret = mpol_ops[mode].create(policy,
nodes ? &cpuset_context_nmask : NULL);
if (ret < 0) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(ret);
}
return policy;
}
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new;
struct mempolicy *new, *old;
struct mm_struct *mm = current->mm;
int ret;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
*/
if (mm)
down_write(&mm->mmap_sem);
mpol_put(current->mempolicy);
task_lock(current);
ret = mpol_set_nodemask(new, nodes);
if (ret) {
task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
mpol_put(new);
return ret;
}
old = current->mempolicy;
current->mempolicy = new;
mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
mpol_put(old);
return 0;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
cpuset_update_task_memory_state();
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
err = 0;
if (nmask)
if (nmask) {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
out:
mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
return err;
}
down_write(&mm->mmap_sem);
task_lock(current);
err = mpol_set_nodemask(new, nmask);
task_unlock(current);
if (err) {
up_write(&mm->mmap_sem);
mpol_put(new);
return err;
}
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
cpuset_update_task_memory_state();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
if ((gfp & __GFP_WAIT) && !in_interrupt())
cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
@@ -1854,6 +1894,8 @@ restart:
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
spin_lock_init(&sp->lock);
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
mpol_put(mpol); /* drop our ref on sb mpol */
if (IS_ERR(new))
if (IS_ERR(new)) {
mpol_put(mpol); /* drop our ref on sb mpol */
return; /* no valid nodemask intersection */
}
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
task_unlock(current);
mpol_put(mpol); /* drop our ref on sb mpol */
if (ret) {
mpol_put(new);
return;
}
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
err = 1;
else if (no_context)
new->w.user_nodemask = nodes; /* save for contextualization */
else {
int ret;
task_lock(current);
ret = mpol_set_nodemask(new, &nodes);
task_unlock(current);
if (ret)
err = 1;
else if (no_context) {
/* save for contextualization */
new->w.user_nodemask = nodes;
}
}
out:
/* Restore string for error message */
+4 -2
View File
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
return alloc_pages_node(pm->node,
return alloc_pages_exact_node(pm->node,
GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
struct page_to_node *pp;
LIST_HEAD(pagelist);
migrate_prep();
down_read(&mm->mmap_sem);
/*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
goto out;
migrate_prep();
/*
* Store a chunk of page_to_node array in a page,
* but keep the last one as a marker
-22
View File
@@ -31,7 +31,6 @@ int can_do_mlock(void)
}
EXPORT_SYMBOL(can_do_mlock);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Mlocked pages are marked with PageMlocked() flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
return retval;
}
#else /* CONFIG_UNEVICTABLE_LRU */
/*
* Just make pages present if VM_LOCKED. No-op if unlocking.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int mlock)
{
if (mlock && (vma->vm_flags & VM_LOCKED))
return make_pages_present(start, end);
return 0;
}
static inline int __mlock_posix_error_return(long retval)
{
return 0;
}
#endif /* CONFIG_UNEVICTABLE_LRU */
/**
* mlock_vma_pages_range() - mlock pages in specified vma range.
* @vma - the vma containing the specfied address range
+25 -39
View File
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
int oom_adj;
task_lock(p);
mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
task_unlock(p);
return 0;
}
oom_adj = mm->oom_adj;
if (oom_adj == OOM_DISABLE) {
task_unlock(p);
return 0;
}
/*
* The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
points /= 8;
/*
* Adjust the score by oomkilladj.
* Adjust the score by oom_adj.
*/
if (p->oomkilladj) {
if (p->oomkilladj > 0) {
if (oom_adj) {
if (oom_adj > 0) {
if (!points)
points = 1;
points <<= p->oomkilladj;
points <<= oom_adj;
} else
points >>= -(p->oomkilladj);
points >>= -(oom_adj);
}
#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*ppoints = ULONG_MAX;
}
if (p->oomkilladj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
p->comm);
get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
task_unlock(p);
} while_each_thread(g, p);
}
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
return;
}
if (!p->mm) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill an mm-less task!\n");
if (!p->mm)
return;
}
if (verbose)
printk(KERN_ERR "Killed process %d (%s)\n",
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
struct mm_struct *mm;
struct task_struct *g, *q;
task_lock(p);
mm = p->mm;
/* WARNING: mm may not be dereferenced since we did not obtain its
* value from get_task_mm(p). This is OK since all we need to do is
* compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
* change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us.
*/
if (mm == NULL)
if (!mm || mm->oom_adj == OOM_DISABLE) {
task_unlock(p);
return 1;
/*
* Don't kill the process if any threads are set to OOM_DISABLE
*/
do_each_thread(g, q) {
if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
return 1;
} while_each_thread(g, q);
}
task_unlock(p);
__oom_kill_task(p, 1);
/*
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
struct task_struct *c;
if (printk_ratelimit()) {
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
current->comm, gfp_mask, order, current->oomkilladj);
task_lock(current);
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
current->comm, gfp_mask, order,
current->mm ? current->mm->oom_adj : OOM_DISABLE);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
* if its mm is still attached.
*/
if (p->flags & PF_EXITING) {
if (p->mm && (p->flags & PF_EXITING)) {
__oom_kill_task(p, 0);
return 0;
}
+10 -9
View File
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
* This avoids exceeding the total dirty_limit when the floating averages
* fluctuate too quickly.
*/
static void
clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
unsigned long dirty, unsigned long *pbdi_dirty)
{
long avail_dirty;
unsigned long avail_dirty;
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
avail_dirty = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_WRITEBACK) +
global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK_TEMP));
global_page_state(NR_WRITEBACK_TEMP);
if (avail_dirty < 0)
if (avail_dirty < dirty)
avail_dirty = dirty - avail_dirty;
else
avail_dirty = 0;
avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
*
* dirty -= (dirty/8) * p_{t}
*/
static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
{
long numerator, denominator;
long dirty = *pdirty;
unsigned long dirty = *pdirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
+483 -287
View File
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -120,7 +120,7 @@ out:
return ret;
}
int swap_readpage(struct file *file, struct page *page)
int swap_readpage(struct page *page)
{
struct bio *bio;
int ret = 0;
+101 -44
View File
@@ -133,15 +133,12 @@ out:
}
/*
* do_page_cache_readahead actually reads a chunk of disk. It allocates all
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*
* do_page_cache_readahead() returns -1 if it encountered request queue
* congestion.
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
nr_to_read = max_sane_readahead(nr_to_read);
while (nr_to_read) {
int err;
@@ -230,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret;
}
/*
* This version skips the IO if the queue is read-congested, and will tell the
* block layer to abandon the readahead if request allocation would block.
*
* force_page_cache_readahead() will ignore queue congestion and will block on
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
}
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
static unsigned long ra_submit(struct file_ra_state *ra,
unsigned long ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
int actual;
@@ -347,6 +329,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
* it approaches max_readhead.
*/
/*
* Count contiguously cached pages from @offset-1 to @offset-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t offset, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
}
/*
* page cache context based read-ahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t offset,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
size = count_history_pages(mapping, ra, offset, max);
/*
* no history pages:
* it could be a random read
*/
if (!size)
return 0;
/*
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
if (size >= offset)
size *= 2;
ra->start = offset;
ra->size = get_init_ra_size(size + req_size, max);
ra->async_size = ra->size;
return 1;
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
int max = ra->ra_pages; /* max readahead pages */
pgoff_t prev_offset;
int sequential;
unsigned long max = max_sane_readahead(ra->ra_pages);
/*
* start of file
*/
if (!offset)
goto initial_readahead;
/*
* It's the expected callback offset, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
sequential = offset - prev_offset <= 1UL || req_size > max;
/*
* Standalone, small read.
* Read as is, and do not pollute the readahead state.
*/
if (!hit_readahead_marker && !sequential) {
return __do_page_cache_readahead(mapping, filp,
offset, req_size, 0);
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
/*
* It may be one of
* - first read on start of file
* - sequential cache miss
* - oversize random read
* Start readahead for it.
* oversize read
*/
if (req_size > max)
goto initial_readahead;
/*
* sequential cache miss
*/
if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->size += ra->async_size;
}
return ra_submit(ra, mapping, filp);
}
+27 -13
View File
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
struct vm_area_struct *vma, unsigned int *mapcount)
struct vm_area_struct *vma,
unsigned int *mapcount,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
out:
if (referenced)
*vm_flags |= vma->vm_flags;
return referenced;
}
static int page_referenced_anon(struct page *page,
struct mem_cgroup *mem_cont)
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
referenced += page_referenced_one(page, vma, &mapcount);
referenced += page_referenced_one(page, vma,
&mapcount, vm_flags);
if (!mapcount)
break;
}
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
* page_referenced_file - referenced check for object-based rmap
* @page: the page we're checking references on.
* @mem_cont: target memory controller
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* For an object-based mapped page, find all the places it is mapped and
* check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
* This function is only called from page_referenced for object-based pages.
*/
static int page_referenced_file(struct page *page,
struct mem_cgroup *mem_cont)
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
referenced += page_referenced_one(page, vma, &mapcount);
referenced += page_referenced_one(page, vma,
&mapcount, vm_flags);
if (!mapcount)
break;
}
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
* @page: the page to test
* @is_locked: caller holds lock on the page
* @mem_cont: target memory controller
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *mem_cont)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
int referenced = 0;
if (TestClearPageReferenced(page))
referenced++;
*vm_flags = 0;
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
referenced += page_referenced_anon(page, mem_cont);
referenced += page_referenced_anon(page, mem_cont,
vm_flags);
else if (is_locked)
referenced += page_referenced_file(page, mem_cont);
referenced += page_referenced_file(page, mem_cont,
vm_flags);
else if (!trylock_page(page))
referenced++;
else {
if (page->mapping)
referenced +=
page_referenced_file(page, mem_cont);
referenced += page_referenced_file(page,
mem_cont, vm_flags);
unlock_page(page);
}
}
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration)
return ret;
}
#ifdef CONFIG_UNEVICTABLE_LRU
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page)
else
return try_to_unmap_file(page, 1, 0);
}
#endif

Some files were not shown because too many files have changed in this diff Show More