2005-04-16 15:20:36 -07:00
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
# include <linux/kernel_stat.h>
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/mman.h>
# include <linux/swap.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
2009-09-21 17:02:01 -07:00
# include <linux/ksm.h>
2005-04-16 15:20:36 -07:00
# include <linux/rmap.h>
2011-10-16 02:01:52 -04:00
# include <linux/export.h>
2006-07-14 00:24:37 -07:00
# include <linux/delayacct.h>
2005-04-16 15:20:36 -07:00
# include <linux/init.h>
2006-09-25 23:30:58 -07:00
# include <linux/writeback.h>
2008-02-07 00:13:53 -08:00
# include <linux/memcontrol.h>
2008-07-28 15:46:29 -07:00
# include <linux/mmu_notifier.h>
2009-01-06 14:40:08 -08:00
# include <linux/kallsyms.h>
# include <linux/swapops.h>
# include <linux/elf.h>
2010-03-24 17:04:11 +09:00
# include <linux/gfp.h>
2012-11-02 11:33:45 +00:00
# include <linux/migrate.h>
2005-04-16 15:20:36 -07:00
2009-09-18 23:55:55 +04:00
# include <asm/io.h>
2005-04-16 15:20:36 -07:00
# include <asm/pgalloc.h>
# include <asm/uaccess.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
# include <asm/pgtable.h>
2008-07-23 21:27:10 -07:00
# include "internal.h"
2005-06-23 00:07:54 -07:00
# ifndef CONFIG_NEED_MULTIPLE_NODES
2005-04-16 15:20:36 -07:00
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr ;
struct page * mem_map ;
EXPORT_SYMBOL ( max_mapnr ) ;
EXPORT_SYMBOL ( mem_map ) ;
# endif
unsigned long num_physpages ;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory ;
EXPORT_SYMBOL ( num_physpages ) ;
EXPORT_SYMBOL ( high_memory ) ;
2008-02-06 22:39:44 +01:00
/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
int randomize_va_space __read_mostly =
# ifdef CONFIG_COMPAT_BRK
1 ;
# else
2 ;
# endif
2006-02-16 23:41:58 +01:00
static int __init disable_randmaps ( char * s )
{
randomize_va_space = 0 ;
2006-03-31 02:30:33 -08:00
return 1 ;
2006-02-16 23:41:58 +01:00
}
__setup ( " norandmaps " , disable_randmaps ) ;
2009-09-21 17:03:34 -07:00
unsigned long zero_pfn __read_mostly ;
2009-09-21 17:03:35 -07:00
unsigned long highest_memmap_pfn __read_mostly ;
2009-09-21 17:03:30 -07:00
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
static int __init init_zero_pfn ( void )
{
zero_pfn = page_to_pfn ( ZERO_PAGE ( 0 ) ) ;
return 0 ;
}
core_initcall ( init_zero_pfn ) ;
2006-02-16 23:41:58 +01:00
2010-03-05 13:41:39 -08:00
2010-03-05 13:41:40 -08:00
# if defined(SPLIT_RSS_COUNTING)
2012-03-21 16:34:13 -07:00
void sync_mm_rss ( struct mm_struct * mm )
2010-03-05 13:41:40 -08:00
{
int i ;
for ( i = 0 ; i < NR_MM_COUNTERS ; i + + ) {
2012-03-21 16:34:13 -07:00
if ( current - > rss_stat . count [ i ] ) {
add_mm_counter ( mm , i , current - > rss_stat . count [ i ] ) ;
current - > rss_stat . count [ i ] = 0 ;
2010-03-05 13:41:40 -08:00
}
}
2012-03-21 16:34:13 -07:00
current - > rss_stat . events = 0 ;
2010-03-05 13:41:40 -08:00
}
static void add_mm_counter_fast ( struct mm_struct * mm , int member , int val )
{
struct task_struct * task = current ;
if ( likely ( task - > mm = = mm ) )
task - > rss_stat . count [ member ] + = val ;
else
add_mm_counter ( mm , member , val ) ;
}
# define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
# define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
/* sync counter once per 64 page faults */
# define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat ( struct task_struct * task )
{
if ( unlikely ( task ! = current ) )
return ;
if ( unlikely ( task - > rss_stat . events + + > TASK_RSS_EVENTS_THRESH ) )
2012-03-21 16:34:13 -07:00
sync_mm_rss ( task - > mm ) ;
2010-03-05 13:41:40 -08:00
}
2011-05-24 17:12:14 -07:00
# else /* SPLIT_RSS_COUNTING */
2010-03-05 13:41:40 -08:00
# define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
# define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat ( struct task_struct * task )
{
}
2011-05-24 17:12:14 -07:00
# endif /* SPLIT_RSS_COUNTING */
# ifdef HAVE_GENERIC_MMU_GATHER
static int tlb_next_batch ( struct mmu_gather * tlb )
{
struct mmu_gather_batch * batch ;
batch = tlb - > active ;
if ( batch - > next ) {
tlb - > active = batch - > next ;
return 1 ;
}
batch = ( void * ) __get_free_pages ( GFP_NOWAIT | __GFP_NOWARN , 0 ) ;
if ( ! batch )
return 0 ;
batch - > next = NULL ;
batch - > nr = 0 ;
batch - > max = MAX_GATHER_BATCH ;
tlb - > active - > next = batch ;
tlb - > active = batch ;
return 1 ;
}
/* tlb_gather_mmu
* Called to initialize an (on-stack) mmu_gather structure for page-table
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
void tlb_gather_mmu ( struct mmu_gather * tlb , struct mm_struct * mm , bool fullmm )
{
tlb - > mm = mm ;
tlb - > fullmm = fullmm ;
2012-06-28 09:02:21 +08:00
tlb - > start = - 1UL ;
tlb - > end = 0 ;
2011-05-24 17:12:14 -07:00
tlb - > need_flush = 0 ;
tlb - > fast_mode = ( num_possible_cpus ( ) = = 1 ) ;
tlb - > local . next = NULL ;
tlb - > local . nr = 0 ;
tlb - > local . max = ARRAY_SIZE ( tlb - > __pages ) ;
tlb - > active = & tlb - > local ;
# ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb - > batch = NULL ;
2010-03-05 13:41:40 -08:00
# endif
2011-05-24 17:12:14 -07:00
}
void tlb_flush_mmu ( struct mmu_gather * tlb )
{
struct mmu_gather_batch * batch ;
if ( ! tlb - > need_flush )
return ;
tlb - > need_flush = 0 ;
tlb_flush ( tlb ) ;
# ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush ( tlb ) ;
# endif
if ( tlb_fast_mode ( tlb ) )
return ;
for ( batch = & tlb - > local ; batch ; batch = batch - > next ) {
free_pages_and_swap_cache ( batch - > pages , batch - > nr ) ;
batch - > nr = 0 ;
}
tlb - > active = & tlb - > local ;
}
/* tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
void tlb_finish_mmu ( struct mmu_gather * tlb , unsigned long start , unsigned long end )
{
struct mmu_gather_batch * batch , * next ;
2012-06-28 09:02:21 +08:00
tlb - > start = start ;
tlb - > end = end ;
2011-05-24 17:12:14 -07:00
tlb_flush_mmu ( tlb ) ;
/* keep the page table cache within bounds */
check_pgt_cache ( ) ;
for ( batch = tlb - > local . next ; batch ; batch = next ) {
next = batch - > next ;
free_pages ( ( unsigned long ) batch , 0 ) ;
}
tlb - > local . next = NULL ;
}
/* __tlb_remove_page
* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
* handling the additional races in SMP caused by other CPUs caching valid
* mappings in their TLBs. Returns the number of free page slots left.
* When out of page slots we must call tlb_flush_mmu().
*/
int __tlb_remove_page ( struct mmu_gather * tlb , struct page * page )
{
struct mmu_gather_batch * batch ;
2012-01-12 17:19:16 -08:00
VM_BUG_ON ( ! tlb - > need_flush ) ;
2011-05-24 17:12:14 -07:00
if ( tlb_fast_mode ( tlb ) ) {
free_page_and_swap_cache ( page ) ;
return 1 ; /* avoid calling tlb_flush_mmu() */
}
batch = tlb - > active ;
batch - > pages [ batch - > nr + + ] = page ;
if ( batch - > nr = = batch - > max ) {
if ( ! tlb_next_batch ( tlb ) )
return 0 ;
2011-07-08 15:39:41 -07:00
batch = tlb - > active ;
2011-05-24 17:12:14 -07:00
}
VM_BUG_ON ( batch - > nr > batch - > max ) ;
return batch - > max - batch - > nr ;
}
# endif /* HAVE_GENERIC_MMU_GATHER */
2010-03-05 13:41:40 -08:00
2011-05-24 17:12:00 -07:00
# ifdef CONFIG_HAVE_RCU_TABLE_FREE
/*
* See the comment near struct mmu_table_batch.
*/
static void tlb_remove_table_smp_sync ( void * arg )
{
/* Simply deliver the interrupt */
}
static void tlb_remove_table_one ( void * table )
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
smp_call_function ( tlb_remove_table_smp_sync , NULL , 1 ) ;
__tlb_remove_table ( table ) ;
}
static void tlb_remove_table_rcu ( struct rcu_head * head )
{
struct mmu_table_batch * batch ;
int i ;
batch = container_of ( head , struct mmu_table_batch , rcu ) ;
for ( i = 0 ; i < batch - > nr ; i + + )
__tlb_remove_table ( batch - > tables [ i ] ) ;
free_page ( ( unsigned long ) batch ) ;
}
void tlb_table_flush ( struct mmu_gather * tlb )
{
struct mmu_table_batch * * batch = & tlb - > batch ;
if ( * batch ) {
call_rcu_sched ( & ( * batch ) - > rcu , tlb_remove_table_rcu ) ;
* batch = NULL ;
}
}
void tlb_remove_table ( struct mmu_gather * tlb , void * table )
{
struct mmu_table_batch * * batch = & tlb - > batch ;
tlb - > need_flush = 1 ;
/*
* When there's less then two users of this mm there cannot be a
* concurrent page-table walk.
*/
if ( atomic_read ( & tlb - > mm - > mm_users ) < 2 ) {
__tlb_remove_table ( table ) ;
return ;
}
if ( * batch = = NULL ) {
* batch = ( struct mmu_table_batch * ) __get_free_page ( GFP_NOWAIT | __GFP_NOWARN ) ;
if ( * batch = = NULL ) {
tlb_remove_table_one ( table ) ;
return ;
}
( * batch ) - > nr = 0 ;
}
( * batch ) - > tables [ ( * batch ) - > nr + + ] = table ;
if ( ( * batch ) - > nr = = MAX_TABLE_BATCH )
tlb_table_flush ( tlb ) ;
}
2011-05-24 17:12:14 -07:00
# endif /* CONFIG_HAVE_RCU_TABLE_FREE */
2011-05-24 17:12:00 -07:00
2005-04-16 15:20:36 -07:00
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
* very seldom) called out from the p?d_none_or_clear_bad macros.
*/
void pgd_clear_bad ( pgd_t * pgd )
{
pgd_ERROR ( * pgd ) ;
pgd_clear ( pgd ) ;
}
void pud_clear_bad ( pud_t * pud )
{
pud_ERROR ( * pud ) ;
pud_clear ( pud ) ;
}
void pmd_clear_bad ( pmd_t * pmd )
{
pmd_ERROR ( * pmd ) ;
pmd_clear ( pmd ) ;
}
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
2009-07-22 15:44:28 +10:00
static void free_pte_range ( struct mmu_gather * tlb , pmd_t * pmd ,
unsigned long addr )
2005-04-16 15:20:36 -07:00
{
2008-02-08 04:22:04 -08:00
pgtable_t token = pmd_pgtable ( * pmd ) ;
2005-04-19 13:29:15 -07:00
pmd_clear ( pmd ) ;
2009-07-22 15:44:28 +10:00
pte_free_tlb ( tlb , token , addr ) ;
2005-04-19 13:29:15 -07:00
tlb - > mm - > nr_ptes - - ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
static inline void free_pmd_range ( struct mmu_gather * tlb , pud_t * pud ,
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pmd_t * pmd ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
unsigned long start ;
2005-04-16 15:20:36 -07:00
2005-04-19 13:29:15 -07:00
start = addr ;
2005-04-16 15:20:36 -07:00
pmd = pmd_offset ( pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none_or_clear_bad ( pmd ) )
continue ;
2009-07-22 15:44:28 +10:00
free_pte_range ( tlb , pmd , addr ) ;
2005-04-16 15:20:36 -07:00
} while ( pmd + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
start & = PUD_MASK ;
if ( start < floor )
return ;
if ( ceiling ) {
ceiling & = PUD_MASK ;
if ( ! ceiling )
return ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
if ( end - 1 > ceiling - 1 )
return ;
pmd = pmd_offset ( pud , start ) ;
pud_clear ( pud ) ;
2009-07-22 15:44:28 +10:00
pmd_free_tlb ( tlb , pmd , start ) ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
static inline void free_pud_range ( struct mmu_gather * tlb , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pud_t * pud ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
unsigned long start ;
2005-04-16 15:20:36 -07:00
2005-04-19 13:29:15 -07:00
start = addr ;
2005-04-16 15:20:36 -07:00
pud = pud_offset ( pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( pud ) )
continue ;
2005-04-19 13:29:15 -07:00
free_pmd_range ( tlb , pud , addr , next , floor , ceiling ) ;
2005-04-16 15:20:36 -07:00
} while ( pud + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
start & = PGDIR_MASK ;
if ( start < floor )
return ;
if ( ceiling ) {
ceiling & = PGDIR_MASK ;
if ( ! ceiling )
return ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
if ( end - 1 > ceiling - 1 )
return ;
pud = pud_offset ( pgd , start ) ;
pgd_clear ( pgd ) ;
2009-07-22 15:44:28 +10:00
pud_free_tlb ( tlb , pud , start ) ;
2005-04-16 15:20:36 -07:00
}
/*
2005-04-19 13:29:15 -07:00
* This function frees user-level page tables of a process.
*
2005-04-16 15:20:36 -07:00
* Must be called with pagetable lock held.
*/
2008-07-23 21:27:10 -07:00
void free_pgd_range ( struct mmu_gather * tlb ,
2005-04-19 13:29:15 -07:00
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr & = PMD_MASK ;
if ( addr < floor ) {
addr + = PMD_SIZE ;
if ( ! addr )
return ;
}
if ( ceiling ) {
ceiling & = PMD_MASK ;
if ( ! ceiling )
return ;
}
if ( end - 1 > ceiling - 1 )
end - = PMD_SIZE ;
if ( addr > end - 1 )
return ;
2008-07-23 21:27:10 -07:00
pgd = pgd_offset ( tlb - > mm , addr ) ;
2005-04-16 15:20:36 -07:00
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( pgd ) )
continue ;
2008-07-23 21:27:10 -07:00
free_pud_range ( tlb , pgd , addr , next , floor , ceiling ) ;
2005-04-16 15:20:36 -07:00
} while ( pgd + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
}
2008-07-23 21:27:10 -07:00
void free_pgtables ( struct mmu_gather * tlb , struct vm_area_struct * vma ,
2005-04-19 13:29:16 -07:00
unsigned long floor , unsigned long ceiling )
2005-04-19 13:29:15 -07:00
{
while ( vma ) {
struct vm_area_struct * next = vma - > vm_next ;
unsigned long addr = vma - > vm_start ;
2005-10-29 18:16:29 -07:00
/*
2009-08-21 02:35:05 +10:00
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
2005-10-29 18:16:29 -07:00
*/
2010-03-05 13:42:07 -08:00
unlink_anon_vmas ( vma ) ;
2005-10-29 18:16:29 -07:00
unlink_file_vma ( vma ) ;
2006-03-22 00:08:57 -08:00
if ( is_vm_hugetlb_page ( vma ) ) {
2005-04-19 13:29:16 -07:00
hugetlb_free_pgd_range ( tlb , addr , vma - > vm_end ,
2005-04-19 13:29:15 -07:00
floor , next ? next - > vm_start : ceiling ) ;
2005-04-19 13:29:16 -07:00
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while ( next & & next - > vm_start < = vma - > vm_end + PMD_SIZE
2006-03-22 00:08:58 -08:00
& & ! is_vm_hugetlb_page ( next ) ) {
2005-04-19 13:29:16 -07:00
vma = next ;
next = vma - > vm_next ;
2010-03-05 13:42:07 -08:00
unlink_anon_vmas ( vma ) ;
2005-10-29 18:16:29 -07:00
unlink_file_vma ( vma ) ;
2005-04-19 13:29:16 -07:00
}
free_pgd_range ( tlb , addr , vma - > vm_end ,
floor , next ? next - > vm_start : ceiling ) ;
}
2005-04-19 13:29:15 -07:00
vma = next ;
}
2005-04-16 15:20:36 -07:00
}
2011-01-13 15:46:43 -08:00
int __pte_alloc ( struct mm_struct * mm , struct vm_area_struct * vma ,
pmd_t * pmd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2008-02-08 04:22:04 -08:00
pgtable_t new = pte_alloc_one ( mm , address ) ;
2011-01-13 15:46:43 -08:00
int wait_split_huge_page ;
2005-10-29 18:16:22 -07:00
if ( ! new )
return - ENOMEM ;
2008-05-14 06:37:36 +02:00
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_read_barrier_depends() barriers in page table walking code.
*/
smp_wmb ( ) ; /* Could be smp_wmb__xxx(before|after)_spin_lock */
2005-10-29 18:16:23 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2011-01-13 15:46:43 -08:00
wait_split_huge_page = 0 ;
if ( likely ( pmd_none ( * pmd ) ) ) { /* Has another populated it ? */
2005-04-16 15:20:36 -07:00
mm - > nr_ptes + + ;
pmd_populate ( mm , pmd , new ) ;
2008-02-08 04:22:04 -08:00
new = NULL ;
2011-01-13 15:46:43 -08:00
} else if ( unlikely ( pmd_trans_splitting ( * pmd ) ) )
wait_split_huge_page = 1 ;
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2008-02-08 04:22:04 -08:00
if ( new )
pte_free ( mm , new ) ;
2011-01-13 15:46:43 -08:00
if ( wait_split_huge_page )
wait_split_huge_page ( vma - > anon_vma , pmd ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:22 -07:00
int __pte_alloc_kernel ( pmd_t * pmd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:22 -07:00
pte_t * new = pte_alloc_one_kernel ( & init_mm , address ) ;
if ( ! new )
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2008-05-14 06:37:36 +02:00
smp_wmb ( ) ; /* See comment in __pte_alloc */
2005-10-29 18:16:22 -07:00
spin_lock ( & init_mm . page_table_lock ) ;
2011-01-13 15:46:43 -08:00
if ( likely ( pmd_none ( * pmd ) ) ) { /* Has another populated it ? */
2005-10-29 18:16:22 -07:00
pmd_populate_kernel ( & init_mm , pmd , new ) ;
2008-02-08 04:22:04 -08:00
new = NULL ;
2011-01-13 15:46:43 -08:00
} else
VM_BUG_ON ( pmd_trans_splitting ( * pmd ) ) ;
2005-10-29 18:16:22 -07:00
spin_unlock ( & init_mm . page_table_lock ) ;
2008-02-08 04:22:04 -08:00
if ( new )
pte_free_kernel ( & init_mm , new ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2010-03-05 13:41:39 -08:00
static inline void init_rss_vec ( int * rss )
2005-10-29 18:16:05 -07:00
{
2010-03-05 13:41:39 -08:00
memset ( rss , 0 , sizeof ( int ) * NR_MM_COUNTERS ) ;
}
static inline void add_mm_rss_vec ( struct mm_struct * mm , int * rss )
{
int i ;
2010-03-05 13:41:40 -08:00
if ( current - > mm = = mm )
2012-03-21 16:34:13 -07:00
sync_mm_rss ( mm ) ;
2010-03-05 13:41:39 -08:00
for ( i = 0 ; i < NR_MM_COUNTERS ; i + + )
if ( rss [ i ] )
add_mm_counter ( mm , i , rss [ i ] ) ;
2005-10-29 18:16:05 -07:00
}
2005-10-29 18:16:12 -07:00
/*
2005-11-28 14:34:23 -08:00
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
* a region that doesn't allow it.
2005-10-29 18:16:12 -07:00
*
* The calling function must still handle the error.
*/
2009-01-06 14:40:08 -08:00
static void print_bad_pte ( struct vm_area_struct * vma , unsigned long addr ,
pte_t pte , struct page * page )
2005-10-29 18:16:12 -07:00
{
2009-01-06 14:40:08 -08:00
pgd_t * pgd = pgd_offset ( vma - > vm_mm , addr ) ;
pud_t * pud = pud_offset ( pgd , addr ) ;
pmd_t * pmd = pmd_offset ( pud , addr ) ;
struct address_space * mapping ;
pgoff_t index ;
2009-01-06 14:40:12 -08:00
static unsigned long resume ;
static unsigned long nr_shown ;
static unsigned long nr_unshown ;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if ( nr_shown = = 60 ) {
if ( time_before ( jiffies , resume ) ) {
nr_unshown + + ;
return ;
}
if ( nr_unshown ) {
2009-01-06 14:40:13 -08:00
printk ( KERN_ALERT
" BUG: Bad page map: %lu messages suppressed \n " ,
2009-01-06 14:40:12 -08:00
nr_unshown ) ;
nr_unshown = 0 ;
}
nr_shown = 0 ;
}
if ( nr_shown + + = = 0 )
resume = jiffies + 60 * HZ ;
2009-01-06 14:40:08 -08:00
mapping = vma - > vm_file ? vma - > vm_file - > f_mapping : NULL ;
index = linear_page_index ( vma , addr ) ;
2009-01-06 14:40:13 -08:00
printk ( KERN_ALERT
" BUG: Bad page map in process %s pte:%08llx pmd:%08llx \n " ,
2009-01-06 14:40:08 -08:00
current - > comm ,
( long long ) pte_val ( pte ) , ( long long ) pmd_val ( * pmd ) ) ;
2010-03-10 15:20:43 -08:00
if ( page )
dump_page ( page ) ;
2009-01-06 14:40:13 -08:00
printk ( KERN_ALERT
2009-01-06 14:40:08 -08:00
" addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx \n " ,
( void * ) addr , vma - > vm_flags , vma - > anon_vma , mapping , index ) ;
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
if ( vma - > vm_ops )
2009-01-06 14:40:13 -08:00
print_symbol ( KERN_ALERT " vma->vm_ops->fault: %s \n " ,
2009-01-06 14:40:08 -08:00
( unsigned long ) vma - > vm_ops - > fault ) ;
if ( vma - > vm_file & & vma - > vm_file - > f_op )
2009-01-06 14:40:13 -08:00
print_symbol ( KERN_ALERT " vma->vm_file->f_op->mmap: %s \n " ,
2009-01-06 14:40:08 -08:00
( unsigned long ) vma - > vm_file - > f_op - > mmap ) ;
2005-10-29 18:16:12 -07:00
dump_stack ( ) ;
2009-01-06 14:40:08 -08:00
add_taint ( TAINT_BAD_PAGE ) ;
2005-10-29 18:16:12 -07:00
}
2012-10-08 16:33:33 -07:00
static inline bool is_cow_mapping ( vm_flags_t flags )
2005-12-11 20:38:17 -08:00
{
return ( flags & ( VM_SHARED | VM_MAYWRITE ) ) = = VM_MAYWRITE ;
}
2005-11-21 21:32:18 -08:00
/*
2008-04-28 02:13:00 -07:00
* vm_normal_page -- This function gets the "struct page" associated with a pte.
2005-11-28 14:34:23 -08:00
*
2008-04-28 02:13:00 -07:00
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
* case, NULL is returned here. "Normal" mappings do have a struct page.
2008-04-28 02:12:58 -07:00
*
2008-04-28 02:13:00 -07:00
* There are 2 broad cases. Firstly, an architecture may define a pte_special()
* pte bit, in which case this function is trivial. Secondly, an architecture
* may not have a spare pte bit, which requires a more complicated scheme,
* described below.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
* COWed pages of a VM_PFNMAP are always normal.
2005-11-28 14:34:23 -08:00
*
2008-04-28 02:12:58 -07:00
* The way we recognize COWed pages within VM_PFNMAP mappings is through the
* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
2008-04-28 02:13:00 -07:00
* set, and the vm_pgoff will point to the first PFN mapped: thus every special
* mapping will always honor the rule
2005-11-28 14:34:23 -08:00
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
2008-04-28 02:13:00 -07:00
* And for normal mappings this is false.
2008-04-28 02:12:58 -07:00
*
2008-04-28 02:13:00 -07:00
* This restricts such mappings to be a linear translation from virtual address
* to pfn. To get around this restriction, we allow arbitrary mappings so long
* as the vma is not a COW mapping; in that case, we know that all ptes are
* special (because none can have been COWed).
2008-04-28 02:12:58 -07:00
*
*
2008-04-28 02:13:00 -07:00
* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
*
2008-04-28 02:12:58 -07:00
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
* page (that is, those where pfn_valid is true) are refcounted and considered
* normal pages by the VM. The disadvantage is that pages are refcounted
* (which can be slower and simply not an option for some PFNMAP users). The
* advantage is that we don't have to follow the strict linearity rule of
* PFNMAP mappings in order to support COWable mappings.
*
2005-11-21 21:32:18 -08:00
*/
2008-04-28 02:13:00 -07:00
# ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
# else
# define HAVE_PTE_SPECIAL 0
# endif
struct page * vm_normal_page ( struct vm_area_struct * vma , unsigned long addr ,
pte_t pte )
2005-11-21 21:32:18 -08:00
{
2009-01-06 14:40:09 -08:00
unsigned long pfn = pte_pfn ( pte ) ;
2008-04-28 02:13:00 -07:00
if ( HAVE_PTE_SPECIAL ) {
2009-01-06 14:40:09 -08:00
if ( likely ( ! pte_special ( pte ) ) )
goto check_pfn ;
2009-09-21 17:03:30 -07:00
if ( vma - > vm_flags & ( VM_PFNMAP | VM_MIXEDMAP ) )
return NULL ;
2009-09-21 17:03:34 -07:00
if ( ! is_zero_pfn ( pfn ) )
2009-01-06 14:40:09 -08:00
print_bad_pte ( vma , addr , pte , NULL ) ;
2008-04-28 02:13:00 -07:00
return NULL ;
}
/* !HAVE_PTE_SPECIAL case follows: */
2008-04-28 02:12:58 -07:00
if ( unlikely ( vma - > vm_flags & ( VM_PFNMAP | VM_MIXEDMAP ) ) ) {
if ( vma - > vm_flags & VM_MIXEDMAP ) {
if ( ! pfn_valid ( pfn ) )
return NULL ;
goto out ;
} else {
2008-04-28 02:13:00 -07:00
unsigned long off ;
off = ( addr - vma - > vm_start ) > > PAGE_SHIFT ;
2008-04-28 02:12:58 -07:00
if ( pfn = = vma - > vm_pgoff + off )
return NULL ;
if ( ! is_cow_mapping ( vma - > vm_flags ) )
return NULL ;
}
2005-11-28 14:34:23 -08:00
}
2009-09-21 17:03:34 -07:00
if ( is_zero_pfn ( pfn ) )
return NULL ;
2009-01-06 14:40:09 -08:00
check_pfn :
if ( unlikely ( pfn > highest_memmap_pfn ) ) {
print_bad_pte ( vma , addr , pte , NULL ) ;
return NULL ;
}
2005-11-28 14:34:23 -08:00
/*
2008-04-28 02:13:00 -07:00
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
2005-11-28 14:34:23 -08:00
*/
2008-04-28 02:12:58 -07:00
out :
2005-11-28 14:34:23 -08:00
return pfn_to_page ( pfn ) ;
2005-11-21 21:32:18 -08:00
}
2005-04-16 15:20:36 -07:00
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
2009-12-14 17:58:46 -08:00
static inline unsigned long
2005-04-16 15:20:36 -07:00
copy_one_pte ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
2005-10-29 18:16:12 -07:00
pte_t * dst_pte , pte_t * src_pte , struct vm_area_struct * vma ,
2005-10-29 18:16:13 -07:00
unsigned long addr , int * rss )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:12 -07:00
unsigned long vm_flags = vma - > vm_flags ;
2005-04-16 15:20:36 -07:00
pte_t pte = * src_pte ;
struct page * page ;
/* pte contains position in swap or file, so copy. */
if ( unlikely ( ! pte_present ( pte ) ) ) {
if ( ! pte_file ( pte ) ) {
2006-06-23 02:03:35 -07:00
swp_entry_t entry = pte_to_swp_entry ( pte ) ;
2009-12-14 17:58:46 -08:00
if ( swap_duplicate ( entry ) < 0 )
return entry . val ;
2005-04-16 15:20:36 -07:00
/* make sure dst_mm is on swapoff's mmlist. */
if ( unlikely ( list_empty ( & dst_mm - > mmlist ) ) ) {
spin_lock ( & mmlist_lock ) ;
2005-10-29 18:16:41 -07:00
if ( list_empty ( & dst_mm - > mmlist ) )
list_add ( & dst_mm - > mmlist ,
& src_mm - > mmlist ) ;
2005-04-16 15:20:36 -07:00
spin_unlock ( & mmlist_lock ) ;
}
2010-03-05 13:41:42 -08:00
if ( likely ( ! non_swap_entry ( entry ) ) )
rss [ MM_SWAPENTS ] + + ;
2012-01-20 14:34:24 -08:00
else if ( is_migration_entry ( entry ) ) {
page = migration_entry_to_page ( entry ) ;
if ( PageAnon ( page ) )
rss [ MM_ANONPAGES ] + + ;
else
rss [ MM_FILEPAGES ] + + ;
if ( is_write_migration_entry ( entry ) & &
is_cow_mapping ( vm_flags ) ) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
make_migration_entry_read ( & entry ) ;
pte = swp_entry_to_pte ( entry ) ;
set_pte_at ( src_mm , addr , src_pte , pte ) ;
}
2006-06-23 02:03:35 -07:00
}
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:05 -07:00
goto out_set_pte ;
2005-04-16 15:20:36 -07:00
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
2005-12-11 20:38:17 -08:00
if ( is_cow_mapping ( vm_flags ) ) {
2005-04-16 15:20:36 -07:00
ptep_set_wrprotect ( src_mm , addr , src_pte ) ;
2006-09-30 23:29:30 -07:00
pte = pte_wrprotect ( pte ) ;
2005-04-16 15:20:36 -07:00
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if ( vm_flags & VM_SHARED )
pte = pte_mkclean ( pte ) ;
pte = pte_mkold ( pte ) ;
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , addr , pte ) ;
if ( page ) {
get_page ( page ) ;
2009-09-21 17:01:59 -07:00
page_dup_rmap ( page ) ;
2010-03-05 13:41:39 -08:00
if ( PageAnon ( page ) )
rss [ MM_ANONPAGES ] + + ;
else
rss [ MM_FILEPAGES ] + + ;
2005-11-28 14:34:23 -08:00
}
2005-10-29 18:16:05 -07:00
out_set_pte :
set_pte_at ( dst_mm , addr , dst_pte , pte ) ;
2009-12-14 17:58:46 -08:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2011-01-13 15:46:52 -08:00
int copy_pte_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pmd_t * dst_pmd , pmd_t * src_pmd , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
2005-04-16 15:20:36 -07:00
{
2009-10-26 16:50:23 -07:00
pte_t * orig_src_pte , * orig_dst_pte ;
2005-04-16 15:20:36 -07:00
pte_t * src_pte , * dst_pte ;
2005-10-29 18:16:23 -07:00
spinlock_t * src_ptl , * dst_ptl ;
2005-10-29 18:15:53 -07:00
int progress = 0 ;
2010-03-05 13:41:39 -08:00
int rss [ NR_MM_COUNTERS ] ;
2009-12-14 17:58:46 -08:00
swp_entry_t entry = ( swp_entry_t ) { 0 } ;
2005-04-16 15:20:36 -07:00
again :
2010-03-05 13:41:39 -08:00
init_rss_vec ( rss ) ;
2005-10-29 18:16:23 -07:00
dst_pte = pte_alloc_map_lock ( dst_mm , dst_pmd , addr , & dst_ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! dst_pte )
return - ENOMEM ;
2010-10-26 14:21:52 -07:00
src_pte = pte_offset_map ( src_pmd , addr ) ;
2005-10-29 18:16:40 -07:00
src_ptl = pte_lockptr ( src_mm , src_pmd ) ;
2006-07-03 00:25:08 -07:00
spin_lock_nested ( src_ptl , SINGLE_DEPTH_NESTING ) ;
2009-10-26 16:50:23 -07:00
orig_src_pte = src_pte ;
orig_dst_pte = dst_pte ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
2005-10-29 18:15:53 -07:00
if ( progress > = 32 ) {
progress = 0 ;
if ( need_resched ( ) | |
2008-01-30 13:31:20 +01:00
spin_needbreak ( src_ptl ) | | spin_needbreak ( dst_ptl ) )
2005-10-29 18:15:53 -07:00
break ;
}
2005-04-16 15:20:36 -07:00
if ( pte_none ( * src_pte ) ) {
progress + + ;
continue ;
}
2009-12-14 17:58:46 -08:00
entry . val = copy_one_pte ( dst_mm , src_mm , dst_pte , src_pte ,
vma , addr , rss ) ;
if ( entry . val )
break ;
2005-04-16 15:20:36 -07:00
progress + = 8 ;
} while ( dst_pte + + , src_pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:23 -07:00
spin_unlock ( src_ptl ) ;
2010-10-26 14:21:52 -07:00
pte_unmap ( orig_src_pte ) ;
2010-03-05 13:41:39 -08:00
add_mm_rss_vec ( dst_mm , rss ) ;
2009-10-26 16:50:23 -07:00
pte_unmap_unlock ( orig_dst_pte , dst_ptl ) ;
2005-10-29 18:16:23 -07:00
cond_resched ( ) ;
2009-12-14 17:58:46 -08:00
if ( entry . val ) {
if ( add_swap_count_continuation ( entry , GFP_KERNEL ) < 0 )
return - ENOMEM ;
progress = 0 ;
}
2005-04-16 15:20:36 -07:00
if ( addr ! = end )
goto again ;
return 0 ;
}
static inline int copy_pmd_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pud_t * dst_pud , pud_t * src_pud , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
{
pmd_t * src_pmd , * dst_pmd ;
unsigned long next ;
dst_pmd = pmd_alloc ( dst_mm , dst_pud , addr ) ;
if ( ! dst_pmd )
return - ENOMEM ;
src_pmd = pmd_offset ( src_pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
2011-01-13 15:46:52 -08:00
if ( pmd_trans_huge ( * src_pmd ) ) {
int err ;
2011-01-13 15:47:15 -08:00
VM_BUG_ON ( next - addr ! = HPAGE_PMD_SIZE ) ;
2011-01-13 15:46:52 -08:00
err = copy_huge_pmd ( dst_mm , src_mm ,
dst_pmd , src_pmd , addr , vma ) ;
if ( err = = - ENOMEM )
return - ENOMEM ;
if ( ! err )
continue ;
/* fall through */
}
2005-04-16 15:20:36 -07:00
if ( pmd_none_or_clear_bad ( src_pmd ) )
continue ;
if ( copy_pte_range ( dst_mm , src_mm , dst_pmd , src_pmd ,
vma , addr , next ) )
return - ENOMEM ;
} while ( dst_pmd + + , src_pmd + + , addr = next , addr ! = end ) ;
return 0 ;
}
static inline int copy_pud_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pgd_t * dst_pgd , pgd_t * src_pgd , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
{
pud_t * src_pud , * dst_pud ;
unsigned long next ;
dst_pud = pud_alloc ( dst_mm , dst_pgd , addr ) ;
if ( ! dst_pud )
return - ENOMEM ;
src_pud = pud_offset ( src_pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( src_pud ) )
continue ;
if ( copy_pmd_range ( dst_mm , src_mm , dst_pud , src_pud ,
vma , addr , next ) )
return - ENOMEM ;
} while ( dst_pud + + , src_pud + + , addr = next , addr ! = end ) ;
return 0 ;
}
int copy_page_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
struct vm_area_struct * vma )
{
pgd_t * src_pgd , * dst_pgd ;
unsigned long next ;
unsigned long addr = vma - > vm_start ;
unsigned long end = vma - > vm_end ;
2012-10-08 16:33:33 -07:00
unsigned long mmun_start ; /* For mmu_notifiers */
unsigned long mmun_end ; /* For mmu_notifiers */
bool is_cow ;
2008-07-28 15:46:29 -07:00
int ret ;
2005-04-16 15:20:36 -07:00
2005-08-28 16:49:11 +10:00
/*
* Don't copy ptes where a page fault will fill them correctly.
* Fork becomes much lighter when there are big shared or private
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
2012-10-08 16:28:40 -07:00
if ( ! ( vma - > vm_flags & ( VM_HUGETLB | VM_NONLINEAR |
VM_PFNMAP | VM_MIXEDMAP ) ) ) {
2005-08-28 16:49:11 +10:00
if ( ! vma - > anon_vma )
return 0 ;
}
2005-04-16 15:20:36 -07:00
if ( is_vm_hugetlb_page ( vma ) )
return copy_hugetlb_page_range ( dst_mm , src_mm , vma ) ;
2012-10-08 16:28:34 -07:00
if ( unlikely ( vma - > vm_flags & VM_PFNMAP ) ) {
2008-12-18 11:41:29 -08:00
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
2012-10-08 16:28:29 -07:00
ret = track_pfn_copy ( vma ) ;
2008-12-18 11:41:29 -08:00
if ( ret )
return ret ;
}
2008-07-28 15:46:29 -07:00
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
2012-10-08 16:33:33 -07:00
is_cow = is_cow_mapping ( vma - > vm_flags ) ;
mmun_start = addr ;
mmun_end = end ;
if ( is_cow )
mmu_notifier_invalidate_range_start ( src_mm , mmun_start ,
mmun_end ) ;
2008-07-28 15:46:29 -07:00
ret = 0 ;
2005-04-16 15:20:36 -07:00
dst_pgd = pgd_offset ( dst_mm , addr ) ;
src_pgd = pgd_offset ( src_mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( src_pgd ) )
continue ;
2008-07-28 15:46:29 -07:00
if ( unlikely ( copy_pud_range ( dst_mm , src_mm , dst_pgd , src_pgd ,
vma , addr , next ) ) ) {
ret = - ENOMEM ;
break ;
}
2005-04-16 15:20:36 -07:00
} while ( dst_pgd + + , src_pgd + + , addr = next , addr ! = end ) ;
2008-07-28 15:46:29 -07:00
2012-10-08 16:33:33 -07:00
if ( is_cow )
mmu_notifier_invalidate_range_end ( src_mm , mmun_start , mmun_end ) ;
2008-07-28 15:46:29 -07:00
return ret ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static unsigned long zap_pte_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pmd_t * pmd ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2011-05-24 17:12:04 -07:00
struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:12 -07:00
struct mm_struct * mm = tlb - > mm ;
2011-05-24 17:11:45 -07:00
int force_flush = 0 ;
2010-03-05 13:41:39 -08:00
int rss [ NR_MM_COUNTERS ] ;
2011-05-24 17:12:04 -07:00
spinlock_t * ptl ;
2011-06-15 15:08:23 -07:00
pte_t * start_pte ;
2011-05-24 17:12:04 -07:00
pte_t * pte ;
2010-03-05 13:41:39 -08:00
2011-05-24 17:11:45 -07:00
again :
2011-05-24 17:12:01 -07:00
init_rss_vec ( rss ) ;
2011-06-15 15:08:23 -07:00
start_pte = pte_offset_map_lock ( mm , pmd , addr , & ptl ) ;
pte = start_pte ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
pte_t ptent = * pte ;
2005-11-13 16:06:42 -08:00
if ( pte_none ( ptent ) ) {
2005-04-16 15:20:36 -07:00
continue ;
2005-11-13 16:06:42 -08:00
}
2006-03-16 23:04:09 -08:00
2005-04-16 15:20:36 -07:00
if ( pte_present ( ptent ) ) {
2005-11-21 21:32:18 -08:00
struct page * page ;
2005-11-13 16:06:42 -08:00
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , addr , ptent ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( details ) & & page ) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if ( details - > check_mapping & &
details - > check_mapping ! = page - > mapping )
continue ;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
*/
if ( details - > nonlinear_vma & &
( page - > index < details - > first_index | |
page - > index > details - > last_index ) )
continue ;
}
2005-10-29 18:16:12 -07:00
ptent = ptep_get_and_clear_full ( mm , addr , pte ,
2005-09-03 15:55:04 -07:00
tlb - > fullmm ) ;
2005-04-16 15:20:36 -07:00
tlb_remove_tlb_entry ( tlb , pte , addr ) ;
if ( unlikely ( ! page ) )
continue ;
if ( unlikely ( details ) & & details - > nonlinear_vma
& & linear_page_index ( details - > nonlinear_vma ,
addr ) ! = page - > index )
2005-10-29 18:16:12 -07:00
set_pte_at ( mm , addr , pte ,
2005-04-16 15:20:36 -07:00
pgoff_to_pte ( page - > index ) ) ;
if ( PageAnon ( page ) )
2010-03-05 13:41:39 -08:00
rss [ MM_ANONPAGES ] - - ;
2005-10-29 18:15:54 -07:00
else {
if ( pte_dirty ( ptent ) )
set_page_dirty ( page ) ;
2009-01-06 14:39:17 -08:00
if ( pte_young ( ptent ) & &
likely ( ! VM_SequentialReadHint ( vma ) ) )
2009-01-06 14:38:55 -08:00
mark_page_accessed ( page ) ;
2010-03-05 13:41:39 -08:00
rss [ MM_FILEPAGES ] - - ;
2005-10-29 18:15:54 -07:00
}
2009-01-06 14:40:11 -08:00
page_remove_rmap ( page ) ;
2009-01-06 14:40:08 -08:00
if ( unlikely ( page_mapcount ( page ) < 0 ) )
print_bad_pte ( vma , addr , ptent , page ) ;
2011-05-24 17:11:45 -07:00
force_flush = ! __tlb_remove_page ( tlb , page ) ;
if ( force_flush )
break ;
2005-04-16 15:20:36 -07:00
continue ;
}
/*
* If details->check_mapping, we leave swap entries;
* if details->nonlinear_vma, we leave file entries.
*/
if ( unlikely ( details ) )
continue ;
2009-01-06 14:40:10 -08:00
if ( pte_file ( ptent ) ) {
if ( unlikely ( ! ( vma - > vm_flags & VM_NONLINEAR ) ) )
print_bad_pte ( vma , addr , ptent , NULL ) ;
2010-03-05 13:41:42 -08:00
} else {
swp_entry_t entry = pte_to_swp_entry ( ptent ) ;
if ( ! non_swap_entry ( entry ) )
rss [ MM_SWAPENTS ] - - ;
2012-01-20 14:34:24 -08:00
else if ( is_migration_entry ( entry ) ) {
struct page * page ;
page = migration_entry_to_page ( entry ) ;
if ( PageAnon ( page ) )
rss [ MM_ANONPAGES ] - - ;
else
rss [ MM_FILEPAGES ] - - ;
}
2010-03-05 13:41:42 -08:00
if ( unlikely ( ! free_swap_and_cache ( entry ) ) )
print_bad_pte ( vma , addr , ptent , NULL ) ;
}
2006-09-30 23:29:31 -07:00
pte_clear_not_present_full ( mm , addr , pte , tlb - > fullmm ) ;
2011-05-24 17:12:04 -07:00
} while ( pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2005-10-29 18:16:05 -07:00
2010-03-05 13:41:39 -08:00
add_mm_rss_vec ( mm , rss ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2011-06-15 15:08:23 -07:00
pte_unmap_unlock ( start_pte , ptl ) ;
2005-11-13 16:06:42 -08:00
2011-05-24 17:11:45 -07:00
/*
* mmu_gather ran out of room to batch pages, we break out of
* the PTE lock to avoid doing the potential expensive TLB invalidate
* and page-free while holding it.
*/
if ( force_flush ) {
force_flush = 0 ;
2012-06-28 09:02:21 +08:00
# ifdef HAVE_GENERIC_MMU_GATHER
tlb - > start = addr ;
tlb - > end = end ;
# endif
2011-05-24 17:11:45 -07:00
tlb_flush_mmu ( tlb ) ;
if ( addr ! = end )
goto again ;
}
2005-11-13 16:06:42 -08:00
return addr ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static inline unsigned long zap_pmd_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pud_t * pud ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2011-05-24 17:12:04 -07:00
struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pmd_t * pmd ;
unsigned long next ;
pmd = pmd_offset ( pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
2011-01-13 15:46:52 -08:00
if ( pmd_trans_huge ( * pmd ) ) {
2012-03-21 16:33:42 -07:00
if ( next - addr ! = HPAGE_PMD_SIZE ) {
2012-06-20 12:53:00 -07:00
# ifdef CONFIG_DEBUG_VM
if ( ! rwsem_is_locked ( & tlb - > mm - > mmap_sem ) ) {
pr_err ( " %s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx \n " ,
__func__ , addr , end ,
vma - > vm_start ,
vma - > vm_end ) ;
BUG ( ) ;
}
# endif
2012-12-12 13:50:59 -08:00
split_huge_page_pmd ( vma , addr , pmd ) ;
2012-01-12 17:19:16 -08:00
} else if ( zap_huge_pmd ( tlb , vma , pmd , addr ) )
2012-03-21 16:33:42 -07:00
goto next ;
2011-01-13 15:46:52 -08:00
/* fall through */
}
2012-03-21 16:33:42 -07:00
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
* because MADV_DONTNEED holds the mmap_sem in read
* mode.
*/
if ( pmd_none_or_trans_huge_or_clear_bad ( pmd ) )
goto next ;
2011-05-24 17:12:04 -07:00
next = zap_pte_range ( tlb , vma , pmd , addr , next , details ) ;
2012-03-21 16:33:42 -07:00
next :
2011-05-24 17:12:04 -07:00
cond_resched ( ) ;
} while ( pmd + + , addr = next , addr ! = end ) ;
2005-11-13 16:06:42 -08:00
return addr ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static inline unsigned long zap_pud_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pgd_t * pgd ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2011-05-24 17:12:04 -07:00
struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pud_t * pud ;
unsigned long next ;
pud = pud_offset ( pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
2011-05-24 17:12:04 -07:00
if ( pud_none_or_clear_bad ( pud ) )
2005-04-16 15:20:36 -07:00
continue ;
2011-05-24 17:12:04 -07:00
next = zap_pmd_range ( tlb , vma , pud , addr , next , details ) ;
} while ( pud + + , addr = next , addr ! = end ) ;
2005-11-13 16:06:42 -08:00
return addr ;
2005-04-16 15:20:36 -07:00
}
2012-03-05 13:25:09 -05:00
static void unmap_page_range ( struct mmu_gather * tlb ,
struct vm_area_struct * vma ,
unsigned long addr , unsigned long end ,
struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
unsigned long next ;
if ( details & & ! details - > check_mapping & & ! details - > nonlinear_vma )
details = NULL ;
BUG_ON ( addr > = end ) ;
2009-12-15 16:47:03 -08:00
mem_cgroup_uncharge_start ( ) ;
2005-04-16 15:20:36 -07:00
tlb_start_vma ( tlb , vma ) ;
pgd = pgd_offset ( vma - > vm_mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
2011-05-24 17:12:04 -07:00
if ( pgd_none_or_clear_bad ( pgd ) )
2005-04-16 15:20:36 -07:00
continue ;
2011-05-24 17:12:04 -07:00
next = zap_pud_range ( tlb , vma , pgd , addr , next , details ) ;
} while ( pgd + + , addr = next , addr ! = end ) ;
2005-04-16 15:20:36 -07:00
tlb_end_vma ( tlb , vma ) ;
2009-12-15 16:47:03 -08:00
mem_cgroup_uncharge_end ( ) ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
2012-03-05 14:14:20 -05:00
static void unmap_single_vma ( struct mmu_gather * tlb ,
struct vm_area_struct * vma , unsigned long start_addr ,
2012-05-06 13:54:06 -07:00
unsigned long end_addr ,
2012-03-05 14:14:20 -05:00
struct zap_details * details )
{
unsigned long start = max ( vma - > vm_start , start_addr ) ;
unsigned long end ;
if ( start > = vma - > vm_end )
return ;
end = min ( vma - > vm_end , end_addr ) ;
if ( end < = vma - > vm_start )
return ;
2012-04-11 16:05:27 +05:30
if ( vma - > vm_file )
uprobe_munmap ( vma , start , end ) ;
2012-10-08 16:28:34 -07:00
if ( unlikely ( vma - > vm_flags & VM_PFNMAP ) )
2012-10-08 16:28:29 -07:00
untrack_pfn ( vma , 0 , 0 ) ;
2012-03-05 14:14:20 -05:00
if ( start ! = end ) {
if ( unlikely ( is_vm_hugetlb_page ( vma ) ) ) {
/*
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
* cleanup path of do_mmap_pgoff. When
* hugetlbfs ->mmap method fails,
* do_mmap_pgoff() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
2012-07-31 16:42:03 -07:00
if ( vma - > vm_file ) {
mutex_lock ( & vma - > vm_file - > f_mapping - > i_mmap_mutex ) ;
2012-07-31 16:46:20 -07:00
__unmap_hugepage_range_final ( tlb , vma , start , end , NULL ) ;
2012-07-31 16:42:03 -07:00
mutex_unlock ( & vma - > vm_file - > f_mapping - > i_mmap_mutex ) ;
}
2012-03-05 14:14:20 -05:00
} else
unmap_page_range ( tlb , vma , start , end , details ) ;
}
2005-04-16 15:20:36 -07:00
}
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
2011-06-15 15:08:09 -07:00
* @tlb: address of the caller's struct mmu_gather
2005-04-16 15:20:36 -07:00
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
*
2005-10-29 18:16:30 -07:00
* Unmap all pages in the vma list.
2005-04-16 15:20:36 -07:00
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
2012-03-05 13:41:15 -05:00
void unmap_vmas ( struct mmu_gather * tlb ,
2005-04-16 15:20:36 -07:00
struct vm_area_struct * vma , unsigned long start_addr ,
2012-05-06 13:54:06 -07:00
unsigned long end_addr )
2005-04-16 15:20:36 -07:00
{
2008-07-28 15:46:29 -07:00
struct mm_struct * mm = vma - > vm_mm ;
2005-04-16 15:20:36 -07:00
2008-07-28 15:46:29 -07:00
mmu_notifier_invalidate_range_start ( mm , start_addr , end_addr ) ;
2012-03-05 14:14:20 -05:00
for ( ; vma & & vma - > vm_start < end_addr ; vma = vma - > vm_next )
2012-05-06 13:54:06 -07:00
unmap_single_vma ( tlb , vma , start_addr , end_addr , NULL ) ;
2008-07-28 15:46:29 -07:00
mmu_notifier_invalidate_range_end ( mm , start_addr , end_addr ) ;
2005-04-16 15:20:36 -07:00
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
2012-06-20 12:53:02 -07:00
* @start: starting address of pages to zap
2005-04-16 15:20:36 -07:00
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
2012-03-05 14:14:20 -05:00
*
* Caller must protect the VMA list
2005-04-16 15:20:36 -07:00
*/
2012-05-06 13:43:15 -07:00
void zap_page_range ( struct vm_area_struct * vma , unsigned long start ,
2005-04-16 15:20:36 -07:00
unsigned long size , struct zap_details * details )
{
struct mm_struct * mm = vma - > vm_mm ;
2011-05-24 17:11:45 -07:00
struct mmu_gather tlb ;
2012-05-06 13:43:15 -07:00
unsigned long end = start + size ;
2005-04-16 15:20:36 -07:00
lru_add_drain ( ) ;
2011-05-24 17:11:45 -07:00
tlb_gather_mmu ( & tlb , mm , 0 ) ;
2005-10-29 18:16:18 -07:00
update_hiwater_rss ( mm ) ;
2012-05-06 13:43:15 -07:00
mmu_notifier_invalidate_range_start ( mm , start , end ) ;
for ( ; vma & & vma - > vm_start < end ; vma = vma - > vm_next )
2012-05-06 13:54:06 -07:00
unmap_single_vma ( & tlb , vma , start , end , details ) ;
2012-05-06 13:43:15 -07:00
mmu_notifier_invalidate_range_end ( mm , start , end ) ;
tlb_finish_mmu ( & tlb , start , end ) ;
2005-04-16 15:20:36 -07:00
}
2012-03-05 14:14:20 -05:00
/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*
* The range must fit into one VMA.
*/
static void zap_page_range_single ( struct vm_area_struct * vma , unsigned long address ,
unsigned long size , struct zap_details * details )
{
struct mm_struct * mm = vma - > vm_mm ;
struct mmu_gather tlb ;
unsigned long end = address + size ;
lru_add_drain ( ) ;
tlb_gather_mmu ( & tlb , mm , 0 ) ;
update_hiwater_rss ( mm ) ;
mmu_notifier_invalidate_range_start ( mm , address , end ) ;
2012-05-06 13:54:06 -07:00
unmap_single_vma ( & tlb , vma , address , end , details ) ;
2012-03-05 14:14:20 -05:00
mmu_notifier_invalidate_range_end ( mm , address , end ) ;
2011-05-24 17:11:45 -07:00
tlb_finish_mmu ( & tlb , address , end ) ;
2005-04-16 15:20:36 -07:00
}
2008-07-29 22:33:53 -07:00
/**
* zap_vma_ptes - remove ptes mapping the vma
* @vma: vm_area_struct holding ptes to be zapped
* @address: starting address of pages to zap
* @size: number of bytes to zap
*
* This function only unmaps ptes assigned to VM_PFNMAP vmas.
*
* The entire address range must be fully contained within the vma.
*
* Returns 0 if successful.
*/
int zap_vma_ptes ( struct vm_area_struct * vma , unsigned long address ,
unsigned long size )
{
if ( address < vma - > vm_start | | address + size > vma - > vm_end | |
! ( vma - > vm_flags & VM_PFNMAP ) )
return - 1 ;
2012-03-05 14:14:20 -05:00
zap_page_range_single ( vma , address , size , NULL ) ;
2008-07-29 22:33:53 -07:00
return 0 ;
}
EXPORT_SYMBOL_GPL ( zap_vma_ptes ) ;
2010-05-24 14:32:39 -07:00
/**
* follow_page - look up a page descriptor from a user-virtual address
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
* Returns the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
2005-04-16 15:20:36 -07:00
*/
2005-11-28 14:34:23 -08:00
struct page * follow_page ( struct vm_area_struct * vma , unsigned long address ,
2005-10-29 18:16:33 -07:00
unsigned int flags )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * ptep , pte ;
2005-10-29 18:16:33 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
struct page * page ;
2005-11-28 14:34:23 -08:00
struct mm_struct * mm = vma - > vm_mm ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
page = follow_huge_addr ( mm , address , flags & FOLL_WRITE ) ;
if ( ! IS_ERR ( page ) ) {
BUG_ON ( flags & FOLL_GET ) ;
goto out ;
}
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
page = NULL ;
2005-04-16 15:20:36 -07:00
pgd = pgd_offset ( mm , address ) ;
if ( pgd_none ( * pgd ) | | unlikely ( pgd_bad ( * pgd ) ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2005-04-16 15:20:36 -07:00
pud = pud_offset ( pgd , address ) ;
2008-07-23 21:27:50 -07:00
if ( pud_none ( * pud ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2011-01-13 15:46:52 -08:00
if ( pud_huge ( * pud ) & & vma - > vm_flags & VM_HUGETLB ) {
2008-07-23 21:27:50 -07:00
BUG_ON ( flags & FOLL_GET ) ;
page = follow_huge_pud ( mm , address , pud , flags & FOLL_WRITE ) ;
goto out ;
}
if ( unlikely ( pud_bad ( * pud ) ) )
goto no_page_table ;
2005-04-16 15:20:36 -07:00
pmd = pmd_offset ( pud , address ) ;
2008-05-06 20:49:23 +01:00
if ( pmd_none ( * pmd ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2011-01-13 15:46:52 -08:00
if ( pmd_huge ( * pmd ) & & vma - > vm_flags & VM_HUGETLB ) {
2005-10-29 18:16:33 -07:00
BUG_ON ( flags & FOLL_GET ) ;
page = follow_huge_pmd ( mm , address , pmd , flags & FOLL_WRITE ) ;
goto out ;
}
2012-10-05 21:36:27 +02:00
if ( ( flags & FOLL_NUMA ) & & pmd_numa ( * pmd ) )
goto no_page_table ;
2011-01-13 15:46:52 -08:00
if ( pmd_trans_huge ( * pmd ) ) {
2011-01-13 15:46:55 -08:00
if ( flags & FOLL_SPLIT ) {
2012-12-12 13:50:59 -08:00
split_huge_page_pmd ( vma , address , pmd ) ;
2011-01-13 15:46:55 -08:00
goto split_fallthrough ;
}
2011-01-13 15:46:52 -08:00
spin_lock ( & mm - > page_table_lock ) ;
if ( likely ( pmd_trans_huge ( * pmd ) ) ) {
if ( unlikely ( pmd_trans_splitting ( * pmd ) ) ) {
spin_unlock ( & mm - > page_table_lock ) ;
wait_split_huge_page ( vma - > anon_vma , pmd ) ;
} else {
2012-10-08 16:34:03 -07:00
page = follow_trans_huge_pmd ( vma , address ,
2011-01-13 15:46:52 -08:00
pmd , flags ) ;
spin_unlock ( & mm - > page_table_lock ) ;
goto out ;
}
} else
spin_unlock ( & mm - > page_table_lock ) ;
/* fall through */
}
2011-01-13 15:46:55 -08:00
split_fallthrough :
2008-05-06 20:49:23 +01:00
if ( unlikely ( pmd_bad ( * pmd ) ) )
goto no_page_table ;
2005-10-29 18:16:33 -07:00
ptep = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
pte = * ptep ;
2005-10-29 18:16:33 -07:00
if ( ! pte_present ( pte ) )
2008-06-20 11:18:25 -07:00
goto no_page ;
2012-10-05 21:36:27 +02:00
if ( ( flags & FOLL_NUMA ) & & pte_numa ( pte ) )
goto no_page ;
2005-10-29 18:16:33 -07:00
if ( ( flags & FOLL_WRITE ) & & ! pte_write ( pte ) )
goto unlock ;
2009-09-21 17:03:30 -07:00
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , address , pte ) ;
2009-09-21 17:03:30 -07:00
if ( unlikely ( ! page ) ) {
if ( ( flags & FOLL_DUMP ) | |
2009-09-21 17:03:34 -07:00
! is_zero_pfn ( pte_pfn ( pte ) ) )
2009-09-21 17:03:30 -07:00
goto bad_page ;
page = pte_page ( pte ) ;
}
2005-10-29 18:16:33 -07:00
if ( flags & FOLL_GET )
2011-11-02 13:36:59 -07:00
get_page_foll ( page ) ;
2005-10-29 18:16:33 -07:00
if ( flags & FOLL_TOUCH ) {
if ( ( flags & FOLL_WRITE ) & &
! pte_dirty ( pte ) & & ! PageDirty ( page ) )
set_page_dirty ( page ) ;
2009-03-31 15:19:37 -07:00
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
* mark_page_accessed().
*/
2005-10-29 18:16:33 -07:00
mark_page_accessed ( page ) ;
2005-04-16 15:20:36 -07:00
}
2011-05-04 21:30:28 -07:00
if ( ( flags & FOLL_MLOCK ) & & ( vma - > vm_flags & VM_LOCKED ) ) {
2011-01-13 15:46:11 -08:00
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
* which might bounce very badly if there is contention.
*
* If the page is already locked, we don't need to
* handle it now - vmscan will handle it later if and
* when it attempts to reclaim the page.
*/
if ( page - > mapping & & trylock_page ( page ) ) {
lru_add_drain ( ) ; /* push cached pages to LRU */
/*
2012-10-08 16:33:19 -07:00
* Because we lock page here, and migration is
* blocked by the pte's page reference, and we
* know the page is still mapped, we don't even
* need to check for file-cache page truncation.
2011-01-13 15:46:11 -08:00
*/
2012-10-08 16:33:19 -07:00
mlock_vma_page ( page ) ;
2011-01-13 15:46:11 -08:00
unlock_page ( page ) ;
}
}
2005-10-29 18:16:33 -07:00
unlock :
pte_unmap_unlock ( ptep , ptl ) ;
2005-04-16 15:20:36 -07:00
out :
2005-10-29 18:16:33 -07:00
return page ;
2005-04-16 15:20:36 -07:00
2008-06-20 11:18:25 -07:00
bad_page :
pte_unmap_unlock ( ptep , ptl ) ;
return ERR_PTR ( - EFAULT ) ;
no_page :
pte_unmap_unlock ( ptep , ptl ) ;
if ( ! pte_none ( pte ) )
return page ;
2009-09-21 17:03:26 -07:00
2005-10-29 18:16:33 -07:00
no_page_table :
/*
* When core dumping an enormous anonymous area that nobody
2009-09-21 17:03:26 -07:00
* has touched so far, we don't want to allocate unnecessary pages or
* page tables. Return error instead of NULL to skip handle_mm_fault,
* then get_dump_page() will return NULL to leave a hole in the dump.
* But we can only make this optimization where a hole would surely
* be zero-filled if handle_mm_fault() actually did handle it.
2005-10-29 18:16:33 -07:00
*/
2009-09-21 17:03:26 -07:00
if ( ( flags & FOLL_DUMP ) & &
( ! vma - > vm_ops | | ! vma - > vm_ops - > fault ) )
return ERR_PTR ( - EFAULT ) ;
2005-10-29 18:16:33 -07:00
return page ;
2005-04-16 15:20:36 -07:00
}
2011-04-12 14:15:51 -07:00
static inline int stack_guard_page ( struct vm_area_struct * vma , unsigned long addr )
{
2011-05-09 13:01:09 +02:00
return stack_guard_page_start ( vma , addr ) | |
stack_guard_page_end ( vma , addr + PAGE_SIZE ) ;
2011-04-12 14:15:51 -07:00
}
2011-01-30 11:15:47 +08:00
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
* the page is written to, set_page_dirty (or set_page_dirty_lock, as
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
* If @nonblocking != NULL, __get_user_pages will not wait for disk IO
* or mmap_sem contention, and if waiting is needed to pin all pages,
* *@nonblocking will be set to 0.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
2008-10-18 20:26:44 -07:00
int __get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-09-21 17:03:31 -07:00
unsigned long start , int nr_pages , unsigned int gup_flags ,
2011-01-13 15:46:14 -08:00
struct page * * pages , struct vm_area_struct * * vmas ,
int * nonblocking )
2005-04-16 15:20:36 -07:00
{
int i ;
2009-09-21 17:03:31 -07:00
unsigned long vm_flags ;
2005-04-16 15:20:36 -07:00
2009-06-25 11:58:55 +02:00
if ( nr_pages < = 0 )
2008-02-11 16:17:33 -07:00
return 0 ;
2009-09-21 17:03:31 -07:00
VM_BUG_ON ( ! ! pages ! = ! ! ( gup_flags & FOLL_GET ) ) ;
2005-04-16 15:20:36 -07:00
/*
* Require read or write permissions.
2009-09-21 17:03:31 -07:00
* If FOLL_FORCE is set, we only require the "MAY" flags.
2005-04-16 15:20:36 -07:00
*/
2009-09-21 17:03:31 -07:00
vm_flags = ( gup_flags & FOLL_WRITE ) ?
( VM_WRITE | VM_MAYWRITE ) : ( VM_READ | VM_MAYREAD ) ;
vm_flags & = ( gup_flags & FOLL_FORCE ) ?
( VM_MAYREAD | VM_MAYWRITE ) : ( VM_READ | VM_WRITE ) ;
2012-10-05 21:36:27 +02:00
/*
* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
* would be called on PROT_NONE ranges. We must never invoke
* handle_mm_fault on PROT_NONE ranges or the NUMA hinting
* page faults would unprotect the PROT_NONE ranges if
* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
* bitflag. So to avoid that, don't set FOLL_NUMA if
* FOLL_FORCE is set.
*/
if ( ! ( gup_flags & FOLL_FORCE ) )
gup_flags | = FOLL_NUMA ;
2005-04-16 15:20:36 -07:00
i = 0 ;
do {
2005-10-29 18:16:33 -07:00
struct vm_area_struct * vma ;
2005-04-16 15:20:36 -07:00
vma = find_extend_vma ( mm , start ) ;
2011-03-13 15:49:18 -04:00
if ( ! vma & & in_gate_area ( mm , start ) ) {
2005-04-16 15:20:36 -07:00
unsigned long pg = start & PAGE_MASK ;
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
2008-10-18 20:26:44 -07:00
/* user gate pages are read-only */
2009-09-21 17:03:31 -07:00
if ( gup_flags & FOLL_WRITE )
2005-04-16 15:20:36 -07:00
return i ? : - EFAULT ;
if ( pg > TASK_SIZE )
pgd = pgd_offset_k ( pg ) ;
else
pgd = pgd_offset_gate ( mm , pg ) ;
BUG_ON ( pgd_none ( * pgd ) ) ;
pud = pud_offset ( pgd , pg ) ;
BUG_ON ( pud_none ( * pud ) ) ;
pmd = pmd_offset ( pud , pg ) ;
2005-08-01 21:11:42 -07:00
if ( pmd_none ( * pmd ) )
return i ? : - EFAULT ;
2011-01-13 15:46:54 -08:00
VM_BUG_ON ( pmd_trans_huge ( * pmd ) ) ;
2005-04-16 15:20:36 -07:00
pte = pte_offset_map ( pmd , pg ) ;
2005-08-01 21:11:42 -07:00
if ( pte_none ( * pte ) ) {
pte_unmap ( pte ) ;
return i ? : - EFAULT ;
}
2011-04-12 14:15:51 -07:00
vma = get_gate_vma ( mm ) ;
2005-04-16 15:20:36 -07:00
if ( pages ) {
2010-07-30 10:58:26 -07:00
struct page * page ;
2011-04-12 14:15:51 -07:00
page = vm_normal_page ( vma , start , * pte ) ;
2010-07-30 10:58:26 -07:00
if ( ! page ) {
if ( ! ( gup_flags & FOLL_DUMP ) & &
is_zero_pfn ( pte_pfn ( * pte ) ) )
page = pte_page ( * pte ) ;
else {
pte_unmap ( pte ) ;
return i ? : - EFAULT ;
}
}
2005-11-28 14:34:23 -08:00
pages [ i ] = page ;
2010-07-30 10:58:26 -07:00
get_page ( page ) ;
2005-04-16 15:20:36 -07:00
}
pte_unmap ( pte ) ;
2011-04-12 14:15:51 -07:00
goto next_page ;
2005-04-16 15:20:36 -07:00
}
2008-10-18 20:26:44 -07:00
if ( ! vma | |
( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) | |
2009-09-21 17:03:24 -07:00
! ( vm_flags & vma - > vm_flags ) )
2005-04-16 15:20:36 -07:00
return i ? : - EFAULT ;
2009-09-21 17:03:27 -07:00
if ( is_vm_hugetlb_page ( vma ) ) {
i = follow_hugetlb_page ( mm , vma , pages , vmas ,
2009-09-21 17:03:31 -07:00
& start , & nr_pages , i , gup_flags ) ;
2009-09-21 17:03:27 -07:00
continue ;
}
2005-10-29 18:16:33 -07:00
2005-04-16 15:20:36 -07:00
do {
2005-06-21 17:15:10 -07:00
struct page * page ;
2009-09-21 17:03:31 -07:00
unsigned int foll_flags = gup_flags ;
2005-04-16 15:20:36 -07:00
2007-07-15 23:38:16 -07:00
/*
2009-01-06 14:40:18 -08:00
* If we have a pending SIGKILL, don't keep faulting
2009-09-21 17:03:24 -07:00
* pages and potentially allocating memory.
2007-07-15 23:38:16 -07:00
*/
2009-09-21 17:03:24 -07:00
if ( unlikely ( fatal_signal_pending ( current ) ) )
2009-01-06 14:40:18 -08:00
return i ? i : - ERESTARTSYS ;
2007-07-15 23:38:16 -07:00
2005-10-29 18:16:33 -07:00
cond_resched ( ) ;
2005-11-28 14:34:23 -08:00
while ( ! ( page = follow_page ( vma , start , foll_flags ) ) ) {
2005-08-03 10:07:09 -07:00
int ret ;
2011-01-13 15:46:14 -08:00
unsigned int fault_flags = 0 ;
2011-05-09 13:01:09 +02:00
/* For mlock, just skip the stack guard page. */
if ( foll_flags & FOLL_MLOCK ) {
if ( stack_guard_page ( vma , start ) )
goto next_page ;
}
2011-01-13 15:46:14 -08:00
if ( foll_flags & FOLL_WRITE )
fault_flags | = FAULT_FLAG_WRITE ;
if ( nonblocking )
fault_flags | = FAULT_FLAG_ALLOW_RETRY ;
2011-03-22 16:30:51 -07:00
if ( foll_flags & FOLL_NOWAIT )
fault_flags | = ( FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT ) ;
2009-04-10 09:01:23 -07:00
2009-06-23 13:52:49 +01:00
ret = handle_mm_fault ( mm , vma , start ,
2011-01-13 15:46:14 -08:00
fault_flags ) ;
2009-06-23 13:52:49 +01:00
2007-07-19 01:47:05 -07:00
if ( ret & VM_FAULT_ERROR ) {
if ( ret & VM_FAULT_OOM )
return i ? i : - ENOMEM ;
2011-01-30 11:15:48 +08:00
if ( ret & ( VM_FAULT_HWPOISON |
VM_FAULT_HWPOISON_LARGE ) ) {
if ( i )
return i ;
else if ( gup_flags & FOLL_HWPOISON )
return - EHWPOISON ;
else
return - EFAULT ;
}
if ( ret & VM_FAULT_SIGBUS )
2007-07-19 01:47:05 -07:00
return i ? i : - EFAULT ;
BUG ( ) ;
}
2011-03-13 15:49:18 -04:00
if ( tsk ) {
if ( ret & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
}
2007-07-19 01:47:05 -07:00
2011-01-13 15:46:14 -08:00
if ( ret & VM_FAULT_RETRY ) {
2011-03-22 16:30:51 -07:00
if ( nonblocking )
* nonblocking = 0 ;
2011-01-13 15:46:14 -08:00
return i ;
}
2005-08-03 10:07:09 -07:00
/*
2007-07-19 01:47:05 -07:00
* The VM_FAULT_WRITE bit tells us that
* do_wp_page has broken COW when necessary,
* even if maybe_mkwrite decided not to set
* pte_write. We can thus safely do subsequent
2009-01-06 14:39:32 -08:00
* page lookups as if they were reads. But only
* do so when looping for pte_write is futile:
* in some cases userspace may also be wanting
* to write to the gotten user page, which a
* read fault here might prevent (a readonly
* page might get reCOWed by userspace write).
2005-08-03 10:07:09 -07:00
*/
2009-01-06 14:39:32 -08:00
if ( ( ret & VM_FAULT_WRITE ) & &
! ( vma - > vm_flags & VM_WRITE ) )
2005-10-29 18:16:33 -07:00
foll_flags & = ~ FOLL_WRITE ;
2007-07-19 01:47:05 -07:00
2006-10-06 00:43:53 -07:00
cond_resched ( ) ;
2005-04-16 15:20:36 -07:00
}
2008-06-20 11:18:25 -07:00
if ( IS_ERR ( page ) )
return i ? i : PTR_ERR ( page ) ;
2005-04-16 15:20:36 -07:00
if ( pages ) {
2005-06-21 17:15:10 -07:00
pages [ i ] = page ;
2006-03-26 01:36:57 -08:00
2006-12-30 22:24:19 +00:00
flush_anon_page ( vma , page , start ) ;
2005-06-21 17:15:10 -07:00
flush_dcache_page ( page ) ;
2005-04-16 15:20:36 -07:00
}
2011-04-12 14:15:51 -07:00
next_page :
2005-04-16 15:20:36 -07:00
if ( vmas )
vmas [ i ] = vma ;
i + + ;
start + = PAGE_SIZE ;
2009-06-25 11:58:55 +02:00
nr_pages - - ;
} while ( nr_pages & & start < vma - > vm_end ) ;
} while ( nr_pages ) ;
2005-04-16 15:20:36 -07:00
return i ;
}
2011-01-30 11:15:47 +08:00
EXPORT_SYMBOL ( __get_user_pages ) ;
2008-10-18 20:26:44 -07:00
2011-07-25 17:12:32 -07:00
/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
* section), this returns -EFAULT, and we want to resolve the user fault before
* trying again.
*
* Typically this is meant to be used by the futex code.
*
* The main difference with get_user_pages() is that this function will
* unconditionally call handle_mm_fault() which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE, while
* handle_mm_fault() only guarantees to update these in the struct page.
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software. On
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
* This should be called with the mm_sem held for read.
*/
int fixup_user_fault ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long address , unsigned int fault_flags )
{
struct vm_area_struct * vma ;
int ret ;
vma = find_extend_vma ( mm , address ) ;
if ( ! vma | | address < vma - > vm_start )
return - EFAULT ;
ret = handle_mm_fault ( mm , vma , address , fault_flags ) ;
if ( ret & VM_FAULT_ERROR ) {
if ( ret & VM_FAULT_OOM )
return - ENOMEM ;
if ( ret & ( VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE ) )
return - EHWPOISON ;
if ( ret & VM_FAULT_SIGBUS )
return - EFAULT ;
BUG ( ) ;
}
if ( tsk ) {
if ( ret & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
}
return 0 ;
}
/*
2009-06-16 15:31:39 -07:00
* get_user_pages() - pin user pages in memory
2011-03-13 15:49:18 -04:00
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
2009-06-16 15:31:39 -07:00
* @mm: mm_struct of target mm
* @start: starting user address
2009-06-25 11:58:55 +02:00
* @nr_pages: number of pages from start to pin
2009-06-16 15:31:39 -07:00
* @write: whether pages will be written to by the caller
* @force: whether to force write access even if user mapping is
* readonly. This will result in the page being COWed even
* in MAP_SHARED mappings. You do not want this.
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
* Returns number of pages pinned. This may be fewer than the number
2009-06-25 11:58:55 +02:00
* requested. If nr_pages is 0 or negative, returns 0. If no pages
2009-06-16 15:31:39 -07:00
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If write=0, the page must not be written to. If the page is written to,
* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
* after the page is finished with, and before put_page is called.
*
* get_user_pages is typically used for fewer-copy IO operations, to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses. The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping (via the kmap APIs). Care should be taken to
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*/
2008-10-18 20:26:44 -07:00
int get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-06-25 11:58:55 +02:00
unsigned long start , int nr_pages , int write , int force ,
2008-10-18 20:26:44 -07:00
struct page * * pages , struct vm_area_struct * * vmas )
{
2009-09-21 17:03:31 -07:00
int flags = FOLL_TOUCH ;
2008-10-18 20:26:44 -07:00
2009-09-21 17:03:31 -07:00
if ( pages )
flags | = FOLL_GET ;
2008-10-18 20:26:44 -07:00
if ( write )
2009-09-21 17:03:31 -07:00
flags | = FOLL_WRITE ;
2008-10-18 20:26:44 -07:00
if ( force )
2009-09-21 17:03:31 -07:00
flags | = FOLL_FORCE ;
2008-10-18 20:26:44 -07:00
2011-01-13 15:46:14 -08:00
return __get_user_pages ( tsk , mm , start , nr_pages , flags , pages , vmas ,
NULL ) ;
2008-10-18 20:26:44 -07:00
}
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( get_user_pages ) ;
2009-09-21 17:03:25 -07:00
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by page_cache_release() or put_page().
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
* allowing a hole to be left in the corefile to save diskspace.
*
* Called without mmap_sem, but after all other threads have been killed.
*/
# ifdef CONFIG_ELF_CORE
struct page * get_dump_page ( unsigned long addr )
{
struct vm_area_struct * vma ;
struct page * page ;
if ( __get_user_pages ( current , current - > mm , addr , 1 ,
2011-01-13 15:46:14 -08:00
FOLL_FORCE | FOLL_DUMP | FOLL_GET , & page , & vma ,
NULL ) < 1 )
2009-09-21 17:03:25 -07:00
return NULL ;
flush_cache_page ( vma , addr , page_to_pfn ( page ) ) ;
return page ;
}
# endif /* CONFIG_ELF_CORE */
2010-10-26 14:21:59 -07:00
pte_t * __get_locked_pte ( struct mm_struct * mm , unsigned long addr ,
2008-02-04 22:29:26 -08:00
spinlock_t * * ptl )
2005-11-29 14:03:14 -08:00
{
pgd_t * pgd = pgd_offset ( mm , addr ) ;
pud_t * pud = pud_alloc ( mm , pgd , addr ) ;
if ( pud ) {
2005-11-29 19:27:22 -05:00
pmd_t * pmd = pmd_alloc ( mm , pud , addr ) ;
2011-01-13 15:46:54 -08:00
if ( pmd ) {
VM_BUG_ON ( pmd_trans_huge ( * pmd ) ) ;
2005-11-29 14:03:14 -08:00
return pte_alloc_map_lock ( mm , pmd , addr , ptl ) ;
2011-01-13 15:46:54 -08:00
}
2005-11-29 14:03:14 -08:00
}
return NULL ;
}
2005-11-29 13:01:56 -08:00
/*
* This is the old fallback for page remapping.
*
* For historical reasons, it only allows reserved pages. Only
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
2008-04-28 02:13:01 -07:00
static int insert_page ( struct vm_area_struct * vma , unsigned long addr ,
struct page * page , pgprot_t prot )
2005-11-29 13:01:56 -08:00
{
2008-04-28 02:13:01 -07:00
struct mm_struct * mm = vma - > vm_mm ;
2005-11-29 13:01:56 -08:00
int retval ;
2005-11-29 14:03:14 -08:00
pte_t * pte ;
2008-02-07 00:13:53 -08:00
spinlock_t * ptl ;
2005-11-29 13:01:56 -08:00
retval = - EINVAL ;
2005-11-30 09:35:19 -08:00
if ( PageAnon ( page ) )
2008-10-18 20:28:10 -07:00
goto out ;
2005-11-29 13:01:56 -08:00
retval = - ENOMEM ;
flush_dcache_page ( page ) ;
2005-11-29 14:03:14 -08:00
pte = get_locked_pte ( mm , addr , & ptl ) ;
2005-11-29 13:01:56 -08:00
if ( ! pte )
2008-10-18 20:28:10 -07:00
goto out ;
2005-11-29 13:01:56 -08:00
retval = - EBUSY ;
if ( ! pte_none ( * pte ) )
goto out_unlock ;
/* Ok, finally just insert the thing.. */
get_page ( page ) ;
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_FILEPAGES ) ;
2005-11-29 13:01:56 -08:00
page_add_file_rmap ( page ) ;
set_pte_at ( mm , addr , pte , mk_pte ( page , prot ) ) ;
retval = 0 ;
2008-02-07 00:13:53 -08:00
pte_unmap_unlock ( pte , ptl ) ;
return retval ;
2005-11-29 13:01:56 -08:00
out_unlock :
pte_unmap_unlock ( pte , ptl ) ;
out :
return retval ;
}
2006-09-25 23:31:22 -07:00
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @page: source kernel page
*
2005-11-30 09:35:19 -08:00
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
2006-03-22 00:08:05 -08:00
* (see split_page()).
2005-11-30 09:35:19 -08:00
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
* that. Your vma protection will have to be set up correctly, which
* means that if you want a shared writable mapping, you'd better
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
2012-10-08 16:28:40 -07:00
*
* Usually this function is called from f_op->mmap() handler
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
2005-11-30 09:35:19 -08:00
*/
2008-04-28 02:13:01 -07:00
int vm_insert_page ( struct vm_area_struct * vma , unsigned long addr ,
struct page * page )
2005-11-30 09:35:19 -08:00
{
if ( addr < vma - > vm_start | | addr > = vma - > vm_end )
return - EFAULT ;
if ( ! page_count ( page ) )
return - EINVAL ;
2012-10-08 16:28:40 -07:00
if ( ! ( vma - > vm_flags & VM_MIXEDMAP ) ) {
BUG_ON ( down_read_trylock ( & vma - > vm_mm - > mmap_sem ) ) ;
BUG_ON ( vma - > vm_flags & VM_PFNMAP ) ;
vma - > vm_flags | = VM_MIXEDMAP ;
}
2008-04-28 02:13:01 -07:00
return insert_page ( vma , addr , page , vma - > vm_page_prot ) ;
2005-11-30 09:35:19 -08:00
}
2005-12-03 20:48:11 -08:00
EXPORT_SYMBOL ( vm_insert_page ) ;
2005-11-30 09:35:19 -08:00
2008-04-28 02:13:01 -07:00
static int insert_pfn ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long pfn , pgprot_t prot )
{
struct mm_struct * mm = vma - > vm_mm ;
int retval ;
pte_t * pte , entry ;
spinlock_t * ptl ;
retval = - ENOMEM ;
pte = get_locked_pte ( mm , addr , & ptl ) ;
if ( ! pte )
goto out ;
retval = - EBUSY ;
if ( ! pte_none ( * pte ) )
goto out_unlock ;
/* Ok, finally just insert the thing.. */
entry = pte_mkspecial ( pfn_pte ( pfn , prot ) ) ;
set_pte_at ( mm , addr , pte , entry ) ;
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , addr , pte ) ; /* XXX: why not for insert_page? */
2008-04-28 02:13:01 -07:00
retval = 0 ;
out_unlock :
pte_unmap_unlock ( pte , ptl ) ;
out :
return retval ;
}
2007-02-12 00:51:36 -08:00
/**
* vm_insert_pfn - insert single pfn into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
*
2012-10-08 16:33:43 -07:00
* Similar to vm_insert_page, this allows drivers to insert individual pages
2007-02-12 00:51:36 -08:00
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return NULL.
2008-07-23 21:27:05 -07:00
*
* vma cannot be a COW mapping.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
2007-02-12 00:51:36 -08:00
*/
int vm_insert_pfn ( struct vm_area_struct * vma , unsigned long addr ,
2008-04-28 02:13:01 -07:00
unsigned long pfn )
2007-02-12 00:51:36 -08:00
{
2008-12-18 11:41:29 -08:00
int ret ;
2009-01-09 16:13:11 -08:00
pgprot_t pgprot = vma - > vm_page_prot ;
2008-04-28 02:13:00 -07:00
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
* consistency in testing and feature parity among all, so we should
* try to keep these invariants in place for everybody.
*/
2008-04-28 02:12:58 -07:00
BUG_ON ( ! ( vma - > vm_flags & ( VM_PFNMAP | VM_MIXEDMAP ) ) ) ;
BUG_ON ( ( vma - > vm_flags & ( VM_PFNMAP | VM_MIXEDMAP ) ) = =
( VM_PFNMAP | VM_MIXEDMAP ) ) ;
BUG_ON ( ( vma - > vm_flags & VM_PFNMAP ) & & is_cow_mapping ( vma - > vm_flags ) ) ;
BUG_ON ( ( vma - > vm_flags & VM_MIXEDMAP ) & & pfn_valid ( pfn ) ) ;
2007-02-12 00:51:36 -08:00
2008-04-28 02:13:01 -07:00
if ( addr < vma - > vm_start | | addr > = vma - > vm_end )
return - EFAULT ;
2012-10-08 16:28:29 -07:00
if ( track_pfn_insert ( vma , & pgprot , pfn ) )
2008-12-18 11:41:29 -08:00
return - EINVAL ;
2009-01-09 16:13:11 -08:00
ret = insert_pfn ( vma , addr , pfn , pgprot ) ;
2008-12-18 11:41:29 -08:00
return ret ;
2007-02-12 00:51:36 -08:00
}
EXPORT_SYMBOL ( vm_insert_pfn ) ;
2008-04-28 02:13:01 -07:00
int vm_insert_mixed ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long pfn )
{
BUG_ON ( ! ( vma - > vm_flags & VM_MIXEDMAP ) ) ;
if ( addr < vma - > vm_start | | addr > = vma - > vm_end )
return - EFAULT ;
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
* refcount the page if pfn_valid is true (hence insert_page rather
2009-09-21 17:03:34 -07:00
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
2008-04-28 02:13:01 -07:00
*/
if ( ! HAVE_PTE_SPECIAL & & pfn_valid ( pfn ) ) {
struct page * page ;
page = pfn_to_page ( pfn ) ;
return insert_page ( vma , addr , page , vma - > vm_page_prot ) ;
}
return insert_pfn ( vma , addr , pfn , vma - > vm_page_prot ) ;
}
EXPORT_SYMBOL ( vm_insert_mixed ) ;
2005-04-16 15:20:36 -07:00
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static int remap_pte_range ( struct mm_struct * mm , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pte_t * pte ;
2005-10-29 18:16:23 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:23 -07:00
pte = pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! pte )
return - ENOMEM ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
BUG_ON ( ! pte_none ( * pte ) ) ;
2008-04-28 02:13:00 -07:00
set_pte_at ( mm , addr , pte , pte_mkspecial ( pfn_pte ( pfn , prot ) ) ) ;
2005-04-16 15:20:36 -07:00
pfn + + ;
} while ( pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:23 -07:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
static inline int remap_pmd_range ( struct mm_struct * mm , pud_t * pud ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pmd_t * pmd ;
unsigned long next ;
pfn - = addr > > PAGE_SHIFT ;
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
return - ENOMEM ;
2011-01-13 15:46:54 -08:00
VM_BUG_ON ( pmd_trans_huge ( * pmd ) ) ;
2005-04-16 15:20:36 -07:00
do {
next = pmd_addr_end ( addr , end ) ;
if ( remap_pte_range ( mm , pmd , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) )
return - ENOMEM ;
} while ( pmd + + , addr = next , addr ! = end ) ;
return 0 ;
}
static inline int remap_pud_range ( struct mm_struct * mm , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pud_t * pud ;
unsigned long next ;
pfn - = addr > > PAGE_SHIFT ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
return - ENOMEM ;
do {
next = pud_addr_end ( addr , end ) ;
if ( remap_pmd_range ( mm , pud , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) )
return - ENOMEM ;
} while ( pud + + , addr = next , addr ! = end ) ;
return 0 ;
}
2006-09-25 23:31:22 -07:00
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: physical address of kernel memory
* @size: size of map area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
*/
2005-04-16 15:20:36 -07:00
int remap_pfn_range ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long pfn , unsigned long size , pgprot_t prot )
{
pgd_t * pgd ;
unsigned long next ;
2005-06-25 14:54:33 -07:00
unsigned long end = addr + PAGE_ALIGN ( size ) ;
2005-04-16 15:20:36 -07:00
struct mm_struct * mm = vma - > vm_mm ;
int err ;
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
2005-11-28 14:34:23 -08:00
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
2012-10-08 16:29:02 -07:00
* VM_DONTEXPAND
* Disable vma merging and expanding with mremap().
* VM_DONTDUMP
* Omit vma from core dump, even when VM_IO turned off.
2005-12-11 19:46:02 -08:00
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
2012-10-08 16:28:34 -07:00
* See vm_normal_page() for details.
2005-04-16 15:20:36 -07:00
*/
2012-10-08 16:28:34 -07:00
if ( is_cow_mapping ( vma - > vm_flags ) ) {
if ( addr ! = vma - > vm_start | | end ! = vma - > vm_end )
return - EINVAL ;
2005-12-11 19:46:02 -08:00
vma - > vm_pgoff = pfn ;
2012-10-08 16:28:34 -07:00
}
err = track_pfn_remap ( vma , & prot , pfn , addr , PAGE_ALIGN ( size ) ) ;
if ( err )
2008-12-18 11:41:27 -08:00
return - EINVAL ;
2005-12-11 19:46:02 -08:00
2012-10-08 16:29:02 -07:00
vma - > vm_flags | = VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP ;
2005-04-16 15:20:36 -07:00
BUG_ON ( addr > = end ) ;
pfn - = addr > > PAGE_SHIFT ;
pgd = pgd_offset ( mm , addr ) ;
flush_cache_range ( vma , addr , end ) ;
do {
next = pgd_addr_end ( addr , end ) ;
err = remap_pud_range ( mm , pgd , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) ;
if ( err )
break ;
} while ( pgd + + , addr = next , addr ! = end ) ;
2008-12-18 11:41:29 -08:00
if ( err )
2012-10-08 16:28:29 -07:00
untrack_pfn ( vma , pfn , PAGE_ALIGN ( size ) ) ;
2008-12-18 11:41:29 -08:00
2005-04-16 15:20:36 -07:00
return err ;
}
EXPORT_SYMBOL ( remap_pfn_range ) ;
2007-05-06 14:48:54 -07:00
static int apply_to_pte_range ( struct mm_struct * mm , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
pte_fn_t fn , void * data )
{
pte_t * pte ;
int err ;
2008-02-08 04:22:04 -08:00
pgtable_t token ;
2007-05-06 14:49:17 -07:00
spinlock_t * uninitialized_var ( ptl ) ;
2007-05-06 14:48:54 -07:00
pte = ( mm = = & init_mm ) ?
pte_alloc_kernel ( pmd , addr ) :
pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
if ( ! pte )
return - ENOMEM ;
BUG_ON ( pmd_huge ( * pmd ) ) ;
2009-01-06 14:39:21 -08:00
arch_enter_lazy_mmu_mode ( ) ;
2008-02-08 04:22:04 -08:00
token = pmd_pgtable ( * pmd ) ;
2007-05-06 14:48:54 -07:00
do {
2009-10-26 16:50:23 -07:00
err = fn ( pte + + , token , addr , data ) ;
2007-05-06 14:48:54 -07:00
if ( err )
break ;
2009-10-26 16:50:23 -07:00
} while ( addr + = PAGE_SIZE , addr ! = end ) ;
2007-05-06 14:48:54 -07:00
2009-01-06 14:39:21 -08:00
arch_leave_lazy_mmu_mode ( ) ;
2007-05-06 14:48:54 -07:00
if ( mm ! = & init_mm )
pte_unmap_unlock ( pte - 1 , ptl ) ;
return err ;
}
static int apply_to_pmd_range ( struct mm_struct * mm , pud_t * pud ,
unsigned long addr , unsigned long end ,
pte_fn_t fn , void * data )
{
pmd_t * pmd ;
unsigned long next ;
int err ;
2008-07-23 21:27:50 -07:00
BUG_ON ( pud_huge ( * pud ) ) ;
2007-05-06 14:48:54 -07:00
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
return - ENOMEM ;
do {
next = pmd_addr_end ( addr , end ) ;
err = apply_to_pte_range ( mm , pmd , addr , next , fn , data ) ;
if ( err )
break ;
} while ( pmd + + , addr = next , addr ! = end ) ;
return err ;
}
static int apply_to_pud_range ( struct mm_struct * mm , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
pte_fn_t fn , void * data )
{
pud_t * pud ;
unsigned long next ;
int err ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
return - ENOMEM ;
do {
next = pud_addr_end ( addr , end ) ;
err = apply_to_pmd_range ( mm , pud , addr , next , fn , data ) ;
if ( err )
break ;
} while ( pud + + , addr = next , addr ! = end ) ;
return err ;
}
/*
* Scan a region of virtual memory, filling in page tables as necessary
* and calling a provided function on each leaf page table.
*/
int apply_to_page_range ( struct mm_struct * mm , unsigned long addr ,
unsigned long size , pte_fn_t fn , void * data )
{
pgd_t * pgd ;
unsigned long next ;
2010-08-09 17:19:52 -07:00
unsigned long end = addr + size ;
2007-05-06 14:48:54 -07:00
int err ;
BUG_ON ( addr > = end ) ;
pgd = pgd_offset ( mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
err = apply_to_pud_range ( mm , pgd , addr , next , fn , data ) ;
if ( err )
break ;
} while ( pgd + + , addr = next , addr ! = end ) ;
2010-08-09 17:19:52 -07:00
2007-05-06 14:48:54 -07:00
return err ;
}
EXPORT_SYMBOL_GPL ( apply_to_page_range ) ;
2005-10-29 18:16:26 -07:00
/*
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
2011-02-10 13:56:28 +09:00
* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
2005-10-29 18:16:26 -07:00
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
2011-02-10 13:56:28 +09:00
* and do_anonymous_page can safely check later on).
2005-10-29 18:16:26 -07:00
*/
2005-10-29 18:16:40 -07:00
static inline int pte_unmap_same ( struct mm_struct * mm , pmd_t * pmd ,
2005-10-29 18:16:26 -07:00
pte_t * page_table , pte_t orig_pte )
{
int same = 1 ;
# if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if ( sizeof ( pte_t ) > sizeof ( unsigned long ) ) {
2005-10-29 18:16:40 -07:00
spinlock_t * ptl = pte_lockptr ( mm , pmd ) ;
spin_lock ( ptl ) ;
2005-10-29 18:16:26 -07:00
same = pte_same ( * page_table , orig_pte ) ;
2005-10-29 18:16:40 -07:00
spin_unlock ( ptl ) ;
2005-10-29 18:16:26 -07:00
}
# endif
pte_unmap ( page_table ) ;
return same ;
}
2006-12-12 17:14:55 +00:00
static inline void cow_user_page ( struct page * dst , struct page * src , unsigned long va , struct vm_area_struct * vma )
2005-11-28 14:34:23 -08:00
{
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
if ( unlikely ( ! src ) ) {
2011-11-25 23:14:39 +08:00
void * kaddr = kmap_atomic ( dst ) ;
2005-11-29 14:07:55 -08:00
void __user * uaddr = ( void __user * ) ( va & PAGE_MASK ) ;
/*
* This really shouldn't fail, because the page is there
* in the page tables. But it might just be unreadable,
* in which case we just give up and fill the result with
* zeroes.
*/
if ( __copy_from_user_inatomic ( kaddr , uaddr , PAGE_SIZE ) )
2010-10-26 14:22:27 -07:00
clear_page ( kaddr ) ;
2011-11-25 23:14:39 +08:00
kunmap_atomic ( kaddr ) ;
2006-10-19 23:29:08 -07:00
flush_dcache_page ( dst ) ;
2008-02-04 22:29:34 -08:00
} else
copy_user_highpage ( dst , src , va , vma ) ;
2005-11-28 14:34:23 -08:00
}
2005-04-16 15:20:36 -07:00
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_wp_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2005-10-29 18:16:26 -07:00
spinlock_t * ptl , pte_t orig_pte )
2010-10-26 14:22:00 -07:00
__releases ( ptl )
2005-04-16 15:20:36 -07:00
{
2012-10-08 16:33:33 -07:00
struct page * old_page , * new_page = NULL ;
2005-04-16 15:20:36 -07:00
pte_t entry ;
2011-01-13 15:46:07 -08:00
int ret = 0 ;
2007-10-08 18:54:37 +02:00
int page_mkwrite = 0 ;
2006-09-25 23:30:57 -07:00
struct page * dirty_page = NULL ;
2012-11-16 14:14:48 -08:00
unsigned long mmun_start = 0 ; /* For mmu_notifiers */
unsigned long mmun_end = 0 ; /* For mmu_notifiers */
2005-04-16 15:20:36 -07:00
2005-11-28 14:34:23 -08:00
old_page = vm_normal_page ( vma , address , orig_pte ) ;
2008-07-04 09:59:24 -07:00
if ( ! old_page ) {
/*
* VM_MIXEDMAP !pfn_valid() case
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable as we can't do any dirty
* accounting on raw pfn maps.
*/
if ( ( vma - > vm_flags & ( VM_WRITE | VM_SHARED ) ) = =
( VM_WRITE | VM_SHARED ) )
goto reuse ;
2005-11-28 14:34:23 -08:00
goto gotten ;
2008-07-04 09:59:24 -07:00
}
2005-04-16 15:20:36 -07:00
2006-09-25 23:30:57 -07:00
/*
2006-09-25 23:31:00 -07:00
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
2006-09-25 23:30:57 -07:00
*/
2009-09-21 17:02:01 -07:00
if ( PageAnon ( old_page ) & & ! PageKsm ( old_page ) ) {
2009-01-06 14:39:33 -08:00
if ( ! trylock_page ( old_page ) ) {
page_cache_get ( old_page ) ;
pte_unmap_unlock ( page_table , ptl ) ;
lock_page ( old_page ) ;
page_table = pte_offset_map_lock ( mm , pmd , address ,
& ptl ) ;
if ( ! pte_same ( * page_table , orig_pte ) ) {
unlock_page ( old_page ) ;
goto unlock ;
}
page_cache_release ( old_page ) ;
2006-09-25 23:31:00 -07:00
}
2011-01-13 15:46:07 -08:00
if ( reuse_swap_page ( old_page ) ) {
2010-03-05 13:42:09 -08:00
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap ( old_page , vma , address ) ;
2011-01-13 15:46:07 -08:00
unlock_page ( old_page ) ;
goto reuse ;
}
2009-01-06 14:39:33 -08:00
unlock_page ( old_page ) ;
2006-09-25 23:31:00 -07:00
} else if ( unlikely ( ( vma - > vm_flags & ( VM_WRITE | VM_SHARED ) ) = =
2006-09-25 23:30:57 -07:00
( VM_WRITE | VM_SHARED ) ) ) {
2006-09-25 23:31:00 -07:00
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
* get_user_pages(.write=1, .force=1).
*/
2006-06-23 02:03:43 -07:00
if ( vma - > vm_ops & & vma - > vm_ops - > page_mkwrite ) {
2009-03-31 15:23:21 -07:00
struct vm_fault vmf ;
int tmp ;
vmf . virtual_address = ( void __user * ) ( address &
PAGE_MASK ) ;
vmf . pgoff = old_page - > index ;
vmf . flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE ;
vmf . page = old_page ;
2006-06-23 02:03:43 -07:00
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get ( old_page ) ;
pte_unmap_unlock ( page_table , ptl ) ;
2009-03-31 15:23:21 -07:00
tmp = vma - > vm_ops - > page_mkwrite ( vma , & vmf ) ;
if ( unlikely ( tmp &
( VM_FAULT_ERROR | VM_FAULT_NOPAGE ) ) ) {
ret = tmp ;
2006-06-23 02:03:43 -07:00
goto unwritable_page ;
2009-03-31 15:23:21 -07:00
}
2009-04-30 15:08:16 -07:00
if ( unlikely ( ! ( tmp & VM_FAULT_LOCKED ) ) ) {
lock_page ( old_page ) ;
if ( ! old_page - > mapping ) {
ret = 0 ; /* retry the fault */
unlock_page ( old_page ) ;
goto unwritable_page ;
}
} else
VM_BUG_ON ( ! PageLocked ( old_page ) ) ;
2006-06-23 02:03:43 -07:00
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock ( mm , pmd , address ,
& ptl ) ;
2009-04-30 15:08:16 -07:00
if ( ! pte_same ( * page_table , orig_pte ) ) {
unlock_page ( old_page ) ;
2006-06-23 02:03:43 -07:00
goto unlock ;
2009-04-30 15:08:16 -07:00
}
2007-10-08 18:54:37 +02:00
page_mkwrite = 1 ;
2005-04-16 15:20:36 -07:00
}
2006-09-25 23:30:57 -07:00
dirty_page = old_page ;
get_page ( dirty_page ) ;
2006-06-23 02:03:43 -07:00
2008-07-04 09:59:24 -07:00
reuse :
2006-06-23 02:03:43 -07:00
flush_cache_page ( vma , address , pte_pfn ( orig_pte ) ) ;
entry = pte_mkyoung ( orig_pte ) ;
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
2007-10-16 01:25:44 -07:00
if ( ptep_set_access_flags ( vma , address , page_table , entry , 1 ) )
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , page_table ) ;
2011-01-13 15:46:08 -08:00
pte_unmap_unlock ( page_table , ptl ) ;
2006-06-23 02:03:43 -07:00
ret | = VM_FAULT_WRITE ;
2011-01-13 15:46:08 -08:00
if ( ! dirty_page )
return ret ;
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
2011-02-10 13:56:28 +09:00
* __do_fault is protected similarly.
2011-01-13 15:46:08 -08:00
*/
if ( ! page_mkwrite ) {
wait_on_page_locked ( dirty_page ) ;
set_page_dirty_balance ( dirty_page , page_mkwrite ) ;
2012-06-12 16:20:28 +02:00
/* file_update_time outside page_lock */
if ( vma - > vm_file )
file_update_time ( vma - > vm_file ) ;
2011-01-13 15:46:08 -08:00
}
put_page ( dirty_page ) ;
if ( page_mkwrite ) {
struct address_space * mapping = dirty_page - > mapping ;
set_page_dirty ( dirty_page ) ;
unlock_page ( dirty_page ) ;
page_cache_release ( dirty_page ) ;
if ( mapping ) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited ( mapping ) ;
}
}
return ret ;
2005-04-16 15:20:36 -07:00
}
/*
* Ok, we need to copy. Oh, well..
*/
2005-10-29 18:16:12 -07:00
page_cache_get ( old_page ) ;
2005-11-21 21:32:17 -08:00
gotten :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( anon_vma_prepare ( vma ) ) )
2005-10-29 18:15:59 -07:00
goto oom ;
2009-09-21 17:03:30 -07:00
2009-09-21 17:03:34 -07:00
if ( is_zero_pfn ( pte_pfn ( orig_pte ) ) ) {
2009-09-21 17:03:30 -07:00
new_page = alloc_zeroed_user_highpage_movable ( vma , address ) ;
if ( ! new_page )
goto oom ;
} else {
new_page = alloc_page_vma ( GFP_HIGHUSER_MOVABLE , vma , address ) ;
if ( ! new_page )
goto oom ;
cow_user_page ( new_page , old_page , address , vma ) ;
}
__SetPageUptodate ( new_page ) ;
2009-01-07 18:08:10 -08:00
if ( mem_cgroup_newpage_charge ( new_page , mm , GFP_KERNEL ) )
2008-02-07 00:13:53 -08:00
goto oom_free_new ;
2012-10-08 16:33:35 -07:00
mmun_start = address & PAGE_MASK ;
2012-11-16 14:14:48 -08:00
mmun_end = mmun_start + PAGE_SIZE ;
2012-10-08 16:33:35 -07:00
mmu_notifier_invalidate_range_start ( mm , mmun_start , mmun_end ) ;
2005-04-16 15:20:36 -07:00
/*
* Re-check the pte - we dropped the lock
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-10-29 18:15:59 -07:00
if ( likely ( pte_same ( * page_table , orig_pte ) ) ) {
2005-11-21 21:32:17 -08:00
if ( old_page ) {
if ( ! PageAnon ( old_page ) ) {
2010-03-05 13:41:40 -08:00
dec_mm_counter_fast ( mm , MM_FILEPAGES ) ;
inc_mm_counter_fast ( mm , MM_ANONPAGES ) ;
2005-11-21 21:32:17 -08:00
}
} else
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_ANONPAGES ) ;
2005-11-29 11:45:26 -08:00
flush_cache_page ( vma , address , pte_pfn ( orig_pte ) ) ;
2005-10-29 18:15:59 -07:00
entry = mk_pte ( new_page , vma - > vm_page_prot ) ;
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
2006-09-29 01:58:42 -07:00
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
2009-09-21 17:01:51 -07:00
ptep_clear_flush ( vma , address , page_table ) ;
2006-01-06 00:11:12 -08:00
page_add_new_anon_rmap ( new_page , vma , address ) ;
2009-09-21 17:01:51 -07:00
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify ( mm , address , page_table , entry ) ;
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , page_table ) ;
2008-06-23 14:30:30 +02:00
if ( old_page ) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
2009-01-06 14:40:11 -08:00
page_remove_rmap ( old_page ) ;
2008-06-23 14:30:30 +02:00
}
2005-04-16 15:20:36 -07:00
/* Free the old page.. */
new_page = old_page ;
2005-08-03 20:24:01 +10:00
ret | = VM_FAULT_WRITE ;
2008-02-07 00:13:53 -08:00
} else
mem_cgroup_uncharge_page ( new_page ) ;
2012-10-08 16:33:35 -07:00
if ( new_page )
page_cache_release ( new_page ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2012-11-16 14:14:48 -08:00
if ( mmun_end > mmun_start )
2012-10-08 16:33:35 -07:00
mmu_notifier_invalidate_range_end ( mm , mmun_start , mmun_end ) ;
2011-02-10 15:01:32 -08:00
if ( old_page ) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if ( ( ret & VM_FAULT_WRITE ) & & ( vma - > vm_flags & VM_LOCKED ) ) {
lock_page ( old_page ) ; /* LRU manipulation */
munlock_vma_page ( old_page ) ;
unlock_page ( old_page ) ;
}
page_cache_release ( old_page ) ;
}
2005-08-03 20:24:01 +10:00
return ret ;
2008-02-07 00:13:53 -08:00
oom_free_new :
2008-03-04 14:29:04 -08:00
page_cache_release ( new_page ) ;
2005-10-29 18:15:59 -07:00
oom :
2012-12-12 13:52:37 -08:00
if ( old_page )
2005-11-21 21:32:17 -08:00
page_cache_release ( old_page ) ;
2005-04-16 15:20:36 -07:00
return VM_FAULT_OOM ;
2006-06-23 02:03:43 -07:00
unwritable_page :
page_cache_release ( old_page ) ;
2009-03-31 15:23:21 -07:00
return ret ;
2005-04-16 15:20:36 -07:00
}
2011-05-24 17:12:04 -07:00
static void unmap_mapping_range_vma ( struct vm_area_struct * vma ,
2005-04-16 15:20:36 -07:00
unsigned long start_addr , unsigned long end_addr ,
struct zap_details * details )
{
2012-03-05 14:14:20 -05:00
zap_page_range_single ( vma , start_addr , end_addr - start_addr , details ) ;
2005-04-16 15:20:36 -07:00
}
2012-10-08 16:31:25 -07:00
static inline void unmap_mapping_range_tree ( struct rb_root * root ,
2005-04-16 15:20:36 -07:00
struct zap_details * details )
{
struct vm_area_struct * vma ;
pgoff_t vba , vea , zba , zea ;
2012-10-08 16:31:25 -07:00
vma_interval_tree_foreach ( vma , root ,
2005-04-16 15:20:36 -07:00
details - > first_index , details - > last_index ) {
vba = vma - > vm_pgoff ;
vea = vba + ( ( vma - > vm_end - vma - > vm_start ) > > PAGE_SHIFT ) - 1 ;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details - > first_index ;
if ( zba < vba )
zba = vba ;
zea = details - > last_index ;
if ( zea > vea )
zea = vea ;
2011-05-24 17:12:04 -07:00
unmap_mapping_range_vma ( vma ,
2005-04-16 15:20:36 -07:00
( ( zba - vba ) < < PAGE_SHIFT ) + vma - > vm_start ,
( ( zea - vba + 1 ) < < PAGE_SHIFT ) + vma - > vm_start ,
2011-05-24 17:12:04 -07:00
details ) ;
2005-04-16 15:20:36 -07:00
}
}
static inline void unmap_mapping_range_list ( struct list_head * head ,
struct zap_details * details )
{
struct vm_area_struct * vma ;
/*
* In nonlinear VMAs there is no correspondence between virtual address
* offset and file offset. So we must perform an exhaustive search
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
2012-10-08 16:31:25 -07:00
list_for_each_entry ( vma , head , shared . nonlinear ) {
2005-04-16 15:20:36 -07:00
details - > nonlinear_vma = vma ;
2011-05-24 17:12:04 -07:00
unmap_mapping_range_vma ( vma , vma - > vm_start , vma - > vm_end , details ) ;
2005-04-16 15:20:36 -07:00
}
}
/**
2007-02-10 01:45:59 -08:00
* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2005-06-23 22:05:21 -07:00
* @mapping: the address space containing mmaps to be unmapped.
2005-04-16 15:20:36 -07:00
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
2009-08-21 02:35:05 +10:00
* boundary. Note that this is different from truncate_pagecache(), which
2005-04-16 15:20:36 -07:00
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void unmap_mapping_range ( struct address_space * mapping ,
loff_t const holebegin , loff_t const holelen , int even_cows )
{
struct zap_details details ;
pgoff_t hba = holebegin > > PAGE_SHIFT ;
pgoff_t hlen = ( holelen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
/* Check for overflow. */
if ( sizeof ( holelen ) > sizeof ( hlen ) ) {
long long holeend =
( holebegin + holelen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
if ( holeend & ~ ( long long ) ULONG_MAX )
hlen = ULONG_MAX - hba + 1 ;
}
details . check_mapping = even_cows ? NULL : mapping ;
details . nonlinear_vma = NULL ;
details . first_index = hba ;
details . last_index = hba + hlen - 1 ;
if ( details . last_index < details . first_index )
details . last_index = ULONG_MAX ;
2011-05-24 17:12:04 -07:00
2011-05-24 17:12:06 -07:00
mutex_lock ( & mapping - > i_mmap_mutex ) ;
2012-10-08 16:31:25 -07:00
if ( unlikely ( ! RB_EMPTY_ROOT ( & mapping - > i_mmap ) ) )
2005-04-16 15:20:36 -07:00
unmap_mapping_range_tree ( & mapping - > i_mmap , & details ) ;
if ( unlikely ( ! list_empty ( & mapping - > i_mmap_nonlinear ) ) )
unmap_mapping_range_list ( & mapping - > i_mmap_nonlinear , & details ) ;
2011-05-24 17:12:06 -07:00
mutex_unlock ( & mapping - > i_mmap_mutex ) ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( unmap_mapping_range ) ;
/*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_swap_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2009-04-10 08:43:11 -07:00
unsigned int flags , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2010-09-09 16:37:52 -07:00
struct page * page , * swapcache = NULL ;
2005-10-29 18:15:59 -07:00
swp_entry_t entry ;
2005-04-16 15:20:36 -07:00
pte_t pte ;
2010-10-26 14:21:57 -07:00
int locked ;
2011-03-23 16:42:19 -07:00
struct mem_cgroup * ptr ;
2010-08-09 17:19:48 -07:00
int exclusive = 0 ;
2007-07-19 01:47:05 -07:00
int ret = 0 ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:40 -07:00
if ( ! pte_unmap_same ( mm , pmd , page_table , orig_pte ) )
2005-10-29 18:16:26 -07:00
goto out ;
2005-10-29 18:15:59 -07:00
entry = pte_to_swp_entry ( orig_pte ) ;
2009-09-16 11:50:06 +02:00
if ( unlikely ( non_swap_entry ( entry ) ) ) {
if ( is_migration_entry ( entry ) ) {
migration_entry_wait ( mm , pmd , address ) ;
} else if ( is_hwpoison_entry ( entry ) ) {
ret = VM_FAULT_HWPOISON ;
} else {
print_bad_pte ( vma , address , orig_pte , NULL ) ;
2009-12-14 17:59:04 -08:00
ret = VM_FAULT_SIGBUS ;
2009-09-16 11:50:06 +02:00
}
2006-06-23 02:03:35 -07:00
goto out ;
}
2006-07-14 00:24:37 -07:00
delayacct_set_flag ( DELAYACCT_PF_SWAPIN ) ;
2005-04-16 15:20:36 -07:00
page = lookup_swap_cache ( entry ) ;
if ( ! page ) {
2008-02-04 22:28:42 -08:00
page = swapin_readahead ( entry ,
GFP_HIGHUSER_MOVABLE , vma , address ) ;
2005-04-16 15:20:36 -07:00
if ( ! page ) {
/*
2005-10-29 18:16:26 -07:00
* Back out if somebody else faulted in this pte
* while we released the pte lock.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( likely ( pte_same ( * page_table , orig_pte ) ) )
ret = VM_FAULT_OOM ;
2006-07-14 00:24:37 -07:00
delayacct_clear_flag ( DELAYACCT_PF_SWAPIN ) ;
2005-10-29 18:15:59 -07:00
goto unlock ;
2005-04-16 15:20:36 -07:00
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR ;
2006-06-30 01:55:45 -07:00
count_vm_event ( PGMAJFAULT ) ;
2011-05-26 16:25:38 -07:00
mem_cgroup_count_vm_event ( mm , PGMAJFAULT ) ;
2009-09-16 11:50:06 +02:00
} else if ( PageHWPoison ( page ) ) {
2009-12-16 12:19:58 +01:00
/*
* hwpoisoned dirty swapcache pages are kept for killing
* owner processes (which may be unknown at hwpoison time)
*/
2009-09-16 11:50:06 +02:00
ret = VM_FAULT_HWPOISON ;
delayacct_clear_flag ( DELAYACCT_PF_SWAPIN ) ;
2009-10-14 01:51:41 +02:00
goto out_release ;
2005-04-16 15:20:36 -07:00
}
2010-10-26 14:21:57 -07:00
locked = lock_page_or_retry ( page , mm , flags ) ;
2012-05-29 15:06:18 -07:00
2007-11-14 17:00:33 -08:00
delayacct_clear_flag ( DELAYACCT_PF_SWAPIN ) ;
2010-10-26 14:21:57 -07:00
if ( ! locked ) {
ret | = VM_FAULT_RETRY ;
goto out_release ;
}
2005-04-16 15:20:36 -07:00
2010-09-09 16:37:52 -07:00
/*
2010-09-19 19:40:22 -07:00
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.
2010-09-09 16:37:52 -07:00
*/
2010-09-19 19:40:22 -07:00
if ( unlikely ( ! PageSwapCache ( page ) | | page_private ( page ) ! = entry . val ) )
2010-09-09 16:37:52 -07:00
goto out_page ;
if ( ksm_might_need_to_copy ( page , vma , address ) ) {
swapcache = page ;
page = ksm_does_need_to_copy ( page , vma , address ) ;
if ( unlikely ( ! page ) ) {
ret = VM_FAULT_OOM ;
page = swapcache ;
swapcache = NULL ;
goto out_page ;
}
2009-12-14 17:59:24 -08:00
}
2009-01-07 18:08:10 -08:00
if ( mem_cgroup_try_charge_swapin ( mm , page , GFP_KERNEL , & ptr ) ) {
2008-10-18 20:28:08 -07:00
ret = VM_FAULT_OOM ;
2009-04-30 15:08:08 -07:00
goto out_page ;
2008-10-18 20:28:08 -07:00
}
2005-04-16 15:20:36 -07:00
/*
2005-10-29 18:16:26 -07:00
* Back out if somebody else already faulted in this pte.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-10-29 18:16:15 -07:00
if ( unlikely ( ! pte_same ( * page_table , orig_pte ) ) )
2005-05-16 21:53:50 -07:00
goto out_nomap ;
if ( unlikely ( ! PageUptodate ( page ) ) ) {
ret = VM_FAULT_SIGBUS ;
goto out_nomap ;
2005-04-16 15:20:36 -07:00
}
2009-01-07 18:08:00 -08:00
/*
* The page isn't present yet, go ahead with the fault.
*
* Be careful about the sequence of operations here.
* To get its accounting right, reuse_swap_page() must be called
* while the page is counted on swap but not yet in mapcount i.e.
* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
* must be called after the swap_free(), or it will never succeed.
2009-01-07 18:08:31 -08:00
* Because delete_from_swap_page() may be called by reuse_swap_page(),
* mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
* in page->private. In this case, a record in swap_cgroup is silently
* discarded at swap_free().
2009-01-07 18:08:00 -08:00
*/
2005-04-16 15:20:36 -07:00
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_ANONPAGES ) ;
2010-03-05 13:41:42 -08:00
dec_mm_counter_fast ( mm , MM_SWAPENTS ) ;
2005-04-16 15:20:36 -07:00
pte = mk_pte ( page , vma - > vm_page_prot ) ;
2009-04-10 08:43:11 -07:00
if ( ( flags & FAULT_FLAG_WRITE ) & & reuse_swap_page ( page ) ) {
2005-04-16 15:20:36 -07:00
pte = maybe_mkwrite ( pte_mkdirty ( pte ) , vma ) ;
2009-04-10 08:43:11 -07:00
flags & = ~ FAULT_FLAG_WRITE ;
2010-08-09 17:19:49 -07:00
ret | = VM_FAULT_WRITE ;
2010-08-09 17:19:48 -07:00
exclusive = 1 ;
2005-04-16 15:20:36 -07:00
}
flush_icache_page ( vma , page ) ;
set_pte_at ( mm , address , page_table , pte ) ;
2010-08-09 17:19:48 -07:00
do_page_add_anon_rmap ( page , vma , address , exclusive ) ;
2009-01-07 18:08:31 -08:00
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin ( page , ptr ) ;
2005-04-16 15:20:36 -07:00
2005-06-21 17:15:12 -07:00
swap_free ( entry ) ;
2008-10-18 20:26:44 -07:00
if ( vm_swap_full ( ) | | ( vma - > vm_flags & VM_LOCKED ) | | PageMlocked ( page ) )
2009-01-06 14:39:36 -08:00
try_to_free_swap ( page ) ;
2005-06-21 17:15:12 -07:00
unlock_page ( page ) ;
2010-09-09 16:37:52 -07:00
if ( swapcache ) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
* further safety release the lock after the swap_free
* so that the swap count won't change under a
* parallel locked swapcache.
*/
unlock_page ( swapcache ) ;
page_cache_release ( swapcache ) ;
}
2005-06-21 17:15:12 -07:00
2009-04-10 08:43:11 -07:00
if ( flags & FAULT_FLAG_WRITE ) {
2008-03-04 14:29:04 -08:00
ret | = do_wp_page ( mm , vma , address , page_table , pmd , ptl , pte ) ;
if ( ret & VM_FAULT_ERROR )
ret & = VM_FAULT_ERROR ;
2005-04-16 15:20:36 -07:00
goto out ;
}
/* No need to invalidate - it was non-present before */
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , page_table ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
out :
return ret ;
2005-05-16 21:53:50 -07:00
out_nomap :
2009-01-07 18:07:48 -08:00
mem_cgroup_cancel_charge_swapin ( ptr ) ;
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2009-04-30 15:08:08 -07:00
out_page :
2005-05-16 21:53:50 -07:00
unlock_page ( page ) ;
2009-10-14 01:51:41 +02:00
out_release :
2005-05-16 21:53:50 -07:00
page_cache_release ( page ) ;
2010-09-09 16:37:52 -07:00
if ( swapcache ) {
unlock_page ( swapcache ) ;
page_cache_release ( swapcache ) ;
}
2005-10-29 18:15:59 -07:00
return ret ;
2005-04-16 15:20:36 -07:00
}
2010-08-12 17:54:33 -07:00
/*
2010-08-24 11:44:18 -07:00
* This is like a special single-page "expand_{down|up}wards()",
* except we must first make sure that 'address{-|+}PAGE_SIZE'
2010-08-12 17:54:33 -07:00
* doesn't hit another vma.
*/
static inline int check_stack_guard_page ( struct vm_area_struct * vma , unsigned long address )
{
address & = PAGE_MASK ;
if ( ( vma - > vm_flags & VM_GROWSDOWN ) & & address = = vma - > vm_start ) {
2010-08-20 16:49:40 -07:00
struct vm_area_struct * prev = vma - > vm_prev ;
2010-08-12 17:54:33 -07:00
2010-08-20 16:49:40 -07:00
/*
* Is there a mapping abutting this one below?
*
* That's only ok if it's the same stack mapping
* that has gotten split..
*/
if ( prev & & prev - > vm_end = = address )
return prev - > vm_flags & VM_GROWSDOWN ? 0 : - ENOMEM ;
2011-05-24 17:11:44 -07:00
expand_downwards ( vma , address - PAGE_SIZE ) ;
2010-08-12 17:54:33 -07:00
}
2010-08-24 11:44:18 -07:00
if ( ( vma - > vm_flags & VM_GROWSUP ) & & address + PAGE_SIZE = = vma - > vm_end ) {
struct vm_area_struct * next = vma - > vm_next ;
/* As VM_GROWSDOWN but s/below/above/ */
if ( next & & next - > vm_start = = address + PAGE_SIZE )
return next - > vm_flags & VM_GROWSUP ? 0 : - ENOMEM ;
expand_upwards ( vma , address + PAGE_SIZE ) ;
}
2010-08-12 17:54:33 -07:00
return 0 ;
}
2005-04-16 15:20:36 -07:00
/*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_anonymous_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2009-04-10 08:43:11 -07:00
unsigned int flags )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:26 -07:00
struct page * page ;
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
pte_t entry ;
2010-08-14 11:44:56 -07:00
pte_unmap ( page_table ) ;
2010-08-12 17:54:33 -07:00
2010-08-14 11:44:56 -07:00
/* Check if we need to add a guard page to the stack */
if ( check_stack_guard_page ( vma , address ) < 0 )
return VM_FAULT_SIGBUS ;
/* Use the zero-page for reads */
2009-09-21 17:03:34 -07:00
if ( ! ( flags & FAULT_FLAG_WRITE ) ) {
entry = pte_mkspecial ( pfn_pte ( my_zero_pfn ( address ) ,
vma - > vm_page_prot ) ) ;
2010-08-14 11:44:56 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2009-09-21 17:03:30 -07:00
if ( ! pte_none ( * page_table ) )
goto unlock ;
goto setpte ;
}
2007-10-16 01:24:40 -07:00
/* Allocate our own private page. */
if ( unlikely ( anon_vma_prepare ( vma ) ) )
goto oom ;
page = alloc_zeroed_user_highpage_movable ( vma , address ) ;
if ( ! page )
goto oom ;
2008-02-04 22:29:34 -08:00
__SetPageUptodate ( page ) ;
2005-04-16 15:20:36 -07:00
2009-01-07 18:08:10 -08:00
if ( mem_cgroup_newpage_charge ( page , mm , GFP_KERNEL ) )
2008-02-07 00:13:53 -08:00
goto oom_free_page ;
2007-10-16 01:24:40 -07:00
entry = mk_pte ( page , vma - > vm_page_prot ) ;
2009-09-21 17:03:29 -07:00
if ( vma - > vm_flags & VM_WRITE )
entry = pte_mkwrite ( pte_mkdirty ( entry ) ) ;
2005-04-16 15:20:36 -07:00
2007-10-16 01:24:40 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2009-09-21 17:02:22 -07:00
if ( ! pte_none ( * page_table ) )
2007-10-16 01:24:40 -07:00
goto release ;
2009-09-21 17:02:20 -07:00
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_ANONPAGES ) ;
2007-10-16 01:24:40 -07:00
page_add_new_anon_rmap ( page , vma , address ) ;
2009-09-21 17:03:30 -07:00
setpte :
2005-10-29 18:15:59 -07:00
set_pte_at ( mm , address , page_table , entry ) ;
2005-04-16 15:20:36 -07:00
/* No need to invalidate - it was non-present before */
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , page_table ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2007-07-19 01:47:05 -07:00
return 0 ;
2005-10-29 18:16:26 -07:00
release :
2008-02-07 00:13:53 -08:00
mem_cgroup_uncharge_page ( page ) ;
2005-10-29 18:16:26 -07:00
page_cache_release ( page ) ;
goto unlock ;
2008-02-07 00:13:53 -08:00
oom_free_page :
2008-03-04 14:29:04 -08:00
page_cache_release ( page ) ;
2005-10-29 18:15:59 -07:00
oom :
2005-04-16 15:20:36 -07:00
return VM_FAULT_OOM ;
}
/*
2007-07-19 01:46:59 -07:00
* __do_fault() tries to create a new page mapping. It aggressively
2005-04-16 15:20:36 -07:00
* tries to share with existing pages, but makes a separate copy if
2007-07-19 01:46:59 -07:00
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
* the next page fault.
2005-04-16 15:20:36 -07:00
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
2007-10-04 16:56:06 +01:00
* but allow concurrent faults), and pte neither mapped nor locked.
2005-10-29 18:16:26 -07:00
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2007-07-19 01:46:59 -07:00
static int __do_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
2007-10-04 16:56:06 +01:00
unsigned long address , pmd_t * pmd ,
2007-07-19 01:46:59 -07:00
pgoff_t pgoff , unsigned int flags , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2007-10-04 16:56:06 +01:00
pte_t * page_table ;
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2007-07-19 01:47:03 -07:00
struct page * page ;
2011-07-25 17:12:27 -07:00
struct page * cow_page ;
2005-04-16 15:20:36 -07:00
pte_t entry ;
int anon = 0 ;
2006-09-25 23:30:57 -07:00
struct page * dirty_page = NULL ;
2007-07-19 01:47:03 -07:00
struct vm_fault vmf ;
int ret ;
2007-10-08 18:54:37 +02:00
int page_mkwrite = 0 ;
2007-07-19 01:46:59 -07:00
2011-07-25 17:12:27 -07:00
/*
* If we do COW later, allocate page befor taking lock_page()
* on the file cache page. This will reduce lock holding time.
*/
if ( ( flags & FAULT_FLAG_WRITE ) & & ! ( vma - > vm_flags & VM_SHARED ) ) {
if ( unlikely ( anon_vma_prepare ( vma ) ) )
return VM_FAULT_OOM ;
cow_page = alloc_page_vma ( GFP_HIGHUSER_MOVABLE , vma , address ) ;
if ( ! cow_page )
return VM_FAULT_OOM ;
if ( mem_cgroup_newpage_charge ( cow_page , mm , GFP_KERNEL ) ) {
page_cache_release ( cow_page ) ;
return VM_FAULT_OOM ;
}
} else
cow_page = NULL ;
2007-07-19 01:47:03 -07:00
vmf . virtual_address = ( void __user * ) ( address & PAGE_MASK ) ;
vmf . pgoff = pgoff ;
vmf . flags = flags ;
vmf . page = NULL ;
2005-04-16 15:20:36 -07:00
2008-04-28 02:12:10 -07:00
ret = vma - > vm_ops - > fault ( vma , & vmf ) ;
2010-10-26 14:21:57 -07:00
if ( unlikely ( ret & ( VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY ) ) )
2011-07-25 17:12:27 -07:00
goto uncharge_out ;
2005-04-16 15:20:36 -07:00
2009-09-16 11:50:08 +02:00
if ( unlikely ( PageHWPoison ( vmf . page ) ) ) {
if ( ret & VM_FAULT_LOCKED )
unlock_page ( vmf . page ) ;
2011-07-25 17:12:27 -07:00
ret = VM_FAULT_HWPOISON ;
goto uncharge_out ;
2009-09-16 11:50:08 +02:00
}
2007-07-19 01:46:57 -07:00
/*
2007-07-19 01:47:03 -07:00
* For consistency in subsequent calls, make the faulted page always
2007-07-19 01:46:57 -07:00
* locked.
*/
2007-07-19 01:47:05 -07:00
if ( unlikely ( ! ( ret & VM_FAULT_LOCKED ) ) )
2007-07-19 01:47:03 -07:00
lock_page ( vmf . page ) ;
2007-07-19 01:46:59 -07:00
else
2007-07-19 01:47:03 -07:00
VM_BUG_ON ( ! PageLocked ( vmf . page ) ) ;
2007-07-19 01:46:57 -07:00
2005-04-16 15:20:36 -07:00
/*
* Should we do an early C-O-W break?
*/
2007-07-19 01:47:03 -07:00
page = vmf . page ;
2007-07-19 01:46:59 -07:00
if ( flags & FAULT_FLAG_WRITE ) {
2006-06-23 02:03:43 -07:00
if ( ! ( vma - > vm_flags & VM_SHARED ) ) {
2011-07-25 17:12:27 -07:00
page = cow_page ;
2007-07-19 01:46:59 -07:00
anon = 1 ;
2007-07-19 01:47:03 -07:00
copy_user_highpage ( page , vmf . page , address , vma ) ;
2008-02-04 22:29:34 -08:00
__SetPageUptodate ( page ) ;
2006-06-23 02:03:43 -07:00
} else {
2007-07-19 01:46:59 -07:00
/*
* If the page will be shareable, see if the backing
2006-06-23 02:03:43 -07:00
* address space wants to know that the page is about
2007-07-19 01:46:59 -07:00
* to become writable
*/
2007-07-19 01:47:00 -07:00
if ( vma - > vm_ops - > page_mkwrite ) {
2009-03-31 15:23:21 -07:00
int tmp ;
2007-07-19 01:47:00 -07:00
unlock_page ( page ) ;
2009-04-30 15:08:16 -07:00
vmf . flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE ;
2009-03-31 15:23:21 -07:00
tmp = vma - > vm_ops - > page_mkwrite ( vma , & vmf ) ;
if ( unlikely ( tmp &
( VM_FAULT_ERROR | VM_FAULT_NOPAGE ) ) ) {
ret = tmp ;
2009-04-30 15:08:16 -07:00
goto unwritable_page ;
2007-07-19 01:47:03 -07:00
}
2009-04-30 15:08:16 -07:00
if ( unlikely ( ! ( tmp & VM_FAULT_LOCKED ) ) ) {
lock_page ( page ) ;
if ( ! page - > mapping ) {
ret = 0 ; /* retry the fault */
unlock_page ( page ) ;
goto unwritable_page ;
}
} else
VM_BUG_ON ( ! PageLocked ( page ) ) ;
2007-10-08 18:54:37 +02:00
page_mkwrite = 1 ;
2006-06-23 02:03:43 -07:00
}
}
2007-07-19 01:46:59 -07:00
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
2009-04-10 08:43:11 -07:00
* Note that if FAULT_FLAG_WRITE is set, we either now have
2005-04-16 15:20:36 -07:00
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
2009-09-21 17:02:22 -07:00
if ( likely ( pte_same ( * page_table , orig_pte ) ) ) {
2007-07-19 01:46:57 -07:00
flush_icache_page ( vma , page ) ;
entry = mk_pte ( page , vma - > vm_page_prot ) ;
2007-07-19 01:46:59 -07:00
if ( flags & FAULT_FLAG_WRITE )
2005-04-16 15:20:36 -07:00
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
if ( anon ) {
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_ANONPAGES ) ;
2008-10-18 20:26:52 -07:00
page_add_new_anon_rmap ( page , vma , address ) ;
2005-11-21 21:32:19 -08:00
} else {
2010-03-05 13:41:40 -08:00
inc_mm_counter_fast ( mm , MM_FILEPAGES ) ;
2007-07-19 01:46:57 -07:00
page_add_file_rmap ( page ) ;
2007-07-19 01:46:59 -07:00
if ( flags & FAULT_FLAG_WRITE ) {
2007-07-19 01:46:57 -07:00
dirty_page = page ;
2006-09-25 23:30:57 -07:00
get_page ( dirty_page ) ;
}
2005-10-29 18:16:05 -07:00
}
2008-10-18 20:26:52 -07:00
set_pte_at ( mm , address , page_table , entry ) ;
2007-07-19 01:46:57 -07:00
/* no need to invalidate: a not-present page won't be cached */
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , page_table ) ;
2005-04-16 15:20:36 -07:00
} else {
2011-07-25 17:12:27 -07:00
if ( cow_page )
mem_cgroup_uncharge_page ( cow_page ) ;
2007-07-19 01:46:57 -07:00
if ( anon )
page_cache_release ( page ) ;
else
2007-07-19 01:46:59 -07:00
anon = 1 ; /* no anon but release faulted_page */
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2007-07-19 01:46:57 -07:00
2009-04-30 15:08:16 -07:00
if ( dirty_page ) {
struct address_space * mapping = page - > mapping ;
2012-06-12 16:20:28 +02:00
int dirtied = 0 ;
2009-04-30 15:08:16 -07:00
if ( set_page_dirty ( dirty_page ) )
2012-06-12 16:20:28 +02:00
dirtied = 1 ;
2009-04-30 15:08:16 -07:00
unlock_page ( dirty_page ) ;
put_page ( dirty_page ) ;
2012-06-12 16:20:28 +02:00
if ( ( dirtied | | page_mkwrite ) & & mapping ) {
2009-04-30 15:08:16 -07:00
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited ( mapping ) ;
}
/* file_update_time outside page_lock */
2012-06-12 16:20:28 +02:00
if ( vma - > vm_file & & ! page_mkwrite )
2008-01-23 02:21:18 +03:00
file_update_time ( vma - > vm_file ) ;
2009-04-30 15:08:16 -07:00
} else {
unlock_page ( vmf . page ) ;
if ( anon )
page_cache_release ( vmf . page ) ;
2006-09-25 23:30:57 -07:00
}
2007-07-19 01:46:57 -07:00
2007-07-19 01:47:05 -07:00
return ret ;
2009-04-30 15:08:16 -07:00
unwritable_page :
page_cache_release ( page ) ;
return ret ;
2011-07-25 17:12:27 -07:00
uncharge_out :
/* fs's fault handler get error */
if ( cow_page ) {
mem_cgroup_uncharge_page ( cow_page ) ;
page_cache_release ( cow_page ) ;
}
return ret ;
2007-07-19 01:46:59 -07:00
}
2007-07-19 01:46:57 -07:00
2007-07-19 01:46:59 -07:00
static int do_linear_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2009-04-10 08:43:11 -07:00
unsigned int flags , pte_t orig_pte )
2007-07-19 01:46:59 -07:00
{
pgoff_t pgoff = ( ( ( address & PAGE_MASK )
2007-10-16 01:24:45 -07:00
- vma - > vm_start ) > > PAGE_SHIFT ) + vma - > vm_pgoff ;
2007-07-19 01:46:59 -07:00
2007-10-04 16:56:06 +01:00
pte_unmap ( page_table ) ;
return __do_fault ( mm , vma , address , pmd , pgoff , flags , orig_pte ) ;
2007-07-19 01:46:59 -07:00
}
2005-04-16 15:20:36 -07:00
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
2005-10-29 18:16:26 -07:00
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2007-07-19 01:47:03 -07:00
static int do_nonlinear_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
2005-10-29 18:15:59 -07:00
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2009-04-10 08:43:11 -07:00
unsigned int flags , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:15:59 -07:00
pgoff_t pgoff ;
2005-04-16 15:20:36 -07:00
2009-04-10 08:43:11 -07:00
flags | = FAULT_FLAG_NONLINEAR ;
2005-10-29 18:16:40 -07:00
if ( ! pte_unmap_same ( mm , pmd , page_table , orig_pte ) )
2007-07-19 01:47:05 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
2009-01-06 14:40:10 -08:00
if ( unlikely ( ! ( vma - > vm_flags & VM_NONLINEAR ) ) ) {
2005-10-29 18:15:59 -07:00
/*
* Page table corrupted: show pte and kill process.
*/
2009-01-06 14:40:08 -08:00
print_bad_pte ( vma , address , orig_pte , NULL ) ;
2009-12-14 17:59:04 -08:00
return VM_FAULT_SIGBUS ;
2005-10-29 18:15:59 -07:00
}
pgoff = pte_to_pgoff ( orig_pte ) ;
2007-10-04 16:56:06 +01:00
return __do_fault ( mm , vma , address , pmd , pgoff , flags , orig_pte ) ;
2005-04-16 15:20:36 -07:00
}
2012-11-15 01:24:32 +00:00
int numa_migrate_prep ( struct page * page , struct vm_area_struct * vma ,
unsigned long addr , int current_nid )
{
get_page ( page ) ;
count_vm_numa_event ( NUMA_HINT_FAULTS ) ;
if ( current_nid = = numa_node_id ( ) )
count_vm_numa_event ( NUMA_HINT_FAULTS_LOCAL ) ;
return mpol_misplaced ( page , vma , addr ) ;
}
2012-10-25 14:16:31 +02:00
int do_numa_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , pte_t pte , pte_t * ptep , pmd_t * pmd )
{
2012-11-02 11:33:45 +00:00
struct page * page = NULL ;
2012-10-25 14:16:31 +02:00
spinlock_t * ptl ;
2012-10-25 14:16:43 +02:00
int current_nid = - 1 ;
int target_nid ;
2012-11-21 01:18:23 +00:00
bool migrated = false ;
2012-10-25 14:16:31 +02:00
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
* ptep_modify_prot_start is not called as this is clearing
* the _PAGE_NUMA bit and it is not really expected that there
* would be concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr ( mm , pmd ) ;
spin_lock ( ptl ) ;
2012-11-02 11:33:45 +00:00
if ( unlikely ( ! pte_same ( * ptep , pte ) ) ) {
pte_unmap_unlock ( ptep , ptl ) ;
goto out ;
}
2012-10-25 14:16:31 +02:00
pte = pte_mknonnuma ( pte ) ;
set_pte_at ( mm , addr , ptep , pte ) ;
update_mmu_cache ( vma , addr , ptep ) ;
page = vm_normal_page ( vma , addr , pte ) ;
if ( ! page ) {
pte_unmap_unlock ( ptep , ptl ) ;
return 0 ;
}
2012-11-02 11:33:45 +00:00
current_nid = page_to_nid ( page ) ;
2012-11-15 01:24:32 +00:00
target_nid = numa_migrate_prep ( page , vma , addr , current_nid ) ;
2012-10-25 14:16:31 +02:00
pte_unmap_unlock ( ptep , ptl ) ;
2012-11-02 11:33:45 +00:00
if ( target_nid = = - 1 ) {
/*
* Account for the fault against the current node if it not
* being replaced regardless of where the page is located.
*/
current_nid = numa_node_id ( ) ;
put_page ( page ) ;
goto out ;
}
/* Migrate to the requested node */
2012-11-21 01:18:23 +00:00
migrated = migrate_misplaced_page ( page , target_nid ) ;
if ( migrated )
2012-11-02 11:33:45 +00:00
current_nid = target_nid ;
out :
2012-11-15 01:24:32 +00:00
if ( current_nid ! = - 1 )
2012-11-21 01:18:23 +00:00
task_numa_fault ( current_nid , 1 , migrated ) ;
2012-10-25 14:16:31 +02:00
return 0 ;
}
/* NUMA hinting page fault entry point for regular pmds */
# ifdef CONFIG_NUMA_BALANCING
static int do_pmd_numa_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , pmd_t * pmdp )
{
pmd_t pmd ;
pte_t * pte , * orig_pte ;
unsigned long _addr = addr & PMD_MASK ;
unsigned long offset ;
spinlock_t * ptl ;
bool numa = false ;
2012-11-02 14:52:48 +00:00
int local_nid = numa_node_id ( ) ;
2012-10-25 14:16:31 +02:00
spin_lock ( & mm - > page_table_lock ) ;
pmd = * pmdp ;
if ( pmd_numa ( pmd ) ) {
set_pmd_at ( mm , _addr , pmdp , pmd_mknonnuma ( pmd ) ) ;
numa = true ;
}
spin_unlock ( & mm - > page_table_lock ) ;
if ( ! numa )
return 0 ;
/* we're in a page fault so some vma must be in the range */
BUG_ON ( ! vma ) ;
BUG_ON ( vma - > vm_start > = _addr + PMD_SIZE ) ;
offset = max ( _addr , vma - > vm_start ) & ~ PMD_MASK ;
VM_BUG_ON ( offset > = PMD_SIZE ) ;
orig_pte = pte = pte_offset_map_lock ( mm , pmdp , _addr , & ptl ) ;
pte + = offset > > PAGE_SHIFT ;
for ( addr = _addr + offset ; addr < _addr + PMD_SIZE ; pte + + , addr + = PAGE_SIZE ) {
pte_t pteval = * pte ;
struct page * page ;
2012-11-15 01:24:32 +00:00
int curr_nid = local_nid ;
int target_nid ;
2012-11-21 01:18:23 +00:00
bool migrated ;
2012-10-25 14:16:31 +02:00
if ( ! pte_present ( pteval ) )
continue ;
if ( ! pte_numa ( pteval ) )
continue ;
if ( addr > = vma - > vm_end ) {
vma = find_vma ( mm , addr ) ;
/* there's a pte present so there must be a vma */
BUG_ON ( ! vma ) ;
BUG_ON ( addr < vma - > vm_start ) ;
}
if ( pte_numa ( pteval ) ) {
pteval = pte_mknonnuma ( pteval ) ;
set_pte_at ( mm , addr , pte , pteval ) ;
}
page = vm_normal_page ( vma , addr , pteval ) ;
if ( unlikely ( ! page ) )
continue ;
2012-10-25 14:16:43 +02:00
/* only check non-shared pages */
if ( unlikely ( page_mapcount ( page ) ! = 1 ) )
continue ;
2012-11-15 01:24:32 +00:00
/*
* Note that the NUMA fault is later accounted to either
* the node that is currently running or where the page is
* migrated to.
*/
curr_nid = local_nid ;
target_nid = numa_migrate_prep ( page , vma , addr ,
page_to_nid ( page ) ) ;
if ( target_nid = = - 1 ) {
put_page ( page ) ;
continue ;
}
/* Migrate to the requested node */
2012-10-25 14:16:43 +02:00
pte_unmap_unlock ( pte , ptl ) ;
2012-11-21 01:18:23 +00:00
migrated = migrate_misplaced_page ( page , target_nid ) ;
if ( migrated )
2012-11-15 01:24:32 +00:00
curr_nid = target_nid ;
2012-11-21 01:18:23 +00:00
task_numa_fault ( curr_nid , 1 , migrated ) ;
2012-10-25 14:16:43 +02:00
pte = pte_offset_map_lock ( mm , pmdp , addr , & ptl ) ;
2012-10-25 14:16:31 +02:00
}
pte_unmap_unlock ( orig_pte , ptl ) ;
return 0 ;
}
# else
static int do_pmd_numa_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , pmd_t * pmdp )
{
BUG ( ) ;
2012-12-17 15:59:24 -08:00
return 0 ;
2012-10-25 14:16:31 +02:00
}
# endif /* CONFIG_NUMA_BALANCING */
2005-04-16 15:20:36 -07:00
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
2005-10-29 18:16:23 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2011-01-13 15:46:52 -08:00
int handle_pte_fault ( struct mm_struct * mm ,
struct vm_area_struct * vma , unsigned long address ,
pte_t * pte , pmd_t * pmd , unsigned int flags )
2005-04-16 15:20:36 -07:00
{
pte_t entry ;
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
2007-06-16 10:16:12 -07:00
entry = * pte ;
2005-04-16 15:20:36 -07:00
if ( ! pte_present ( entry ) ) {
2005-10-29 18:15:59 -07:00
if ( pte_none ( entry ) ) {
2006-09-27 01:50:10 -07:00
if ( vma - > vm_ops ) {
2008-04-28 02:12:10 -07:00
if ( likely ( vma - > vm_ops - > fault ) )
2007-07-19 01:46:59 -07:00
return do_linear_fault ( mm , vma , address ,
2009-04-10 08:43:11 -07:00
pte , pmd , flags , entry ) ;
2006-09-27 01:50:10 -07:00
}
return do_anonymous_page ( mm , vma , address ,
2009-04-10 08:43:11 -07:00
pte , pmd , flags ) ;
2005-10-29 18:15:59 -07:00
}
2005-04-16 15:20:36 -07:00
if ( pte_file ( entry ) )
2007-07-19 01:47:03 -07:00
return do_nonlinear_fault ( mm , vma , address ,
2009-04-10 08:43:11 -07:00
pte , pmd , flags , entry ) ;
2005-10-29 18:15:59 -07:00
return do_swap_page ( mm , vma , address ,
2009-04-10 08:43:11 -07:00
pte , pmd , flags , entry ) ;
2005-04-16 15:20:36 -07:00
}
2012-10-25 14:16:31 +02:00
if ( pte_numa ( entry ) )
return do_numa_page ( mm , vma , address , entry , pte , pmd ) ;
2005-10-29 18:16:40 -07:00
ptl = pte_lockptr ( mm , pmd ) ;
2005-10-29 18:16:26 -07:00
spin_lock ( ptl ) ;
if ( unlikely ( ! pte_same ( * pte , entry ) ) )
goto unlock ;
2009-04-10 08:43:11 -07:00
if ( flags & FAULT_FLAG_WRITE ) {
2005-04-16 15:20:36 -07:00
if ( ! pte_write ( entry ) )
2005-10-29 18:16:26 -07:00
return do_wp_page ( mm , vma , address ,
pte , pmd , ptl , entry ) ;
2005-04-16 15:20:36 -07:00
entry = pte_mkdirty ( entry ) ;
}
entry = pte_mkyoung ( entry ) ;
2009-04-10 08:43:11 -07:00
if ( ptep_set_access_flags ( vma , address , pte , entry , flags & FAULT_FLAG_WRITE ) ) {
2009-12-18 16:40:18 +00:00
update_mmu_cache ( vma , address , pte ) ;
2005-10-29 18:16:48 -07:00
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
2009-04-10 08:43:11 -07:00
if ( flags & FAULT_FLAG_WRITE )
2010-08-16 09:16:55 +08:00
flush_tlb_fix_spurious_fault ( vma , address ) ;
2005-10-29 18:16:48 -07:00
}
2005-10-29 18:16:26 -07:00
unlock :
pte_unmap_unlock ( pte , ptl ) ;
2007-07-19 01:47:05 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* By the time we get here, we already hold the mm semaphore
*/
2007-07-19 01:47:05 -07:00
int handle_mm_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
2009-04-10 09:01:23 -07:00
unsigned long address , unsigned int flags )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
__set_current_state ( TASK_RUNNING ) ;
2006-06-30 01:55:45 -07:00
count_vm_event ( PGFAULT ) ;
2011-05-26 16:25:38 -07:00
mem_cgroup_count_vm_event ( mm , PGFAULT ) ;
2005-04-16 15:20:36 -07:00
2010-03-05 13:41:40 -08:00
/* do counter updates before entering really critical section. */
check_sync_rss_stat ( current ) ;
2005-10-20 16:24:28 +01:00
if ( unlikely ( is_vm_hugetlb_page ( vma ) ) )
2009-04-10 08:43:11 -07:00
return hugetlb_fault ( mm , vma , address , flags ) ;
2005-04-16 15:20:36 -07:00
2012-05-29 15:06:23 -07:00
retry :
2005-04-16 15:20:36 -07:00
pgd = pgd_offset ( mm , address ) ;
pud = pud_alloc ( mm , pgd , address ) ;
if ( ! pud )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2005-04-16 15:20:36 -07:00
pmd = pmd_alloc ( mm , pud , address ) ;
if ( ! pmd )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2011-01-13 15:46:52 -08:00
if ( pmd_none ( * pmd ) & & transparent_hugepage_enabled ( vma ) ) {
if ( ! vma - > vm_ops )
return do_huge_pmd_anonymous_page ( mm , vma , address ,
pmd , flags ) ;
} else {
pmd_t orig_pmd = * pmd ;
2012-05-29 15:06:23 -07:00
int ret ;
2011-01-13 15:46:52 -08:00
barrier ( ) ;
if ( pmd_trans_huge ( orig_pmd ) ) {
2012-12-11 16:01:27 -08:00
unsigned int dirty = flags & FAULT_FLAG_WRITE ;
2012-12-16 14:33:25 -08:00
if ( pmd_numa ( orig_pmd ) )
2012-11-02 11:33:45 +00:00
return do_huge_pmd_numa_page ( mm , vma , address ,
2012-10-25 14:16:31 +02:00
orig_pmd , pmd ) ;
2012-12-16 14:33:25 -08:00
if ( dirty & & ! pmd_write ( orig_pmd ) ) {
2012-05-29 15:06:23 -07:00
ret = do_huge_pmd_wp_page ( mm , vma , address , pmd ,
orig_pmd ) ;
/*
* If COW results in an oom, the huge pmd will
* have been split, so retry the fault on the
* pte for a smaller charge.
*/
if ( unlikely ( ret & VM_FAULT_OOM ) )
goto retry ;
return ret ;
2012-12-11 16:01:27 -08:00
} else {
huge_pmd_set_accessed ( mm , vma , address , pmd ,
orig_pmd , dirty ) ;
2012-05-29 15:06:23 -07:00
}
2012-10-25 14:16:31 +02:00
2011-01-13 15:46:52 -08:00
return 0 ;
}
}
2012-10-25 14:16:31 +02:00
if ( pmd_numa ( * pmd ) )
return do_pmd_numa_page ( mm , vma , address , pmd ) ;
2011-01-13 15:46:52 -08:00
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
2011-10-12 21:06:51 +02:00
if ( unlikely ( pmd_none ( * pmd ) ) & &
unlikely ( __pte_alloc ( mm , vma , pmd , address ) ) )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2011-01-13 15:46:52 -08:00
/* if an huge pmd materialized from under us just retry later */
if ( unlikely ( pmd_trans_huge ( * pmd ) ) )
return 0 ;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map ( pmd , address ) ;
2005-04-16 15:20:36 -07:00
2009-04-10 08:43:11 -07:00
return handle_pte_fault ( mm , vma , address , pte , pmd , flags ) ;
2005-04-16 15:20:36 -07:00
}
# ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
2005-10-29 18:16:21 -07:00
* We've already handled the fast-path in-line.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:22 -07:00
int __pud_alloc ( struct mm_struct * mm , pgd_t * pgd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:23 -07:00
pud_t * new = pud_alloc_one ( mm , address ) ;
if ( ! new )
2005-10-29 18:16:22 -07:00
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2008-05-14 06:37:36 +02:00
smp_wmb ( ) ; /* See comment in __pte_alloc */
2005-10-29 18:16:21 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
if ( pgd_present ( * pgd ) ) /* Another has populated it */
2008-02-04 22:29:14 -08:00
pud_free ( mm , new ) ;
2005-10-29 18:16:22 -07:00
else
pgd_populate ( mm , pgd , new ) ;
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
# endif /* __PAGETABLE_PUD_FOLDED */
# ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
2005-10-29 18:16:21 -07:00
* We've already handled the fast-path in-line.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:22 -07:00
int __pmd_alloc ( struct mm_struct * mm , pud_t * pud , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:23 -07:00
pmd_t * new = pmd_alloc_one ( mm , address ) ;
if ( ! new )
2005-10-29 18:16:22 -07:00
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2008-05-14 06:37:36 +02:00
smp_wmb ( ) ; /* See comment in __pte_alloc */
2005-10-29 18:16:21 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2005-04-16 15:20:36 -07:00
# ifndef __ARCH_HAS_4LEVEL_HACK
2005-10-29 18:16:22 -07:00
if ( pud_present ( * pud ) ) /* Another has populated it */
2008-02-04 22:29:14 -08:00
pmd_free ( mm , new ) ;
2005-10-29 18:16:22 -07:00
else
pud_populate ( mm , pud , new ) ;
2005-04-16 15:20:36 -07:00
# else
2005-10-29 18:16:22 -07:00
if ( pgd_present ( * pud ) ) /* Another has populated it */
2008-02-04 22:29:14 -08:00
pmd_free ( mm , new ) ;
2005-10-29 18:16:22 -07:00
else
pgd_populate ( mm , pud , new ) ;
2005-04-16 15:20:36 -07:00
# endif /* __ARCH_HAS_4LEVEL_HACK */
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-11-28 13:43:44 -08:00
}
2005-04-16 15:20:36 -07:00
# endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present ( unsigned long addr , unsigned long end )
{
int ret , len , write ;
struct vm_area_struct * vma ;
vma = find_vma ( current - > mm , addr ) ;
if ( ! vma )
2008-08-04 13:41:14 -07:00
return - ENOMEM ;
2011-01-13 15:46:09 -08:00
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
*/
write = ( vma - > vm_flags & ( VM_WRITE | VM_SHARED ) ) = = VM_WRITE ;
2006-03-26 18:30:52 +02:00
BUG_ON ( addr > = end ) ;
BUG_ON ( end > vma - > vm_end ) ;
2007-07-15 23:38:03 -07:00
len = DIV_ROUND_UP ( end , PAGE_SIZE ) - addr / PAGE_SIZE ;
2005-04-16 15:20:36 -07:00
ret = get_user_pages ( current , current - > mm , addr ,
len , write , 0 , NULL , NULL ) ;
2008-10-18 20:26:56 -07:00
if ( ret < 0 )
2005-04-16 15:20:36 -07:00
return ret ;
2008-10-18 20:26:56 -07:00
return ret = = len ? 0 : - EFAULT ;
2005-04-16 15:20:36 -07:00
}
# if !defined(__HAVE_ARCH_GATE_AREA)
# if defined(AT_SYSINFO_EHDR)
2005-09-10 00:26:28 -07:00
static struct vm_area_struct gate_vma ;
2005-04-16 15:20:36 -07:00
static int __init gate_vma_init ( void )
{
gate_vma . vm_mm = NULL ;
gate_vma . vm_start = FIXADDR_USER_START ;
gate_vma . vm_end = FIXADDR_USER_END ;
2007-01-26 00:56:47 -08:00
gate_vma . vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC ;
gate_vma . vm_page_prot = __P101 ;
2012-03-23 15:02:51 -07:00
2005-04-16 15:20:36 -07:00
return 0 ;
}
__initcall ( gate_vma_init ) ;
# endif
2011-03-13 15:49:15 -04:00
struct vm_area_struct * get_gate_vma ( struct mm_struct * mm )
2005-04-16 15:20:36 -07:00
{
# ifdef AT_SYSINFO_EHDR
return & gate_vma ;
# else
return NULL ;
# endif
}
2011-03-13 15:49:17 -04:00
int in_gate_area_no_mm ( unsigned long addr )
2005-04-16 15:20:36 -07:00
{
# ifdef AT_SYSINFO_EHDR
if ( ( addr > = FIXADDR_USER_START ) & & ( addr < FIXADDR_USER_END ) )
return 1 ;
# endif
return 0 ;
}
# endif /* __HAVE_ARCH_GATE_AREA */
2006-09-27 01:50:15 -07:00
2010-10-26 14:22:00 -07:00
static int __follow_pte ( struct mm_struct * mm , unsigned long address ,
2009-06-16 15:32:33 -07:00
pte_t * * ptepp , spinlock_t * * ptlp )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * ptep ;
pgd = pgd_offset ( mm , address ) ;
if ( pgd_none ( * pgd ) | | unlikely ( pgd_bad ( * pgd ) ) )
goto out ;
pud = pud_offset ( pgd , address ) ;
if ( pud_none ( * pud ) | | unlikely ( pud_bad ( * pud ) ) )
goto out ;
pmd = pmd_offset ( pud , address ) ;
2011-01-13 15:46:54 -08:00
VM_BUG_ON ( pmd_trans_huge ( * pmd ) ) ;
2009-06-16 15:32:33 -07:00
if ( pmd_none ( * pmd ) | | unlikely ( pmd_bad ( * pmd ) ) )
goto out ;
/* We cannot handle huge page PFN maps. Luckily they don't exist. */
if ( pmd_huge ( * pmd ) )
goto out ;
ptep = pte_offset_map_lock ( mm , pmd , address , ptlp ) ;
if ( ! ptep )
goto out ;
if ( ! pte_present ( * ptep ) )
goto unlock ;
* ptepp = ptep ;
return 0 ;
unlock :
pte_unmap_unlock ( ptep , * ptlp ) ;
out :
return - EINVAL ;
}
2010-10-26 14:22:00 -07:00
static inline int follow_pte ( struct mm_struct * mm , unsigned long address ,
pte_t * * ptepp , spinlock_t * * ptlp )
{
int res ;
/* (void) is needed to make gcc happy */
( void ) __cond_lock ( * ptlp ,
! ( res = __follow_pte ( mm , address , ptepp , ptlp ) ) ) ;
return res ;
}
2009-06-16 15:32:35 -07:00
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* Returns zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn ( struct vm_area_struct * vma , unsigned long address ,
unsigned long * pfn )
{
int ret = - EINVAL ;
spinlock_t * ptl ;
pte_t * ptep ;
if ( ! ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) )
return ret ;
ret = follow_pte ( vma - > vm_mm , address , & ptep , & ptl ) ;
if ( ret )
return ret ;
* pfn = pte_pfn ( * ptep ) ;
pte_unmap_unlock ( ptep , ptl ) ;
return 0 ;
}
EXPORT_SYMBOL ( follow_pfn ) ;
2008-07-23 21:27:05 -07:00
# ifdef CONFIG_HAVE_IOREMAP_PROT
2008-12-19 13:47:27 -08:00
int follow_phys ( struct vm_area_struct * vma ,
unsigned long address , unsigned int flags ,
unsigned long * prot , resource_size_t * phys )
2008-07-23 21:27:05 -07:00
{
2009-06-16 15:32:34 -07:00
int ret = - EINVAL ;
2008-07-23 21:27:05 -07:00
pte_t * ptep , pte ;
spinlock_t * ptl ;
2008-12-19 13:47:27 -08:00
if ( ! ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) )
goto out ;
2008-07-23 21:27:05 -07:00
2009-06-16 15:32:34 -07:00
if ( follow_pte ( vma - > vm_mm , address , & ptep , & ptl ) )
2008-12-19 13:47:27 -08:00
goto out ;
2008-07-23 21:27:05 -07:00
pte = * ptep ;
2009-06-16 15:32:34 -07:00
2008-07-23 21:27:05 -07:00
if ( ( flags & FOLL_WRITE ) & & ! pte_write ( pte ) )
goto unlock ;
* prot = pgprot_val ( pte_pgprot ( pte ) ) ;
2009-06-16 15:32:34 -07:00
* phys = ( resource_size_t ) pte_pfn ( pte ) < < PAGE_SHIFT ;
2008-07-23 21:27:05 -07:00
2009-06-16 15:32:34 -07:00
ret = 0 ;
2008-07-23 21:27:05 -07:00
unlock :
pte_unmap_unlock ( ptep , ptl ) ;
out :
2008-12-19 13:47:27 -08:00
return ret ;
2008-07-23 21:27:05 -07:00
}
int generic_access_phys ( struct vm_area_struct * vma , unsigned long addr ,
void * buf , int len , int write )
{
resource_size_t phys_addr ;
unsigned long prot = 0 ;
2009-01-06 14:39:43 -08:00
void __iomem * maddr ;
2008-07-23 21:27:05 -07:00
int offset = addr & ( PAGE_SIZE - 1 ) ;
2008-12-19 13:47:27 -08:00
if ( follow_phys ( vma , addr , write , & prot , & phys_addr ) )
2008-07-23 21:27:05 -07:00
return - EINVAL ;
maddr = ioremap_prot ( phys_addr , PAGE_SIZE , prot ) ;
if ( write )
memcpy_toio ( maddr + offset , buf , len ) ;
else
memcpy_fromio ( buf , maddr + offset , len ) ;
iounmap ( maddr ) ;
return len ;
}
# endif
2006-09-27 01:50:15 -07:00
/*
2011-03-13 15:49:19 -04:00
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
2006-09-27 01:50:15 -07:00
*/
2011-03-13 15:49:19 -04:00
static int __access_remote_vm ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long addr , void * buf , int len , int write )
2006-09-27 01:50:15 -07:00
{
struct vm_area_struct * vma ;
void * old_buf = buf ;
down_read ( & mm - > mmap_sem ) ;
2007-10-20 01:27:18 +02:00
/* ignore errors, just check how much was successfully transferred */
2006-09-27 01:50:15 -07:00
while ( len ) {
int bytes , ret , offset ;
void * maddr ;
2008-07-23 21:27:05 -07:00
struct page * page = NULL ;
2006-09-27 01:50:15 -07:00
ret = get_user_pages ( tsk , mm , addr , 1 ,
write , 1 , & page , & vma ) ;
2008-07-23 21:27:05 -07:00
if ( ret < = 0 ) {
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
# ifdef CONFIG_HAVE_IOREMAP_PROT
vma = find_vma ( mm , addr ) ;
2011-04-14 15:22:10 -07:00
if ( ! vma | | vma - > vm_start > addr )
2008-07-23 21:27:05 -07:00
break ;
if ( vma - > vm_ops & & vma - > vm_ops - > access )
ret = vma - > vm_ops - > access ( vma , addr , buf ,
len , write ) ;
if ( ret < = 0 )
# endif
break ;
bytes = ret ;
2006-09-27 01:50:15 -07:00
} else {
2008-07-23 21:27:05 -07:00
bytes = len ;
offset = addr & ( PAGE_SIZE - 1 ) ;
if ( bytes > PAGE_SIZE - offset )
bytes = PAGE_SIZE - offset ;
maddr = kmap ( page ) ;
if ( write ) {
copy_to_user_page ( vma , page , addr ,
maddr + offset , buf , bytes ) ;
set_page_dirty_lock ( page ) ;
} else {
copy_from_user_page ( vma , page , addr ,
buf , maddr + offset , bytes ) ;
}
kunmap ( page ) ;
page_cache_release ( page ) ;
2006-09-27 01:50:15 -07:00
}
len - = bytes ;
buf + = bytes ;
addr + = bytes ;
}
up_read ( & mm - > mmap_sem ) ;
return buf - old_buf ;
}
2008-01-30 13:33:18 +01:00
2011-03-13 15:49:20 -04:00
/**
2011-03-26 13:27:01 -07:00
* access_remote_vm - access another process' address space
2011-03-13 15:49:20 -04:00
* @mm: the mm_struct of the target address space
* @addr: start address to access
* @buf: source or destination buffer
* @len: number of bytes to transfer
* @write: whether the access is a write
*
* The caller must hold a reference on @mm.
*/
int access_remote_vm ( struct mm_struct * mm , unsigned long addr ,
void * buf , int len , int write )
{
return __access_remote_vm ( NULL , mm , addr , buf , len , write ) ;
}
2011-03-13 15:49:19 -04:00
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm ( struct task_struct * tsk , unsigned long addr ,
void * buf , int len , int write )
{
struct mm_struct * mm ;
int ret ;
mm = get_task_mm ( tsk ) ;
if ( ! mm )
return 0 ;
ret = __access_remote_vm ( tsk , mm , addr , buf , len , write ) ;
mmput ( mm ) ;
return ret ;
}
2008-01-30 13:33:18 +01:00
/*
* Print the name of a VMA.
*/
void print_vma_addr ( char * prefix , unsigned long ip )
{
struct mm_struct * mm = current - > mm ;
struct vm_area_struct * vma ;
2008-02-13 20:21:06 +01:00
/*
* Do not print if we are in atomic
* contexts (in exception stacks, etc.):
*/
if ( preempt_count ( ) )
return ;
2008-01-30 13:33:18 +01:00
down_read ( & mm - > mmap_sem ) ;
vma = find_vma ( mm , ip ) ;
if ( vma & & vma - > vm_file ) {
struct file * f = vma - > vm_file ;
char * buf = ( char * ) __get_free_page ( GFP_KERNEL ) ;
if ( buf ) {
char * p , * s ;
2008-02-14 19:38:44 -08:00
p = d_path ( & f - > f_path , buf , PAGE_SIZE ) ;
2008-01-30 13:33:18 +01:00
if ( IS_ERR ( p ) )
p = " ? " ;
s = strrchr ( p , ' / ' ) ;
if ( s )
p = s + 1 ;
printk ( " %s%s[%lx+%lx] " , prefix , p ,
vma - > vm_start ,
vma - > vm_end - vma - > vm_start ) ;
free_page ( ( unsigned long ) buf ) ;
}
}
2012-07-31 16:43:18 -07:00
up_read ( & mm - > mmap_sem ) ;
2008-01-30 13:33:18 +01:00
}
2008-09-10 13:37:17 +02:00
# ifdef CONFIG_PROVE_LOCKING
void might_fault ( void )
{
2009-01-12 13:02:11 +01:00
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
* holding the mmap_sem, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
if ( segment_eq ( get_fs ( ) , KERNEL_DS ) )
return ;
2008-09-10 13:37:17 +02:00
might_sleep ( ) ;
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
if ( ! in_atomic ( ) & & current - > mm )
might_lock_read ( & current - > mm - > mmap_sem ) ;
}
EXPORT_SYMBOL ( might_fault ) ;
# endif
2011-01-13 15:46:47 -08:00
# if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
static void clear_gigantic_page ( struct page * page ,
unsigned long addr ,
unsigned int pages_per_huge_page )
{
int i ;
struct page * p = page ;
might_sleep ( ) ;
for ( i = 0 ; i < pages_per_huge_page ;
i + + , p = mem_map_next ( p , page , i ) ) {
cond_resched ( ) ;
clear_user_highpage ( p , addr + i * PAGE_SIZE ) ;
}
}
void clear_huge_page ( struct page * page ,
unsigned long addr , unsigned int pages_per_huge_page )
{
int i ;
if ( unlikely ( pages_per_huge_page > MAX_ORDER_NR_PAGES ) ) {
clear_gigantic_page ( page , addr , pages_per_huge_page ) ;
return ;
}
might_sleep ( ) ;
for ( i = 0 ; i < pages_per_huge_page ; i + + ) {
cond_resched ( ) ;
clear_user_highpage ( page + i , addr + i * PAGE_SIZE ) ;
}
}
static void copy_user_gigantic_page ( struct page * dst , struct page * src ,
unsigned long addr ,
struct vm_area_struct * vma ,
unsigned int pages_per_huge_page )
{
int i ;
struct page * dst_base = dst ;
struct page * src_base = src ;
for ( i = 0 ; i < pages_per_huge_page ; ) {
cond_resched ( ) ;
copy_user_highpage ( dst , src , addr + i * PAGE_SIZE , vma ) ;
i + + ;
dst = mem_map_next ( dst , dst_base , i ) ;
src = mem_map_next ( src , src_base , i ) ;
}
}
void copy_user_huge_page ( struct page * dst , struct page * src ,
unsigned long addr , struct vm_area_struct * vma ,
unsigned int pages_per_huge_page )
{
int i ;
if ( unlikely ( pages_per_huge_page > MAX_ORDER_NR_PAGES ) ) {
copy_user_gigantic_page ( dst , src , addr , vma ,
pages_per_huge_page ) ;
return ;
}
might_sleep ( ) ;
for ( i = 0 ; i < pages_per_huge_page ; i + + ) {
cond_resched ( ) ;
copy_user_highpage ( dst + i , src + i , addr + i * PAGE_SIZE , vma ) ;
}
}
# endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */