2005-04-16 15:20:36 -07:00
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
# include <linux/kernel_stat.h>
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/mman.h>
# include <linux/swap.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <linux/rmap.h>
# include <linux/module.h>
2006-07-14 00:24:37 -07:00
# include <linux/delayacct.h>
2005-04-16 15:20:36 -07:00
# include <linux/init.h>
2006-09-25 23:30:58 -07:00
# include <linux/writeback.h>
2005-04-16 15:20:36 -07:00
# include <asm/pgalloc.h>
# include <asm/uaccess.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
# include <asm/pgtable.h>
# include <linux/swapops.h>
# include <linux/elf.h>
2005-06-23 00:07:54 -07:00
# ifndef CONFIG_NEED_MULTIPLE_NODES
2005-04-16 15:20:36 -07:00
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr ;
struct page * mem_map ;
EXPORT_SYMBOL ( max_mapnr ) ;
EXPORT_SYMBOL ( mem_map ) ;
# endif
unsigned long num_physpages ;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory ;
unsigned long vmalloc_earlyreserve ;
EXPORT_SYMBOL ( num_physpages ) ;
EXPORT_SYMBOL ( high_memory ) ;
EXPORT_SYMBOL ( vmalloc_earlyreserve ) ;
2006-02-16 23:41:58 +01:00
int randomize_va_space __read_mostly = 1 ;
static int __init disable_randmaps ( char * s )
{
randomize_va_space = 0 ;
2006-03-31 02:30:33 -08:00
return 1 ;
2006-02-16 23:41:58 +01:00
}
__setup ( " norandmaps " , disable_randmaps ) ;
2005-04-16 15:20:36 -07:00
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
* very seldom) called out from the p?d_none_or_clear_bad macros.
*/
void pgd_clear_bad ( pgd_t * pgd )
{
pgd_ERROR ( * pgd ) ;
pgd_clear ( pgd ) ;
}
void pud_clear_bad ( pud_t * pud )
{
pud_ERROR ( * pud ) ;
pud_clear ( pud ) ;
}
void pmd_clear_bad ( pmd_t * pmd )
{
pmd_ERROR ( * pmd ) ;
pmd_clear ( pmd ) ;
}
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
2005-04-19 13:29:15 -07:00
static void free_pte_range ( struct mmu_gather * tlb , pmd_t * pmd )
2005-04-16 15:20:36 -07:00
{
2005-04-19 13:29:15 -07:00
struct page * page = pmd_page ( * pmd ) ;
pmd_clear ( pmd ) ;
2005-10-29 18:16:40 -07:00
pte_lock_deinit ( page ) ;
2005-04-19 13:29:15 -07:00
pte_free_tlb ( tlb , page ) ;
2006-06-30 01:55:38 -07:00
dec_zone_page_state ( page , NR_PAGETABLE ) ;
2005-04-19 13:29:15 -07:00
tlb - > mm - > nr_ptes - - ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
static inline void free_pmd_range ( struct mmu_gather * tlb , pud_t * pud ,
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pmd_t * pmd ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
unsigned long start ;
2005-04-16 15:20:36 -07:00
2005-04-19 13:29:15 -07:00
start = addr ;
2005-04-16 15:20:36 -07:00
pmd = pmd_offset ( pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none_or_clear_bad ( pmd ) )
continue ;
2005-04-19 13:29:15 -07:00
free_pte_range ( tlb , pmd ) ;
2005-04-16 15:20:36 -07:00
} while ( pmd + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
start & = PUD_MASK ;
if ( start < floor )
return ;
if ( ceiling ) {
ceiling & = PUD_MASK ;
if ( ! ceiling )
return ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
if ( end - 1 > ceiling - 1 )
return ;
pmd = pmd_offset ( pud , start ) ;
pud_clear ( pud ) ;
pmd_free_tlb ( tlb , pmd ) ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
static inline void free_pud_range ( struct mmu_gather * tlb , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pud_t * pud ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
unsigned long start ;
2005-04-16 15:20:36 -07:00
2005-04-19 13:29:15 -07:00
start = addr ;
2005-04-16 15:20:36 -07:00
pud = pud_offset ( pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( pud ) )
continue ;
2005-04-19 13:29:15 -07:00
free_pmd_range ( tlb , pud , addr , next , floor , ceiling ) ;
2005-04-16 15:20:36 -07:00
} while ( pud + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
start & = PGDIR_MASK ;
if ( start < floor )
return ;
if ( ceiling ) {
ceiling & = PGDIR_MASK ;
if ( ! ceiling )
return ;
2005-04-16 15:20:36 -07:00
}
2005-04-19 13:29:15 -07:00
if ( end - 1 > ceiling - 1 )
return ;
pud = pud_offset ( pgd , start ) ;
pgd_clear ( pgd ) ;
pud_free_tlb ( tlb , pud ) ;
2005-04-16 15:20:36 -07:00
}
/*
2005-04-19 13:29:15 -07:00
* This function frees user-level page tables of a process.
*
2005-04-16 15:20:36 -07:00
* Must be called with pagetable lock held.
*/
2005-04-19 13:29:16 -07:00
void free_pgd_range ( struct mmu_gather * * tlb ,
2005-04-19 13:29:15 -07:00
unsigned long addr , unsigned long end ,
unsigned long floor , unsigned long ceiling )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
unsigned long next ;
2005-04-19 13:29:15 -07:00
unsigned long start ;
2005-04-16 15:20:36 -07:00
2005-04-19 13:29:15 -07:00
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr & = PMD_MASK ;
if ( addr < floor ) {
addr + = PMD_SIZE ;
if ( ! addr )
return ;
}
if ( ceiling ) {
ceiling & = PMD_MASK ;
if ( ! ceiling )
return ;
}
if ( end - 1 > ceiling - 1 )
end - = PMD_SIZE ;
if ( addr > end - 1 )
return ;
start = addr ;
2005-04-19 13:29:16 -07:00
pgd = pgd_offset ( ( * tlb ) - > mm , addr ) ;
2005-04-16 15:20:36 -07:00
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( pgd ) )
continue ;
2005-04-19 13:29:16 -07:00
free_pud_range ( * tlb , pgd , addr , next , floor , ceiling ) ;
2005-04-16 15:20:36 -07:00
} while ( pgd + + , addr = next , addr ! = end ) ;
2005-04-19 13:29:15 -07:00
2005-10-29 18:16:02 -07:00
if ( ! ( * tlb ) - > fullmm )
2005-04-19 13:29:16 -07:00
flush_tlb_pgtables ( ( * tlb ) - > mm , start , end ) ;
2005-04-19 13:29:15 -07:00
}
void free_pgtables ( struct mmu_gather * * tlb , struct vm_area_struct * vma ,
2005-04-19 13:29:16 -07:00
unsigned long floor , unsigned long ceiling )
2005-04-19 13:29:15 -07:00
{
while ( vma ) {
struct vm_area_struct * next = vma - > vm_next ;
unsigned long addr = vma - > vm_start ;
2005-10-29 18:16:29 -07:00
/*
* Hide vma from rmap and vmtruncate before freeing pgtables
*/
anon_vma_unlink ( vma ) ;
unlink_file_vma ( vma ) ;
2006-03-22 00:08:57 -08:00
if ( is_vm_hugetlb_page ( vma ) ) {
2005-04-19 13:29:16 -07:00
hugetlb_free_pgd_range ( tlb , addr , vma - > vm_end ,
2005-04-19 13:29:15 -07:00
floor , next ? next - > vm_start : ceiling ) ;
2005-04-19 13:29:16 -07:00
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while ( next & & next - > vm_start < = vma - > vm_end + PMD_SIZE
2006-03-22 00:08:58 -08:00
& & ! is_vm_hugetlb_page ( next ) ) {
2005-04-19 13:29:16 -07:00
vma = next ;
next = vma - > vm_next ;
2005-10-29 18:16:29 -07:00
anon_vma_unlink ( vma ) ;
unlink_file_vma ( vma ) ;
2005-04-19 13:29:16 -07:00
}
free_pgd_range ( tlb , addr , vma - > vm_end ,
floor , next ? next - > vm_start : ceiling ) ;
}
2005-04-19 13:29:15 -07:00
vma = next ;
}
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:22 -07:00
int __pte_alloc ( struct mm_struct * mm , pmd_t * pmd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:23 -07:00
struct page * new = pte_alloc_one ( mm , address ) ;
2005-10-29 18:16:22 -07:00
if ( ! new )
return - ENOMEM ;
2005-10-29 18:16:40 -07:00
pte_lock_init ( new ) ;
2005-10-29 18:16:23 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:40 -07:00
if ( pmd_present ( * pmd ) ) { /* Another has populated it */
pte_lock_deinit ( new ) ;
2005-10-29 18:16:22 -07:00
pte_free ( new ) ;
2005-10-29 18:16:40 -07:00
} else {
2005-04-16 15:20:36 -07:00
mm - > nr_ptes + + ;
2006-06-30 01:55:38 -07:00
inc_zone_page_state ( new , NR_PAGETABLE ) ;
2005-04-16 15:20:36 -07:00
pmd_populate ( mm , pmd , new ) ;
}
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:22 -07:00
int __pte_alloc_kernel ( pmd_t * pmd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:22 -07:00
pte_t * new = pte_alloc_one_kernel ( & init_mm , address ) ;
if ( ! new )
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:22 -07:00
spin_lock ( & init_mm . page_table_lock ) ;
if ( pmd_present ( * pmd ) ) /* Another has populated it */
pte_free_kernel ( new ) ;
else
pmd_populate_kernel ( & init_mm , pmd , new ) ;
spin_unlock ( & init_mm . page_table_lock ) ;
return 0 ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:05 -07:00
static inline void add_mm_rss ( struct mm_struct * mm , int file_rss , int anon_rss )
{
if ( file_rss )
add_mm_counter ( mm , file_rss , file_rss ) ;
if ( anon_rss )
add_mm_counter ( mm , anon_rss , anon_rss ) ;
}
2005-10-29 18:16:12 -07:00
/*
2005-11-28 14:34:23 -08:00
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
* a region that doesn't allow it.
2005-10-29 18:16:12 -07:00
*
* The calling function must still handle the error.
*/
void print_bad_pte ( struct vm_area_struct * vma , pte_t pte , unsigned long vaddr )
{
printk ( KERN_ERR " Bad pte = %08llx, process = %s, "
" vm_flags = %lx, vaddr = %lx \n " ,
( long long ) pte_val ( pte ) ,
( vma - > vm_mm = = current - > mm ? current - > comm : " ??? " ) ,
vma - > vm_flags , vaddr ) ;
dump_stack ( ) ;
}
2005-12-11 20:38:17 -08:00
static inline int is_cow_mapping ( unsigned int flags )
{
return ( flags & ( VM_SHARED | VM_MAYWRITE ) ) = = VM_MAYWRITE ;
}
2005-11-21 21:32:18 -08:00
/*
2005-11-28 14:34:23 -08:00
* This function gets the "struct page" associated with a pte.
*
* NOTE! Some mappings do not have "struct pages". A raw PFN mapping
* will have each page table entry just pointing to a raw page frame
* number, and as far as the VM layer is concerned, those do not have
* pages associated with them - even if the PFN might point to memory
* that otherwise is perfectly fine and has a "struct page".
*
* The way we recognize those mappings is through the rules set up
* by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
* and the vm_pgoff will point to the first PFN mapped: thus every
* page that is a raw mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
* and if that isn't true, the page has been COW'ed (in which case it
* _does_ have a "struct page" associated with it even if it is in a
* VM_PFNMAP range).
2005-11-21 21:32:18 -08:00
*/
2005-11-28 14:34:23 -08:00
struct page * vm_normal_page ( struct vm_area_struct * vma , unsigned long addr , pte_t pte )
2005-11-21 21:32:18 -08:00
{
2005-11-28 14:34:23 -08:00
unsigned long pfn = pte_pfn ( pte ) ;
2006-03-22 00:08:42 -08:00
if ( unlikely ( vma - > vm_flags & VM_PFNMAP ) ) {
2005-11-28 14:34:23 -08:00
unsigned long off = ( addr - vma - > vm_start ) > > PAGE_SHIFT ;
if ( pfn = = vma - > vm_pgoff + off )
return NULL ;
2005-12-11 20:38:17 -08:00
if ( ! is_cow_mapping ( vma - > vm_flags ) )
2005-12-11 19:46:02 -08:00
return NULL ;
2005-11-28 14:34:23 -08:00
}
2006-03-25 16:20:22 +01:00
/*
* Add some anal sanity checks for now. Eventually,
* we should just do "return pfn_to_page(pfn)", but
* in the meantime we check that we get a valid pfn,
* and that the resulting page looks ok.
*/
2005-11-28 14:34:23 -08:00
if ( unlikely ( ! pfn_valid ( pfn ) ) ) {
print_bad_pte ( vma , pte , addr ) ;
return NULL ;
}
/*
* NOTE! We still have PageReserved() pages in the page
* tables.
*
* The PAGE_ZERO() pages and various VDSO mappings can
* cause them to exist.
*/
return pfn_to_page ( pfn ) ;
2005-11-21 21:32:18 -08:00
}
2005-04-16 15:20:36 -07:00
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
2005-10-29 18:16:13 -07:00
static inline void
2005-04-16 15:20:36 -07:00
copy_one_pte ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
2005-10-29 18:16:12 -07:00
pte_t * dst_pte , pte_t * src_pte , struct vm_area_struct * vma ,
2005-10-29 18:16:13 -07:00
unsigned long addr , int * rss )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:12 -07:00
unsigned long vm_flags = vma - > vm_flags ;
2005-04-16 15:20:36 -07:00
pte_t pte = * src_pte ;
struct page * page ;
/* pte contains position in swap or file, so copy. */
if ( unlikely ( ! pte_present ( pte ) ) ) {
if ( ! pte_file ( pte ) ) {
2006-06-23 02:03:35 -07:00
swp_entry_t entry = pte_to_swp_entry ( pte ) ;
swap_duplicate ( entry ) ;
2005-04-16 15:20:36 -07:00
/* make sure dst_mm is on swapoff's mmlist. */
if ( unlikely ( list_empty ( & dst_mm - > mmlist ) ) ) {
spin_lock ( & mmlist_lock ) ;
2005-10-29 18:16:41 -07:00
if ( list_empty ( & dst_mm - > mmlist ) )
list_add ( & dst_mm - > mmlist ,
& src_mm - > mmlist ) ;
2005-04-16 15:20:36 -07:00
spin_unlock ( & mmlist_lock ) ;
}
2006-06-23 02:03:35 -07:00
if ( is_write_migration_entry ( entry ) & &
is_cow_mapping ( vm_flags ) ) {
/*
* COW mappings require pages in both parent
* and child to be set to read.
*/
make_migration_entry_read ( & entry ) ;
pte = swp_entry_to_pte ( entry ) ;
set_pte_at ( src_mm , addr , src_pte , pte ) ;
}
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:05 -07:00
goto out_set_pte ;
2005-04-16 15:20:36 -07:00
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
2005-12-11 20:38:17 -08:00
if ( is_cow_mapping ( vm_flags ) ) {
2005-04-16 15:20:36 -07:00
ptep_set_wrprotect ( src_mm , addr , src_pte ) ;
2006-09-30 23:29:30 -07:00
pte = pte_wrprotect ( pte ) ;
2005-04-16 15:20:36 -07:00
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if ( vm_flags & VM_SHARED )
pte = pte_mkclean ( pte ) ;
pte = pte_mkold ( pte ) ;
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , addr , pte ) ;
if ( page ) {
get_page ( page ) ;
page_dup_rmap ( page ) ;
rss [ ! ! PageAnon ( page ) ] + + ;
}
2005-10-29 18:16:05 -07:00
out_set_pte :
set_pte_at ( dst_mm , addr , dst_pte , pte ) ;
2005-04-16 15:20:36 -07:00
}
static int copy_pte_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pmd_t * dst_pmd , pmd_t * src_pmd , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
{
pte_t * src_pte , * dst_pte ;
2005-10-29 18:16:23 -07:00
spinlock_t * src_ptl , * dst_ptl ;
2005-10-29 18:15:53 -07:00
int progress = 0 ;
2005-10-29 18:16:13 -07:00
int rss [ 2 ] ;
2005-04-16 15:20:36 -07:00
again :
2005-10-29 18:16:05 -07:00
rss [ 1 ] = rss [ 0 ] = 0 ;
2005-10-29 18:16:23 -07:00
dst_pte = pte_alloc_map_lock ( dst_mm , dst_pmd , addr , & dst_ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! dst_pte )
return - ENOMEM ;
src_pte = pte_offset_map_nested ( src_pmd , addr ) ;
2005-10-29 18:16:40 -07:00
src_ptl = pte_lockptr ( src_mm , src_pmd ) ;
2006-07-03 00:25:08 -07:00
spin_lock_nested ( src_ptl , SINGLE_DEPTH_NESTING ) ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
2005-10-29 18:15:53 -07:00
if ( progress > = 32 ) {
progress = 0 ;
if ( need_resched ( ) | |
2005-10-29 18:16:23 -07:00
need_lockbreak ( src_ptl ) | |
need_lockbreak ( dst_ptl ) )
2005-10-29 18:15:53 -07:00
break ;
}
2005-04-16 15:20:36 -07:00
if ( pte_none ( * src_pte ) ) {
progress + + ;
continue ;
}
2005-10-29 18:16:13 -07:00
copy_one_pte ( dst_mm , src_mm , dst_pte , src_pte , vma , addr , rss ) ;
2005-04-16 15:20:36 -07:00
progress + = 8 ;
} while ( dst_pte + + , src_pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:23 -07:00
spin_unlock ( src_ptl ) ;
2005-04-16 15:20:36 -07:00
pte_unmap_nested ( src_pte - 1 ) ;
2005-10-29 18:16:05 -07:00
add_mm_rss ( dst_mm , rss [ 0 ] , rss [ 1 ] ) ;
2005-10-29 18:16:23 -07:00
pte_unmap_unlock ( dst_pte - 1 , dst_ptl ) ;
cond_resched ( ) ;
2005-04-16 15:20:36 -07:00
if ( addr ! = end )
goto again ;
return 0 ;
}
static inline int copy_pmd_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pud_t * dst_pud , pud_t * src_pud , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
{
pmd_t * src_pmd , * dst_pmd ;
unsigned long next ;
dst_pmd = pmd_alloc ( dst_mm , dst_pud , addr ) ;
if ( ! dst_pmd )
return - ENOMEM ;
src_pmd = pmd_offset ( src_pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none_or_clear_bad ( src_pmd ) )
continue ;
if ( copy_pte_range ( dst_mm , src_mm , dst_pmd , src_pmd ,
vma , addr , next ) )
return - ENOMEM ;
} while ( dst_pmd + + , src_pmd + + , addr = next , addr ! = end ) ;
return 0 ;
}
static inline int copy_pud_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
pgd_t * dst_pgd , pgd_t * src_pgd , struct vm_area_struct * vma ,
unsigned long addr , unsigned long end )
{
pud_t * src_pud , * dst_pud ;
unsigned long next ;
dst_pud = pud_alloc ( dst_mm , dst_pgd , addr ) ;
if ( ! dst_pud )
return - ENOMEM ;
src_pud = pud_offset ( src_pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( src_pud ) )
continue ;
if ( copy_pmd_range ( dst_mm , src_mm , dst_pud , src_pud ,
vma , addr , next ) )
return - ENOMEM ;
} while ( dst_pud + + , src_pud + + , addr = next , addr ! = end ) ;
return 0 ;
}
int copy_page_range ( struct mm_struct * dst_mm , struct mm_struct * src_mm ,
struct vm_area_struct * vma )
{
pgd_t * src_pgd , * dst_pgd ;
unsigned long next ;
unsigned long addr = vma - > vm_start ;
unsigned long end = vma - > vm_end ;
2005-08-28 16:49:11 +10:00
/*
* Don't copy ptes where a page fault will fill them correctly.
* Fork becomes much lighter when there are big shared or private
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
2005-12-16 10:21:23 -08:00
if ( ! ( vma - > vm_flags & ( VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP | VM_INSERTPAGE ) ) ) {
2005-08-28 16:49:11 +10:00
if ( ! vma - > anon_vma )
return 0 ;
}
2005-04-16 15:20:36 -07:00
if ( is_vm_hugetlb_page ( vma ) )
return copy_hugetlb_page_range ( dst_mm , src_mm , vma ) ;
dst_pgd = pgd_offset ( dst_mm , addr ) ;
src_pgd = pgd_offset ( src_mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( src_pgd ) )
continue ;
if ( copy_pud_range ( dst_mm , src_mm , dst_pgd , src_pgd ,
vma , addr , next ) )
return - ENOMEM ;
} while ( dst_pgd + + , src_pgd + + , addr = next , addr ! = end ) ;
return 0 ;
}
2005-11-13 16:06:42 -08:00
static unsigned long zap_pte_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pmd_t * pmd ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2005-11-13 16:06:42 -08:00
long * zap_work , struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:12 -07:00
struct mm_struct * mm = tlb - > mm ;
2005-04-16 15:20:36 -07:00
pte_t * pte ;
2005-10-29 18:16:30 -07:00
spinlock_t * ptl ;
2005-10-29 18:16:05 -07:00
int file_rss = 0 ;
int anon_rss = 0 ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:30 -07:00
pte = pte_offset_map_lock ( mm , pmd , addr , & ptl ) ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
pte_t ptent = * pte ;
2005-11-13 16:06:42 -08:00
if ( pte_none ( ptent ) ) {
( * zap_work ) - - ;
2005-04-16 15:20:36 -07:00
continue ;
2005-11-13 16:06:42 -08:00
}
2006-03-16 23:04:09 -08:00
( * zap_work ) - = PAGE_SIZE ;
2005-04-16 15:20:36 -07:00
if ( pte_present ( ptent ) ) {
2005-11-21 21:32:18 -08:00
struct page * page ;
2005-11-13 16:06:42 -08:00
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , addr , ptent ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( details ) & & page ) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if ( details - > check_mapping & &
details - > check_mapping ! = page - > mapping )
continue ;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
*/
if ( details - > nonlinear_vma & &
( page - > index < details - > first_index | |
page - > index > details - > last_index ) )
continue ;
}
2005-10-29 18:16:12 -07:00
ptent = ptep_get_and_clear_full ( mm , addr , pte ,
2005-09-03 15:55:04 -07:00
tlb - > fullmm ) ;
2005-04-16 15:20:36 -07:00
tlb_remove_tlb_entry ( tlb , pte , addr ) ;
if ( unlikely ( ! page ) )
continue ;
if ( unlikely ( details ) & & details - > nonlinear_vma
& & linear_page_index ( details - > nonlinear_vma ,
addr ) ! = page - > index )
2005-10-29 18:16:12 -07:00
set_pte_at ( mm , addr , pte ,
2005-04-16 15:20:36 -07:00
pgoff_to_pte ( page - > index ) ) ;
if ( PageAnon ( page ) )
2005-10-29 18:16:14 -07:00
anon_rss - - ;
2005-10-29 18:15:54 -07:00
else {
if ( pte_dirty ( ptent ) )
set_page_dirty ( page ) ;
if ( pte_young ( ptent ) )
2007-02-10 01:43:18 -08:00
SetPageReferenced ( page ) ;
2005-10-29 18:16:14 -07:00
file_rss - - ;
2005-10-29 18:15:54 -07:00
}
2006-12-22 01:09:33 -08:00
page_remove_rmap ( page , vma ) ;
2005-04-16 15:20:36 -07:00
tlb_remove_page ( tlb , page ) ;
continue ;
}
/*
* If details->check_mapping, we leave swap entries;
* if details->nonlinear_vma, we leave file entries.
*/
if ( unlikely ( details ) )
continue ;
if ( ! pte_file ( ptent ) )
free_swap_and_cache ( pte_to_swp_entry ( ptent ) ) ;
2006-09-30 23:29:31 -07:00
pte_clear_not_present_full ( mm , addr , pte , tlb - > fullmm ) ;
2005-11-13 16:06:42 -08:00
} while ( pte + + , addr + = PAGE_SIZE , ( addr ! = end & & * zap_work > 0 ) ) ;
2005-10-29 18:16:05 -07:00
2005-10-29 18:16:14 -07:00
add_mm_rss ( mm , file_rss , anon_rss ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:30 -07:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
2005-11-13 16:06:42 -08:00
return addr ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static inline unsigned long zap_pmd_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pud_t * pud ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2005-11-13 16:06:42 -08:00
long * zap_work , struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pmd_t * pmd ;
unsigned long next ;
pmd = pmd_offset ( pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
2005-11-13 16:06:42 -08:00
if ( pmd_none_or_clear_bad ( pmd ) ) {
( * zap_work ) - - ;
2005-04-16 15:20:36 -07:00
continue ;
2005-11-13 16:06:42 -08:00
}
next = zap_pte_range ( tlb , vma , pmd , addr , next ,
zap_work , details ) ;
} while ( pmd + + , addr = next , ( addr ! = end & & * zap_work > 0 ) ) ;
return addr ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static inline unsigned long zap_pud_range ( struct mmu_gather * tlb ,
2005-10-29 18:16:12 -07:00
struct vm_area_struct * vma , pgd_t * pgd ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2005-11-13 16:06:42 -08:00
long * zap_work , struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pud_t * pud ;
unsigned long next ;
pud = pud_offset ( pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
2005-11-13 16:06:42 -08:00
if ( pud_none_or_clear_bad ( pud ) ) {
( * zap_work ) - - ;
2005-04-16 15:20:36 -07:00
continue ;
2005-11-13 16:06:42 -08:00
}
next = zap_pmd_range ( tlb , vma , pud , addr , next ,
zap_work , details ) ;
} while ( pud + + , addr = next , ( addr ! = end & & * zap_work > 0 ) ) ;
return addr ;
2005-04-16 15:20:36 -07:00
}
2005-11-13 16:06:42 -08:00
static unsigned long unmap_page_range ( struct mmu_gather * tlb ,
struct vm_area_struct * vma ,
2005-04-16 15:20:36 -07:00
unsigned long addr , unsigned long end ,
2005-11-13 16:06:42 -08:00
long * zap_work , struct zap_details * details )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
unsigned long next ;
if ( details & & ! details - > check_mapping & & ! details - > nonlinear_vma )
details = NULL ;
BUG_ON ( addr > = end ) ;
tlb_start_vma ( tlb , vma ) ;
pgd = pgd_offset ( vma - > vm_mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
2005-11-13 16:06:42 -08:00
if ( pgd_none_or_clear_bad ( pgd ) ) {
( * zap_work ) - - ;
2005-04-16 15:20:36 -07:00
continue ;
2005-11-13 16:06:42 -08:00
}
next = zap_pud_range ( tlb , vma , pgd , addr , next ,
zap_work , details ) ;
} while ( pgd + + , addr = next , ( addr ! = end & & * zap_work > 0 ) ) ;
2005-04-16 15:20:36 -07:00
tlb_end_vma ( tlb , vma ) ;
2005-11-13 16:06:42 -08:00
return addr ;
2005-04-16 15:20:36 -07:00
}
# ifdef CONFIG_PREEMPT
# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
# else
/* No preempt: go for improved straight-line efficiency */
# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
# endif
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
2005-04-19 13:29:15 -07:00
* Returns the end address of the unmapping (restart addr if interrupted).
2005-04-16 15:20:36 -07:00
*
2005-10-29 18:16:30 -07:00
* Unmap all pages in the vma list.
2005-04-16 15:20:36 -07:00
*
2005-10-29 18:16:30 -07:00
* We aim to not hold locks for too long (for scheduling latency reasons).
* So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
2005-04-16 15:20:36 -07:00
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
2005-10-29 18:16:30 -07:00
unsigned long unmap_vmas ( struct mmu_gather * * tlbp ,
2005-04-16 15:20:36 -07:00
struct vm_area_struct * vma , unsigned long start_addr ,
unsigned long end_addr , unsigned long * nr_accounted ,
struct zap_details * details )
{
2005-11-13 16:06:42 -08:00
long zap_work = ZAP_BLOCK_SIZE ;
2005-04-16 15:20:36 -07:00
unsigned long tlb_start = 0 ; /* For tlb_finish_mmu */
int tlb_start_valid = 0 ;
2005-04-19 13:29:15 -07:00
unsigned long start = start_addr ;
2005-04-16 15:20:36 -07:00
spinlock_t * i_mmap_lock = details ? details - > i_mmap_lock : NULL ;
2005-10-29 18:16:02 -07:00
int fullmm = ( * tlbp ) - > fullmm ;
2005-04-16 15:20:36 -07:00
for ( ; vma & & vma - > vm_start < end_addr ; vma = vma - > vm_next ) {
unsigned long end ;
start = max ( vma - > vm_start , start_addr ) ;
if ( start > = vma - > vm_end )
continue ;
end = min ( vma - > vm_end , end_addr ) ;
if ( end < = vma - > vm_start )
continue ;
if ( vma - > vm_flags & VM_ACCOUNT )
* nr_accounted + = ( end - start ) > > PAGE_SHIFT ;
while ( start ! = end ) {
if ( ! tlb_start_valid ) {
tlb_start = start ;
tlb_start_valid = 1 ;
}
2005-11-13 16:06:42 -08:00
if ( unlikely ( is_vm_hugetlb_page ( vma ) ) ) {
2005-04-16 15:20:36 -07:00
unmap_hugepage_range ( vma , start , end ) ;
2005-11-13 16:06:42 -08:00
zap_work - = ( end - start ) /
( HPAGE_SIZE / PAGE_SIZE ) ;
start = end ;
} else
start = unmap_page_range ( * tlbp , vma ,
start , end , & zap_work , details ) ;
2005-04-16 15:20:36 -07:00
2005-11-13 16:06:42 -08:00
if ( zap_work > 0 ) {
BUG_ON ( start ! = end ) ;
break ;
}
2005-04-16 15:20:36 -07:00
tlb_finish_mmu ( * tlbp , tlb_start , start ) ;
if ( need_resched ( ) | |
( i_mmap_lock & & need_lockbreak ( i_mmap_lock ) ) ) {
if ( i_mmap_lock ) {
2005-10-29 18:16:30 -07:00
* tlbp = NULL ;
2005-04-16 15:20:36 -07:00
goto out ;
}
cond_resched ( ) ;
}
2005-10-29 18:16:30 -07:00
* tlbp = tlb_gather_mmu ( vma - > vm_mm , fullmm ) ;
2005-04-16 15:20:36 -07:00
tlb_start_valid = 0 ;
2005-11-13 16:06:42 -08:00
zap_work = ZAP_BLOCK_SIZE ;
2005-04-16 15:20:36 -07:00
}
}
out :
2005-04-19 13:29:15 -07:00
return start ; /* which is now the end (or restart) address */
2005-04-16 15:20:36 -07:00
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*/
2005-04-19 13:29:15 -07:00
unsigned long zap_page_range ( struct vm_area_struct * vma , unsigned long address ,
2005-04-16 15:20:36 -07:00
unsigned long size , struct zap_details * details )
{
struct mm_struct * mm = vma - > vm_mm ;
struct mmu_gather * tlb ;
unsigned long end = address + size ;
unsigned long nr_accounted = 0 ;
lru_add_drain ( ) ;
tlb = tlb_gather_mmu ( mm , 0 ) ;
2005-10-29 18:16:18 -07:00
update_hiwater_rss ( mm ) ;
2005-10-29 18:16:30 -07:00
end = unmap_vmas ( & tlb , vma , address , end , & nr_accounted , details ) ;
if ( tlb )
tlb_finish_mmu ( tlb , address , end ) ;
2005-04-19 13:29:15 -07:00
return end ;
2005-04-16 15:20:36 -07:00
}
/*
* Do a quick page-table lookup for a single page.
*/
2005-11-28 14:34:23 -08:00
struct page * follow_page ( struct vm_area_struct * vma , unsigned long address ,
2005-10-29 18:16:33 -07:00
unsigned int flags )
2005-04-16 15:20:36 -07:00
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * ptep , pte ;
2005-10-29 18:16:33 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
struct page * page ;
2005-11-28 14:34:23 -08:00
struct mm_struct * mm = vma - > vm_mm ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
page = follow_huge_addr ( mm , address , flags & FOLL_WRITE ) ;
if ( ! IS_ERR ( page ) ) {
BUG_ON ( flags & FOLL_GET ) ;
goto out ;
}
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
page = NULL ;
2005-04-16 15:20:36 -07:00
pgd = pgd_offset ( mm , address ) ;
if ( pgd_none ( * pgd ) | | unlikely ( pgd_bad ( * pgd ) ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2005-04-16 15:20:36 -07:00
pud = pud_offset ( pgd , address ) ;
if ( pud_none ( * pud ) | | unlikely ( pud_bad ( * pud ) ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2005-04-16 15:20:36 -07:00
pmd = pmd_offset ( pud , address ) ;
if ( pmd_none ( * pmd ) | | unlikely ( pmd_bad ( * pmd ) ) )
2005-10-29 18:16:33 -07:00
goto no_page_table ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
if ( pmd_huge ( * pmd ) ) {
BUG_ON ( flags & FOLL_GET ) ;
page = follow_huge_pmd ( mm , address , pmd , flags & FOLL_WRITE ) ;
goto out ;
}
ptep = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! ptep )
goto out ;
pte = * ptep ;
2005-10-29 18:16:33 -07:00
if ( ! pte_present ( pte ) )
goto unlock ;
if ( ( flags & FOLL_WRITE ) & & ! pte_write ( pte ) )
goto unlock ;
2005-11-28 14:34:23 -08:00
page = vm_normal_page ( vma , address , pte ) ;
if ( unlikely ( ! page ) )
2005-10-29 18:16:33 -07:00
goto unlock ;
if ( flags & FOLL_GET )
get_page ( page ) ;
if ( flags & FOLL_TOUCH ) {
if ( ( flags & FOLL_WRITE ) & &
! pte_dirty ( pte ) & & ! PageDirty ( page ) )
set_page_dirty ( page ) ;
mark_page_accessed ( page ) ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:33 -07:00
unlock :
pte_unmap_unlock ( ptep , ptl ) ;
2005-04-16 15:20:36 -07:00
out :
2005-10-29 18:16:33 -07:00
return page ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
no_page_table :
/*
* When core dumping an enormous anonymous area that nobody
* has touched so far, we don't want to allocate page tables.
*/
if ( flags & FOLL_ANON ) {
page = ZERO_PAGE ( address ) ;
if ( flags & FOLL_GET )
get_page ( page ) ;
BUG_ON ( flags & FOLL_WRITE ) ;
}
return page ;
2005-04-16 15:20:36 -07:00
}
int get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long start , int len , int write , int force ,
struct page * * pages , struct vm_area_struct * * vmas )
{
int i ;
2005-10-29 18:16:33 -07:00
unsigned int vm_flags ;
2005-04-16 15:20:36 -07:00
/*
* Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags.
*/
2005-10-29 18:16:33 -07:00
vm_flags = write ? ( VM_WRITE | VM_MAYWRITE ) : ( VM_READ | VM_MAYREAD ) ;
vm_flags & = force ? ( VM_MAYREAD | VM_MAYWRITE ) : ( VM_READ | VM_WRITE ) ;
2005-04-16 15:20:36 -07:00
i = 0 ;
do {
2005-10-29 18:16:33 -07:00
struct vm_area_struct * vma ;
unsigned int foll_flags ;
2005-04-16 15:20:36 -07:00
vma = find_extend_vma ( mm , start ) ;
if ( ! vma & & in_gate_area ( tsk , start ) ) {
unsigned long pg = start & PAGE_MASK ;
struct vm_area_struct * gate_vma = get_gate_vma ( tsk ) ;
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
if ( write ) /* user gate pages are read-only */
return i ? : - EFAULT ;
if ( pg > TASK_SIZE )
pgd = pgd_offset_k ( pg ) ;
else
pgd = pgd_offset_gate ( mm , pg ) ;
BUG_ON ( pgd_none ( * pgd ) ) ;
pud = pud_offset ( pgd , pg ) ;
BUG_ON ( pud_none ( * pud ) ) ;
pmd = pmd_offset ( pud , pg ) ;
2005-08-01 21:11:42 -07:00
if ( pmd_none ( * pmd ) )
return i ? : - EFAULT ;
2005-04-16 15:20:36 -07:00
pte = pte_offset_map ( pmd , pg ) ;
2005-08-01 21:11:42 -07:00
if ( pte_none ( * pte ) ) {
pte_unmap ( pte ) ;
return i ? : - EFAULT ;
}
2005-04-16 15:20:36 -07:00
if ( pages ) {
2005-11-29 18:43:17 +11:00
struct page * page = vm_normal_page ( gate_vma , start , * pte ) ;
2005-11-28 14:34:23 -08:00
pages [ i ] = page ;
if ( page )
get_page ( page ) ;
2005-04-16 15:20:36 -07:00
}
pte_unmap ( pte ) ;
if ( vmas )
vmas [ i ] = gate_vma ;
i + + ;
start + = PAGE_SIZE ;
len - - ;
continue ;
}
2005-12-12 16:24:33 -08:00
if ( ! vma | | ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) )
2005-10-29 18:16:33 -07:00
| | ! ( vm_flags & vma - > vm_flags ) )
2005-04-16 15:20:36 -07:00
return i ? : - EFAULT ;
if ( is_vm_hugetlb_page ( vma ) ) {
i = follow_hugetlb_page ( mm , vma , pages , vmas ,
& start , & len , i ) ;
continue ;
}
2005-10-29 18:16:33 -07:00
foll_flags = FOLL_TOUCH ;
if ( pages )
foll_flags | = FOLL_GET ;
if ( ! write & & ! ( vma - > vm_flags & VM_LOCKED ) & &
( ! vma - > vm_ops | | ! vma - > vm_ops - > nopage ) )
foll_flags | = FOLL_ANON ;
2005-04-16 15:20:36 -07:00
do {
2005-06-21 17:15:10 -07:00
struct page * page ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:33 -07:00
if ( write )
foll_flags | = FOLL_WRITE ;
cond_resched ( ) ;
2005-11-28 14:34:23 -08:00
while ( ! ( page = follow_page ( vma , start , foll_flags ) ) ) {
2005-08-03 10:07:09 -07:00
int ret ;
2005-10-29 18:16:33 -07:00
ret = __handle_mm_fault ( mm , vma , start ,
foll_flags & FOLL_WRITE ) ;
2005-08-03 10:07:09 -07:00
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has
* broken COW when necessary, even if maybe_mkwrite
* decided not to set pte_write. We can thus safely do
* subsequent page lookups as if they were reads.
*/
if ( ret & VM_FAULT_WRITE )
2005-10-29 18:16:33 -07:00
foll_flags & = ~ FOLL_WRITE ;
2005-08-03 10:07:09 -07:00
switch ( ret & ~ VM_FAULT_WRITE ) {
2005-04-16 15:20:36 -07:00
case VM_FAULT_MINOR :
tsk - > min_flt + + ;
break ;
case VM_FAULT_MAJOR :
tsk - > maj_flt + + ;
break ;
case VM_FAULT_SIGBUS :
return i ? i : - EFAULT ;
case VM_FAULT_OOM :
return i ? i : - ENOMEM ;
default :
BUG ( ) ;
}
2006-10-06 00:43:53 -07:00
cond_resched ( ) ;
2005-04-16 15:20:36 -07:00
}
if ( pages ) {
2005-06-21 17:15:10 -07:00
pages [ i ] = page ;
2006-03-26 01:36:57 -08:00
2006-12-30 22:24:19 +00:00
flush_anon_page ( vma , page , start ) ;
2005-06-21 17:15:10 -07:00
flush_dcache_page ( page ) ;
2005-04-16 15:20:36 -07:00
}
if ( vmas )
vmas [ i ] = vma ;
i + + ;
start + = PAGE_SIZE ;
len - - ;
2005-06-21 17:15:10 -07:00
} while ( len & & start < vma - > vm_end ) ;
} while ( len ) ;
2005-04-16 15:20:36 -07:00
return i ;
}
EXPORT_SYMBOL ( get_user_pages ) ;
static int zeromap_pte_range ( struct mm_struct * mm , pmd_t * pmd ,
unsigned long addr , unsigned long end , pgprot_t prot )
{
pte_t * pte ;
2005-10-29 18:16:23 -07:00
spinlock_t * ptl ;
2006-12-10 02:18:43 -08:00
int err = 0 ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:23 -07:00
pte = pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! pte )
2006-12-10 02:18:43 -08:00
return - EAGAIN ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
2005-10-29 18:16:12 -07:00
struct page * page = ZERO_PAGE ( addr ) ;
pte_t zero_pte = pte_wrprotect ( mk_pte ( page , prot ) ) ;
2006-12-10 02:18:43 -08:00
if ( unlikely ( ! pte_none ( * pte ) ) ) {
err = - EEXIST ;
pte + + ;
break ;
}
2005-10-29 18:16:12 -07:00
page_cache_get ( page ) ;
page_add_file_rmap ( page ) ;
inc_mm_counter ( mm , file_rss ) ;
2005-04-16 15:20:36 -07:00
set_pte_at ( mm , addr , pte , zero_pte ) ;
} while ( pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:23 -07:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
2006-12-10 02:18:43 -08:00
return err ;
2005-04-16 15:20:36 -07:00
}
static inline int zeromap_pmd_range ( struct mm_struct * mm , pud_t * pud ,
unsigned long addr , unsigned long end , pgprot_t prot )
{
pmd_t * pmd ;
unsigned long next ;
2006-12-10 02:18:43 -08:00
int err ;
2005-04-16 15:20:36 -07:00
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
2006-12-10 02:18:43 -08:00
return - EAGAIN ;
2005-04-16 15:20:36 -07:00
do {
next = pmd_addr_end ( addr , end ) ;
2006-12-10 02:18:43 -08:00
err = zeromap_pte_range ( mm , pmd , addr , next , prot ) ;
if ( err )
break ;
2005-04-16 15:20:36 -07:00
} while ( pmd + + , addr = next , addr ! = end ) ;
2006-12-10 02:18:43 -08:00
return err ;
2005-04-16 15:20:36 -07:00
}
static inline int zeromap_pud_range ( struct mm_struct * mm , pgd_t * pgd ,
unsigned long addr , unsigned long end , pgprot_t prot )
{
pud_t * pud ;
unsigned long next ;
2006-12-10 02:18:43 -08:00
int err ;
2005-04-16 15:20:36 -07:00
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
2006-12-10 02:18:43 -08:00
return - EAGAIN ;
2005-04-16 15:20:36 -07:00
do {
next = pud_addr_end ( addr , end ) ;
2006-12-10 02:18:43 -08:00
err = zeromap_pmd_range ( mm , pud , addr , next , prot ) ;
if ( err )
break ;
2005-04-16 15:20:36 -07:00
} while ( pud + + , addr = next , addr ! = end ) ;
2006-12-10 02:18:43 -08:00
return err ;
2005-04-16 15:20:36 -07:00
}
int zeromap_page_range ( struct vm_area_struct * vma ,
unsigned long addr , unsigned long size , pgprot_t prot )
{
pgd_t * pgd ;
unsigned long next ;
unsigned long end = addr + size ;
struct mm_struct * mm = vma - > vm_mm ;
int err ;
BUG_ON ( addr > = end ) ;
pgd = pgd_offset ( mm , addr ) ;
flush_cache_range ( vma , addr , end ) ;
do {
next = pgd_addr_end ( addr , end ) ;
err = zeromap_pud_range ( mm , pgd , addr , next , prot ) ;
if ( err )
break ;
} while ( pgd + + , addr = next , addr ! = end ) ;
return err ;
}
2005-11-29 19:27:22 -05:00
pte_t * fastcall get_locked_pte ( struct mm_struct * mm , unsigned long addr , spinlock_t * * ptl )
2005-11-29 14:03:14 -08:00
{
pgd_t * pgd = pgd_offset ( mm , addr ) ;
pud_t * pud = pud_alloc ( mm , pgd , addr ) ;
if ( pud ) {
2005-11-29 19:27:22 -05:00
pmd_t * pmd = pmd_alloc ( mm , pud , addr ) ;
2005-11-29 14:03:14 -08:00
if ( pmd )
return pte_alloc_map_lock ( mm , pmd , addr , ptl ) ;
}
return NULL ;
}
2005-11-29 13:01:56 -08:00
/*
* This is the old fallback for page remapping.
*
* For historical reasons, it only allows reserved pages. Only
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
static int insert_page ( struct mm_struct * mm , unsigned long addr , struct page * page , pgprot_t prot )
{
int retval ;
2005-11-29 14:03:14 -08:00
pte_t * pte ;
2005-11-29 13:01:56 -08:00
spinlock_t * ptl ;
retval = - EINVAL ;
2005-11-30 09:35:19 -08:00
if ( PageAnon ( page ) )
2005-11-29 13:01:56 -08:00
goto out ;
retval = - ENOMEM ;
flush_dcache_page ( page ) ;
2005-11-29 14:03:14 -08:00
pte = get_locked_pte ( mm , addr , & ptl ) ;
2005-11-29 13:01:56 -08:00
if ( ! pte )
goto out ;
retval = - EBUSY ;
if ( ! pte_none ( * pte ) )
goto out_unlock ;
/* Ok, finally just insert the thing.. */
get_page ( page ) ;
inc_mm_counter ( mm , file_rss ) ;
page_add_file_rmap ( page ) ;
set_pte_at ( mm , addr , pte , mk_pte ( page , prot ) ) ;
retval = 0 ;
out_unlock :
pte_unmap_unlock ( pte , ptl ) ;
out :
return retval ;
}
2006-09-25 23:31:22 -07:00
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @page: source kernel page
*
2005-11-30 09:35:19 -08:00
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
2006-03-22 00:08:05 -08:00
* (see split_page()).
2005-11-30 09:35:19 -08:00
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
* that. Your vma protection will have to be set up correctly, which
* means that if you want a shared writable mapping, you'd better
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
*/
int vm_insert_page ( struct vm_area_struct * vma , unsigned long addr , struct page * page )
{
if ( addr < vma - > vm_start | | addr > = vma - > vm_end )
return - EFAULT ;
if ( ! page_count ( page ) )
return - EINVAL ;
2005-12-16 10:21:23 -08:00
vma - > vm_flags | = VM_INSERTPAGE ;
2005-11-30 09:35:19 -08:00
return insert_page ( vma - > vm_mm , addr , page , vma - > vm_page_prot ) ;
}
2005-12-03 20:48:11 -08:00
EXPORT_SYMBOL ( vm_insert_page ) ;
2005-11-30 09:35:19 -08:00
2007-02-12 00:51:36 -08:00
/**
* vm_insert_pfn - insert single pfn into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
*
* Similar to vm_inert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return NULL.
*/
int vm_insert_pfn ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long pfn )
{
struct mm_struct * mm = vma - > vm_mm ;
int retval ;
pte_t * pte , entry ;
spinlock_t * ptl ;
BUG_ON ( ! ( vma - > vm_flags & VM_PFNMAP ) ) ;
BUG_ON ( is_cow_mapping ( vma - > vm_flags ) ) ;
retval = - ENOMEM ;
pte = get_locked_pte ( mm , addr , & ptl ) ;
if ( ! pte )
goto out ;
retval = - EBUSY ;
if ( ! pte_none ( * pte ) )
goto out_unlock ;
/* Ok, finally just insert the thing.. */
entry = pfn_pte ( pfn , vma - > vm_page_prot ) ;
set_pte_at ( mm , addr , pte , entry ) ;
update_mmu_cache ( vma , addr , entry ) ;
retval = 0 ;
out_unlock :
pte_unmap_unlock ( pte , ptl ) ;
out :
return retval ;
}
EXPORT_SYMBOL ( vm_insert_pfn ) ;
2005-04-16 15:20:36 -07:00
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static int remap_pte_range ( struct mm_struct * mm , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pte_t * pte ;
2005-10-29 18:16:23 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:23 -07:00
pte = pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( ! pte )
return - ENOMEM ;
2006-09-30 23:29:33 -07:00
arch_enter_lazy_mmu_mode ( ) ;
2005-04-16 15:20:36 -07:00
do {
BUG_ON ( ! pte_none ( * pte ) ) ;
2005-10-29 18:16:12 -07:00
set_pte_at ( mm , addr , pte , pfn_pte ( pfn , prot ) ) ;
2005-04-16 15:20:36 -07:00
pfn + + ;
} while ( pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2006-09-30 23:29:33 -07:00
arch_leave_lazy_mmu_mode ( ) ;
2005-10-29 18:16:23 -07:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
static inline int remap_pmd_range ( struct mm_struct * mm , pud_t * pud ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pmd_t * pmd ;
unsigned long next ;
pfn - = addr > > PAGE_SHIFT ;
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
return - ENOMEM ;
do {
next = pmd_addr_end ( addr , end ) ;
if ( remap_pte_range ( mm , pmd , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) )
return - ENOMEM ;
} while ( pmd + + , addr = next , addr ! = end ) ;
return 0 ;
}
static inline int remap_pud_range ( struct mm_struct * mm , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
unsigned long pfn , pgprot_t prot )
{
pud_t * pud ;
unsigned long next ;
pfn - = addr > > PAGE_SHIFT ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
return - ENOMEM ;
do {
next = pud_addr_end ( addr , end ) ;
if ( remap_pmd_range ( mm , pud , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) )
return - ENOMEM ;
} while ( pud + + , addr = next , addr ! = end ) ;
return 0 ;
}
2006-09-25 23:31:22 -07:00
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: physical address of kernel memory
* @size: size of map area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
*/
2005-04-16 15:20:36 -07:00
int remap_pfn_range ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long pfn , unsigned long size , pgprot_t prot )
{
pgd_t * pgd ;
unsigned long next ;
2005-06-25 14:54:33 -07:00
unsigned long end = addr + PAGE_ALIGN ( size ) ;
2005-04-16 15:20:36 -07:00
struct mm_struct * mm = vma - > vm_mm ;
int err ;
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
2005-11-21 21:32:15 -08:00
* VM_RESERVED is specified all over the place, because
* in 2.4 it kept swapout's vma scan off this vma; but
* in 2.6 the LRU scan won't even find its pages, so this
* flag means no more than count its pages in reserved_vm,
* and omit it from core dump, even when VM_IO turned off.
2005-11-28 14:34:23 -08:00
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
2005-12-11 19:46:02 -08:00
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
2005-04-16 15:20:36 -07:00
*/
2005-12-11 20:38:17 -08:00
if ( is_cow_mapping ( vma - > vm_flags ) ) {
2005-12-11 19:46:02 -08:00
if ( addr ! = vma - > vm_start | | end ! = vma - > vm_end )
2005-12-11 19:57:52 -08:00
return - EINVAL ;
2005-12-11 19:46:02 -08:00
vma - > vm_pgoff = pfn ;
}
2005-11-28 14:34:23 -08:00
vma - > vm_flags | = VM_IO | VM_RESERVED | VM_PFNMAP ;
2005-04-16 15:20:36 -07:00
BUG_ON ( addr > = end ) ;
pfn - = addr > > PAGE_SHIFT ;
pgd = pgd_offset ( mm , addr ) ;
flush_cache_range ( vma , addr , end ) ;
do {
next = pgd_addr_end ( addr , end ) ;
err = remap_pud_range ( mm , pgd , addr , next ,
pfn + ( addr > > PAGE_SHIFT ) , prot ) ;
if ( err )
break ;
} while ( pgd + + , addr = next , addr ! = end ) ;
return err ;
}
EXPORT_SYMBOL ( remap_pfn_range ) ;
2005-10-29 18:16:26 -07:00
/*
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
* might give a mix of unmatched parts, do_swap_page and do_file_page
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on).
*/
2005-10-29 18:16:40 -07:00
static inline int pte_unmap_same ( struct mm_struct * mm , pmd_t * pmd ,
2005-10-29 18:16:26 -07:00
pte_t * page_table , pte_t orig_pte )
{
int same = 1 ;
# if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if ( sizeof ( pte_t ) > sizeof ( unsigned long ) ) {
2005-10-29 18:16:40 -07:00
spinlock_t * ptl = pte_lockptr ( mm , pmd ) ;
spin_lock ( ptl ) ;
2005-10-29 18:16:26 -07:00
same = pte_same ( * page_table , orig_pte ) ;
2005-10-29 18:16:40 -07:00
spin_unlock ( ptl ) ;
2005-10-29 18:16:26 -07:00
}
# endif
pte_unmap ( page_table ) ;
return same ;
}
2005-04-16 15:20:36 -07:00
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
* servicing faults for write access. In the normal case, do always want
* pte_mkwrite. But get_user_pages can cause write faults for mappings
* that do not have writing enabled, when used by access_process_vm.
*/
static inline pte_t maybe_mkwrite ( pte_t pte , struct vm_area_struct * vma )
{
if ( likely ( vma - > vm_flags & VM_WRITE ) )
pte = pte_mkwrite ( pte ) ;
return pte ;
}
2006-12-12 17:14:55 +00:00
static inline void cow_user_page ( struct page * dst , struct page * src , unsigned long va , struct vm_area_struct * vma )
2005-11-28 14:34:23 -08:00
{
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
if ( unlikely ( ! src ) ) {
void * kaddr = kmap_atomic ( dst , KM_USER0 ) ;
2005-11-29 14:07:55 -08:00
void __user * uaddr = ( void __user * ) ( va & PAGE_MASK ) ;
/*
* This really shouldn't fail, because the page is there
* in the page tables. But it might just be unreadable,
* in which case we just give up and fill the result with
* zeroes.
*/
if ( __copy_from_user_inatomic ( kaddr , uaddr , PAGE_SIZE ) )
2005-11-28 14:34:23 -08:00
memset ( kaddr , 0 , PAGE_SIZE ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
2006-10-19 23:29:08 -07:00
flush_dcache_page ( dst ) ;
2005-11-28 14:34:23 -08:00
return ;
2006-12-12 17:14:55 +00:00
2005-11-28 14:34:23 -08:00
}
2006-12-12 17:14:55 +00:00
copy_user_highpage ( dst , src , va , vma ) ;
2005-11-28 14:34:23 -08:00
}
2005-04-16 15:20:36 -07:00
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_wp_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
2005-10-29 18:16:26 -07:00
spinlock_t * ptl , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2005-11-29 16:54:51 +00:00
struct page * old_page , * new_page ;
2005-04-16 15:20:36 -07:00
pte_t entry ;
2006-09-25 23:30:57 -07:00
int reuse = 0 , ret = VM_FAULT_MINOR ;
struct page * dirty_page = NULL ;
2005-04-16 15:20:36 -07:00
2005-11-28 14:34:23 -08:00
old_page = vm_normal_page ( vma , address , orig_pte ) ;
if ( ! old_page )
goto gotten ;
2005-04-16 15:20:36 -07:00
2006-09-25 23:30:57 -07:00
/*
2006-09-25 23:31:00 -07:00
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
2006-09-25 23:30:57 -07:00
*/
2006-09-25 23:31:00 -07:00
if ( PageAnon ( old_page ) ) {
if ( ! TestSetPageLocked ( old_page ) ) {
reuse = can_share_swap_page ( old_page ) ;
unlock_page ( old_page ) ;
}
} else if ( unlikely ( ( vma - > vm_flags & ( VM_WRITE | VM_SHARED ) ) = =
2006-09-25 23:30:57 -07:00
( VM_WRITE | VM_SHARED ) ) ) {
2006-09-25 23:31:00 -07:00
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
* get_user_pages(.write=1, .force=1).
*/
2006-06-23 02:03:43 -07:00
if ( vma - > vm_ops & & vma - > vm_ops - > page_mkwrite ) {
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get ( old_page ) ;
pte_unmap_unlock ( page_table , ptl ) ;
if ( vma - > vm_ops - > page_mkwrite ( vma , old_page ) < 0 )
goto unwritable_page ;
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock ( mm , pmd , address ,
& ptl ) ;
2007-02-10 01:43:00 -08:00
page_cache_release ( old_page ) ;
2006-06-23 02:03:43 -07:00
if ( ! pte_same ( * page_table , orig_pte ) )
goto unlock ;
2005-04-16 15:20:36 -07:00
}
2006-09-25 23:30:57 -07:00
dirty_page = old_page ;
get_page ( dirty_page ) ;
2006-06-23 02:03:43 -07:00
reuse = 1 ;
}
if ( reuse ) {
flush_cache_page ( vma , address , pte_pfn ( orig_pte ) ) ;
entry = pte_mkyoung ( orig_pte ) ;
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
ptep_set_access_flags ( vma , address , page_table , entry , 1 ) ;
update_mmu_cache ( vma , address , entry ) ;
lazy_mmu_prot_update ( entry ) ;
ret | = VM_FAULT_WRITE ;
goto unlock ;
2005-04-16 15:20:36 -07:00
}
/*
* Ok, we need to copy. Oh, well..
*/
2005-10-29 18:16:12 -07:00
page_cache_get ( old_page ) ;
2005-11-21 21:32:17 -08:00
gotten :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( anon_vma_prepare ( vma ) ) )
2005-10-29 18:15:59 -07:00
goto oom ;
2005-11-29 16:54:51 +00:00
if ( old_page = = ZERO_PAGE ( address ) ) {
2005-04-16 15:20:36 -07:00
new_page = alloc_zeroed_user_highpage ( vma , address ) ;
if ( ! new_page )
2005-10-29 18:15:59 -07:00
goto oom ;
2005-04-16 15:20:36 -07:00
} else {
new_page = alloc_page_vma ( GFP_HIGHUSER , vma , address ) ;
if ( ! new_page )
2005-10-29 18:15:59 -07:00
goto oom ;
2006-12-12 17:14:55 +00:00
cow_user_page ( new_page , old_page , address , vma ) ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:15:59 -07:00
2005-04-16 15:20:36 -07:00
/*
* Re-check the pte - we dropped the lock
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-10-29 18:15:59 -07:00
if ( likely ( pte_same ( * page_table , orig_pte ) ) ) {
2005-11-21 21:32:17 -08:00
if ( old_page ) {
2006-12-22 01:09:33 -08:00
page_remove_rmap ( old_page , vma ) ;
2005-11-21 21:32:17 -08:00
if ( ! PageAnon ( old_page ) ) {
dec_mm_counter ( mm , file_rss ) ;
inc_mm_counter ( mm , anon_rss ) ;
}
} else
2005-10-29 18:16:05 -07:00
inc_mm_counter ( mm , anon_rss ) ;
2005-11-29 11:45:26 -08:00
flush_cache_page ( vma , address , pte_pfn ( orig_pte ) ) ;
2005-10-29 18:15:59 -07:00
entry = mk_pte ( new_page , vma - > vm_page_prot ) ;
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
2006-07-14 00:23:57 -07:00
lazy_mmu_prot_update ( entry ) ;
2006-09-29 01:58:42 -07:00
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush ( vma , address , page_table ) ;
set_pte_at ( mm , address , page_table , entry ) ;
2005-10-29 18:15:59 -07:00
update_mmu_cache ( vma , address , entry ) ;
2005-04-16 15:20:36 -07:00
lru_cache_add_active ( new_page ) ;
2006-01-06 00:11:12 -08:00
page_add_new_anon_rmap ( new_page , vma , address ) ;
2005-04-16 15:20:36 -07:00
/* Free the old page.. */
new_page = old_page ;
2005-08-03 20:24:01 +10:00
ret | = VM_FAULT_WRITE ;
2005-04-16 15:20:36 -07:00
}
2005-11-21 21:32:17 -08:00
if ( new_page )
page_cache_release ( new_page ) ;
if ( old_page )
page_cache_release ( old_page ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2006-09-25 23:30:57 -07:00
if ( dirty_page ) {
2006-09-25 23:30:58 -07:00
set_page_dirty_balance ( dirty_page ) ;
2006-09-25 23:30:57 -07:00
put_page ( dirty_page ) ;
}
2005-08-03 20:24:01 +10:00
return ret ;
2005-10-29 18:15:59 -07:00
oom :
2005-11-21 21:32:17 -08:00
if ( old_page )
page_cache_release ( old_page ) ;
2005-04-16 15:20:36 -07:00
return VM_FAULT_OOM ;
2006-06-23 02:03:43 -07:00
unwritable_page :
page_cache_release ( old_page ) ;
return VM_FAULT_SIGBUS ;
2005-04-16 15:20:36 -07:00
}
/*
* Helper functions for unmap_mapping_range().
*
* __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
*
* We have to restart searching the prio_tree whenever we drop the lock,
* since the iterator is only valid while the lock is held, and anyway
* a later vma might be split and reinserted earlier while lock dropped.
*
* The list of nonlinear vmas could be handled more efficiently, using
* a placeholder, but handle it in the same way until a need is shown.
* It is important to search the prio_tree before nonlinear list: a vma
* may become nonlinear and be shifted from prio_tree to nonlinear list
* while the lock is dropped; but never shifted from list to prio_tree.
*
* In order to make forward progress despite restarting the search,
* vm_truncate_count is used to mark a vma as now dealt with, so we can
* quickly skip it next time around. Since the prio_tree search only
* shows us those vmas affected by unmapping the range in question, we
* can't efficiently keep all vmas in step with mapping->truncate_count:
* so instead reset them all whenever it wraps back to 0 (then go to 1).
* mapping->truncate_count and vma->vm_truncate_count are protected by
* i_mmap_lock.
*
* In order to make forward progress despite repeatedly restarting some
2005-04-19 13:29:15 -07:00
* large vma, note the restart_addr from unmap_vmas when it breaks out:
2005-04-16 15:20:36 -07:00
* and restart from that address when we reach that vma again. It might
* have been split or merged, shrunk or extended, but never shifted: so
* restart_addr remains valid so long as it remains in the vma's range.
* unmap_mapping_range forces truncate_count to leap over page-aligned
* values so we can save vma's restart_addr in its truncate_count field.
*/
# define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
static void reset_vma_truncate_counts ( struct address_space * mapping )
{
struct vm_area_struct * vma ;
struct prio_tree_iter iter ;
vma_prio_tree_foreach ( vma , & iter , & mapping - > i_mmap , 0 , ULONG_MAX )
vma - > vm_truncate_count = 0 ;
list_for_each_entry ( vma , & mapping - > i_mmap_nonlinear , shared . vm_set . list )
vma - > vm_truncate_count = 0 ;
}
static int unmap_mapping_range_vma ( struct vm_area_struct * vma ,
unsigned long start_addr , unsigned long end_addr ,
struct zap_details * details )
{
unsigned long restart_addr ;
int need_break ;
again :
restart_addr = vma - > vm_truncate_count ;
if ( is_restart_addr ( restart_addr ) & & start_addr < restart_addr ) {
start_addr = restart_addr ;
if ( start_addr > = end_addr ) {
/* Top of vma has been split off since last time */
vma - > vm_truncate_count = details - > truncate_count ;
return 0 ;
}
}
2005-04-19 13:29:15 -07:00
restart_addr = zap_page_range ( vma , start_addr ,
end_addr - start_addr , details ) ;
2005-04-16 15:20:36 -07:00
need_break = need_resched ( ) | |
need_lockbreak ( details - > i_mmap_lock ) ;
2005-04-19 13:29:15 -07:00
if ( restart_addr > = end_addr ) {
2005-04-16 15:20:36 -07:00
/* We have now completed this vma: mark it so */
vma - > vm_truncate_count = details - > truncate_count ;
if ( ! need_break )
return 0 ;
} else {
/* Note restart_addr in vma's truncate_count field */
2005-04-19 13:29:15 -07:00
vma - > vm_truncate_count = restart_addr ;
2005-04-16 15:20:36 -07:00
if ( ! need_break )
goto again ;
}
spin_unlock ( details - > i_mmap_lock ) ;
cond_resched ( ) ;
spin_lock ( details - > i_mmap_lock ) ;
return - EINTR ;
}
static inline void unmap_mapping_range_tree ( struct prio_tree_root * root ,
struct zap_details * details )
{
struct vm_area_struct * vma ;
struct prio_tree_iter iter ;
pgoff_t vba , vea , zba , zea ;
restart :
vma_prio_tree_foreach ( vma , & iter , root ,
details - > first_index , details - > last_index ) {
/* Skip quickly over those we have already dealt with */
if ( vma - > vm_truncate_count = = details - > truncate_count )
continue ;
vba = vma - > vm_pgoff ;
vea = vba + ( ( vma - > vm_end - vma - > vm_start ) > > PAGE_SHIFT ) - 1 ;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details - > first_index ;
if ( zba < vba )
zba = vba ;
zea = details - > last_index ;
if ( zea > vea )
zea = vea ;
if ( unmap_mapping_range_vma ( vma ,
( ( zba - vba ) < < PAGE_SHIFT ) + vma - > vm_start ,
( ( zea - vba + 1 ) < < PAGE_SHIFT ) + vma - > vm_start ,
details ) < 0 )
goto restart ;
}
}
static inline void unmap_mapping_range_list ( struct list_head * head ,
struct zap_details * details )
{
struct vm_area_struct * vma ;
/*
* In nonlinear VMAs there is no correspondence between virtual address
* offset and file offset. So we must perform an exhaustive search
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
restart :
list_for_each_entry ( vma , head , shared . vm_set . list ) {
/* Skip quickly over those we have already dealt with */
if ( vma - > vm_truncate_count = = details - > truncate_count )
continue ;
details - > nonlinear_vma = vma ;
if ( unmap_mapping_range_vma ( vma , vma - > vm_start ,
vma - > vm_end , details ) < 0 )
goto restart ;
}
}
/**
2007-02-10 01:45:59 -08:00
* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2005-06-23 22:05:21 -07:00
* @mapping: the address space containing mmaps to be unmapped.
2005-04-16 15:20:36 -07:00
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from vmtruncate(), which
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void unmap_mapping_range ( struct address_space * mapping ,
loff_t const holebegin , loff_t const holelen , int even_cows )
{
struct zap_details details ;
pgoff_t hba = holebegin > > PAGE_SHIFT ;
pgoff_t hlen = ( holelen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
/* Check for overflow. */
if ( sizeof ( holelen ) > sizeof ( hlen ) ) {
long long holeend =
( holebegin + holelen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
if ( holeend & ~ ( long long ) ULONG_MAX )
hlen = ULONG_MAX - hba + 1 ;
}
details . check_mapping = even_cows ? NULL : mapping ;
details . nonlinear_vma = NULL ;
details . first_index = hba ;
details . last_index = hba + hlen - 1 ;
if ( details . last_index < details . first_index )
details . last_index = ULONG_MAX ;
details . i_mmap_lock = & mapping - > i_mmap_lock ;
spin_lock ( & mapping - > i_mmap_lock ) ;
/* serialize i_size write against truncate_count write */
smp_wmb ( ) ;
/* Protect against page faults, and endless unmapping loops */
mapping - > truncate_count + + ;
/*
* For archs where spin_lock has inclusive semantics like ia64
* this smp_mb() will prevent to read pagetable contents
* before the truncate_count increment is visible to
* other cpus.
*/
smp_mb ( ) ;
if ( unlikely ( is_restart_addr ( mapping - > truncate_count ) ) ) {
if ( mapping - > truncate_count = = 0 )
reset_vma_truncate_counts ( mapping ) ;
mapping - > truncate_count + + ;
}
details . truncate_count = mapping - > truncate_count ;
if ( unlikely ( ! prio_tree_empty ( & mapping - > i_mmap ) ) )
unmap_mapping_range_tree ( & mapping - > i_mmap , & details ) ;
if ( unlikely ( ! list_empty ( & mapping - > i_mmap_nonlinear ) ) )
unmap_mapping_range_list ( & mapping - > i_mmap_nonlinear , & details ) ;
spin_unlock ( & mapping - > i_mmap_lock ) ;
}
EXPORT_SYMBOL ( unmap_mapping_range ) ;
2006-09-25 23:31:22 -07:00
/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
* @offset: file offset to start truncating
2005-04-16 15:20:36 -07:00
*
* NOTE! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
* incomplete page. Ugly, but necessary.
*/
int vmtruncate ( struct inode * inode , loff_t offset )
{
struct address_space * mapping = inode - > i_mapping ;
unsigned long limit ;
if ( inode - > i_size < offset )
goto do_expand ;
/*
* truncation of in-use swapfiles is disallowed - it would cause
* subsequent swapout to scribble on the now-freed blocks.
*/
if ( IS_SWAPFILE ( inode ) )
goto out_busy ;
i_size_write ( inode , offset ) ;
unmap_mapping_range ( mapping , offset + PAGE_SIZE - 1 , 0 , 1 ) ;
truncate_inode_pages ( mapping , offset ) ;
goto out_truncate ;
do_expand :
limit = current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur ;
if ( limit ! = RLIM_INFINITY & & offset > limit )
goto out_sig ;
if ( offset > inode - > i_sb - > s_maxbytes )
goto out_big ;
i_size_write ( inode , offset ) ;
out_truncate :
if ( inode - > i_op & & inode - > i_op - > truncate )
inode - > i_op - > truncate ( inode ) ;
return 0 ;
out_sig :
send_sig ( SIGXFSZ , current , 0 ) ;
out_big :
return - EFBIG ;
out_busy :
return - ETXTBSY ;
}
EXPORT_SYMBOL ( vmtruncate ) ;
2006-01-06 00:10:38 -08:00
int vmtruncate_range ( struct inode * inode , loff_t offset , loff_t end )
{
struct address_space * mapping = inode - > i_mapping ;
/*
* If the underlying filesystem is not going to provide
* a way to truncate a range of blocks (punch a hole) -
* we should return failure right now.
*/
if ( ! inode - > i_op | | ! inode - > i_op - > truncate_range )
return - ENOSYS ;
2006-01-09 15:59:24 -08:00
mutex_lock ( & inode - > i_mutex ) ;
2006-01-06 00:10:38 -08:00
down_write ( & inode - > i_alloc_sem ) ;
unmap_mapping_range ( mapping , offset , ( end - offset ) , 1 ) ;
truncate_inode_pages_range ( mapping , offset , end ) ;
inode - > i_op - > truncate_range ( inode , offset , end ) ;
up_write ( & inode - > i_alloc_sem ) ;
2006-01-09 15:59:24 -08:00
mutex_unlock ( & inode - > i_mutex ) ;
2006-01-06 00:10:38 -08:00
return 0 ;
}
2006-09-25 23:31:22 -07:00
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
* @addr: address to start
* @vma: user vma this addresses belong to
*
2005-04-16 15:20:36 -07:00
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
* because it doesn't cost us any seek time. We also make sure to queue
2006-09-25 23:31:22 -07:00
* the 'original' request together with the readahead ones...
2005-04-16 15:20:36 -07:00
*
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
* Caller must hold down_read on the vma->vm_mm if vma is not NULL.
*/
void swapin_readahead ( swp_entry_t entry , unsigned long addr , struct vm_area_struct * vma )
{
# ifdef CONFIG_NUMA
struct vm_area_struct * next_vma = vma ? vma - > vm_next : NULL ;
# endif
int i , num ;
struct page * new_page ;
unsigned long offset ;
/*
* Get the number of handles we should do readahead io to.
*/
num = valid_swaphandles ( entry , & offset ) ;
for ( i = 0 ; i < num ; offset + + , i + + ) {
/* Ok, do the async read-ahead now */
new_page = read_swap_cache_async ( swp_entry ( swp_type ( entry ) ,
offset ) , vma , addr ) ;
if ( ! new_page )
break ;
page_cache_release ( new_page ) ;
# ifdef CONFIG_NUMA
/*
* Find the next applicable VMA for the NUMA policy.
*/
addr + = PAGE_SIZE ;
if ( addr = = 0 )
vma = NULL ;
if ( vma ) {
if ( addr > = vma - > vm_end ) {
vma = next_vma ;
next_vma = vma ? vma - > vm_next : NULL ;
}
if ( vma & & addr < vma - > vm_start )
vma = NULL ;
} else {
if ( next_vma & & addr > = next_vma - > vm_start ) {
vma = next_vma ;
next_vma = vma - > vm_next ;
}
}
# endif
}
lru_add_drain ( ) ; /* Push any new pages onto the LRU now */
}
/*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_swap_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
int write_access , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
struct page * page ;
2005-10-29 18:15:59 -07:00
swp_entry_t entry ;
2005-04-16 15:20:36 -07:00
pte_t pte ;
int ret = VM_FAULT_MINOR ;
2005-10-29 18:16:40 -07:00
if ( ! pte_unmap_same ( mm , pmd , page_table , orig_pte ) )
2005-10-29 18:16:26 -07:00
goto out ;
2005-10-29 18:15:59 -07:00
entry = pte_to_swp_entry ( orig_pte ) ;
2006-06-23 02:03:35 -07:00
if ( is_migration_entry ( entry ) ) {
migration_entry_wait ( mm , pmd , address ) ;
goto out ;
}
2006-07-14 00:24:37 -07:00
delayacct_set_flag ( DELAYACCT_PF_SWAPIN ) ;
2005-04-16 15:20:36 -07:00
page = lookup_swap_cache ( entry ) ;
if ( ! page ) {
2006-12-06 20:31:54 -08:00
grab_swap_token ( ) ; /* Contend for token _before_ read-in */
2005-04-16 15:20:36 -07:00
swapin_readahead ( entry , address , vma ) ;
page = read_swap_cache_async ( entry , vma , address ) ;
if ( ! page ) {
/*
2005-10-29 18:16:26 -07:00
* Back out if somebody else faulted in this pte
* while we released the pte lock.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
if ( likely ( pte_same ( * page_table , orig_pte ) ) )
ret = VM_FAULT_OOM ;
2006-07-14 00:24:37 -07:00
delayacct_clear_flag ( DELAYACCT_PF_SWAPIN ) ;
2005-10-29 18:15:59 -07:00
goto unlock ;
2005-04-16 15:20:36 -07:00
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR ;
2006-06-30 01:55:45 -07:00
count_vm_event ( PGMAJFAULT ) ;
2005-04-16 15:20:36 -07:00
}
2006-07-14 00:24:37 -07:00
delayacct_clear_flag ( DELAYACCT_PF_SWAPIN ) ;
2005-04-16 15:20:36 -07:00
mark_page_accessed ( page ) ;
lock_page ( page ) ;
/*
2005-10-29 18:16:26 -07:00
* Back out if somebody else already faulted in this pte.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-10-29 18:16:15 -07:00
if ( unlikely ( ! pte_same ( * page_table , orig_pte ) ) )
2005-05-16 21:53:50 -07:00
goto out_nomap ;
if ( unlikely ( ! PageUptodate ( page ) ) ) {
ret = VM_FAULT_SIGBUS ;
goto out_nomap ;
2005-04-16 15:20:36 -07:00
}
/* The page isn't present yet, go ahead with the fault. */
2005-10-29 18:16:05 -07:00
inc_mm_counter ( mm , anon_rss ) ;
2005-04-16 15:20:36 -07:00
pte = mk_pte ( page , vma - > vm_page_prot ) ;
if ( write_access & & can_share_swap_page ( page ) ) {
pte = maybe_mkwrite ( pte_mkdirty ( pte ) , vma ) ;
write_access = 0 ;
}
flush_icache_page ( vma , page ) ;
set_pte_at ( mm , address , page_table , pte ) ;
page_add_anon_rmap ( page , vma , address ) ;
2005-06-21 17:15:12 -07:00
swap_free ( entry ) ;
if ( vm_swap_full ( ) )
remove_exclusive_swap_page ( page ) ;
unlock_page ( page ) ;
2005-04-16 15:20:36 -07:00
if ( write_access ) {
if ( do_wp_page ( mm , vma , address ,
2005-10-29 18:16:26 -07:00
page_table , pmd , ptl , pte ) = = VM_FAULT_OOM )
2005-04-16 15:20:36 -07:00
ret = VM_FAULT_OOM ;
goto out ;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache ( vma , address , pte ) ;
lazy_mmu_prot_update ( pte ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
out :
return ret ;
2005-05-16 21:53:50 -07:00
out_nomap :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-05-16 21:53:50 -07:00
unlock_page ( page ) ;
page_cache_release ( page ) ;
2005-10-29 18:15:59 -07:00
return ret ;
2005-04-16 15:20:36 -07:00
}
/*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_anonymous_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
int write_access )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:26 -07:00
struct page * page ;
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
pte_t entry ;
2005-11-28 14:34:23 -08:00
if ( write_access ) {
2005-04-16 15:20:36 -07:00
/* Allocate our own private page. */
pte_unmap ( page_table ) ;
if ( unlikely ( anon_vma_prepare ( vma ) ) )
2005-10-29 18:15:59 -07:00
goto oom ;
page = alloc_zeroed_user_highpage ( vma , address ) ;
2005-04-16 15:20:36 -07:00
if ( ! page )
2005-10-29 18:15:59 -07:00
goto oom ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:15:59 -07:00
entry = mk_pte ( page , vma - > vm_page_prot ) ;
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
if ( ! pte_none ( * page_table ) )
goto release ;
inc_mm_counter ( mm , anon_rss ) ;
2005-04-16 15:20:36 -07:00
lru_cache_add_active ( page ) ;
2006-01-06 00:11:12 -08:00
page_add_new_anon_rmap ( page , vma , address ) ;
2005-10-29 18:16:12 -07:00
} else {
2005-10-29 18:16:26 -07:00
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE ( address ) ;
page_cache_get ( page ) ;
entry = mk_pte ( page , vma - > vm_page_prot ) ;
2005-10-29 18:16:40 -07:00
ptl = pte_lockptr ( mm , pmd ) ;
2005-10-29 18:16:26 -07:00
spin_lock ( ptl ) ;
if ( ! pte_none ( * page_table ) )
goto release ;
2005-10-29 18:16:12 -07:00
inc_mm_counter ( mm , file_rss ) ;
page_add_file_rmap ( page ) ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:15:59 -07:00
set_pte_at ( mm , address , page_table , entry ) ;
2005-04-16 15:20:36 -07:00
/* No need to invalidate - it was non-present before */
2005-10-29 18:15:59 -07:00
update_mmu_cache ( vma , address , entry ) ;
2005-04-16 15:20:36 -07:00
lazy_mmu_prot_update ( entry ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
return VM_FAULT_MINOR ;
2005-10-29 18:16:26 -07:00
release :
page_cache_release ( page ) ;
goto unlock ;
2005-10-29 18:15:59 -07:00
oom :
2005-04-16 15:20:36 -07:00
return VM_FAULT_OOM ;
}
/*
* do_no_page() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the "write_access" parameter is true in order to avoid the next
* page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
2005-10-29 18:16:26 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_no_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
int write_access )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2005-10-29 18:15:59 -07:00
struct page * new_page ;
2005-04-16 15:20:36 -07:00
struct address_space * mapping = NULL ;
pte_t entry ;
unsigned int sequence = 0 ;
int ret = VM_FAULT_MINOR ;
int anon = 0 ;
2006-09-25 23:30:57 -07:00
struct page * dirty_page = NULL ;
2005-04-16 15:20:36 -07:00
pte_unmap ( page_table ) ;
2005-11-29 16:55:48 +00:00
BUG_ON ( vma - > vm_flags & VM_PFNMAP ) ;
2005-04-16 15:20:36 -07:00
if ( vma - > vm_file ) {
mapping = vma - > vm_file - > f_mapping ;
sequence = mapping - > truncate_count ;
smp_rmb ( ) ; /* serializes i_size against truncate_count */
}
retry :
new_page = vma - > vm_ops - > nopage ( vma , address & PAGE_MASK , & ret ) ;
/*
* No smp_rmb is needed here as long as there's a full
* spin_lock/unlock sequence inside the ->nopage callback
* (for the pagecache lookup) that acts as an implicit
* smp_mb() and prevents the i_size read to happen
* after the next truncate_count read.
*/
2006-10-06 00:43:53 -07:00
/* no page was available -- either SIGBUS, OOM or REFAULT */
if ( unlikely ( new_page = = NOPAGE_SIGBUS ) )
2005-04-16 15:20:36 -07:00
return VM_FAULT_SIGBUS ;
2006-10-06 00:43:53 -07:00
else if ( unlikely ( new_page = = NOPAGE_OOM ) )
2005-04-16 15:20:36 -07:00
return VM_FAULT_OOM ;
2006-10-06 00:43:53 -07:00
else if ( unlikely ( new_page = = NOPAGE_REFAULT ) )
return VM_FAULT_MINOR ;
2005-04-16 15:20:36 -07:00
/*
* Should we do an early C-O-W break?
*/
2006-06-23 02:03:43 -07:00
if ( write_access ) {
if ( ! ( vma - > vm_flags & VM_SHARED ) ) {
struct page * page ;
2005-04-16 15:20:36 -07:00
2006-06-23 02:03:43 -07:00
if ( unlikely ( anon_vma_prepare ( vma ) ) )
goto oom ;
page = alloc_page_vma ( GFP_HIGHUSER , vma , address ) ;
if ( ! page )
goto oom ;
2006-12-12 17:14:55 +00:00
copy_user_highpage ( page , new_page , address , vma ) ;
2006-06-23 02:03:43 -07:00
page_cache_release ( new_page ) ;
new_page = page ;
anon = 1 ;
} else {
/* if the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable */
if ( vma - > vm_ops - > page_mkwrite & &
vma - > vm_ops - > page_mkwrite ( vma , new_page ) < 0
) {
page_cache_release ( new_page ) ;
return VM_FAULT_SIGBUS ;
}
}
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:26 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-16 15:20:36 -07:00
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if ( mapping & & unlikely ( sequence ! = mapping - > truncate_count ) ) {
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2005-04-16 15:20:36 -07:00
page_cache_release ( new_page ) ;
2005-10-29 18:15:59 -07:00
cond_resched ( ) ;
sequence = mapping - > truncate_count ;
smp_rmb ( ) ;
2005-04-16 15:20:36 -07:00
goto retry ;
}
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if ( pte_none ( * page_table ) ) {
flush_icache_page ( vma , new_page ) ;
entry = mk_pte ( new_page , vma - > vm_page_prot ) ;
if ( write_access )
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
set_pte_at ( mm , address , page_table , entry ) ;
if ( anon ) {
2005-10-29 18:16:05 -07:00
inc_mm_counter ( mm , anon_rss ) ;
2005-04-16 15:20:36 -07:00
lru_cache_add_active ( new_page ) ;
2006-01-06 00:11:12 -08:00
page_add_new_anon_rmap ( new_page , vma , address ) ;
2005-11-21 21:32:19 -08:00
} else {
2005-10-29 18:16:05 -07:00
inc_mm_counter ( mm , file_rss ) ;
2005-04-16 15:20:36 -07:00
page_add_file_rmap ( new_page ) ;
2006-09-25 23:30:57 -07:00
if ( write_access ) {
dirty_page = new_page ;
get_page ( dirty_page ) ;
}
2005-10-29 18:16:05 -07:00
}
2005-04-16 15:20:36 -07:00
} else {
/* One of our sibling threads was faster, back out. */
page_cache_release ( new_page ) ;
2005-10-29 18:15:59 -07:00
goto unlock ;
2005-04-16 15:20:36 -07:00
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache ( vma , address , entry ) ;
lazy_mmu_prot_update ( entry ) ;
2005-10-29 18:15:59 -07:00
unlock :
2005-10-29 18:16:26 -07:00
pte_unmap_unlock ( page_table , ptl ) ;
2006-09-25 23:30:57 -07:00
if ( dirty_page ) {
2006-09-25 23:30:58 -07:00
set_page_dirty_balance ( dirty_page ) ;
2006-09-25 23:30:57 -07:00
put_page ( dirty_page ) ;
}
2005-04-16 15:20:36 -07:00
return ret ;
oom :
page_cache_release ( new_page ) ;
2005-10-29 18:15:59 -07:00
return VM_FAULT_OOM ;
2005-04-16 15:20:36 -07:00
}
2006-09-27 01:50:10 -07:00
/*
* do_no_pfn() tries to create a new page mapping for a page without
* a struct_page backing it
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*
* It is expected that the ->nopfn handler always returns the same pfn
* for a given virtual mapping.
*
* Mark this `noinline' to prevent it from bloating the main pagefault code.
*/
static noinline int do_no_pfn ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
int write_access )
{
spinlock_t * ptl ;
pte_t entry ;
unsigned long pfn ;
int ret = VM_FAULT_MINOR ;
pte_unmap ( page_table ) ;
BUG_ON ( ! ( vma - > vm_flags & VM_PFNMAP ) ) ;
BUG_ON ( is_cow_mapping ( vma - > vm_flags ) ) ;
pfn = vma - > vm_ops - > nopfn ( vma , address & PAGE_MASK ) ;
2007-02-12 00:51:38 -08:00
if ( unlikely ( pfn = = NOPFN_OOM ) )
2006-09-27 01:50:10 -07:00
return VM_FAULT_OOM ;
2007-02-12 00:51:38 -08:00
else if ( unlikely ( pfn = = NOPFN_SIGBUS ) )
2006-09-27 01:50:10 -07:00
return VM_FAULT_SIGBUS ;
2007-02-12 00:51:38 -08:00
else if ( unlikely ( pfn = = NOPFN_REFAULT ) )
return VM_FAULT_MINOR ;
2006-09-27 01:50:10 -07:00
page_table = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
/* Only go through if we didn't race with anybody else... */
if ( pte_none ( * page_table ) ) {
entry = pfn_pte ( pfn , vma - > vm_page_prot ) ;
if ( write_access )
entry = maybe_mkwrite ( pte_mkdirty ( entry ) , vma ) ;
set_pte_at ( mm , address , page_table , entry ) ;
}
pte_unmap_unlock ( page_table , ptl ) ;
return ret ;
}
2005-04-16 15:20:36 -07:00
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
2005-10-29 18:16:26 -07:00
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:15:59 -07:00
static int do_file_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * page_table , pmd_t * pmd ,
int write_access , pte_t orig_pte )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:15:59 -07:00
pgoff_t pgoff ;
2005-04-16 15:20:36 -07:00
int err ;
2005-10-29 18:16:40 -07:00
if ( ! pte_unmap_same ( mm , pmd , page_table , orig_pte ) )
2005-10-29 18:16:26 -07:00
return VM_FAULT_MINOR ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:15:59 -07:00
if ( unlikely ( ! ( vma - > vm_flags & VM_NONLINEAR ) ) ) {
/*
* Page table corrupted: show pte and kill process.
*/
2005-10-29 18:16:12 -07:00
print_bad_pte ( vma , orig_pte , address ) ;
2005-10-29 18:15:59 -07:00
return VM_FAULT_OOM ;
}
/* We can then assume vm->vm_ops && vma->vm_ops->populate */
pgoff = pte_to_pgoff ( orig_pte ) ;
err = vma - > vm_ops - > populate ( vma , address & PAGE_MASK , PAGE_SIZE ,
vma - > vm_page_prot , pgoff , 0 ) ;
2005-04-16 15:20:36 -07:00
if ( err = = - ENOMEM )
return VM_FAULT_OOM ;
if ( err )
return VM_FAULT_SIGBUS ;
return VM_FAULT_MAJOR ;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
2005-10-29 18:16:23 -07:00
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
2005-04-16 15:20:36 -07:00
*/
static inline int handle_pte_fault ( struct mm_struct * mm ,
2005-10-29 18:15:59 -07:00
struct vm_area_struct * vma , unsigned long address ,
pte_t * pte , pmd_t * pmd , int write_access )
2005-04-16 15:20:36 -07:00
{
pte_t entry ;
2005-10-29 18:16:48 -07:00
pte_t old_entry ;
2005-10-29 18:16:26 -07:00
spinlock_t * ptl ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:48 -07:00
old_entry = entry = * pte ;
2005-04-16 15:20:36 -07:00
if ( ! pte_present ( entry ) ) {
2005-10-29 18:15:59 -07:00
if ( pte_none ( entry ) ) {
2006-09-27 01:50:10 -07:00
if ( vma - > vm_ops ) {
if ( vma - > vm_ops - > nopage )
return do_no_page ( mm , vma , address ,
pte , pmd ,
write_access ) ;
if ( unlikely ( vma - > vm_ops - > nopfn ) )
return do_no_pfn ( mm , vma , address , pte ,
pmd , write_access ) ;
}
return do_anonymous_page ( mm , vma , address ,
pte , pmd , write_access ) ;
2005-10-29 18:15:59 -07:00
}
2005-04-16 15:20:36 -07:00
if ( pte_file ( entry ) )
2005-10-29 18:15:59 -07:00
return do_file_page ( mm , vma , address ,
pte , pmd , write_access , entry ) ;
return do_swap_page ( mm , vma , address ,
pte , pmd , write_access , entry ) ;
2005-04-16 15:20:36 -07:00
}
2005-10-29 18:16:40 -07:00
ptl = pte_lockptr ( mm , pmd ) ;
2005-10-29 18:16:26 -07:00
spin_lock ( ptl ) ;
if ( unlikely ( ! pte_same ( * pte , entry ) ) )
goto unlock ;
2005-04-16 15:20:36 -07:00
if ( write_access ) {
if ( ! pte_write ( entry ) )
2005-10-29 18:16:26 -07:00
return do_wp_page ( mm , vma , address ,
pte , pmd , ptl , entry ) ;
2005-04-16 15:20:36 -07:00
entry = pte_mkdirty ( entry ) ;
}
entry = pte_mkyoung ( entry ) ;
2005-10-29 18:16:48 -07:00
if ( ! pte_same ( old_entry , entry ) ) {
ptep_set_access_flags ( vma , address , pte , entry , write_access ) ;
update_mmu_cache ( vma , address , entry ) ;
lazy_mmu_prot_update ( entry ) ;
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if ( write_access )
flush_tlb_page ( vma , address ) ;
}
2005-10-29 18:16:26 -07:00
unlock :
pte_unmap_unlock ( pte , ptl ) ;
2005-04-16 15:20:36 -07:00
return VM_FAULT_MINOR ;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
2005-10-29 18:15:59 -07:00
int __handle_mm_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
2005-04-16 15:20:36 -07:00
unsigned long address , int write_access )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
__set_current_state ( TASK_RUNNING ) ;
2006-06-30 01:55:45 -07:00
count_vm_event ( PGFAULT ) ;
2005-04-16 15:20:36 -07:00
2005-10-20 16:24:28 +01:00
if ( unlikely ( is_vm_hugetlb_page ( vma ) ) )
return hugetlb_fault ( mm , vma , address , write_access ) ;
2005-04-16 15:20:36 -07:00
pgd = pgd_offset ( mm , address ) ;
pud = pud_alloc ( mm , pgd , address ) ;
if ( ! pud )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2005-04-16 15:20:36 -07:00
pmd = pmd_alloc ( mm , pud , address ) ;
if ( ! pmd )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2005-04-16 15:20:36 -07:00
pte = pte_alloc_map ( mm , pmd , address ) ;
if ( ! pte )
2005-10-29 18:16:23 -07:00
return VM_FAULT_OOM ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:23 -07:00
return handle_pte_fault ( mm , vma , address , pte , pmd , write_access ) ;
2005-04-16 15:20:36 -07:00
}
2005-11-15 15:53:48 -05:00
EXPORT_SYMBOL_GPL ( __handle_mm_fault ) ;
2005-04-16 15:20:36 -07:00
# ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
2005-10-29 18:16:21 -07:00
* We've already handled the fast-path in-line.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:22 -07:00
int __pud_alloc ( struct mm_struct * mm , pgd_t * pgd , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:23 -07:00
pud_t * new = pud_alloc_one ( mm , address ) ;
if ( ! new )
2005-10-29 18:16:22 -07:00
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:21 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
if ( pgd_present ( * pgd ) ) /* Another has populated it */
2005-04-16 15:20:36 -07:00
pud_free ( new ) ;
2005-10-29 18:16:22 -07:00
else
pgd_populate ( mm , pgd , new ) ;
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2005-11-28 13:43:44 -08:00
# else
/* Workaround for gcc 2.96 */
int __pud_alloc ( struct mm_struct * mm , pgd_t * pgd , unsigned long address )
{
return 0 ;
}
2005-04-16 15:20:36 -07:00
# endif /* __PAGETABLE_PUD_FOLDED */
# ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
2005-10-29 18:16:21 -07:00
* We've already handled the fast-path in-line.
2005-04-16 15:20:36 -07:00
*/
2005-10-29 18:16:22 -07:00
int __pmd_alloc ( struct mm_struct * mm , pud_t * pud , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2005-10-29 18:16:23 -07:00
pmd_t * new = pmd_alloc_one ( mm , address ) ;
if ( ! new )
2005-10-29 18:16:22 -07:00
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2005-10-29 18:16:21 -07:00
spin_lock ( & mm - > page_table_lock ) ;
2005-04-16 15:20:36 -07:00
# ifndef __ARCH_HAS_4LEVEL_HACK
2005-10-29 18:16:22 -07:00
if ( pud_present ( * pud ) ) /* Another has populated it */
2005-04-16 15:20:36 -07:00
pmd_free ( new ) ;
2005-10-29 18:16:22 -07:00
else
pud_populate ( mm , pud , new ) ;
2005-04-16 15:20:36 -07:00
# else
2005-10-29 18:16:22 -07:00
if ( pgd_present ( * pud ) ) /* Another has populated it */
2005-04-16 15:20:36 -07:00
pmd_free ( new ) ;
2005-10-29 18:16:22 -07:00
else
pgd_populate ( mm , pud , new ) ;
2005-04-16 15:20:36 -07:00
# endif /* __ARCH_HAS_4LEVEL_HACK */
2005-10-29 18:16:23 -07:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-29 18:16:22 -07:00
return 0 ;
2005-11-28 13:43:44 -08:00
}
# else
/* Workaround for gcc 2.96 */
int __pmd_alloc ( struct mm_struct * mm , pud_t * pud , unsigned long address )
{
return 0 ;
2005-04-16 15:20:36 -07:00
}
# endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present ( unsigned long addr , unsigned long end )
{
int ret , len , write ;
struct vm_area_struct * vma ;
vma = find_vma ( current - > mm , addr ) ;
if ( ! vma )
return - 1 ;
write = ( vma - > vm_flags & VM_WRITE ) ! = 0 ;
2006-03-26 18:30:52 +02:00
BUG_ON ( addr > = end ) ;
BUG_ON ( end > vma - > vm_end ) ;
2005-04-16 15:20:36 -07:00
len = ( end + PAGE_SIZE - 1 ) / PAGE_SIZE - addr / PAGE_SIZE ;
ret = get_user_pages ( current , current - > mm , addr ,
len , write , 0 , NULL , NULL ) ;
if ( ret < 0 )
return ret ;
return ret = = len ? 0 : - 1 ;
}
/*
* Map a vmalloc()-space virtual address to the physical page.
*/
struct page * vmalloc_to_page ( void * vmalloc_addr )
{
unsigned long addr = ( unsigned long ) vmalloc_addr ;
struct page * page = NULL ;
pgd_t * pgd = pgd_offset_k ( addr ) ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * ptep , pte ;
if ( ! pgd_none ( * pgd ) ) {
pud = pud_offset ( pgd , addr ) ;
if ( ! pud_none ( * pud ) ) {
pmd = pmd_offset ( pud , addr ) ;
if ( ! pmd_none ( * pmd ) ) {
ptep = pte_offset_map ( pmd , addr ) ;
pte = * ptep ;
if ( pte_present ( pte ) )
page = pte_page ( pte ) ;
pte_unmap ( ptep ) ;
}
}
}
return page ;
}
EXPORT_SYMBOL ( vmalloc_to_page ) ;
/*
* Map a vmalloc()-space virtual address to the physical page frame number.
*/
unsigned long vmalloc_to_pfn ( void * vmalloc_addr )
{
return page_to_pfn ( vmalloc_to_page ( vmalloc_addr ) ) ;
}
EXPORT_SYMBOL ( vmalloc_to_pfn ) ;
# if !defined(__HAVE_ARCH_GATE_AREA)
# if defined(AT_SYSINFO_EHDR)
2005-09-10 00:26:28 -07:00
static struct vm_area_struct gate_vma ;
2005-04-16 15:20:36 -07:00
static int __init gate_vma_init ( void )
{
gate_vma . vm_mm = NULL ;
gate_vma . vm_start = FIXADDR_USER_START ;
gate_vma . vm_end = FIXADDR_USER_END ;
2007-01-26 00:56:47 -08:00
gate_vma . vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC ;
gate_vma . vm_page_prot = __P101 ;
2007-01-26 00:56:49 -08:00
/*
* Make sure the vDSO gets into every core dump.
* Dumping its contents makes post-mortem fully interpretable later
* without matching up the same kernel and hardware config to see
* what PC values meant.
*/
gate_vma . vm_flags | = VM_ALWAYSDUMP ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
__initcall ( gate_vma_init ) ;
# endif
struct vm_area_struct * get_gate_vma ( struct task_struct * tsk )
{
# ifdef AT_SYSINFO_EHDR
return & gate_vma ;
# else
return NULL ;
# endif
}
int in_gate_area_no_task ( unsigned long addr )
{
# ifdef AT_SYSINFO_EHDR
if ( ( addr > = FIXADDR_USER_START ) & & ( addr < FIXADDR_USER_END ) )
return 1 ;
# endif
return 0 ;
}
# endif /* __HAVE_ARCH_GATE_AREA */
2006-09-27 01:50:15 -07:00
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm ( struct task_struct * tsk , unsigned long addr , void * buf , int len , int write )
{
struct mm_struct * mm ;
struct vm_area_struct * vma ;
struct page * page ;
void * old_buf = buf ;
mm = get_task_mm ( tsk ) ;
if ( ! mm )
return 0 ;
down_read ( & mm - > mmap_sem ) ;
/* ignore errors, just check how much was sucessfully transfered */
while ( len ) {
int bytes , ret , offset ;
void * maddr ;
ret = get_user_pages ( tsk , mm , addr , 1 ,
write , 1 , & page , & vma ) ;
if ( ret < = 0 )
break ;
bytes = len ;
offset = addr & ( PAGE_SIZE - 1 ) ;
if ( bytes > PAGE_SIZE - offset )
bytes = PAGE_SIZE - offset ;
maddr = kmap ( page ) ;
if ( write ) {
copy_to_user_page ( vma , page , addr ,
maddr + offset , buf , bytes ) ;
set_page_dirty_lock ( page ) ;
} else {
copy_from_user_page ( vma , page , addr ,
buf , maddr + offset , bytes ) ;
}
kunmap ( page ) ;
page_cache_release ( page ) ;
len - = bytes ;
buf + = bytes ;
addr + = bytes ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
return buf - old_buf ;
}