You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main changes in this cycle were:
- Continued work to add support for 5-level paging provided by future
Intel CPUs. In particular we switch the x86 GUP code to the generic
implementation. (Kirill A. Shutemov)
- Continued work to add PCID CPU support to native kernels as well.
In this round most of the focus is on reworking/refreshing the TLB
flush infrastructure for the upcoming PCID changes. (Andy
Lutomirski)"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
x86/mm: Delete a big outdated comment about TLB flushing
x86/mm: Don't reenter flush_tlb_func_common()
x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
x86/ftrace: Exclude functions in head64.c from function-tracing
x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
x86/mm: Remove reset_lazy_tlbstate()
x86/ldt: Simplify the LDT switching logic
x86/boot/64: Put __startup_64() into .head.text
x86/mm: Add support for 5-level paging for KASLR
x86/mm: Make kernel_physical_mapping_init() support 5-level paging
x86/mm: Add sync_global_pgds() for configuration with 5-level paging
x86/boot/64: Add support of additional page table level during early boot
x86/boot/64: Rename init_level4_pgt and early_level4_pgt
x86/boot/64: Rewrite startup_64() in C
x86/boot/compressed: Enable 5-level paging during decompression stage
x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
x86/boot/efi: Cleanup initialization of GDT entries
x86/asm: Fix comment in return_from_SYSCALL_64()
x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
...
This commit is contained in:
+1
-1
@@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
|
||||
config HAVE_ARCH_PFN_VALID
|
||||
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
|
||||
|
||||
config HAVE_GENERIC_RCU_GUP
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
depends on ARM_LPAE
|
||||
|
||||
|
||||
+1
-1
@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
|
||||
config ZONE_DMA
|
||||
def_bool y
|
||||
|
||||
config HAVE_GENERIC_RCU_GUP
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
|
||||
config ARCH_DMA_ADDR_T_64BIT
|
||||
|
||||
@@ -184,7 +184,7 @@ config PPC
|
||||
select HAVE_FUNCTION_GRAPH_TRACER
|
||||
select HAVE_FUNCTION_TRACER
|
||||
select HAVE_GCC_PLUGINS
|
||||
select HAVE_GENERIC_RCU_GUP
|
||||
select HAVE_GENERIC_GUP
|
||||
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
|
||||
select HAVE_IDE
|
||||
select HAVE_IOREMAP_PROT
|
||||
|
||||
+4
-1
@@ -69,7 +69,7 @@ config X86
|
||||
select ARCH_USE_BUILTIN_BSWAP
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
|
||||
bool
|
||||
depends on STA2X11
|
||||
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
|
||||
source "net/Kconfig"
|
||||
|
||||
source "drivers/Kconfig"
|
||||
|
||||
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
memset((char *)gdt->address, 0x0, gdt->size);
|
||||
desc = (struct desc_struct *)gdt->address;
|
||||
|
||||
/* The first GDT is a dummy and the second is unused. */
|
||||
desc += 2;
|
||||
/* The first GDT is a dummy. */
|
||||
desc++;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
/* __KERNEL32_CS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
|
||||
desc->s = DESC_TYPE_CODE_DATA;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0xf;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
desc++;
|
||||
} else {
|
||||
/* Second entry is unused on 32-bit */
|
||||
desc++;
|
||||
}
|
||||
|
||||
/* __KERNEL_CS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
desc->p = 1;
|
||||
desc->limit = 0xf;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
desc->l = 1;
|
||||
desc->d = 0;
|
||||
} else {
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
}
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
|
||||
desc++;
|
||||
|
||||
/* __KERNEL_DS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Task segment value */
|
||||
desc++;
|
||||
desc->limit0 = 0x0000;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_TSS;
|
||||
desc->s = 0;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0x0;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = 0;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
/* Task segment value */
|
||||
desc->limit0 = 0x0000;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_TSS;
|
||||
desc->s = 0;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0x0;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = 0;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
desc++;
|
||||
}
|
||||
|
||||
asm volatile("cli");
|
||||
asm volatile ("lgdt %0" : : "m" (*gdt));
|
||||
|
||||
@@ -346,6 +346,48 @@ preferred_addr:
|
||||
/* Set up the stack */
|
||||
leaq boot_stack_end(%rbx), %rsp
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
/* Check if 5-level paging has already enabled */
|
||||
movq %cr4, %rax
|
||||
testl $X86_CR4_LA57, %eax
|
||||
jnz lvl5
|
||||
|
||||
/*
|
||||
* At this point we are in long mode with 4-level paging enabled,
|
||||
* but we want to enable 5-level paging.
|
||||
*
|
||||
* The problem is that we cannot do it directly. Setting LA57 in
|
||||
* long mode would trigger #GP. So we need to switch off long mode
|
||||
* first.
|
||||
*
|
||||
* NOTE: This is not going to work if bootloader put us above 4G
|
||||
* limit.
|
||||
*
|
||||
* The first step is go into compatibility mode.
|
||||
*/
|
||||
|
||||
/* Clear additional page table */
|
||||
leaq lvl5_pgtable(%rbx), %rdi
|
||||
xorq %rax, %rax
|
||||
movq $(PAGE_SIZE/8), %rcx
|
||||
rep stosq
|
||||
|
||||
/*
|
||||
* Setup current CR3 as the first and only entry in a new top level
|
||||
* page table.
|
||||
*/
|
||||
movq %cr3, %rdi
|
||||
leaq 0x7 (%rdi), %rax
|
||||
movq %rax, lvl5_pgtable(%rbx)
|
||||
|
||||
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
|
||||
pushq $__KERNEL32_CS
|
||||
leaq compatible_mode(%rip), %rax
|
||||
pushq %rax
|
||||
lretq
|
||||
lvl5:
|
||||
#endif
|
||||
|
||||
/* Zero EFLAGS */
|
||||
pushq $0
|
||||
popfq
|
||||
@@ -429,6 +471,44 @@ relocated:
|
||||
jmp *%rax
|
||||
|
||||
.code32
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
compatible_mode:
|
||||
/* Setup data and stack segments */
|
||||
movl $__KERNEL_DS, %eax
|
||||
movl %eax, %ds
|
||||
movl %eax, %ss
|
||||
|
||||
/* Disable paging */
|
||||
movl %cr0, %eax
|
||||
btrl $X86_CR0_PG_BIT, %eax
|
||||
movl %eax, %cr0
|
||||
|
||||
/* Point CR3 to 5-level paging */
|
||||
leal lvl5_pgtable(%ebx), %eax
|
||||
movl %eax, %cr3
|
||||
|
||||
/* Enable PAE and LA57 mode */
|
||||
movl %cr4, %eax
|
||||
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
|
||||
movl %eax, %cr4
|
||||
|
||||
/* Calculate address we are running at */
|
||||
call 1f
|
||||
1: popl %edi
|
||||
subl $1b, %edi
|
||||
|
||||
/* Prepare stack for far return to Long Mode */
|
||||
pushl $__KERNEL_CS
|
||||
leal lvl5(%edi), %eax
|
||||
push %eax
|
||||
|
||||
/* Enable paging back */
|
||||
movl $(X86_CR0_PG | X86_CR0_PE), %eax
|
||||
movl %eax, %cr0
|
||||
|
||||
lret
|
||||
#endif
|
||||
|
||||
no_longmode:
|
||||
/* This isn't an x86-64 CPU so hang */
|
||||
1:
|
||||
@@ -442,7 +522,7 @@ gdt:
|
||||
.word gdt_end - gdt
|
||||
.long gdt
|
||||
.word 0
|
||||
.quad 0x0000000000000000 /* NULL descriptor */
|
||||
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
|
||||
.quad 0x00af9a000000ffff /* __KERNEL_CS */
|
||||
.quad 0x00cf92000000ffff /* __KERNEL_DS */
|
||||
.quad 0x0080890000000000 /* TS descriptor */
|
||||
@@ -486,3 +566,7 @@ boot_stack_end:
|
||||
.balign 4096
|
||||
pgtable:
|
||||
.fill BOOT_PGT_SIZE, 1, 0
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
lvl5_pgtable:
|
||||
.fill PAGE_SIZE, 1, 0
|
||||
#endif
|
||||
|
||||
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
|
||||
static struct alloc_pgt_data pgt_data;
|
||||
|
||||
/* The top level page table entry pointer. */
|
||||
static unsigned long level4p;
|
||||
static unsigned long top_level_pgt;
|
||||
|
||||
/*
|
||||
* Mapping information structure passed to kernel_ident_mapping_init().
|
||||
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
|
||||
* If we came here via startup_32(), cr3 will be _pgtable already
|
||||
* and we must append to the existing area instead of entirely
|
||||
* overwriting it.
|
||||
*
|
||||
* With 5-level paging, we use '_pgtable' to allocate the p4d page table,
|
||||
* the top-level page table is allocated separately.
|
||||
*
|
||||
* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
|
||||
* cases. On 4-level paging it's equal to 'top_level_pgt'.
|
||||
*/
|
||||
level4p = read_cr3();
|
||||
if (level4p == (unsigned long)_pgtable) {
|
||||
top_level_pgt = read_cr3_pa();
|
||||
if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
|
||||
debug_putstr("booted via startup_32()\n");
|
||||
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
|
||||
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
|
||||
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
|
||||
pgt_data.pgt_buf = _pgtable;
|
||||
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
|
||||
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
|
||||
level4p = (unsigned long)alloc_pgt_page(&pgt_data);
|
||||
top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
|
||||
return;
|
||||
|
||||
/* Build the mapping. */
|
||||
kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
|
||||
kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
|
||||
start, end);
|
||||
}
|
||||
|
||||
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
|
||||
*/
|
||||
void finalize_identity_maps(void)
|
||||
{
|
||||
write_cr3(level4p);
|
||||
write_cr3(top_level_pgt);
|
||||
}
|
||||
|
||||
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
|
||||
* If width of "canonical tail" ever becomes variable, this will need
|
||||
* to be updated to remain correct on both old and new CPUs.
|
||||
*
|
||||
* Change top 16 bits to be the sign-extension of 47th bit
|
||||
* Change top bits to match most significant bit (47th or 56th bit
|
||||
* depending on paging mode) in the address.
|
||||
*/
|
||||
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
|
||||
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
|
||||
|
||||
@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
|
||||
|
||||
static void refresh_pce(void *ignored)
|
||||
{
|
||||
if (current->active_mm)
|
||||
load_mm_cr4(current->active_mm);
|
||||
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
|
||||
}
|
||||
|
||||
static void x86_pmu_event_mapped(struct perf_event *event)
|
||||
@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
|
||||
|
||||
/* IRQs are off, so this synchronizes with smp_store_release */
|
||||
ldt = lockless_dereference(current->active_mm->context.ldt);
|
||||
if (!ldt || idx > ldt->size)
|
||||
if (!ldt || idx > ldt->nr_entries)
|
||||
return 0;
|
||||
|
||||
desc = &ldt->entries[idx];
|
||||
|
||||
@@ -74,7 +74,7 @@ struct efi_scratch {
|
||||
__kernel_fpu_begin(); \
|
||||
\
|
||||
if (efi_scratch.use_pgd) { \
|
||||
efi_scratch.prev_cr3 = read_cr3(); \
|
||||
efi_scratch.prev_cr3 = __read_cr3(); \
|
||||
write_cr3((unsigned long)efi_scratch.efi_pgt); \
|
||||
__flush_tlb_all(); \
|
||||
} \
|
||||
|
||||
@@ -22,8 +22,8 @@ typedef struct {
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int irq_resched_count;
|
||||
unsigned int irq_call_count;
|
||||
unsigned int irq_tlb_count;
|
||||
#endif
|
||||
unsigned int irq_tlb_count;
|
||||
#ifdef CONFIG_X86_THERMAL_VECTOR
|
||||
unsigned int irq_thermal_count;
|
||||
#endif
|
||||
|
||||
@@ -37,12 +37,6 @@ typedef struct {
|
||||
#endif
|
||||
} mm_context_t;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void leave_mm(int cpu);
|
||||
#else
|
||||
static inline void leave_mm(int cpu)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_MMU_H */
|
||||
|
||||
@@ -47,7 +47,7 @@ struct ldt_struct {
|
||||
* allocations, but it's not worth trying to optimize.
|
||||
*/
|
||||
struct desc_struct *entries;
|
||||
unsigned int size;
|
||||
unsigned int nr_entries;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
|
||||
*/
|
||||
|
||||
if (unlikely(ldt))
|
||||
set_ldt(ldt->entries, ldt->size);
|
||||
set_ldt(ldt->entries, ldt->nr_entries);
|
||||
else
|
||||
clear_LDT();
|
||||
#else
|
||||
clear_LDT();
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
|
||||
{
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
/*
|
||||
* Load the LDT if either the old or new mm had an LDT.
|
||||
*
|
||||
* An mm will never go from having an LDT to not having an LDT. Two
|
||||
* mms never share an LDT, so we don't gain anything by checking to
|
||||
* see whether the LDT changed. There's also no guarantee that
|
||||
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
|
||||
* then prev->context.ldt will also be non-NULL.
|
||||
*
|
||||
* If we really cared, we could optimize the case where prev == next
|
||||
* and we're exiting lazy mode. Most of the time, if this happens,
|
||||
* we don't actually need to reload LDTR, but modify_ldt() is mostly
|
||||
* used by legacy code and emulators where we don't need this level of
|
||||
* performance.
|
||||
*
|
||||
* This uses | instead of || because it generates better code.
|
||||
*/
|
||||
if (unlikely((unsigned long)prev->context.ldt |
|
||||
(unsigned long)next->context.ldt))
|
||||
load_mm_ldt(next);
|
||||
#endif
|
||||
|
||||
DEBUG_LOCKS_WARN_ON(preemptible());
|
||||
}
|
||||
|
||||
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int init_new_context(struct task_struct *tsk,
|
||||
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
|
||||
{
|
||||
u32 pkru = read_pkru();
|
||||
|
||||
if (!__pkru_allows_read(pkru, pkey))
|
||||
return false;
|
||||
if (write && !__pkru_allows_write(pkru, pkey))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We only want to enforce protection keys on the current process
|
||||
* because we effectively have no access to PKRU for other
|
||||
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
||||
return __pkru_allows_pkey(vma_pkey(vma), write);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This can be used from process context to figure out what the value of
|
||||
* CR3 is without needing to do a (slow) __read_cr3().
|
||||
*
|
||||
* It's intended to be used for code like KVM that sneakily changes CR3
|
||||
* and needs to restore it. It needs to be used very carefully.
|
||||
*/
|
||||
static inline unsigned long __get_current_cr3_fast(void)
|
||||
{
|
||||
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
|
||||
|
||||
/* For now, be very restrictive about when this can be called. */
|
||||
VM_WARN_ON(in_nmi() || !in_atomic());
|
||||
|
||||
VM_BUG_ON(cr3 != __read_cr3());
|
||||
return cr3;
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_MMU_CONTEXT_H */
|
||||
|
||||
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
|
||||
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
|
||||
}
|
||||
|
||||
static inline unsigned long read_cr3(void)
|
||||
static inline unsigned long __read_cr3(void)
|
||||
{
|
||||
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
|
||||
}
|
||||
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
|
||||
}
|
||||
|
||||
static inline void flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
|
||||
PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
|
||||
}
|
||||
|
||||
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
|
||||
|
||||
@@ -51,6 +51,7 @@ struct mm_struct;
|
||||
struct desc_struct;
|
||||
struct task_struct;
|
||||
struct cpumask;
|
||||
struct flush_tlb_info;
|
||||
|
||||
/*
|
||||
* Wrapper type for pointers to code which uses the non-standard
|
||||
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_kernel)(void);
|
||||
void (*flush_tlb_single)(unsigned long addr);
|
||||
void (*flush_tlb_others)(const struct cpumask *cpus,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end);
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
/* Hooks for allocating and freeing a pagetable top-level */
|
||||
int (*pgd_alloc)(struct mm_struct *mm);
|
||||
|
||||
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
|
||||
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
|
||||
|
||||
#define gup_get_pte gup_get_pte
|
||||
/*
|
||||
* WARNING: only to be used in the get_user_pages_fast() implementation.
|
||||
*
|
||||
* With get_user_pages_fast(), we walk down the pagetables without taking
|
||||
* any locks. For this we would like to load the pointers atomically,
|
||||
* but that is not possible (without expensive cmpxchg8b) on PAE. What
|
||||
* we do have is the guarantee that a PTE will only either go from not
|
||||
* present to present, or present to not present or both -- it will not
|
||||
* switch to a completely different present page without a TLB flush in
|
||||
* between; something that we are blocking by holding interrupts off.
|
||||
*
|
||||
* Setting ptes from not present to present goes:
|
||||
*
|
||||
* ptep->pte_high = h;
|
||||
* smp_wmb();
|
||||
* ptep->pte_low = l;
|
||||
*
|
||||
* And present to not present goes:
|
||||
*
|
||||
* ptep->pte_low = 0;
|
||||
* smp_wmb();
|
||||
* ptep->pte_high = 0;
|
||||
*
|
||||
* We must ensure here that the load of pte_low sees 'l' iff pte_high
|
||||
* sees 'h'. We load pte_high *after* loading pte_low, which ensures we
|
||||
* don't see an older value of pte_high. *Then* we recheck pte_low,
|
||||
* which ensures that we haven't picked up a changed pte high. We might
|
||||
* have gotten rubbish values from pte_low and pte_high, but we are
|
||||
* guaranteed that pte_low will not have the present bit set *unless*
|
||||
* it is 'l'. Because get_user_pages_fast() only operates on present ptes
|
||||
* we're safe.
|
||||
*/
|
||||
static inline pte_t gup_get_pte(pte_t *ptep)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
do {
|
||||
pte.pte_low = ptep->pte_low;
|
||||
smp_rmb();
|
||||
pte.pte_high = ptep->pte_high;
|
||||
smp_rmb();
|
||||
} while (unlikely(pte.pte_low != ptep->pte_low));
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
|
||||
|
||||
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int pgd_devmap(pgd_t pgd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
|
||||
static inline void __meminit init_trampoline_default(void)
|
||||
{
|
||||
/* Default trampoline pgd value */
|
||||
trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
|
||||
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
|
||||
}
|
||||
# ifdef CONFIG_RANDOMIZE_MEMORY
|
||||
void __meminit init_trampoline(void);
|
||||
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
|
||||
{
|
||||
u32 pkru = read_pkru();
|
||||
|
||||
if (!__pkru_allows_read(pkru, pkey))
|
||||
return false;
|
||||
if (write && !__pkru_allows_write(pkru, pkey))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* 'pteval' can come from a PTE, PMD or PUD. We only check
|
||||
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
|
||||
* same value on all 3 types.
|
||||
*/
|
||||
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
|
||||
{
|
||||
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
|
||||
|
||||
if (write)
|
||||
need_pte_bits |= _PAGE_RW;
|
||||
|
||||
if ((pteval & need_pte_bits) != need_pte_bits)
|
||||
return 0;
|
||||
|
||||
return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
|
||||
}
|
||||
|
||||
#define pte_access_permitted pte_access_permitted
|
||||
static inline bool pte_access_permitted(pte_t pte, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pte_val(pte), write);
|
||||
}
|
||||
|
||||
#define pmd_access_permitted pmd_access_permitted
|
||||
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pmd_val(pmd), write);
|
||||
}
|
||||
|
||||
#define pud_access_permitted pud_access_permitted
|
||||
static inline bool pud_access_permitted(pud_t pud, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pud_val(pud), write);
|
||||
}
|
||||
|
||||
#include <asm-generic/pgtable.h>
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
|
||||
@@ -14,15 +14,17 @@
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/threads.h>
|
||||
|
||||
extern p4d_t level4_kernel_pgt[512];
|
||||
extern p4d_t level4_ident_pgt[512];
|
||||
extern pud_t level3_kernel_pgt[512];
|
||||
extern pud_t level3_ident_pgt[512];
|
||||
extern pmd_t level2_kernel_pgt[512];
|
||||
extern pmd_t level2_fixmap_pgt[512];
|
||||
extern pmd_t level2_ident_pgt[512];
|
||||
extern pte_t level1_fixmap_pgt[512];
|
||||
extern pgd_t init_level4_pgt[];
|
||||
extern pgd_t init_top_pgt[];
|
||||
|
||||
#define swapper_pg_dir init_level4_pgt
|
||||
#define swapper_pg_dir init_top_pgt
|
||||
|
||||
extern void paging_init(void);
|
||||
|
||||
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
|
||||
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
|
||||
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#define gup_fast_permitted gup_fast_permitted
|
||||
static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
|
||||
int write)
|
||||
{
|
||||
unsigned long len, end;
|
||||
|
||||
len = (unsigned long)nr_pages << PAGE_SHIFT;
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return false;
|
||||
if (end >> __VIRTUAL_MASK_SHIFT)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#endif /* _ASM_X86_PGTABLE_64_H */
|
||||
|
||||
@@ -8,4 +8,40 @@
|
||||
#else
|
||||
#define X86_VM_MASK 0 /* No VM86 support */
|
||||
#endif
|
||||
|
||||
/*
|
||||
* CR3's layout varies depending on several things.
|
||||
*
|
||||
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
|
||||
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
|
||||
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
|
||||
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
|
||||
* CR3[2:0] and CR3[11:5] are ignored.
|
||||
*
|
||||
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
|
||||
*
|
||||
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
|
||||
* written as 1 to prevent the write to CR3 from flushing the TLB.
|
||||
*
|
||||
* On systems with SME, one bit (in a variable position!) is stolen to indicate
|
||||
* that the top-level paging structure is encrypted.
|
||||
*
|
||||
* All of the remaining bits indicate the physical address of the top-level
|
||||
* paging structure.
|
||||
*
|
||||
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
|
||||
*/
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Mask off the address space ID bits. */
|
||||
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
|
||||
#define CR3_PCID_MASK 0xFFFull
|
||||
#else
|
||||
/*
|
||||
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
|
||||
* a tiny bit of code size by setting all the bits.
|
||||
*/
|
||||
#define CR3_ADDR_MASK 0xFFFFFFFFull
|
||||
#define CR3_PCID_MASK 0ull
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
|
||||
|
||||
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
|
||||
native_cpuid_reg(ecx)
|
||||
native_cpuid_reg(edx)
|
||||
|
||||
/*
|
||||
* Friendlier CR3 helpers.
|
||||
*/
|
||||
static inline unsigned long read_cr3_pa(void)
|
||||
{
|
||||
return __read_cr3() & CR3_ADDR_MASK;
|
||||
}
|
||||
|
||||
static inline void load_cr3(pgd_t *pgdir)
|
||||
{
|
||||
write_cr3(__pa(pgdir));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user