You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 page table isolation updates from Thomas Gleixner:
"This is the final set of enabling page table isolation on x86:
- Infrastructure patches for handling the extra page tables.
- Patches which map the various bits and pieces which are required to
get in and out of user space into the user space visible page
tables.
- The required changes to have CR3 switching in the entry/exit code.
- Optimizations for the CR3 switching along with documentation how
the ASID/PCID mechanism works.
- Updates to dump pagetables to cover the user space page tables for
W+X scans and extra debugfs files to analyze both the kernel and
the user space visible page tables
The whole functionality is compile time controlled via a config switch
and can be turned on/off on the command line as well"
* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
x86/ldt: Make the LDT mapping RO
x86/mm/dump_pagetables: Allow dumping current pagetables
x86/mm/dump_pagetables: Check user space page table for WX pages
x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
x86/mm/pti: Add Kconfig
x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
x86/mm: Use INVPCID for __native_flush_tlb_single()
x86/mm: Optimize RESTORE_CR3
x86/mm: Use/Fix PCID to optimize user/kernel switches
x86/mm: Abstract switching CR3
x86/mm: Allow flushing for future ASID switches
x86/pti: Map the vsyscall page if needed
x86/pti: Put the LDT in its own PGD if PTI is on
x86/mm/64: Make a full PGD-entry size hole in the memory map
x86/events/intel/ds: Map debug buffers in cpu_entry_area
x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
x86/mm/pti: Map ESPFIX into user space
x86/mm/pti: Share entry text PMD
x86/entry: Align entry text section to PMD boundary
...
This commit is contained in:
@@ -2708,6 +2708,8 @@
|
||||
steal time is computed, but won't influence scheduler
|
||||
behaviour
|
||||
|
||||
nopti [X86-64] Disable kernel page table isolation
|
||||
|
||||
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
||||
|
||||
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
|
||||
@@ -3282,6 +3284,12 @@
|
||||
pt. [PARIDE]
|
||||
See Documentation/blockdev/paride.txt.
|
||||
|
||||
pti= [X86_64]
|
||||
Control user/kernel address space isolation:
|
||||
on - enable
|
||||
off - disable
|
||||
auto - default setting
|
||||
|
||||
pty.legacy_count=
|
||||
[KNL] Number of legacy pty's. Overwrites compiled-in
|
||||
default number.
|
||||
|
||||
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
|
||||
... unused hole ...
|
||||
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
|
||||
... unused hole ...
|
||||
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
|
||||
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
|
||||
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
|
||||
... unused hole ...
|
||||
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
|
||||
hole caused by [56:63] sign extension
|
||||
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
|
||||
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
|
||||
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
|
||||
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
|
||||
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
|
||||
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
|
||||
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
|
||||
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
|
||||
... unused hole ...
|
||||
|
||||
@@ -23,6 +23,9 @@
|
||||
*/
|
||||
#undef CONFIG_AMD_MEM_ENCRYPT
|
||||
|
||||
/* No PAGE_TABLE_ISOLATION support needed either: */
|
||||
#undef CONFIG_PAGE_TABLE_ISOLATION
|
||||
|
||||
#include "misc.h"
|
||||
|
||||
/* These actually do the work of building the kernel identity maps. */
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include <linux/jump_label.h>
|
||||
#include <asm/unwind_hints.h>
|
||||
#include <asm/cpufeatures.h>
|
||||
#include <asm/page_types.h>
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*
|
||||
|
||||
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
|
||||
/*
|
||||
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
|
||||
* halves:
|
||||
*/
|
||||
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
|
||||
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
|
||||
|
||||
.macro SET_NOFLUSH_BIT reg:req
|
||||
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
|
||||
.endm
|
||||
|
||||
.macro ADJUST_KERNEL_CR3 reg:req
|
||||
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
|
||||
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
|
||||
andq $(~PTI_SWITCH_MASK), \reg
|
||||
.endm
|
||||
|
||||
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
||||
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
||||
mov %cr3, \scratch_reg
|
||||
ADJUST_KERNEL_CR3 \scratch_reg
|
||||
mov \scratch_reg, %cr3
|
||||
.Lend_\@:
|
||||
.endm
|
||||
|
||||
#define THIS_CPU_user_pcid_flush_mask \
|
||||
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
|
||||
|
||||
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
||||
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
||||
mov %cr3, \scratch_reg
|
||||
|
||||
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
|
||||
|
||||
/*
|
||||
* Test if the ASID needs a flush.
|
||||
*/
|
||||
movq \scratch_reg, \scratch_reg2
|
||||
andq $(0x7FF), \scratch_reg /* mask ASID */
|
||||
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
jnc .Lnoflush_\@
|
||||
|
||||
/* Flush needed, clear the bit */
|
||||
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
movq \scratch_reg2, \scratch_reg
|
||||
jmp .Lwrcr3_\@
|
||||
|
||||
.Lnoflush_\@:
|
||||
movq \scratch_reg2, \scratch_reg
|
||||
SET_NOFLUSH_BIT \scratch_reg
|
||||
|
||||
.Lwrcr3_\@:
|
||||
/* Flip the PGD and ASID to the user version */
|
||||
orq $(PTI_SWITCH_MASK), \scratch_reg
|
||||
mov \scratch_reg, %cr3
|
||||
.Lend_\@:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
||||
pushq %rax
|
||||
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
|
||||
popq %rax
|
||||
.endm
|
||||
|
||||
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
||||
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
|
||||
movq %cr3, \scratch_reg
|
||||
movq \scratch_reg, \save_reg
|
||||
/*
|
||||
* Is the "switch mask" all zero? That means that both of
|
||||
* these are zero:
|
||||
*
|
||||
* 1. The user/kernel PCID bit, and
|
||||
* 2. The user/kernel "bit" that points CR3 to the
|
||||
* bottom half of the 8k PGD
|
||||
*
|
||||
* That indicates a kernel CR3 value, not a user CR3.
|
||||
*/
|
||||
testq $(PTI_SWITCH_MASK), \scratch_reg
|
||||
jz .Ldone_\@
|
||||
|
||||
ADJUST_KERNEL_CR3 \scratch_reg
|
||||
movq \scratch_reg, %cr3
|
||||
|
||||
.Ldone_\@:
|
||||
.endm
|
||||
|
||||
.macro RESTORE_CR3 scratch_reg:req save_reg:req
|
||||
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
||||
|
||||
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
|
||||
|
||||
/*
|
||||
* KERNEL pages can always resume with NOFLUSH as we do
|
||||
* explicit flushes.
|
||||
*/
|
||||
bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
|
||||
jnc .Lnoflush_\@
|
||||
|
||||
/*
|
||||
* Check if there's a pending flush for the user ASID we're
|
||||
* about to set.
|
||||
*/
|
||||
movq \save_reg, \scratch_reg
|
||||
andq $(0x7FF), \scratch_reg
|
||||
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
jnc .Lnoflush_\@
|
||||
|
||||
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
jmp .Lwrcr3_\@
|
||||
|
||||
.Lnoflush_\@:
|
||||
SET_NOFLUSH_BIT \save_reg
|
||||
|
||||
.Lwrcr3_\@:
|
||||
/*
|
||||
* The CR3 write could be avoided when not changing its value,
|
||||
* but would require a CR3 read *and* a scratch register.
|
||||
*/
|
||||
movq \save_reg, %cr3
|
||||
.Lend_\@:
|
||||
.endm
|
||||
|
||||
#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
|
||||
|
||||
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
||||
.endm
|
||||
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
||||
.endm
|
||||
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
||||
.endm
|
||||
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
||||
.endm
|
||||
.macro RESTORE_CR3 scratch_reg:req save_reg:req
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
#include <asm/segment.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/errno.h>
|
||||
#include "calling.h"
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/unistd.h>
|
||||
@@ -40,6 +39,8 @@
|
||||
#include <asm/frame.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
#include "calling.h"
|
||||
|
||||
.code64
|
||||
.section .entry.text, "ax"
|
||||
|
||||
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
|
||||
/* Stash the user RSP. */
|
||||
movq %rsp, RSP_SCRATCH
|
||||
|
||||
/* Note: using %rsp as a scratch reg. */
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
|
||||
|
||||
/* Load the top of the task stack into RSP */
|
||||
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
|
||||
|
||||
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
|
||||
*/
|
||||
|
||||
swapgs
|
||||
/*
|
||||
* This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
|
||||
* is not required to switch CR3.
|
||||
*/
|
||||
movq %rsp, PER_CPU_VAR(rsp_scratch)
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
@@ -403,6 +411,7 @@ syscall_return_via_sysret:
|
||||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
popq %rdi
|
||||
popq %rsp
|
||||
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
/* Restore RDI. */
|
||||
popq %rdi
|
||||
SWAPGS
|
||||
@@ -822,7 +833,9 @@ native_irq_return_ldt:
|
||||
*/
|
||||
|
||||
pushq %rdi /* Stash user RDI */
|
||||
SWAPGS
|
||||
SWAPGS /* to kernel GS */
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
|
||||
|
||||
movq PER_CPU_VAR(espfix_waddr), %rdi
|
||||
movq %rax, (0*8)(%rdi) /* user RAX */
|
||||
movq (1*8)(%rsp), %rax /* user RIP */
|
||||
@@ -838,7 +851,6 @@ native_irq_return_ldt:
|
||||
/* Now RAX == RSP. */
|
||||
|
||||
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
|
||||
popq %rdi /* Restore user RDI */
|
||||
|
||||
/*
|
||||
* espfix_stack[31:16] == 0. The page tables are set up such that
|
||||
@@ -849,7 +861,11 @@ native_irq_return_ldt:
|
||||
* still points to an RO alias of the ESPFIX stack.
|
||||
*/
|
||||
orq PER_CPU_VAR(espfix_stack), %rax
|
||||
SWAPGS
|
||||
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
SWAPGS /* to user GS */
|
||||
popq %rdi /* Restore user RDI */
|
||||
|
||||
movq %rax, %rsp
|
||||
UNWIND_HINT_IRET_REGS offset=8
|
||||
|
||||
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
|
||||
UNWIND_HINT_FUNC
|
||||
|
||||
pushq %rdi
|
||||
/* Need to switch before accessing the thread stack. */
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
|
||||
movq %rsp, %rdi
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
|
||||
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
|
||||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
1: ret
|
||||
|
||||
1:
|
||||
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
||||
|
||||
ret
|
||||
END(paranoid_entry)
|
||||
|
||||
/*
|
||||
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz .Lparanoid_exit_no_swapgs
|
||||
TRACE_IRQS_IRETQ
|
||||
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
||||
SWAPGS_UNSAFE_STACK
|
||||
jmp .Lparanoid_exit_restore
|
||||
.Lparanoid_exit_no_swapgs:
|
||||
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
|
||||
* from user mode due to an IRET fault.
|
||||
*/
|
||||
SWAPGS
|
||||
/* We have user CR3. Change to kernel CR3. */
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
|
||||
|
||||
.Lerror_entry_from_usermode_after_swapgs:
|
||||
/* Put us onto the real thread stack. */
|
||||
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
|
||||
* .Lgs_change's error handler with kernel gsbase.
|
||||
*/
|
||||
SWAPGS
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
|
||||
jmp .Lerror_entry_done
|
||||
|
||||
.Lbstep_iret:
|
||||
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
|
||||
|
||||
.Lerror_bad_iret:
|
||||
/*
|
||||
* We came from an IRET to user mode, so we have user gsbase.
|
||||
* Switch to kernel gsbase:
|
||||
* We came from an IRET to user mode, so we have user
|
||||
* gsbase and CR3. Switch to kernel gsbase and CR3:
|
||||
*/
|
||||
SWAPGS
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
|
||||
|
||||
/*
|
||||
* Pretend that the exception came from user mode: set up pt_regs
|
||||
@@ -1389,6 +1416,10 @@ END(error_exit)
|
||||
/*
|
||||
* Runs on exception stack. Xen PV does not go through this path at all,
|
||||
* so we can use real assembly here.
|
||||
*
|
||||
* Registers:
|
||||
* %r14: Used to save/restore the CR3 of the interrupted context
|
||||
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
|
||||
*/
|
||||
ENTRY(nmi)
|
||||
UNWIND_HINT_IRET_REGS
|
||||
@@ -1452,6 +1483,7 @@ ENTRY(nmi)
|
||||
|
||||
swapgs
|
||||
cld
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
|
||||
movq %rsp, %rdx
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
UNWIND_HINT_IRET_REGS base=%rdx offset=8
|
||||
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
|
||||
movq $-1, %rsi
|
||||
call do_nmi
|
||||
|
||||
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
|
||||
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz nmi_restore
|
||||
nmi_swapgs:
|
||||
|
||||
@@ -49,6 +49,10 @@
|
||||
ENTRY(entry_SYSENTER_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS
|
||||
|
||||
/* We are about to clobber %rsp anyway, clobbering here is OK */
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
|
||||
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
/*
|
||||
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
|
||||
pushq $0 /* pt_regs->r14 = 0 */
|
||||
pushq $0 /* pt_regs->r15 = 0 */
|
||||
|
||||
/*
|
||||
* We just saved %rdi so it is safe to clobber. It is not
|
||||
* preserved during the C calls inside TRACE_IRQS_OFF anyway.
|
||||
*/
|
||||
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
|
||||
|
||||
/*
|
||||
* User mode is traced as though IRQs are on, and SYSENTER
|
||||
* turned them off.
|
||||
@@ -256,10 +266,22 @@ sysret32_from_system_call:
|
||||
* when the system call started, which is already known to user
|
||||
* code. We zero R8-R10 to avoid info leaks.
|
||||
*/
|
||||
movq RSP-ORIG_RAX(%rsp), %rsp
|
||||
|
||||
/*
|
||||
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
|
||||
* on the process stack which is not mapped to userspace and
|
||||
* not readable after we SWITCH_TO_USER_CR3. Delay the CR3
|
||||
* switch until after after the last reference to the process
|
||||
* stack.
|
||||
*
|
||||
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
|
||||
*/
|
||||
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
|
||||
|
||||
xorq %r8, %r8
|
||||
xorq %r9, %r9
|
||||
xorq %r10, %r10
|
||||
movq RSP-ORIG_RAX(%rsp), %rsp
|
||||
swapgs
|
||||
sysretl
|
||||
END(entry_SYSCALL_compat)
|
||||
|
||||
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
|
||||
* vsyscalls but leave the page not present. If so, we skip calling
|
||||
* this.
|
||||
*/
|
||||
static void __init set_vsyscall_pgtable_user_bits(void)
|
||||
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
pgd = pgd_offset_k(VSYSCALL_ADDR);
|
||||
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
|
||||
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
|
||||
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
|
||||
#if CONFIG_PGTABLE_LEVELS >= 5
|
||||
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
|
||||
vsyscall_mode == NATIVE
|
||||
? PAGE_KERNEL_VSYSCALL
|
||||
: PAGE_KERNEL_VVAR);
|
||||
set_vsyscall_pgtable_user_bits();
|
||||
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
|
||||
}
|
||||
|
||||
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
|
||||
|
||||
+83
-47
@@ -3,16 +3,18 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/insn.h>
|
||||
|
||||
#include "../perf_event.h"
|
||||
|
||||
/* Waste a full page so it can be mapped into the cpu_entry_area */
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
|
||||
|
||||
/* The size of a BTS record in bytes: */
|
||||
#define BTS_RECORD_SIZE 24
|
||||
|
||||
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
|
||||
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
|
||||
#define PEBS_FIXUP_SIZE PAGE_SIZE
|
||||
|
||||
/*
|
||||
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
|
||||
|
||||
static DEFINE_PER_CPU(void *, insn_buffer);
|
||||
|
||||
static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
|
||||
{
|
||||
phys_addr_t pa;
|
||||
size_t msz = 0;
|
||||
|
||||
pa = virt_to_phys(addr);
|
||||
for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
|
||||
cea_set_pte(cea, pa, prot);
|
||||
}
|
||||
|
||||
static void ds_clear_cea(void *cea, size_t size)
|
||||
{
|
||||
size_t msz = 0;
|
||||
|
||||
for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
|
||||
cea_set_pte(cea, 0, PAGE_NONE);
|
||||
}
|
||||
|
||||
static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
|
||||
{
|
||||
unsigned int order = get_order(size);
|
||||
int node = cpu_to_node(cpu);
|
||||
struct page *page;
|
||||
|
||||
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
|
||||
return page ? page_address(page) : NULL;
|
||||
}
|
||||
|
||||
static void dsfree_pages(const void *buffer, size_t size)
|
||||
{
|
||||
if (buffer)
|
||||
free_pages((unsigned long)buffer, get_order(size));
|
||||
}
|
||||
|
||||
static int alloc_pebs_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
int node = cpu_to_node(cpu);
|
||||
int max;
|
||||
void *buffer, *ibuffer;
|
||||
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
|
||||
struct debug_store *ds = hwev->ds;
|
||||
size_t bsiz = x86_pmu.pebs_buffer_size;
|
||||
int max, node = cpu_to_node(cpu);
|
||||
void *buffer, *ibuffer, *cea;
|
||||
|
||||
if (!x86_pmu.pebs)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
|
||||
buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
|
||||
if (unlikely(!buffer))
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
|
||||
if (x86_pmu.intel_cap.pebs_format < 2) {
|
||||
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
|
||||
if (!ibuffer) {
|
||||
kfree(buffer);
|
||||
dsfree_pages(buffer, bsiz);
|
||||
return -ENOMEM;
|
||||
}
|
||||
per_cpu(insn_buffer, cpu) = ibuffer;
|
||||
}
|
||||
|
||||
max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
|
||||
|
||||
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
|
||||
hwev->ds_pebs_vaddr = buffer;
|
||||
/* Update the cpu entry area mapping */
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
|
||||
ds->pebs_buffer_base = (unsigned long) cea;
|
||||
ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
|
||||
ds->pebs_index = ds->pebs_buffer_base;
|
||||
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
|
||||
max * x86_pmu.pebs_record_size;
|
||||
|
||||
max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
|
||||
ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void release_pebs_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
|
||||
struct debug_store *ds = hwev->ds;
|
||||
void *cea;
|
||||
|
||||
if (!ds || !x86_pmu.pebs)
|
||||
return;
|
||||
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
|
||||
kfree(per_cpu(insn_buffer, cpu));
|
||||
per_cpu(insn_buffer, cpu) = NULL;
|
||||
|
||||
kfree((void *)(unsigned long)ds->pebs_buffer_base);
|
||||
/* Clear the fixmap */
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
|
||||
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
|
||||
ds->pebs_buffer_base = 0;
|
||||
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
|
||||
hwev->ds_pebs_vaddr = NULL;
|
||||
}
|
||||
|
||||
static int alloc_bts_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
int node = cpu_to_node(cpu);
|
||||
int max, thresh;
|
||||
void *buffer;
|
||||
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
|
||||
struct debug_store *ds = hwev->ds;
|
||||
void *buffer, *cea;
|
||||
int max;
|
||||
|
||||
if (!x86_pmu.bts)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
|
||||
buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
|
||||
if (unlikely(!buffer)) {
|
||||
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
|
||||
thresh = max / 16;
|
||||
|
||||
ds->bts_buffer_base = (u64)(unsigned long)buffer;
|
||||
hwev->ds_bts_vaddr = buffer;
|
||||
/* Update the fixmap */
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
|
||||
ds->bts_buffer_base = (unsigned long) cea;
|
||||
ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
|
||||
ds->bts_index = ds->bts_buffer_base;
|
||||
ds->bts_absolute_maximum = ds->bts_buffer_base +
|
||||
max * BTS_RECORD_SIZE;
|
||||
ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
|
||||
thresh * BTS_RECORD_SIZE;
|
||||
|
||||
max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
|
||||
ds->bts_absolute_maximum = ds->bts_buffer_base + max;
|
||||
ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void release_bts_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
|
||||
struct debug_store *ds = hwev->ds;
|
||||
void *cea;
|
||||
|
||||
if (!ds || !x86_pmu.bts)
|
||||
return;
|
||||
|
||||
kfree((void *)(unsigned long)ds->bts_buffer_base);
|
||||
/* Clear the fixmap */
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
|
||||
ds_clear_cea(cea, BTS_BUFFER_SIZE);
|
||||
ds->bts_buffer_base = 0;
|
||||
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
|
||||
hwev->ds_bts_vaddr = NULL;
|
||||
}
|
||||
|
||||
static int alloc_ds_buffer(int cpu)
|
||||
{
|
||||
int node = cpu_to_node(cpu);
|
||||
struct debug_store *ds;
|
||||
|
||||
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
|
||||
if (unlikely(!ds))
|
||||
return -ENOMEM;
|
||||
struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
|
||||
|
||||
memset(ds, 0, sizeof(*ds));
|
||||
per_cpu(cpu_hw_events, cpu).ds = ds;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void release_ds_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
|
||||
if (!ds)
|
||||
return;
|
||||
|
||||
per_cpu(cpu_hw_events, cpu).ds = NULL;
|
||||
kfree(ds);
|
||||
}
|
||||
|
||||
void release_ds_buffers(void)
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
|
||||
#include <linux/perf_event.h>
|
||||
|
||||
#include <asm/intel_ds.h>
|
||||
|
||||
/* To enable MSR tracing please use the generic trace points. */
|
||||
|
||||
/*
|
||||
@@ -77,8 +79,6 @@ struct amd_nb {
|
||||
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
|
||||
};
|
||||
|
||||
/* The maximal number of PEBS events: */
|
||||
#define MAX_PEBS_EVENTS 8
|
||||
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
|
||||
|
||||
/*
|
||||
@@ -95,23 +95,6 @@ struct amd_nb {
|
||||
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
|
||||
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
|
||||
|
||||
/*
|
||||
* A debug store configuration.
|
||||
*
|
||||
* We only support architectures that use 64bit fields.
|
||||
*/
|
||||
struct debug_store {
|
||||
u64 bts_buffer_base;
|
||||
u64 bts_index;
|
||||
u64 bts_absolute_maximum;
|
||||
u64 bts_interrupt_threshold;
|
||||
u64 pebs_buffer_base;
|
||||
u64 pebs_index;
|
||||
u64 pebs_absolute_maximum;
|
||||
u64 pebs_interrupt_threshold;
|
||||
u64 pebs_event_reset[MAX_PEBS_EVENTS];
|
||||
};
|
||||
|
||||
#define PEBS_REGS \
|
||||
(PERF_REG_X86_AX | \
|
||||
PERF_REG_X86_BX | \
|
||||
@@ -216,6 +199,8 @@ struct cpu_hw_events {
|
||||
* Intel DebugStore bits
|
||||
*/
|
||||
struct debug_store *ds;
|
||||
void *ds_pebs_vaddr;
|
||||
void *ds_bts_vaddr;
|
||||
u64 pebs_enabled;
|
||||
int n_pebs;
|
||||
int n_large_pebs;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include <linux/percpu-defs.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/intel_ds.h>
|
||||
|
||||
/*
|
||||
* cpu_entry_area is a percpu region that contains things needed by the CPU
|
||||
@@ -40,6 +41,18 @@ struct cpu_entry_area {
|
||||
*/
|
||||
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
|
||||
#endif
|
||||
#ifdef CONFIG_CPU_SUP_INTEL
|
||||
/*
|
||||
* Per CPU debug store for Intel performance monitoring. Wastes a
|
||||
* full page at the moment.
|
||||
*/
|
||||
struct debug_store cpu_debug_store;
|
||||
/*
|
||||
* The actual PEBS/BTS buffers must be mapped to user space
|
||||
* Reserve enough fixmap PTEs.
|
||||
*/
|
||||
struct debug_store_buffers cpu_debug_buffers;
|
||||
#endif
|
||||
};
|
||||
|
||||
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
|
||||
|
||||
@@ -197,11 +197,12 @@
|
||||
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
|
||||
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
|
||||
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
|
||||
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
|
||||
|
||||
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
|
||||
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
|
||||
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
|
||||
|
||||
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
|
||||
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
|
||||
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
|
||||
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
|
||||
@@ -340,5 +341,6 @@
|
||||
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
|
||||
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
|
||||
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
|
||||
#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
|
||||
|
||||
#endif /* _ASM_X86_CPUFEATURES_H */
|
||||
|
||||
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
|
||||
|
||||
desc->type = (info->read_exec_only ^ 1) << 1;
|
||||
desc->type |= info->contents << 2;
|
||||
/* Set the ACCESS bit so it can be mapped RO */
|
||||
desc->type |= 1;
|
||||
|
||||
desc->s = 1;
|
||||
desc->dpl = 0x3;
|
||||
|
||||
@@ -50,6 +50,12 @@
|
||||
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
# define DISABLE_PTI 0
|
||||
#else
|
||||
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Make sure to add features to the correct mask
|
||||
*/
|
||||
@@ -60,7 +66,7 @@
|
||||
#define DISABLED_MASK4 (DISABLE_PCID)
|
||||
#define DISABLED_MASK5 0
|
||||
#define DISABLED_MASK6 0
|
||||
#define DISABLED_MASK7 0
|
||||
#define DISABLED_MASK7 (DISABLE_PTI)
|
||||
#define DISABLED_MASK8 0
|
||||
#define DISABLED_MASK9 (DISABLE_MPX)
|
||||
#define DISABLED_MASK10 0
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
#ifndef _ASM_INTEL_DS_H
|
||||
#define _ASM_INTEL_DS_H
|
||||
|
||||
#include <linux/percpu-defs.h>
|
||||
|
||||
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
|
||||
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
|
||||
|
||||
/* The maximal number of PEBS events: */
|
||||
#define MAX_PEBS_EVENTS 8
|
||||
|
||||
/*
|
||||
* A debug store configuration.
|
||||
*
|
||||
* We only support architectures that use 64bit fields.
|
||||
*/
|
||||
struct debug_store {
|
||||
u64 bts_buffer_base;
|
||||
u64 bts_index;
|
||||
u64 bts_absolute_maximum;
|
||||
u64 bts_interrupt_threshold;
|
||||
u64 pebs_buffer_base;
|
||||
u64 pebs_index;
|
||||
u64 pebs_absolute_maximum;
|
||||
u64 pebs_interrupt_threshold;
|
||||
u64 pebs_event_reset[MAX_PEBS_EVENTS];
|
||||
} __aligned(PAGE_SIZE);
|
||||
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
|
||||
|
||||
struct debug_store_buffers {
|
||||
char bts_buffer[BTS_BUFFER_SIZE];
|
||||
char pebs_buffer[PEBS_BUFFER_SIZE];
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -50,10 +50,33 @@ struct ldt_struct {
|
||||
* call gates. On native, we could merge the ldt_struct and LDT
|
||||
* allocations, but it's not worth trying to optimize.
|
||||
*/
|
||||
struct desc_struct *entries;
|
||||
unsigned int nr_entries;
|
||||
struct desc_struct *entries;
|
||||
unsigned int nr_entries;
|
||||
|
||||
/*
|
||||
* If PTI is in use, then the entries array is not mapped while we're
|
||||
* in user mode. The whole array will be aliased at the addressed
|
||||
* given by ldt_slot_va(slot). We use two slots so that we can allocate
|
||||
* and map, and enable a new LDT without invalidating the mapping
|
||||
* of an older, still-in-use LDT.
|
||||
*
|
||||
* slot will be -1 if this LDT doesn't have an alias mapping.
|
||||
*/
|
||||
int slot;
|
||||
};
|
||||
|
||||
/* This is a multiple of PAGE_SIZE. */
|
||||
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
|
||||
|
||||
static inline void *ldt_slot_va(int slot)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
|
||||
#else
|
||||
BUG();
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Used for LDT copy/destruction.
|
||||
*/
|
||||
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
|
||||
}
|
||||
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
|
||||
void destroy_context_ldt(struct mm_struct *mm);
|
||||
void ldt_arch_exit_mmap(struct mm_struct *mm);
|
||||
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
||||
static inline void init_new_context_ldt(struct mm_struct *mm) { }
|
||||
static inline int ldt_dup_context(struct mm_struct *oldmm,
|
||||
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void destroy_context_ldt(struct mm_struct *mm) {}
|
||||
static inline void destroy_context_ldt(struct mm_struct *mm) { }
|
||||
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
|
||||
#endif
|
||||
|
||||
static inline void load_mm_ldt(struct mm_struct *mm)
|
||||
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
|
||||
* that we can see.
|
||||
*/
|
||||
|
||||
if (unlikely(ldt))
|
||||
set_ldt(ldt->entries, ldt->nr_entries);
|
||||
else
|
||||
if (unlikely(ldt)) {
|
||||
if (static_cpu_has(X86_FEATURE_PTI)) {
|
||||
if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
|
||||
/*
|
||||
* Whoops -- either the new LDT isn't mapped
|
||||
* (if slot == -1) or is mapped into a bogus
|
||||
* slot (if slot > 1).
|
||||
*/
|
||||
clear_LDT();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If page table isolation is enabled, ldt->entries
|
||||
* will not be mapped in the userspace pagetables.
|
||||
* Tell the CPU to access the LDT through the alias
|
||||
* at ldt_slot_va(ldt->slot).
|
||||
*/
|
||||
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
|
||||
} else {
|
||||
set_ldt(ldt->entries, ldt->nr_entries);
|
||||
}
|
||||
} else {
|
||||
clear_LDT();
|
||||
}
|
||||
#else
|
||||
clear_LDT();
|
||||
#endif
|
||||
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
|
||||
static inline void arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
paravirt_arch_exit_mmap(mm);
|
||||
ldt_arch_exit_mmap(mm);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
|
||||
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
|
||||
*/
|
||||
extern gfp_t __userpte_alloc_gfp;
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
|
||||
* both 8k in size and 8k-aligned. That lets us just flip bit 12
|
||||
* in a pointer to swap between the two 4k halves.
|
||||
*/
|
||||
#define PGD_ALLOCATION_ORDER 1
|
||||
#else
|
||||
#define PGD_ALLOCATION_ORDER 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Allocate and free page tables.
|
||||
*/
|
||||
|
||||
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
|
||||
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
|
||||
|
||||
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
|
||||
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
|
||||
void ptdump_walk_pgd_level_checkwx(void);
|
||||
|
||||
#ifdef CONFIG_DEBUG_WX
|
||||
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
|
||||
|
||||
static inline int p4d_bad(p4d_t p4d)
|
||||
{
|
||||
return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
|
||||
unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
|
||||
|
||||
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
||||
ignore_flags |= _PAGE_NX;
|
||||
|
||||
return (p4d_flags(p4d) & ~ignore_flags) != 0;
|
||||
}
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
|
||||
|
||||
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
|
||||
|
||||
static inline int pgd_bad(pgd_t pgd)
|
||||
{
|
||||
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
unsigned long ignore_flags = _PAGE_USER;
|
||||
|
||||
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
||||
ignore_flags |= _PAGE_NX;
|
||||
|
||||
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
|
||||
}
|
||||
|
||||
static inline int pgd_none(pgd_t pgd)
|
||||
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
|
||||
* pgd_offset() returns a (pgd_t *)
|
||||
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
|
||||
*/
|
||||
#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
|
||||
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
|
||||
/*
|
||||
* a shortcut to get a pgd_t in a given mm
|
||||
*/
|
||||
#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
|
||||
/*
|
||||
* a shortcut which implies the use of the kernel's pgd, instead
|
||||
* of a process's
|
||||
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
|
||||
*/
|
||||
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
|
||||
{
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
/* Clone the user space pgd as well */
|
||||
memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
|
||||
count * sizeof(pgd_t));
|
||||
#endif
|
||||
}
|
||||
|
||||
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
|
||||
|
||||
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
|
||||
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
|
||||
* the user one is in the last 4k. To switch between them, you
|
||||
* just need to flip the 12th bit in their addresses.
|
||||
*/
|
||||
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
|
||||
|
||||
/*
|
||||
* This generates better code than the inline assembly in
|
||||
* __set_bit().
|
||||
*/
|
||||
static inline void *ptr_set_bit(void *ptr, int bit)
|
||||
{
|
||||
unsigned long __ptr = (unsigned long)ptr;
|
||||
|
||||
__ptr |= BIT(bit);
|
||||
return (void *)__ptr;
|
||||
}
|
||||
static inline void *ptr_clear_bit(void *ptr, int bit)
|
||||
{
|
||||
unsigned long __ptr = (unsigned long)ptr;
|
||||
|
||||
__ptr &= ~BIT(bit);
|
||||
return (void *)__ptr;
|
||||
}
|
||||
|
||||
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
|
||||
{
|
||||
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
|
||||
}
|
||||
|
||||
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
|
||||
{
|
||||
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
|
||||
}
|
||||
|
||||
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
|
||||
{
|
||||
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
|
||||
}
|
||||
|
||||
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
|
||||
{
|
||||
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
|
||||
}
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
/*
|
||||
* Page table pages are page-aligned. The lower half of the top
|
||||
* level is used for userspace and the top half for the kernel.
|
||||
*
|
||||
* Returns true for parts of the PGD that map userspace and
|
||||
* false for the parts that map the kernel.
|
||||
*/
|
||||
static inline bool pgdp_maps_userspace(void *__ptr)
|
||||
{
|
||||
unsigned long ptr = (unsigned long)__ptr;
|
||||
|
||||
return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
|
||||
|
||||
/*
|
||||
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
|
||||
* Populates the user and returns the resulting PGD that must be set in
|
||||
* the kernel copy of the page tables.
|
||||
*/
|
||||
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return pgd;
|
||||
return __pti_set_user_pgd(pgdp, pgd);
|
||||
}
|
||||
#else
|
||||
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
return pgd;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
|
||||
{
|
||||
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
|
||||
p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
|
||||
#else
|
||||
*p4dp = p4d;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void native_p4d_clear(p4d_t *p4d)
|
||||
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
|
||||
|
||||
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
*pgdp = pti_set_user_pgd(pgdp, pgd);
|
||||
#else
|
||||
*pgdp = pgd;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void native_pgd_clear(pgd_t *pgd)
|
||||
|
||||
@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
|
||||
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
# define VMALLOC_SIZE_TB _AC(16384, UL)
|
||||
# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
|
||||
# define VMALLOC_SIZE_TB _AC(12800, UL)
|
||||
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
|
||||
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
|
||||
# define LDT_PGD_ENTRY _AC(-112, UL)
|
||||
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
|
||||
#else
|
||||
# define VMALLOC_SIZE_TB _AC(32, UL)
|
||||
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
|
||||
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
|
||||
# define LDT_PGD_ENTRY _AC(-4, UL)
|
||||
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RANDOMIZE_MEMORY
|
||||
|
||||
@@ -38,6 +38,11 @@
|
||||
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
|
||||
#define CR3_PCID_MASK 0xFFFull
|
||||
#define CR3_NOFLUSH BIT_ULL(63)
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
# define X86_CR3_PTI_SWITCH_BIT 11
|
||||
#endif
|
||||
|
||||
#else
|
||||
/*
|
||||
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user