From: Hugh Dickins Date: Tue, 12 Dec 2017 01:59:50 +0000 (-0800) Subject: KAISER: Kernel Address Isolation X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=commitdiff_plain;h=a4f588df14fb393b1c8f37c997dbab95afc2eb54 KAISER: Kernel Address Isolation This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the original patch can be found at: https://github.com/IAIK/KAISER http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Daniel Gruss Richard Fellner Michael Schwarz That original was then developed further by Dave Hansen Hugh Dickins then others after this snapshot. This combined patch for 3.2.96 was derived from hughd's patches below for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last, which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been combined in order to minimize the effort of rebasing: most of the patches in the 3.18.72 series were small fixes and cleanups and enhancements to three large patches. About the only new work in this backport is a simple reimplementation of kaiser_remove_mapping(): since mm/pageattr.c changed a lot between 3.2 and 3.18, and the mods there for Kaiser never seemed necessary. KAISER: Kernel Address Isolation kaiser: merged update kaiser: do not set _PAGE_NX on pgd_none kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE kaiser: fix build and FIXME in alloc_ldt_struct() kaiser: KAISER depends on SMP kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER kaiser: fix perf crashes kaiser: ENOMEM if kaiser_pagetable_walk() NULL kaiser: tidied up asm/kaiser.h somewhat kaiser: tidied up kaiser_add/remove_mapping slightly kaiser: kaiser_remove_mapping() move along the pgd kaiser: align addition to x86/mm/Makefile kaiser: cleanups while trying for gold link kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET kaiser: delete KAISER_REAL_SWITCH option kaiser: vmstat show NR_KAISERTABLE as nr_overhead kaiser: enhanced by kernel and user PCIDs kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user kaiser: PCID 0 for kernel and 128 for user kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user kaiser: paranoid_entry pass cr3 need to paranoid_exit kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls kaiser: fix unlikely error in alloc_ldt_struct() kaiser: drop is_atomic arg to kaiser_pagetable_walk() Signed-off-by: Hugh Dickins [bwh: - Fixed the #undef in arch/x86/boot/compressed/misc.h - Add missing #include in arch/x86/mm/kaiser.c] Signed-off-by: Ben Hutchings --- diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3f19c81a6203..2fa2635ee539 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -7,6 +7,7 @@ * we just keep it from happening */ #undef CONFIG_PARAVIRT +#undef CONFIG_KAISER #ifdef CONFIG_X86_32 #define _ASM_X86_DESC_H 1 #endif diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2b5527726ae1..7eb0d4792800 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(kernel_stack), %rsp addq $(KERNEL_STACK_OFFSET),%rsp /* @@ -183,6 +186,7 @@ sysexit_from_sys_call: popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON + SWITCH_USER_CR3 ENABLE_INTERRUPTS_SYSEXIT32 #ifdef CONFIG_AUDITSYSCALL @@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(kernel_stack),%rsp @@ -337,6 +342,7 @@ sysretl_from_sys_call: xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON + SWITCH_USER_CR3 movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp USERGS_SYSRET32 @@ -409,6 +415,7 @@ ENTRY(ia32_syscall) CFI_REL_OFFSET rip,RIP-RIP PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS + SWITCH_KERNEL_CR3_NO_STACK /* * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6f254f2fcd40..736272670870 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 382ce8a9fd62..7f1ead938ec1 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -40,7 +40,7 @@ struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6ed2be7..3354a390cc71 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..6f4c8ef46881 --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,126 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +#include /* For PCID constants */ + +/* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on + * the kernel virtual memory. It has a shadow pgd for every process: the + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole + * user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, + * the shadow pgd is enabled. By this, the virtual memory caches are freed, + * and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. + */ + +#define KAISER_SHADOW_PGD_OFFSET 0x1000 + +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +orq x86_cr3_pcid_noflush, \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg regb +/* + * regb must be the low byte portion of reg: because we have arranged + * for the low byte of the user PCID to serve as the high byte of NOFLUSH + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are + * not enabled): so that the one register can update both memory and cr3. + */ +movq %cr3, \reg +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg +js 9f +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) +9: +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax %al +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg regb +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_KAISER +/* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. +*/ +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +extern unsigned long x86_cr3_pcid_noflush; +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + +/** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * The mapping is done on a global scope, so no bigger + * synchronization has to be done. the pages have to be + * manually unmapped again when they are not needed any longer. + */ +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + +/** + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * kaiser_init - Initialize the shadow mapping + * + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- + * time mappings are permanent and never unmapped. + */ +extern void kaiser_init(void); + +#endif /* CONFIG_KAISER */ + +#endif /* __ASSEMBLY */ + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6be990922d4b..b1c8b8d3b02a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + pgdval_t ignore_flags = _PAGE_USER; + /* + * We set NX on KAISER pgds that map userspace memory so + * that userspace can not meaningfully use the kernel + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ + if (IS_ENABLED(CONFIG_KAISER)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); +#endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..a3bf3de9893b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_KAISER +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +{ + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); +} +#else +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ + return NULL; +} +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +{ + return pgdp; +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pgd; + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2c..6e1315068a62 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -39,7 +39,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -62,7 +66,7 @@ #endif #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) @@ -74,6 +78,33 @@ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +/* The ASID is the lower 12 bits of CR3 */ +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) + +/* Mask for all the PCID-related bits in CR3: */ +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) + +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) +#else +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) +/* + * PCIDs are unsupported on 32-bit and none of these bits can be + * set in CR3: + */ +#define X86_CR3_PCID_KERN_FLUSH (0) +#define X86_CR3_PCID_USER_FLUSH (0) +#define X86_CR3_PCID_KERN_NOFLUSH (0) +#define X86_CR3_PCID_USER_NOFLUSH (0) +#endif + #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) #define _PAGE_CACHE_WC (_PAGE_PWT) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index a9e14a52385f..360e80d0d217 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -43,6 +43,8 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..048249e983ca 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -266,7 +266,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss); /* * Save the original ist values for checking stack pointers during debugging diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e04cbc550424..288195901c8a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +#ifdef CONFIG_KAISER +extern void kaiser_setup_pcid(void); +extern void kaiser_flush_tlb_on_return_to_user(void); +#else +static inline void kaiser_setup_pcid(void) +{ +} +static inline void kaiser_flush_tlb_on_return_to_user(void) +{ +} +#endif + static inline void __native_flush_tlb(void) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all_nonglobals(); + return; + } + /* * If current->mm == NULL then we borrow a mm which may change during a * task switch and therefore we must not be preempted while we write CR3 * back: */ preempt_disable(); + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); } static inline void __native_flush_tlb_global(void) { +#ifdef CONFIG_KAISER + /* Globals are not used at all */ + __native_flush_tlb(); +#else unsigned long flags; unsigned long cr4; - if (static_cpu_has(X86_FEATURE_INVPCID)) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void) native_write_cr4(cr4); raw_local_irq_restore(flags); +#endif } static inline void __native_flush_tlb_single(unsigned long addr) { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* + * SIMICS #GP's if you run INVPCID with type 2/3 + * and X86_CR4_PCIDE clear. Shame! + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call + * invlpg in the case we are called early. + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } + /* Flush the address out of both PCIDs. */ + /* + * An optimization here might be to determine addresses + * that are only kernel-mapped and only flush the kernel + * ASID. But, userspace flushes are probably much more + * important performance-wise. + * + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 895e4b88469c..b567c89fc628 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = { static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) * SDM says that it can't be enabled in 32-bit mode. */ set_in_cr4(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts + * + * 1/2 take a PCID, but 3/4 do not. So, 3/4 + * ignore the PCID argument in the descriptor. + * But, we have to be careful not to call 1/2 + * with an actual non-zero PCID in them before + * we do the above set_in_cr4(). + */ + if (cpu_has(c, X86_FEATURE_INVPCID)) + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't @@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PCID); } } + kaiser_setup_pcid(); } /* @@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 2d4e76ba2b5c..fb933cdca184 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -2,10 +2,14 @@ #include #include +#include #include #include "perf_event.h" +static +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static void *dsalloc(size_t size, gfp_t flags, int node) +{ +#ifdef CONFIG_KAISER + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; + + page = alloc_pages_node(node, flags | __GFP_ZERO, order); + if (!page) + return NULL; + addr = (unsigned long)page_address(page); + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { + __free_pages(page, order); + addr = 0; + } + return (void *)addr; +#else + return kmalloc_node(size, flags | __GFP_ZERO, node); +#endif +} + +static void dsfree(const void *buffer, size_t size) +{ +#ifdef CONFIG_KAISER + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); + free_pages((unsigned long)buffer, get_order(size)); +#else + kfree(buffer); +#endif +} + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE); ds->pebs_buffer_base = 0; } @@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu) if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; return 0; @@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu) return; per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f6daf3cdb878..3a4356a2f156 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64) testl $3, CS(%rdi) je 1f SWAPGS + SWITCH_KERNEL_CR3 /* * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is @@ -362,6 +364,12 @@ END(save_rest) /* save complete stack frame */ .pushsection .kprobes.text, "ax" +/* + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit + */ ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -387,7 +395,25 @@ ENTRY(save_paranoid) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx,%ebx -1: ret +1: +#ifdef CONFIG_KAISER + /* + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ + movq %cr3, %rax + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + orq x86_cr3_pcid_noflush, %rax + movq %rax, %cr3 +2: +#endif + ret CFI_ENDPROC END(save_paranoid) .popsection @@ -464,6 +490,7 @@ ENTRY(system_call) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -515,6 +542,14 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 1,-ARG_SKIP,0 /*CFI_REGISTER rflags,r11*/ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 movq PER_CPU_VAR(old_rsp), %rsp USERGS_SYSRET64 @@ -851,6 +886,14 @@ retint_swapgs: /* return to user-space */ */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 SWAPGS jmp restore_args @@ -891,6 +934,7 @@ native_irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ movq (2*8)(%rsp),%rax /* RIP */ @@ -906,6 +950,7 @@ native_irq_return_ldt: andl $0xffff0000,%eax popq_cfi %rdi orq PER_CPU_VAR(espfix_stack),%rax + SWITCH_USER_CR3 SWAPGS movq %rax,%rsp popq_cfi %rax @@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) * is fundamentally NMI-unsafe. (we cannot change the soft and * hard flags at once, atomically) */ - - /* ebx: no swapgs flag */ +/* + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 + * ebx=2: needs both swapgs and SWITCH_USER_CR3 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs + */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: + movq %rbx, %r12 /* paranoid_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz paranoid_userspace +paranoid_kernel: + movq %r12, %rbx /* restore after paranoid_userspace */ TRACE_IRQS_IRETQ 0 +#ifdef CONFIG_KAISER + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +paranoid_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs SWAPGS_UNSAFE_STACK +paranoid_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return -paranoid_restore: - TRACE_IRQS_IRETQ 0 - RESTORE_ALL 8 - jmp irq_return + jmp irq_return + paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs + jz paranoid_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ @@ -1438,6 +1493,13 @@ ENTRY(error_entry) movq_cfi r13, R13+8 movq_cfi r14, R14+8 movq_cfi r15, R15+8 + /* + * error_entry() always returns with a kernel gsbase and + * CR3. We must also have a kernel CR3/gsbase before + * calling TRACE_IRQS_*. Just unconditionally switch to + * the kernel CR3 here. + */ + SWITCH_KERNEL_CR3 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1527,22 +1589,31 @@ ENTRY(nmi) call do_nmi #ifdef CONFIG_TRACE_IRQFLAGS /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ + /* ebx: no-swapgs and kaiser-switch-cr3 flag */ DISABLE_INTERRUPTS(CLBR_NONE) - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace -nmi_swapgs: + movq %rbx, %r12 /* nmi_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz nmi_userspace +nmi_kernel: + movq %r12, %rbx /* restore after nmi_userspace */ +#ifdef CONFIG_KAISER + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz nmi_exit_no_switch + SWITCH_USER_CR3 +nmi_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz nmi_exit_no_swapgs SWAPGS_UNSAFE_STACK -nmi_restore: +nmi_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return + jmp irq_return + nmi_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs + jz nmi_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb1033..14cd73b0e634 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include #include #include +#include /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -129,6 +130,14 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + /* + * Just copy the top-level PGD that is mapping the espfix + * area to ensure it is mapped into the shadow user page + * tables. + */ + if (IS_ENABLED(CONFIG_KAISER)) + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0f8ebf78253a..6e697ac3fb54 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -338,6 +338,27 @@ early_idt_ripmsg: .balign PAGE_SIZE; \ ENTRY(name) +#ifdef CONFIG_KAISER +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define KAISER_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define KAISER_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -353,13 +374,14 @@ ENTRY(name) * 0xffffffff80000000 to physical address 0x000000. (always using * 2Mbyte large pages provided by PAE mode) */ -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level2_kernel_pgt) /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 43e9ccf44947..f00e6e734fbd 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index e328f691eeef..990f743e21b8 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -85,7 +85,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = -1, }; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1dd32307a494..836a4c2d5ceb 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) set_ldt(pc->ldt->entries, pc->ldt->size); } +static void __free_ldt_struct(struct ldt_struct *ldt) +{ + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; + int ret; if (size > LDT_ENTRIES) return NULL; @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) return NULL; } + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); new_ldt->size = size; + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } return new_ldt; } @@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; + kaiser_remove_mapping((unsigned long)ldt->entries, + ldt->size * LDT_ENTRY_SIZE); paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); - else - kfree(ldt->entries); - kfree(ldt); + __free_ldt_struct(ldt); } /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 557eb3757edb..d2ce2a33d15b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,7 +57,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index cf2a84031dfd..c9a00a5e0b87 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_MEMTEST) += memtest.o +obj-$(CONFIG_KAISER) += kaiser.o diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..79b0222ffa74 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,382 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct mm_struct init_mm; + +#include +#include /* to verify its kaiser declarations */ +#include +#include +#include + +#ifdef CONFIG_KAISER +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these instead. + * + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +unsigned long x86_cr3_pcid_noflush __read_mostly; +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +/* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever + * be populating the same addresses, so we only need to ensure + * that we protect between two CPUs trying to allocate and + * populate the same page table page. + * + * Only take this lock when doing a set_p[4um]d(), but it is not + * needed for doing a set_pte(). We assume that only the *owner* + * of a given allocation will be doing this for _their_ + * allocation. + * + * This ensures that once a system has been running for a while + * and there have been stacks all over and these page tables + * are fully populated, there will be no further acquisitions of + * this lock. + */ +static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +/* + * Returns -1 on error. + */ +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(vaddr); + /* + * We made all the kernel PGDs present in kaiser_init(). + * We expect them to stay that way. + */ + BUG_ON(pgd_none(*pgd)); + /* + * PGDs are either 512GB or 128TB on all x86_64 + * configurations. We don't handle these. + */ + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pud_large(*pud)) + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pmd_large(*pmd)) + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + + pte = pte_offset_kernel(pmd, vaddr); + if (pte_none(*pte)) { + WARN_ON_ONCE(1); + return -1; + } + + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); +} + +/* + * This is a relatively normal page table walk, except that it + * also tries to allocate page tables pages along the way. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *kaiser_pagetable_walk(unsigned long address) +{ + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } + + pmd = pmd_offset(pud, address); + /* The shadow page tables do not use large mappings: */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } + + return pte_offset_kernel(pmd, address); +} + +int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) +{ + int ret = 0; + pte_t *pte; + unsigned long start_addr = (unsigned long )__start_addr; + unsigned long address = start_addr & PAGE_MASK; + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address); + if (!pte) { + ret = -ENOMEM; + break; + } + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + pte_t tmp; + set_pte(&tmp, __pte(flags | target_address)); + WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } + return ret; +} + +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) +{ + unsigned long size = end - start; + + return kaiser_add_user_map(start, size, flags); +} + +/* + * Ensure that the top level of the (shadow) page tables are + * entirely populated. This ensures that all processes that get + * forked have the same entries. This way, we do not have to + * ever go set up new entries in older processes. + * + * Note: we never free these, so there are no updates to them + * after this. + */ +static void __init kaiser_init_all_pgds(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + pgd_t new_pgd; + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. + */ + if (!pgd_none(pgd[i])) { + WARN_ON(1); + continue; + } + set_pgd(pgd + i, new_pgd); + } +} + +#define kaiser_add_user_map_early(start, size, flags) do { \ + int __ret = kaiser_add_user_map(start, size, flags); \ + WARN_ON(__ret); \ +} while (0) + +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ + WARN_ON(__ret); \ +} while (0) + +/* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we + * will have most of the kernel up by then and should be able to + * get a clean warning out of it. If we BUG_ON() here, we run + * the risk of being before we have good console output. + */ +void __init kaiser_init(void) +{ + int cpu; + + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); + unsigned long percpu_sz = __per_cpu_user_mapped_end - + __per_cpu_user_mapped_start; + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, + __PAGE_KERNEL); + } + + /* + * Map the entry/exit text section, which is needed at + * switches from user to and from kernel. + */ + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, + __PAGE_KERNEL_RX); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + kaiser_add_user_map_ptrs_early(__irqentry_text_start, + __irqentry_text_end, + __PAGE_KERNEL_RX); +#endif + kaiser_add_user_map_early((void *)idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + kaiser_add_user_map_early(&x86_cr3_pcid_noflush, + sizeof(x86_cr3_pcid_noflush), + __PAGE_KERNEL); +} + +/* Add a mapping to the shadow mapping, and synchronize the mappings */ +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + return kaiser_add_user_map((const void *)addr, size, flags); +} + +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + unsigned long end = start + size; + unsigned long addr; + pte_t *pte; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + pte = kaiser_pagetable_walk(addr); + if (pte) + set_pte(pte, __pte(0)); + } +} + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} + +void kaiser_setup_pcid(void) +{ + unsigned long kern_cr3 = 0; + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + + if (this_cpu_has(X86_FEATURE_PCID)) { + kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; + } + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ + x86_cr3_pcid_noflush = kern_cr3; + this_cpu_write(x86_cr3_pcid_user, user_cr3); +} + +/* + * Make a note that this cpu will need to flush USER tlb on return to user. + * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: + * if cpu does not, then the NOFLUSH bit will never have been set. + */ +void kaiser_flush_tlb_on_return_to_user(void) +{ + this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); +} +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..73285602c93f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -5,7 +5,7 @@ #include #include -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_KAISER +/* + * Instead of one pmd, we aquire two pmds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + +static inline pgd_t *_pgd_alloc(void) +{ + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, + PGD_ALLOCATION_ORDER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } int ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4f5ca8f04c0d..4078e3092b16 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,10 +12,43 @@ #include #include #include +#include DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; +static void load_new_mm_cr3(pgd_t *pgdir) +{ + unsigned long new_mm_cr3 = __pa(pgdir); + +#ifdef CONFIG_KAISER + if (this_cpu_has(X86_FEATURE_PCID)) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. + * Flush KERN below, flush USER when returning to userspace in + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. + * + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + * + * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; + * but keep that line in there in case something changes. + */ + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); + } +#endif /* CONFIG_KAISER */ + + /* + * Caution: many callers of this function expect + * that load_new_mm_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -65,7 +98,7 @@ void leave_mm(int cpu) BUG(); cpumask_clear_cpu(cpu, mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); - load_cr3(swapper_pg_dir); + load_new_mm_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * from next->pgd. TLB fills are special and can happen * due to instruction fetches or for no reason at all, * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * + * Fortunately, load_new_mm_cr3() is serializing + * and gives the ordering guarantee we need. */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); /* stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. + * As above, load_new_mm_cr3() is serializing and orders + * TLB fills with respect to the mm_cpumask write. */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); load_mm_ldt(next); } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b5e2e4c6b017..01c8155dd613 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -692,7 +692,14 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h new file mode 100644 index 000000000000..4a4d6d911a14 --- /dev/null +++ b/include/linux/kaiser.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_KAISER_H +#define _LINUX_KAISER_H + +#ifdef CONFIG_KAISER +#include + +static inline int kaiser_map_thread_stack(void *stack) +{ + /* + * Map that page of kernel stack on which we enter from user context. + */ + return kaiser_add_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); +} + +static inline void kaiser_unmap_thread_stack(void *stack) +{ + /* + * Note: may be called even when kaiser_map_thread_stack() failed. + */ + kaiser_remove_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); +} +#else + +/* + * These stubs are used whenever CONFIG_KAISER is off, which + * includes architectures that support KAISER, but have it disabled. + */ + +static inline void kaiser_init(void) +{ +} +static inline int kaiser_add_mapping(unsigned long addr, + unsigned long size, unsigned long flags) +{ + return 0; +} +static inline void kaiser_remove_mapping(unsigned long start, + unsigned long size) +{ +} +static inline int kaiser_map_thread_stack(void *stack) +{ + return 0; +} +static inline void kaiser_unmap_thread_stack(void *stack) +{ +} + +#endif /* !CONFIG_KAISER */ +#endif /* _LINUX_KAISER_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25842b6e72e1..a0b4422a116a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -95,8 +95,9 @@ enum zone_stat_item { NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK, /* Second 128 byte cacheline */ + NR_KERNEL_STACK, + NR_KAISERTABLE, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 27ef6b190ea6..56f5eeb78d1d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -28,6 +28,12 @@ (void)__vpp_verify; \ } while (0) +#ifdef CONFIG_KAISER +#define USER_MAPPED_SECTION "..user_mapped" +#else +#define USER_MAPPED_SECTION "" +#endif + /* * s390 and alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external @@ -90,6 +96,12 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + /* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. @@ -119,6 +131,14 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + #define DECLARE_PER_CPU_ALIGNED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ____cacheline_aligned @@ -137,11 +157,21 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) /* * Declaration/definition used for per-CPU variables that must be read mostly. */ -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ DECLARE_PER_CPU_SECTION(type, name, "..readmostly") #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ diff --git a/init/main.c b/init/main.c index e937d9bda0f8..558a9fdd566d 100644 --- a/init/main.c +++ b/init/main.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -463,6 +464,7 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); + kaiser_init(); } asmlinkage void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index 29b460431c12..511131a15a75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { + kaiser_unmap_thread_stack(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } #endif @@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; + err = kaiser_map_thread_stack(tsk->stack); + if (err) + goto out; + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); diff --git a/mm/vmstat.c b/mm/vmstat.c index ff9060919c4b..eaf3db038652 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -699,6 +699,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", + "nr_overhead", "nr_unstable", "nr_bounce", "nr_vmscan_write", diff --git a/security/Kconfig b/security/Kconfig index 51bd5a0b69ae..19f83193e7ab 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -96,6 +96,16 @@ config SECURITY If you are unsure how to answer this question, answer N. +config KAISER + bool "Remove the kernel mapping in user mode" + default y + depends on X86_64 && SMP && !PARAVIRT + help + This enforces a strict kernel and user space isolation, in order + to close hardware side channels on kernel address information. + + If you are unsure how to answer this question, answer Y. + config SECURITYFS bool "Enable the securityfs filesystem" help