KAISER: Kernel Address Isolation
authorHugh Dickins <hughd@google.com>
Tue, 12 Dec 2017 01:59:50 +0000 (17:59 -0800)
committerBen Hutchings <ben@decadent.org.uk>
Sun, 7 Jan 2018 01:46:49 +0000 (01:46 +0000)
This patch introduces our implementation of KAISER (Kernel Address
Isolation to have Side-channels Efficiently Removed), a kernel isolation
technique to close hardware side channels on kernel address information.

More information about the original patch can be found at:
https://github.com/IAIK/KAISER
http://marc.info/?l=linux-kernel&m=149390087310405&w=2

Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Richard Fellner <richard.fellner@student.tugraz.at>
Michael Schwarz <michael.schwarz@iaik.tugraz.at>
<clementine.maurice@iaik.tugraz.at>
<moritz.lipp@iaik.tugraz.at>

That original was then developed further by
Dave Hansen <dave.hansen@intel.com>
Hugh Dickins <hughd@google.com>
then others after this snapshot.

This combined patch for 3.2.96 was derived from hughd's patches below
for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
which was sent in 2017-12-09's nokaiser-3.18.72.tar.  They have been
combined in order to minimize the effort of rebasing: most of the
patches in the 3.18.72 series were small fixes and cleanups and
enhancements to three large patches.  About the only new work in this
backport is a simple reimplementation of kaiser_remove_mapping():
since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
mods there for Kaiser never seemed necessary.

KAISER: Kernel Address Isolation
kaiser: merged update
kaiser: do not set _PAGE_NX on pgd_none
kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
kaiser: fix build and FIXME in alloc_ldt_struct()
kaiser: KAISER depends on SMP
kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
kaiser: fix perf crashes
kaiser: ENOMEM if kaiser_pagetable_walk() NULL
kaiser: tidied up asm/kaiser.h somewhat
kaiser: tidied up kaiser_add/remove_mapping slightly
kaiser: kaiser_remove_mapping() move along the pgd
kaiser: align addition to x86/mm/Makefile
kaiser: cleanups while trying for gold link
kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
kaiser: delete KAISER_REAL_SWITCH option
kaiser: vmstat show NR_KAISERTABLE as nr_overhead
kaiser: enhanced by kernel and user PCIDs
kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
kaiser: PCID 0 for kernel and 128 for user
kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
kaiser: paranoid_entry pass cr3 need to paranoid_exit
kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
kaiser: fix unlikely error in alloc_ldt_struct()
kaiser: drop is_atomic arg to kaiser_pagetable_walk()

Signed-off-by: Hugh Dickins <hughd@google.com>
[bwh:
 - Fixed the #undef in arch/x86/boot/compressed/misc.h
 - Add missing #include in arch/x86/mm/kaiser.c]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
33 files changed:
arch/x86/boot/compressed/misc.h
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/kaiser.h [new file with mode: 0644]
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor-flags.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/tlbflush.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/perf_event_intel_ds.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/espfix_64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/init_task.c
arch/x86/kernel/irqinit.c
arch/x86/kernel/ldt.c
arch/x86/kernel/process_64.c
arch/x86/mm/Makefile
arch/x86/mm/kaiser.c [new file with mode: 0644]
arch/x86/mm/pgtable.c
arch/x86/mm/tlb.c
include/asm-generic/vmlinux.lds.h
include/linux/kaiser.h [new file with mode: 0644]
include/linux/mmzone.h
include/linux/percpu-defs.h
init/main.c
kernel/fork.c
mm/vmstat.c
security/Kconfig

index 3f19c81..2fa2635 100644 (file)
@@ -7,6 +7,7 @@
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_KAISER
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
index 2b55277..7eb0d47 100644 (file)
@@ -12,6 +12,8 @@
 #include <asm/ia32_unistd.h>   
 #include <asm/thread_info.h>   
 #include <asm/segment.h>
 #include <asm/ia32_unistd.h>   
 #include <asm/thread_info.h>   
 #include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
 
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
 
@@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
        CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rsp,rbp
        SWAPGS_UNSAFE_STACK
        CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rsp,rbp
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        movq    PER_CPU_VAR(kernel_stack), %rsp
        addq    $(KERNEL_STACK_OFFSET),%rsp
        /*
        movq    PER_CPU_VAR(kernel_stack), %rsp
        addq    $(KERNEL_STACK_OFFSET),%rsp
        /*
@@ -183,6 +186,7 @@ sysexit_from_sys_call:
        popq_cfi %rcx                           /* User %esp */
        CFI_REGISTER rsp,rcx
        TRACE_IRQS_ON
        popq_cfi %rcx                           /* User %esp */
        CFI_REGISTER rsp,rcx
        TRACE_IRQS_ON
+       SWITCH_USER_CR3
        ENABLE_INTERRUPTS_SYSEXIT32
 
 #ifdef CONFIG_AUDITSYSCALL
        ENABLE_INTERRUPTS_SYSEXIT32
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
        movq    PER_CPU_VAR(kernel_stack),%rsp
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
        movq    PER_CPU_VAR(kernel_stack),%rsp
@@ -337,6 +342,7 @@ sysretl_from_sys_call:
        xorq    %r9,%r9
        xorq    %r8,%r8
        TRACE_IRQS_ON
        xorq    %r9,%r9
        xorq    %r8,%r8
        TRACE_IRQS_ON
+       SWITCH_USER_CR3
        movl RSP-ARGOFFSET(%rsp),%esp
        CFI_RESTORE rsp
        USERGS_SYSRET32
        movl RSP-ARGOFFSET(%rsp),%esp
        CFI_RESTORE rsp
        USERGS_SYSRET32
@@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
        CFI_REL_OFFSET  rip,RIP-RIP
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        SWAPGS
        CFI_REL_OFFSET  rip,RIP-RIP
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        SWAPGS
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs and here we enable it straight after entry:
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs and here we enable it straight after entry:
index 6f254f2..7362726 100644 (file)
 #define X86_FEATURE_PLN                (7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS                (7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTHERM     (7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_PLN                (7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS                (7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTHERM     (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
index 382ce8a..7f1ead9 100644 (file)
@@ -40,7 +40,7 @@ struct gdt_page {
        struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
        struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
index eb92a6e..3354a39 100644 (file)
@@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
 
 typedef int vector_irq_t[NR_VECTORS];
 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
 
 typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 extern void setup_vector_irq(int cpu);
 
 #ifdef CONFIG_X86_IO_APIC
 extern void setup_vector_irq(int cpu);
 
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644 (file)
index 0000000..6f4c8ef
--- /dev/null
@@ -0,0 +1,126 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  x86_cr3_pcid_noflush, \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js   9f
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg regb
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+/**
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  kaiser_init - Initialize the shadow mapping
+ *
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
index 6be9909..b1c8b8d 100644 (file)
@@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
 
 static inline int pgd_bad(pgd_t pgd)
 {
-       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+       pgdval_t ignore_flags = _PAGE_USER;
+       /*
+        * We set NX on KAISER pgds that map userspace memory so
+        * that userspace can not meaningfully use the kernel
+        * page table by accident; it will fault on the first
+        * instruction it tries to run.  See native_set_pgd().
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               ignore_flags |= _PAGE_NX;
+
+       return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+       /* Clone the shadow pgd part as well */
+       memcpy(native_get_shadow_pgd(dst),
+              native_get_shadow_pgd(src),
+              count * sizeof(pgd_t));
+#endif
 }
 
 
 }
 
 
index 975f709..a3bf3de 100644 (file)
@@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud)
        native_set_pud(pud, native_make_pud(0));
 }
 
        native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+       return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+       return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+       return NULL;
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+       return pgdp;
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-       *pgdp = pgd;
+       *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
index 013286a..6e13150 100644 (file)
 #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL   (_AT(pteval_t, 0))
+#else
 #define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_UNUSED1  (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
 #define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
 #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_UNUSED1  (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
 #define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
 #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -62,7 +66,7 @@
 #endif
 
 #define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #endif
 
 #define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
 
 #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
                         _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
                         _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH                (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH                (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH      (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH      (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH                (0)
+#define X86_CR3_PCID_USER_FLUSH                (0)
+#define X86_CR3_PCID_KERN_NOFLUSH      (0)
+#define X86_CR3_PCID_USER_NOFLUSH      (0)
+#endif
+
 #define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB         (0)
 #define _PAGE_CACHE_WC         (_PAGE_PWT)
 #define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB         (0)
 #define _PAGE_CACHE_WC         (_PAGE_PWT)
index a9e14a5..360e80d 100644 (file)
@@ -43,6 +43,8 @@
  */
 #define X86_CR3_PWT    0x00000008 /* Page Write Through */
 #define X86_CR3_PCD    0x00000010 /* Page Cache Disable */
  */
 #define X86_CR3_PWT    0x00000008 /* Page Write Through */
 #define X86_CR3_PCD    0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
 
 /*
  * Intel CPU features in CR4
index f7c89e2..048249e 100644 (file)
@@ -266,7 +266,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
 
 /*
  * Save the original ist values for checking stack pointers during debugging
 
 /*
  * Save the original ist values for checking stack pointers during debugging
index e04cbc5..2881959 100644 (file)
@@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
 static inline void __native_flush_tlb(void)
 {
 static inline void __native_flush_tlb(void)
 {
+       if (this_cpu_has(X86_FEATURE_INVPCID)) {
+               /*
+                * Note, this works with CR4.PCIDE=0 or 1.
+                */
+               invpcid_flush_all_nonglobals();
+               return;
+       }
+
        /*
         * If current->mm == NULL then we borrow a mm which may change during a
         * task switch and therefore we must not be preempted while we write CR3
         * back:
         */
        preempt_disable();
        /*
         * If current->mm == NULL then we borrow a mm which may change during a
         * task switch and therefore we must not be preempted while we write CR3
         * back:
         */
        preempt_disable();
+       if (this_cpu_has(X86_FEATURE_PCID))
+               kaiser_flush_tlb_on_return_to_user();
        native_write_cr3(native_read_cr3());
        preempt_enable();
 }
 
 static inline void __native_flush_tlb_global(void)
 {
        native_write_cr3(native_read_cr3());
        preempt_enable();
 }
 
 static inline void __native_flush_tlb_global(void)
 {
+#ifdef CONFIG_KAISER
+       /* Globals are not used at all */
+       __native_flush_tlb();
+#else
        unsigned long flags;
        unsigned long cr4;
 
        unsigned long flags;
        unsigned long cr4;
 
-       if (static_cpu_has(X86_FEATURE_INVPCID)) {
+       if (this_cpu_has(X86_FEATURE_INVPCID)) {
                /*
                 * Using INVPCID is considerably faster than a pair of writes
                 * to CR4 sandwiched inside an IRQ flag save/restore.
                /*
                 * Using INVPCID is considerably faster than a pair of writes
                 * to CR4 sandwiched inside an IRQ flag save/restore.
+                *
+                * Note, this works with CR4.PCIDE=0 or 1.
                 */
                invpcid_flush_all();
                return;
                 */
                invpcid_flush_all();
                return;
@@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void)
        native_write_cr4(cr4);
 
        raw_local_irq_restore(flags);
        native_write_cr4(cr4);
 
        raw_local_irq_restore(flags);
+#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-       asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+       /*
+        * SIMICS #GP's if you run INVPCID with type 2/3
+        * and X86_CR4_PCIDE clear.  Shame!
+        *
+        * The ASIDs used below are hard-coded.  But, we must not
+        * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+        * invlpg in the case we are called early.
+        */
+
+       if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+               if (this_cpu_has(X86_FEATURE_PCID))
+                       kaiser_flush_tlb_on_return_to_user();
+               asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+               return;
+       }
+       /* Flush the address out of both PCIDs. */
+       /*
+        * An optimization here might be to determine addresses
+        * that are only kernel-mapped and only flush the kernel
+        * ASID.  But, userspace flushes are probably much more
+        * important performance-wise.
+        *
+        * Make sure to do only a single invpcid when KAISER is
+        * disabled and we have only a single ASID.
+        */
+       if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+               invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+       invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
 }
 
 static inline void __flush_tlb_all(void)
 }
 
 static inline void __flush_tlb_all(void)
index 895e4b8..b567c89 100644 (file)
@@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
 
 static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 
 
 static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
        /*
         * We need valid kernel segments for data and code in long mode too
 #ifdef CONFIG_X86_64
        /*
         * We need valid kernel segments for data and code in long mode too
@@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
                         * SDM says that it can't be enabled in 32-bit mode.
                         */
                        set_in_cr4(X86_CR4_PCIDE);
                         * SDM says that it can't be enabled in 32-bit mode.
                         */
                        set_in_cr4(X86_CR4_PCIDE);
+                       /*
+                        * INVPCID has two "groups" of types:
+                        * 1/2: Invalidate an individual address
+                        * 3/4: Invalidate all contexts
+                        *
+                        * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+                        * ignore the PCID argument in the descriptor.
+                        * But, we have to be careful not to call 1/2
+                        * with an actual non-zero PCID in them before
+                        * we do the above set_in_cr4().
+                        */
+                       if (cpu_has(c, X86_FEATURE_INVPCID))
+                               set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
                } else {
                        /*
                         * flush_tlb_all(), as currently implemented, won't
                } else {
                        /*
                         * flush_tlb_all(), as currently implemented, won't
@@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
                        clear_cpu_cap(c, X86_FEATURE_PCID);
                }
        }
                        clear_cpu_cap(c, X86_FEATURE_PCID);
                }
        }
+       kaiser_setup_pcid();
 }
 
 /*
 }
 
 /*
@@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
          [DEBUG_STACK - 1]                     = DEBUG_STKSZ
 };
 
          [DEBUG_STACK - 1]                     = DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
index 2d4e76b..fb933cd 100644 (file)
@@ -2,10 +2,14 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 
 #include "perf_event.h"
 
 #include <asm/perf_event.h>
 
 #include "perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE                24
 
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE                24
 
@@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu)
        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+       unsigned int order = get_order(size);
+       struct page *page;
+       unsigned long addr;
+
+       page = alloc_pages_node(node, flags | __GFP_ZERO, order);
+       if (!page)
+               return NULL;
+       addr = (unsigned long)page_address(page);
+       if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+               __free_pages(page, order);
+               addr = 0;
+       }
+       return (void *)addr;
+#else
+       return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+       if (!buffer)
+               return;
+       kaiser_remove_mapping((unsigned long)buffer, size);
+       free_pages((unsigned long)buffer, get_order(size));
+#else
+       kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 static int alloc_pebs_buffer(int cpu)
 {
        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu)
        if (!x86_pmu.pebs)
                return 0;
 
        if (!x86_pmu.pebs)
                return 0;
 
-       buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+       buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
        if (unlikely(!buffer))
                return -ENOMEM;
 
        if (unlikely(!buffer))
                return -ENOMEM;
 
@@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu)
        if (!ds || !x86_pmu.pebs)
                return;
 
        if (!ds || !x86_pmu.pebs)
                return;
 
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
        ds->pebs_buffer_base = 0;
 }
 
        ds->pebs_buffer_base = 0;
 }
 
@@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu)
        if (!x86_pmu.bts)
                return 0;
 
        if (!x86_pmu.bts)
                return 0;
 
-       buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+       buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
        if (unlikely(!buffer))
                return -ENOMEM;
 
        if (unlikely(!buffer))
                return -ENOMEM;
 
@@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu)
        if (!ds || !x86_pmu.bts)
                return;
 
        if (!ds || !x86_pmu.bts)
                return;
 
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
        ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
        ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
+       struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
 
+       memset(ds, 0, sizeof(*ds));
        per_cpu(cpu_hw_events, cpu).ds = ds;
 
        return 0;
        per_cpu(cpu_hw_events, cpu).ds = ds;
 
        return 0;
@@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu)
                return;
 
        per_cpu(cpu_hw_events, cpu).ds = NULL;
                return;
 
        per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
 }
 
 void release_ds_buffers(void)
 }
 
 void release_ds_buffers(void)
index f6daf3c..3a4356a 100644 (file)
@@ -56,6 +56,7 @@
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
 #include <asm/pgtable_types.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
 #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64)
        testl $3, CS(%rdi)
        je 1f
        SWAPGS
        testl $3, CS(%rdi)
        je 1f
        SWAPGS
+       SWITCH_KERNEL_CR3
        /*
         * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
        /*
         * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
@@ -362,6 +364,12 @@ END(save_rest)
 
 /* save complete stack frame */
        .pushsection .kprobes.text, "ax"
 
 /* save complete stack frame */
        .pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
 ENTRY(save_paranoid)
        XCPT_FRAME 1 RDI+8
        cld
 ENTRY(save_paranoid)
        XCPT_FRAME 1 RDI+8
        cld
@@ -387,7 +395,25 @@ ENTRY(save_paranoid)
        js 1f   /* negative -> in kernel */
        SWAPGS
        xorl %ebx,%ebx
        js 1f   /* negative -> in kernel */
        SWAPGS
        xorl %ebx,%ebx
-1:     ret
+1:
+#ifdef CONFIG_KAISER
+       /*
+        * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+        * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+        * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+        * unconditionally, but we need to find out whether the reverse
+        * should be done on return (conveyed to paranoid_exit in %ebx).
+        */
+       movq    %cr3, %rax
+       testl   $KAISER_SHADOW_PGD_OFFSET, %eax
+       jz      2f
+       orl     $2, %ebx
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       orq     x86_cr3_pcid_noflush, %rax
+       movq    %rax, %cr3
+2:
+#endif
+       ret
        CFI_ENDPROC
 END(save_paranoid)
        .popsection
        CFI_ENDPROC
 END(save_paranoid)
        .popsection
@@ -464,6 +490,7 @@ ENTRY(system_call)
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * A hypervisor implementation might want to use a label
         * after the swapgs, so that it can do the swapgs
        /*
         * A hypervisor implementation might want to use a label
         * after the swapgs, so that it can do the swapgs
@@ -515,6 +542,14 @@ sysret_check:
        CFI_REGISTER    rip,rcx
        RESTORE_ARGS 1,-ARG_SKIP,0
        /*CFI_REGISTER  rflags,r11*/
        CFI_REGISTER    rip,rcx
        RESTORE_ARGS 1,-ARG_SKIP,0
        /*CFI_REGISTER  rflags,r11*/
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
        movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
 
        movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
 
@@ -851,6 +886,14 @@ retint_swapgs:             /* return to user-space */
         */
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_IRETQ
         */
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_IRETQ
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
        SWAPGS
        jmp restore_args
 
        SWAPGS
        jmp restore_args
 
@@ -891,6 +934,7 @@ native_irq_return_ldt:
        pushq_cfi %rax
        pushq_cfi %rdi
        SWAPGS
        pushq_cfi %rax
        pushq_cfi %rdi
        SWAPGS
+       SWITCH_KERNEL_CR3
        movq PER_CPU_VAR(espfix_waddr),%rdi
        movq %rax,(0*8)(%rdi)   /* RAX */
        movq (2*8)(%rsp),%rax   /* RIP */
        movq PER_CPU_VAR(espfix_waddr),%rdi
        movq %rax,(0*8)(%rdi)   /* RAX */
        movq (2*8)(%rsp),%rax   /* RIP */
@@ -906,6 +950,7 @@ native_irq_return_ldt:
        andl $0xffff0000,%eax
        popq_cfi %rdi
        orq PER_CPU_VAR(espfix_stack),%rax
        andl $0xffff0000,%eax
        popq_cfi %rdi
        orq PER_CPU_VAR(espfix_stack),%rax
+       SWITCH_USER_CR3
        SWAPGS
        movq %rax,%rsp
        popq_cfi %rax
        SWAPGS
        movq %rax,%rsp
        popq_cfi %rax
@@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
         * is fundamentally NMI-unsafe. (we cannot change the soft and
         * hard flags at once, atomically)
         */
         * is fundamentally NMI-unsafe. (we cannot change the soft and
         * hard flags at once, atomically)
         */
-
-       /* ebx: no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
 ENTRY(paranoid_exit)
        DEFAULT_FRAME
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
 ENTRY(paranoid_exit)
        DEFAULT_FRAME
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl %ebx,%ebx                         /* swapgs needed? */
-       jnz paranoid_restore
-       testl $3,CS(%rsp)
-       jnz   paranoid_userspace
-paranoid_swapgs:
+       movq    %rbx, %r12              /* paranoid_userspace uses %ebx */
+       testl   $3, CS(%rsp)
+       jnz     paranoid_userspace
+paranoid_kernel:
+       movq    %r12, %rbx              /* restore after paranoid_userspace */
        TRACE_IRQS_IRETQ 0
        TRACE_IRQS_IRETQ 0
+#ifdef CONFIG_KAISER
+       testl   $2, %ebx                /* SWITCH_USER_CR3 needed? */
+       jz      paranoid_exit_no_switch
+       SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+       testl   $1, %ebx                /* swapgs needed? */
+       jnz     paranoid_exit_no_swapgs
        SWAPGS_UNSAFE_STACK
        SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
        RESTORE_ALL 8
        RESTORE_ALL 8
-       jmp irq_return
-paranoid_restore:
-       TRACE_IRQS_IRETQ 0
-       RESTORE_ALL 8
-       jmp irq_return
+       jmp     irq_return
+
 paranoid_userspace:
        GET_THREAD_INFO(%rcx)
        movl TI_flags(%rcx),%ebx
        andl $_TIF_WORK_MASK,%ebx
 paranoid_userspace:
        GET_THREAD_INFO(%rcx)
        movl TI_flags(%rcx),%ebx
        andl $_TIF_WORK_MASK,%ebx
-       jz paranoid_swapgs
+       jz paranoid_kernel
        movq %rsp,%rdi                  /* &pt_regs */
        call sync_regs
        movq %rax,%rsp                  /* switch stack for scheduling */
        movq %rsp,%rdi                  /* &pt_regs */
        call sync_regs
        movq %rax,%rsp                  /* switch stack for scheduling */
@@ -1438,6 +1493,13 @@ ENTRY(error_entry)
        movq_cfi r13, R13+8
        movq_cfi r14, R14+8
        movq_cfi r15, R15+8
        movq_cfi r13, R13+8
        movq_cfi r14, R14+8
        movq_cfi r15, R15+8
+       /*
+        * error_entry() always returns with a kernel gsbase and
+        * CR3.  We must also have a kernel CR3/gsbase before
+        * calling TRACE_IRQS_*.  Just unconditionally switch to
+        * the kernel CR3 here.
+        */
+       SWITCH_KERNEL_CR3
        xorl %ebx,%ebx
        testl $3,CS+8(%rsp)
        je error_kernelspace
        xorl %ebx,%ebx
        testl $3,CS+8(%rsp)
        je error_kernelspace
@@ -1527,22 +1589,31 @@ ENTRY(nmi)
        call do_nmi
 #ifdef CONFIG_TRACE_IRQFLAGS
        /* paranoidexit; without TRACE_IRQS_OFF */
        call do_nmi
 #ifdef CONFIG_TRACE_IRQFLAGS
        /* paranoidexit; without TRACE_IRQS_OFF */
-       /* ebx: no swapgs flag */
+       /* ebx: no-swapgs and kaiser-switch-cr3 flag */
        DISABLE_INTERRUPTS(CLBR_NONE)
        DISABLE_INTERRUPTS(CLBR_NONE)
-       testl %ebx,%ebx                         /* swapgs needed? */
-       jnz nmi_restore
-       testl $3,CS(%rsp)
-       jnz nmi_userspace
-nmi_swapgs:
+       movq    %rbx, %r12              /* nmi_userspace uses %ebx */
+       testl   $3, CS(%rsp)
+       jnz     nmi_userspace
+nmi_kernel:
+       movq    %r12, %rbx              /* restore after nmi_userspace */
+#ifdef CONFIG_KAISER
+       testl   $2, %ebx                /* SWITCH_USER_CR3 needed? */
+       jz      nmi_exit_no_switch
+       SWITCH_USER_CR3
+nmi_exit_no_switch:
+#endif
+       testl   $1, %ebx                /* swapgs needed? */
+       jnz     nmi_exit_no_swapgs
        SWAPGS_UNSAFE_STACK
        SWAPGS_UNSAFE_STACK
-nmi_restore:
+nmi_exit_no_swapgs:
        RESTORE_ALL 8
        RESTORE_ALL 8
-       jmp irq_return
+       jmp     irq_return
+
 nmi_userspace:
        GET_THREAD_INFO(%rcx)
        movl TI_flags(%rcx),%ebx
        andl $_TIF_WORK_MASK,%ebx
 nmi_userspace:
        GET_THREAD_INFO(%rcx)
        movl TI_flags(%rcx),%ebx
        andl $_TIF_WORK_MASK,%ebx
-       jz nmi_swapgs
+       jz nmi_kernel
        movq %rsp,%rdi                  /* &pt_regs */
        call sync_regs
        movq %rax,%rsp                  /* switch stack for scheduling */
        movq %rsp,%rdi                  /* &pt_regs */
        call sync_regs
        movq %rax,%rsp                  /* switch stack for scheduling */
index 94d857f..14cd73b 100644 (file)
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -129,6 +130,14 @@ void __init init_espfix_bsp(void)
        /* Install the espfix pud into the kernel page directory */
        pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
        pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
        /* Install the espfix pud into the kernel page directory */
        pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
        pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+       /*
+        * Just copy the top-level PGD that is mapping the espfix
+        * area to ensure it is mapped into the shadow user page
+        * tables.
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               set_pgd(native_get_shadow_pgd(pgd_p),
+                       __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
 
        /* Randomize the locations */
        init_espfix_random();
 
        /* Randomize the locations */
        init_espfix_random();
index 0f8ebf7..6e697ac 100644 (file)
@@ -338,6 +338,27 @@ early_idt_ripmsg:
        .balign PAGE_SIZE; \
 ENTRY(name)
 
        .balign PAGE_SIZE; \
 ENTRY(name)
 
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL   512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL   0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
        i = 0 ;                                         \
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
        i = 0 ;                                         \
@@ -353,13 +374,14 @@ ENTRY(name)
         * 0xffffffff80000000 to physical address 0x000000. (always using
         * 2Mbyte large pages provided by PAE mode)
         */
         * 0xffffffff80000000 to physical address 0x000000. (always using
         * 2Mbyte large pages provided by PAE mode)
         */
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 
 NEXT_PAGE(level3_ident_pgt)
        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt)
         * Don't set NX because code runs from these pages.
         */
        PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
         * Don't set NX because code runs from these pages.
         */
        PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+       .fill   KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level2_kernel_pgt)
        /*
 
 NEXT_PAGE(level2_kernel_pgt)
        /*
index 43e9ccf..f00e6e7 100644 (file)
@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
 
 
index e328f69..990f743 100644 (file)
@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
        .flags = IRQF_NO_THREAD,
 };
 
        .flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
        [0 ... NR_VECTORS - 1] = -1,
 };
 
        [0 ... NR_VECTORS - 1] = -1,
 };
 
index 1dd3230..836a4c2 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/kaiser.h>
 
 #include <asm/system.h>
 #include <asm/ldt.h>
 
 #include <asm/system.h>
 #include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
        set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
        set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+               vfree(ldt->entries);
+       else
+               free_page((unsigned long)ldt->entries);
+       kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
        struct ldt_struct *new_ldt;
        int alloc_size;
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
        struct ldt_struct *new_ldt;
        int alloc_size;
+       int ret;
 
        if (size > LDT_ENTRIES)
                return NULL;
 
        if (size > LDT_ENTRIES)
                return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
                return NULL;
        }
 
                return NULL;
        }
 
+       ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+                                __PAGE_KERNEL);
        new_ldt->size = size;
        new_ldt->size = size;
+       if (ret) {
+               __free_ldt_struct(new_ldt);
+               return NULL;
+       }
        return new_ldt;
 }
 
        return new_ldt;
 }
 
@@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
        if (likely(!ldt))
                return;
 
        if (likely(!ldt))
                return;
 
+       kaiser_remove_mapping((unsigned long)ldt->entries,
+                             ldt->size * LDT_ENTRY_SIZE);
        paravirt_free_ldt(ldt->entries, ldt->size);
        paravirt_free_ldt(ldt->entries, ldt->size);
-       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-               vfree(ldt->entries);
-       else
-               kfree(ldt->entries);
-       kfree(ldt);
+       __free_ldt_struct(ldt);
 }
 
 /*
 }
 
 /*
index 557eb37..d2ce2a3 100644 (file)
@@ -57,7 +57,7 @@
 
 asmlinkage extern void ret_from_fork(void);
 
 
 asmlinkage extern void ret_from_fork(void);
 
-DEFINE_PER_CPU(unsigned long, old_rsp);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
 static DEFINE_PER_CPU(unsigned char, is_idle);
 
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 static DEFINE_PER_CPU(unsigned char, is_idle);
 
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
index cf2a840..c9a00a5 100644 (file)
@@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
 obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
 
 obj-$(CONFIG_MEMTEST)          += memtest.o
 obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
 
 obj-$(CONFIG_MEMTEST)          += memtest.o
+obj-$(CONFIG_KAISER)           += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644 (file)
index 0000000..79b0222
--- /dev/null
@@ -0,0 +1,382 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+extern struct mm_struct init_mm;
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h>      /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_KAISER
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(vaddr);
+       /*
+        * We made all the kernel PGDs present in kaiser_init().
+        * We expect them to stay that way.
+        */
+       BUG_ON(pgd_none(*pgd));
+       /*
+        * PGDs are either 512GB or 128TB on all x86_64
+        * configurations.  We don't handle these.
+        */
+       BUG_ON(pgd_large(*pgd));
+
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pud_large(*pud))
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pmd_large(*pmd))
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pte_none(*pte)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+       pmd_t *pmd;
+       pud_t *pud;
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+       gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+       if (pgd_none(*pgd)) {
+               WARN_ONCE(1, "All shadow pgds should have been populated");
+               return NULL;
+       }
+       BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+       pud = pud_offset(pgd, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pud_large(*pud)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pud_none(*pud)) {
+               unsigned long new_pmd_page = __get_free_page(gfp);
+               if (!new_pmd_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pmd_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pmd_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       pmd = pmd_offset(pud, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pmd_large(*pmd)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pmd_none(*pmd)) {
+               unsigned long new_pte_page = __get_free_page(gfp);
+               if (!new_pte_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pte_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pte_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       return pte_offset_kernel(pmd, address);
+}
+
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+                       unsigned long flags)
+{
+       int ret = 0;
+       pte_t *pte;
+       unsigned long start_addr = (unsigned long )__start_addr;
+       unsigned long address = start_addr & PAGE_MASK;
+       unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+       unsigned long target_address;
+
+       for (; address < end_addr; address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+               if (target_address == -1) {
+                       ret = -EIO;
+                       break;
+               }
+               pte = kaiser_pagetable_walk(address);
+               if (!pte) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       pte_t tmp;
+                       set_pte(&tmp, __pte(flags | target_address));
+                       WARN_ON_ONCE(!pte_same(*pte, tmp));
+               }
+       }
+       return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+       unsigned long size = end - start;
+
+       return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               pgd_t new_pgd;
+               pud_t *pud = pud_alloc_one(&init_mm,
+                                          PAGE_OFFSET + i * PGDIR_SIZE);
+               if (!pud) {
+                       WARN_ON(1);
+                       break;
+               }
+               inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+               new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+               /*
+                * Make sure not to stomp on some other pgd entry.
+                */
+               if (!pgd_none(pgd[i])) {
+                       WARN_ON(1);
+                       continue;
+               }
+               set_pgd(pgd + i, new_pgd);
+       }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do {     \
+       int __ret = kaiser_add_user_map(start, size, flags);    \
+       WARN_ON(__ret);                                         \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {         \
+       int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
+       WARN_ON(__ret);                                                 \
+} while (0)
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+       int cpu;
+
+       kaiser_init_all_pgds();
+
+       for_each_possible_cpu(cpu) {
+               void *percpu_vaddr = __per_cpu_user_mapped_start +
+                                    per_cpu_offset(cpu);
+               unsigned long percpu_sz = __per_cpu_user_mapped_end -
+                                         __per_cpu_user_mapped_start;
+               kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+                                         __PAGE_KERNEL);
+       }
+
+       /*
+        * Map the entry/exit text section, which is needed at
+        * switches from user to and from kernel.
+        */
+       kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+                                      __PAGE_KERNEL_RX);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+                                      __irqentry_text_end,
+                                      __PAGE_KERNEL_RX);
+#endif
+       kaiser_add_user_map_early((void *)idt_descr.address,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL_RO);
+       kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+                                 sizeof(x86_cr3_pcid_noflush),
+                                 __PAGE_KERNEL);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       unsigned long end = start + size;
+       unsigned long addr;
+       pte_t *pte;
+
+       for (addr = start; addr < end; addr += PAGE_SIZE) {
+               pte = kaiser_pagetable_walk(addr);
+               if (pte)
+                       set_pte(pte, __pte(0));
+       }
+}
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+       return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       /*
+        * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+        * skip cases like kexec and EFI which make temporary low mappings.
+        */
+       if (pgd.pgd & _PAGE_USER) {
+               if (is_userspace_pgd(pgdp)) {
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+                       /*
+                        * Even if the entry is *mapping* userspace, ensure
+                        * that userspace can not use it.  This way, if we
+                        * get out to userspace running on the kernel CR3,
+                        * userspace will crash instead of running.
+                        */
+                       pgd.pgd |= _PAGE_NX;
+               }
+       } else if (!pgd.pgd) {
+               /*
+                * pgd_clear() cannot check _PAGE_USER, and is even used to
+                * clear corrupted pgd entries: so just rely on cases like
+                * kexec and EFI never to be using pgd_clear().
+                */
+               if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+                   is_userspace_pgd(pgdp))
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+       return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+       unsigned long kern_cr3 = 0;
+       unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+       if (this_cpu_has(X86_FEATURE_PCID)) {
+               kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+               user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+       }
+       /*
+        * These variables are used by the entry/exit
+        * code to change PCID and pgd and TLB flushing.
+        */
+       x86_cr3_pcid_noflush = kern_cr3;
+       this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+       this_cpu_write(x86_cr3_pcid_user,
+                       X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+#endif /* CONFIG_KAISER */
index 8573b83..7328560 100644 (file)
@@ -5,7 +5,7 @@
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
        }
 }
 
        }
 }
 
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+       /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+       return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+                                        PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+       free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd;
        pmd_t *pmds[PREALLOCATED_PMDS];
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd;
        pmd_t *pmds[PREALLOCATED_PMDS];
 
-       pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+       pgd = _pgd_alloc();
 
        if (pgd == NULL)
                goto out;
 
        if (pgd == NULL)
                goto out;
@@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 out_free_pmds:
        free_pmds(pmds);
 out_free_pgd:
 out_free_pmds:
        free_pmds(pmds);
 out_free_pgd:
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
 out:
        return NULL;
 }
 out:
        return NULL;
 }
@@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
 }
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
 }
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
index 4f5ca8f..4078e30 100644 (file)
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
+#include <asm/kaiser.h>
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
                        = { &init_mm, 0, };
 
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
                        = { &init_mm, 0, };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+       unsigned long new_mm_cr3 = __pa(pgdir);
+
+#ifdef CONFIG_KAISER
+       if (this_cpu_has(X86_FEATURE_PCID)) {
+               /*
+                * We reuse the same PCID for different tasks, so we must
+                * flush all the entries for the PCID out when we change tasks.
+                * Flush KERN below, flush USER when returning to userspace in
+                * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+                *
+                * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+                * do it here, but can only be used if X86_FEATURE_INVPCID is
+                * available - and many machines support pcid without invpcid.
+                *
+                * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+                * but keep that line in there in case something changes.
+                */
+               new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+               kaiser_flush_tlb_on_return_to_user();
+       }
+#endif /* CONFIG_KAISER */
+
+       /*
+        * Caution: many callers of this function expect
+        * that load_new_mm_cr3() is serializing and orders TLB
+        * fills with respect to the mm_cpumask writes.
+        */
+       write_cr3(new_mm_cr3);
+}
+
 /*
  *     TLB flushing, formerly SMP-only
  *             c/o Linus Torvalds.
 /*
  *     TLB flushing, formerly SMP-only
  *             c/o Linus Torvalds.
@@ -65,7 +98,7 @@ void leave_mm(int cpu)
                BUG();
        cpumask_clear_cpu(cpu,
                          mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
                BUG();
        cpumask_clear_cpu(cpu,
                          mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
-       load_cr3(swapper_pg_dir);
+       load_new_mm_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
@@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                 * from next->pgd.  TLB fills are special and can happen
                 * due to instruction fetches or for no reason at all,
                 * and neither LOCK nor MFENCE orders them.
                 * from next->pgd.  TLB fills are special and can happen
                 * due to instruction fetches or for no reason at all,
                 * and neither LOCK nor MFENCE orders them.
-                * Fortunately, load_cr3() is serializing and gives the
-                * ordering guarantee we need.
-                *
+                * Fortunately, load_new_mm_cr3() is serializing
+                * and gives the  ordering guarantee we need.
                 */
                 */
-               load_cr3(next->pgd);
+               load_new_mm_cr3(next->pgd);
 
                /* stop flush ipis for the previous mm */
                cpumask_clear_cpu(cpu, mm_cpumask(prev));
 
                /* stop flush ipis for the previous mm */
                cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                         * tlb flush IPI delivery. We must reload CR3
                         * to make sure to use no freed page tables.
                         *
                         * tlb flush IPI delivery. We must reload CR3
                         * to make sure to use no freed page tables.
                         *
-                        * As above, load_cr3() is serializing and orders TLB
-                        * fills with respect to the mm_cpumask write.
+                        * As above, load_new_mm_cr3() is serializing and orders
+                        * TLB fills with respect to the mm_cpumask write.
                         */
                         */
-                       load_cr3(next->pgd);
+                       load_new_mm_cr3(next->pgd);
                        load_mm_ldt(next);
                }
        }
                        load_mm_ldt(next);
                }
        }
index b5e2e4c..01c8155 100644 (file)
  */
 #define PERCPU_INPUT(cacheline)                                                \
        VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
  */
 #define PERCPU_INPUT(cacheline)                                                \
        VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;                \
        *(.data..percpu..first)                                         \
        *(.data..percpu..first)                                         \
+       . = ALIGN(cacheline);                                           \
+       *(.data..percpu..user_mapped)                                   \
+       *(.data..percpu..user_mapped..shared_aligned)                   \
+       . = ALIGN(PAGE_SIZE);                                           \
+       *(.data..percpu..user_mapped..page_aligned)                     \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;                  \
        . = ALIGN(PAGE_SIZE);                                           \
        *(.data..percpu..page_aligned)                                  \
        . = ALIGN(cacheline);                                           \
        . = ALIGN(PAGE_SIZE);                                           \
        *(.data..percpu..page_aligned)                                  \
        . = ALIGN(cacheline);                                           \
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644 (file)
index 0000000..4a4d6d9
--- /dev/null
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       /*
+        * Map that page of kernel stack on which we enter from user context.
+        */
+       return kaiser_add_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+       /*
+        * Note: may be called even when kaiser_map_thread_stack() failed.
+        */
+       kaiser_remove_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+                                    unsigned long size, unsigned long flags)
+{
+       return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+                                        unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
index 25842b6..a0b4422 100644 (file)
@@ -95,8 +95,9 @@ enum zone_stat_item {
        NR_SLAB_RECLAIMABLE,
        NR_SLAB_UNRECLAIMABLE,
        NR_PAGETABLE,           /* used for pagetables */
        NR_SLAB_RECLAIMABLE,
        NR_SLAB_UNRECLAIMABLE,
        NR_PAGETABLE,           /* used for pagetables */
-       NR_KERNEL_STACK,
        /* Second 128 byte cacheline */
        /* Second 128 byte cacheline */
+       NR_KERNEL_STACK,
+       NR_KAISERTABLE,
        NR_UNSTABLE_NFS,        /* NFS unstable pages */
        NR_BOUNCE,
        NR_VMSCAN_WRITE,
        NR_UNSTABLE_NFS,        /* NFS unstable pages */
        NR_BOUNCE,
        NR_VMSCAN_WRITE,
index 27ef6b1..56f5eeb 100644 (file)
        (void)__vpp_verify;                                             \
 } while (0)
 
        (void)__vpp_verify;                                             \
 } while (0)
 
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * s390 and alpha modules require percpu variables to be defined as
  * weak to force the compiler to generate GOT based external
 /*
  * s390 and alpha modules require percpu variables to be defined as
  * weak to force the compiler to generate GOT based external
 #define DEFINE_PER_CPU(type, name)                                     \
        DEFINE_PER_CPU_SECTION(type, name, "")
 
 #define DEFINE_PER_CPU(type, name)                                     \
        DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)                                \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)                         \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
        DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
        ____cacheline_aligned_in_smp
 
        DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
        ____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
        DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
        ____cacheline_aligned
 #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
        DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
        ____cacheline_aligned
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
        DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
        __aligned(PAGE_SIZE)
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
        DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
        __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)           \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+       __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)            \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+       __aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                        \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                                \
        DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
        DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
index e937d9b..558a9fd 100644 (file)
@@ -69,6 +69,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/random.h>
+#include <linux/kaiser.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -463,6 +464,7 @@ static void __init mm_init(void)
        percpu_init_late();
        pgtable_cache_init();
        vmalloc_init();
        percpu_init_late();
        pgtable_cache_init();
        vmalloc_init();
+       kaiser_init();
 }
 
 asmlinkage void __init start_kernel(void)
 }
 
 asmlinkage void __init start_kernel(void)
index 29b4604..511131a 100644 (file)
@@ -55,6 +55,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
 
 static inline void free_thread_info(struct thread_info *ti)
 {
+       kaiser_unmap_thread_stack(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 #endif
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 #endif
@@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
        tsk->stack = ti;
 
 
        tsk->stack = ti;
 
+       err = kaiser_map_thread_stack(tsk->stack);
+       if (err)
+               goto out;
+
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
index ff90609..eaf3db0 100644 (file)
@@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
        "nr_slab_unreclaimable",
        "nr_page_table_pages",
        "nr_kernel_stack",
        "nr_slab_unreclaimable",
        "nr_page_table_pages",
        "nr_kernel_stack",
+       "nr_overhead",
        "nr_unstable",
        "nr_bounce",
        "nr_vmscan_write",
        "nr_unstable",
        "nr_bounce",
        "nr_vmscan_write",
index 51bd5a0..19f8319 100644 (file)
@@ -96,6 +96,16 @@ config SECURITY
 
          If you are unsure how to answer this question, answer N.
 
 
          If you are unsure how to answer this question, answer N.
 
+config KAISER
+       bool "Remove the kernel mapping in user mode"
+       default y
+       depends on X86_64 && SMP && !PARAVIRT
+       help
+         This enforces a strict kernel and user space isolation, in order
+         to close hardware side channels on kernel address information.
+
+         If you are unsure how to answer this question, answer Y.
+
 config SECURITYFS
        bool "Enable the securityfs filesystem"
        help
 config SECURITYFS
        bool "Enable the securityfs filesystem"
        help