KAISER: Kernel Address Isolation

author Hugh Dickins <hughd@google.com>

Tue, 12 Dec 2017 01:59:50 +0000 (17:59 -0800)

committer Ben Hutchings <ben@decadent.org.uk>

Sun, 7 Jan 2018 01:46:49 +0000 (01:46 +0000)
author Hugh Dickins <hughd@google.com>
Tue, 12 Dec 2017 01:59:50 +0000 (17:59 -0800)
committer Ben Hutchings <ben@decadent.org.uk>
Sun, 7 Jan 2018 01:46:49 +0000 (01:46 +0000)
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

index 3f19c81..2fa2635 100644 (file)
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
   * we just keep it from happening
   */
  #undef CONFIG_PARAVIRT
   * we just keep it from happening
   */
  #undef CONFIG_PARAVIRT
+#undef CONFIG_KAISER
  #ifdef CONFIG_X86_32
  #define _ASM_X86_DESC_H 1
  #endif
  #ifdef CONFIG_X86_32
  #define _ASM_X86_DESC_H 1
  #endif
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index 2b55277..7eb0d47 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,6 +12,8 @@
  #include <asm/ia32_unistd.h>   
  #include <asm/thread_info.h>   
  #include <asm/segment.h>
  #include <asm/ia32_unistd.h>   
  #include <asm/thread_info.h>   
  #include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
  #include <asm/irqflags.h>
  #include <linux/linkage.h>
  
  #include <asm/irqflags.h>
  #include <linux/linkage.h>
  
@@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
         CFI_DEF_CFA     rsp,0
         CFI_REGISTER    rsp,rbp
         SWAPGS_UNSAFE_STACK
         CFI_DEF_CFA     rsp,0
         CFI_REGISTER    rsp,rbp
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         movq    PER_CPU_VAR(kernel_stack), %rsp
         addq    $(KERNEL_STACK_OFFSET),%rsp
         /*
         movq    PER_CPU_VAR(kernel_stack), %rsp
         addq    $(KERNEL_STACK_OFFSET),%rsp
         /*
@@ -183,6 +186,7 @@ sysexit_from_sys_call:
         popq_cfi %rcx                           /* User %esp */
         CFI_REGISTER rsp,rcx
         TRACE_IRQS_ON
         popq_cfi %rcx                           /* User %esp */
         CFI_REGISTER rsp,rcx
         TRACE_IRQS_ON
+       SWITCH_USER_CR3
         ENABLE_INTERRUPTS_SYSEXIT32
  
  #ifdef CONFIG_AUDITSYSCALL
         ENABLE_INTERRUPTS_SYSEXIT32
  
  #ifdef CONFIG_AUDITSYSCALL
@@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
         CFI_REGISTER    rip,rcx
         /*CFI_REGISTER  rflags,r11*/
         SWAPGS_UNSAFE_STACK
         CFI_REGISTER    rip,rcx
         /*CFI_REGISTER  rflags,r11*/
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         movl    %esp,%r8d
         CFI_REGISTER    rsp,r8
         movq    PER_CPU_VAR(kernel_stack),%rsp
         movl    %esp,%r8d
         CFI_REGISTER    rsp,r8
         movq    PER_CPU_VAR(kernel_stack),%rsp
@@ -337,6 +342,7 @@ sysretl_from_sys_call:
         xorq    %r9,%r9
         xorq    %r8,%r8
         TRACE_IRQS_ON
         xorq    %r9,%r9
         xorq    %r8,%r8
         TRACE_IRQS_ON
+       SWITCH_USER_CR3
         movl RSP-ARGOFFSET(%rsp),%esp
         CFI_RESTORE rsp
         USERGS_SYSRET32
         movl RSP-ARGOFFSET(%rsp),%esp
         CFI_RESTORE rsp
         USERGS_SYSRET32
@@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
         CFI_REL_OFFSET  rip,RIP-RIP
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         SWAPGS
         CFI_REL_OFFSET  rip,RIP-RIP
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         SWAPGS
+       SWITCH_KERNEL_CR3_NO_STACK
         /*
          * No need to follow this irqs on/off section: the syscall
          * disabled irqs and here we enable it straight after entry:
         /*
          * No need to follow this irqs on/off section: the syscall
          * disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

index 6f254f2..7362726 100644 (file)
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
  #define X86_FEATURE_PLN                (7*32+ 5) /* Intel Power Limit Notification */
  #define X86_FEATURE_PTS                (7*32+ 6) /* Intel Package Thermal Status */
  #define X86_FEATURE_DTHERM     (7*32+ 7) /* Digital Thermal Sensor */
  #define X86_FEATURE_PLN                (7*32+ 5) /* Intel Power Limit Notification */
  #define X86_FEATURE_PTS                (7*32+ 6) /* Intel Package Thermal Status */
  #define X86_FEATURE_DTHERM     (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
  
  /* Virtualization flags: Linux defined, word 8 */
  #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
  
  /* Virtualization flags: Linux defined, word 8 */
  #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h

index 382ce8a..7f1ead9 100644 (file)
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -40,7 +40,7 @@ struct gdt_page {
         struct desc_struct gdt[GDT_ENTRIES];
  } __attribute__((aligned(PAGE_SIZE)));
  
         struct desc_struct gdt[GDT_ENTRIES];
  } __attribute__((aligned(PAGE_SIZE)));
  
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
  
  static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
  {
  
  static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
  {
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index eb92a6e..3354a39 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
  extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
  
  typedef int vector_irq_t[NR_VECTORS];
  extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
  
  typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
  extern void setup_vector_irq(int cpu);
  
  #ifdef CONFIG_X86_IO_APIC
  extern void setup_vector_irq(int cpu);
  
  #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

new file mode 100644 (file)

index 0000000..6f4c8ef
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,126 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  x86_cr3_pcid_noflush, \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js   9f
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg regb
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+/**
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  kaiser_init - Initialize the shadow mapping
+ *
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 6be9909..b1c8b8d 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
  
  static inline int pgd_bad(pgd_t pgd)
  {
  
  static inline int pgd_bad(pgd_t pgd)
  {
-       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+       pgdval_t ignore_flags = _PAGE_USER;
+       /*
+        * We set NX on KAISER pgds that map userspace memory so
+        * that userspace can not meaningfully use the kernel
+        * page table by accident; it will fault on the first
+        * instruction it tries to run.  See native_set_pgd().
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               ignore_flags |= _PAGE_NX;
+
+       return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
  }
  
  static inline int pgd_none(pgd_t pgd)
  }
  
  static inline int pgd_none(pgd_t pgd)
@@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+       /* Clone the shadow pgd part as well */
+       memcpy(native_get_shadow_pgd(dst),
+              native_get_shadow_pgd(src),
+              count * sizeof(pgd_t));
+#endif
  }
  
  
  }
  
  
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index 975f709..a3bf3de 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud)
         native_set_pud(pud, native_make_pud(0));
  }
  
         native_set_pud(pud, native_make_pud(0));
  }
  
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+       return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+       return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+       return NULL;
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+       return pgdp;
+}
+#endif /* CONFIG_KAISER */
+
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
-       *pgdp = pgd;
+       *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
  }
  
  static inline void native_pgd_clear(pgd_t *pgd)
  }
  
  static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index 013286a..6e13150 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
  #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
  #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
  #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
  #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
  #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
  #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL   (_AT(pteval_t, 0))
+#else
  #define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
  #define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
  #define _PAGE_UNUSED1  (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
  #define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
  #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
  #define _PAGE_UNUSED1  (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
  #define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
  #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -62,7 +66,7 @@
  #endif
  
  #define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
  #endif
  
  #define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
  
  #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
  
  #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -74,6 +78,33 @@
                          _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
  #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
  
                          _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
  #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
  
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH                (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH                (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH      (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH      (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH                (0)
+#define X86_CR3_PCID_USER_FLUSH                (0)
+#define X86_CR3_PCID_KERN_NOFLUSH      (0)
+#define X86_CR3_PCID_USER_NOFLUSH      (0)
+#endif
+
  #define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT)
  #define _PAGE_CACHE_WB         (0)
  #define _PAGE_CACHE_WC         (_PAGE_PWT)
  #define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT)
  #define _PAGE_CACHE_WB         (0)
  #define _PAGE_CACHE_WC         (_PAGE_PWT)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h

index a9e14a5..360e80d 100644 (file)
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -43,6 +43,8 @@
   */
  #define X86_CR3_PWT    0x00000008 /* Page Write Through */
  #define X86_CR3_PCD    0x00000010 /* Page Cache Disable */
   */
  #define X86_CR3_PWT    0x00000008 /* Page Write Through */
  #define X86_CR3_PCD    0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
  
  /*
   * Intel CPU features in CR4
  
  /*
   * Intel CPU features in CR4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index f7c89e2..048249e 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -266,7 +266,7 @@ struct tss_struct {
  
  } ____cacheline_aligned;
  
  
  } ____cacheline_aligned;
  
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
  
  /*
   * Save the original ist values for checking stack pointers during debugging
  
  /*
   * Save the original ist values for checking stack pointers during debugging
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index e04cbc5..2881959 100644 (file)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void)
  #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  #endif
  
  #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  #endif
  
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
  static inline void __native_flush_tlb(void)
  {
  static inline void __native_flush_tlb(void)
  {
+       if (this_cpu_has(X86_FEATURE_INVPCID)) {
+               /*
+                * Note, this works with CR4.PCIDE=0 or 1.
+                */
+               invpcid_flush_all_nonglobals();
+               return;
+       }
+
         /*
          * If current->mm == NULL then we borrow a mm which may change during a
          * task switch and therefore we must not be preempted while we write CR3
          * back:
          */
         preempt_disable();
         /*
          * If current->mm == NULL then we borrow a mm which may change during a
          * task switch and therefore we must not be preempted while we write CR3
          * back:
          */
         preempt_disable();
+       if (this_cpu_has(X86_FEATURE_PCID))
+               kaiser_flush_tlb_on_return_to_user();
         native_write_cr3(native_read_cr3());
         preempt_enable();
  }
  
  static inline void __native_flush_tlb_global(void)
  {
         native_write_cr3(native_read_cr3());
         preempt_enable();
  }
  
  static inline void __native_flush_tlb_global(void)
  {
+#ifdef CONFIG_KAISER
+       /* Globals are not used at all */
+       __native_flush_tlb();
+#else
         unsigned long flags;
         unsigned long cr4;
  
         unsigned long flags;
         unsigned long cr4;
  
-       if (static_cpu_has(X86_FEATURE_INVPCID)) {
+       if (this_cpu_has(X86_FEATURE_INVPCID)) {
                 /*
                  * Using INVPCID is considerably faster than a pair of writes
                  * to CR4 sandwiched inside an IRQ flag save/restore.
                 /*
                  * Using INVPCID is considerably faster than a pair of writes
                  * to CR4 sandwiched inside an IRQ flag save/restore.
+                *
+                * Note, this works with CR4.PCIDE=0 or 1.
                  */
                 invpcid_flush_all();
                 return;
                  */
                 invpcid_flush_all();
                 return;
@@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void)
         native_write_cr4(cr4);
  
         raw_local_irq_restore(flags);
         native_write_cr4(cr4);
  
         raw_local_irq_restore(flags);
+#endif
  }
  
  static inline void __native_flush_tlb_single(unsigned long addr)
  {
  }
  
  static inline void __native_flush_tlb_single(unsigned long addr)
  {
-       asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+       /*
+        * SIMICS #GP's if you run INVPCID with type 2/3
+        * and X86_CR4_PCIDE clear.  Shame!
+        *
+        * The ASIDs used below are hard-coded.  But, we must not
+        * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+        * invlpg in the case we are called early.
+        */
+
+       if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+               if (this_cpu_has(X86_FEATURE_PCID))
+                       kaiser_flush_tlb_on_return_to_user();
+               asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+               return;
+       }
+       /* Flush the address out of both PCIDs. */
+       /*
+        * An optimization here might be to determine addresses
+        * that are only kernel-mapped and only flush the kernel
+        * ASID.  But, userspace flushes are probably much more
+        * important performance-wise.
+        *
+        * Make sure to do only a single invpcid when KAISER is
+        * disabled and we have only a single ASID.
+        */
+       if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+               invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+       invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
  }
  
  static inline void __flush_tlb_all(void)
  }
  
  static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 895e4b8..b567c89 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
  
  static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
  
  
  static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
  
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
  #ifdef CONFIG_X86_64
         /*
          * We need valid kernel segments for data and code in long mode too
  #ifdef CONFIG_X86_64
         /*
          * We need valid kernel segments for data and code in long mode too
@@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
                          * SDM says that it can't be enabled in 32-bit mode.
                          */
                         set_in_cr4(X86_CR4_PCIDE);
                          * SDM says that it can't be enabled in 32-bit mode.
                          */
                         set_in_cr4(X86_CR4_PCIDE);
+                       /*
+                        * INVPCID has two "groups" of types:
+                        * 1/2: Invalidate an individual address
+                        * 3/4: Invalidate all contexts
+                        *
+                        * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+                        * ignore the PCID argument in the descriptor.
+                        * But, we have to be careful not to call 1/2
+                        * with an actual non-zero PCID in them before
+                        * we do the above set_in_cr4().
+                        */
+                       if (cpu_has(c, X86_FEATURE_INVPCID))
+                               set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
                 } else {
                         /*
                          * flush_tlb_all(), as currently implemented, won't
                 } else {
                         /*
                          * flush_tlb_all(), as currently implemented, won't
@@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
                         clear_cpu_cap(c, X86_FEATURE_PCID);
                 }
         }
                         clear_cpu_cap(c, X86_FEATURE_PCID);
                 }
         }
+       kaiser_setup_pcid();
  }
  
  /*
  }
  
  /*
@@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
           [DEBUG_STACK - 1]                     = DEBUG_STKSZ
  };
  
           [DEBUG_STACK - 1]                     = DEBUG_STKSZ
  };
  
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
         [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
  
  /* May not be marked __init: used by software suspend */
         [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
  
  /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

index 2d4e76b..fb933cd 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,10 +2,14 @@
  #include <linux/types.h>
  #include <linux/slab.h>
  
  #include <linux/types.h>
  #include <linux/slab.h>
  
+#include <asm/kaiser.h>
  #include <asm/perf_event.h>
  
  #include "perf_event.h"
  
  #include <asm/perf_event.h>
  
  #include "perf_event.h"
  
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
  /* The size of a BTS record in bytes: */
  #define BTS_RECORD_SIZE                24
  
  /* The size of a BTS record in bytes: */
  #define BTS_RECORD_SIZE                24
  
@@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu)
         wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
  }
  
         wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
  }
  
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+       unsigned int order = get_order(size);
+       struct page *page;
+       unsigned long addr;
+
+       page = alloc_pages_node(node, flags | __GFP_ZERO, order);
+       if (!page)
+               return NULL;
+       addr = (unsigned long)page_address(page);
+       if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+               __free_pages(page, order);
+               addr = 0;
+       }
+       return (void *)addr;
+#else
+       return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+       if (!buffer)
+               return;
+       kaiser_remove_mapping((unsigned long)buffer, size);
+       free_pages((unsigned long)buffer, get_order(size));
+#else
+       kfree(buffer);
+#endif
+}
+
  static int alloc_pebs_buffer(int cpu)
  {
         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
  static int alloc_pebs_buffer(int cpu)
  {
         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu)
         if (!x86_pmu.pebs)
                 return 0;
  
         if (!x86_pmu.pebs)
                 return 0;
  
-       buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+       buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
         if (unlikely(!buffer))
                 return -ENOMEM;
  
         if (unlikely(!buffer))
                 return -ENOMEM;
  
@@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu)
         if (!ds || !x86_pmu.pebs)
                 return;
  
         if (!ds || !x86_pmu.pebs)
                 return;
  
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
         ds->pebs_buffer_base = 0;
  }
  
         ds->pebs_buffer_base = 0;
  }
  
@@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu)
         if (!x86_pmu.bts)
                 return 0;
  
         if (!x86_pmu.bts)
                 return 0;
  
-       buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+       buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
         if (unlikely(!buffer))
                 return -ENOMEM;
  
         if (unlikely(!buffer))
                 return -ENOMEM;
  
@@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu)
         if (!ds || !x86_pmu.bts)
                 return;
  
         if (!ds || !x86_pmu.bts)
                 return;
  
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
         ds->bts_buffer_base = 0;
  }
  
  static int alloc_ds_buffer(int cpu)
  {
         ds->bts_buffer_base = 0;
  }
  
  static int alloc_ds_buffer(int cpu)
  {
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
+       struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
  
  
+       memset(ds, 0, sizeof(*ds));
         per_cpu(cpu_hw_events, cpu).ds = ds;
  
         return 0;
         per_cpu(cpu_hw_events, cpu).ds = ds;
  
         return 0;
@@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu)
                 return;
  
         per_cpu(cpu_hw_events, cpu).ds = NULL;
                 return;
  
         per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
  }
  
  void release_ds_buffers(void)
  }
  
  void release_ds_buffers(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index f6daf3c..3a4356a 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,7 @@
  #include <asm/ftrace.h>
  #include <asm/percpu.h>
  #include <asm/pgtable_types.h>
  #include <asm/ftrace.h>
  #include <asm/percpu.h>
  #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
  
  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
  #include <linux/elf-em.h>
  
  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
  #include <linux/elf-em.h>
@@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64)
         testl $3, CS(%rdi)
         je 1f
         SWAPGS
         testl $3, CS(%rdi)
         je 1f
         SWAPGS
+       SWITCH_KERNEL_CR3
         /*
          * irq_count is used to check if a CPU is already on an interrupt stack
          * or not. While this is essentially redundant with preempt_count it is
         /*
          * irq_count is used to check if a CPU is already on an interrupt stack
          * or not. While this is essentially redundant with preempt_count it is
@@ -362,6 +364,12 @@ END(save_rest)
  
  /* save complete stack frame */
         .pushsection .kprobes.text, "ax"
  
  /* save complete stack frame */
         .pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
  ENTRY(save_paranoid)
         XCPT_FRAME 1 RDI+8
         cld
  ENTRY(save_paranoid)
         XCPT_FRAME 1 RDI+8
         cld
@@ -387,7 +395,25 @@ ENTRY(save_paranoid)
         js 1f   /* negative -> in kernel */
         SWAPGS
         xorl %ebx,%ebx
         js 1f   /* negative -> in kernel */
         SWAPGS
         xorl %ebx,%ebx
-1:     ret
+1:
+#ifdef CONFIG_KAISER
+       /*
+        * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+        * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+        * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+        * unconditionally, but we need to find out whether the reverse
+        * should be done on return (conveyed to paranoid_exit in %ebx).
+        */
+       movq    %cr3, %rax
+       testl   $KAISER_SHADOW_PGD_OFFSET, %eax
+       jz      2f
+       orl     $2, %ebx
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       orq     x86_cr3_pcid_noflush, %rax
+       movq    %rax, %cr3
+2:
+#endif
+       ret
         CFI_ENDPROC
  END(save_paranoid)
         .popsection
         CFI_ENDPROC
  END(save_paranoid)
         .popsection
@@ -464,6 +490,7 @@ ENTRY(system_call)
         CFI_REGISTER    rip,rcx
         /*CFI_REGISTER  rflags,r11*/
         SWAPGS_UNSAFE_STACK
         CFI_REGISTER    rip,rcx
         /*CFI_REGISTER  rflags,r11*/
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         /*
          * A hypervisor implementation might want to use a label
          * after the swapgs, so that it can do the swapgs
         /*
          * A hypervisor implementation might want to use a label
          * after the swapgs, so that it can do the swapgs
@@ -515,6 +542,14 @@ sysret_check:
         CFI_REGISTER    rip,rcx
         RESTORE_ARGS 1,-ARG_SKIP,0
         /*CFI_REGISTER  rflags,r11*/
         CFI_REGISTER    rip,rcx
         RESTORE_ARGS 1,-ARG_SKIP,0
         /*CFI_REGISTER  rflags,r11*/
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
         movq    PER_CPU_VAR(old_rsp), %rsp
         USERGS_SYSRET64
  
         movq    PER_CPU_VAR(old_rsp), %rsp
         USERGS_SYSRET64
  
@@ -851,6 +886,14 @@ retint_swapgs:             /* return to user-space */
          */
         DISABLE_INTERRUPTS(CLBR_ANY)
         TRACE_IRQS_IRETQ
          */
         DISABLE_INTERRUPTS(CLBR_ANY)
         TRACE_IRQS_IRETQ
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
         SWAPGS
         jmp restore_args
  
         SWAPGS
         jmp restore_args
  
@@ -891,6 +934,7 @@ native_irq_return_ldt:
         pushq_cfi %rax
         pushq_cfi %rdi
         SWAPGS
         pushq_cfi %rax
         pushq_cfi %rdi
         SWAPGS
+       SWITCH_KERNEL_CR3
         movq PER_CPU_VAR(espfix_waddr),%rdi
         movq %rax,(0*8)(%rdi)   /* RAX */
         movq (2*8)(%rsp),%rax   /* RIP */
         movq PER_CPU_VAR(espfix_waddr),%rdi
         movq %rax,(0*8)(%rdi)   /* RAX */
         movq (2*8)(%rsp),%rax   /* RIP */
@@ -906,6 +950,7 @@ native_irq_return_ldt:
         andl $0xffff0000,%eax
         popq_cfi %rdi
         orq PER_CPU_VAR(espfix_stack),%rax
         andl $0xffff0000,%eax
         popq_cfi %rdi
         orq PER_CPU_VAR(espfix_stack),%rax
+       SWITCH_USER_CR3
         SWAPGS
         movq %rax,%rsp
         popq_cfi %rax
         SWAPGS
         movq %rax,%rsp
         popq_cfi %rax
@@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
          * is fundamentally NMI-unsafe. (we cannot change the soft and
          * hard flags at once, atomically)
          */
          * is fundamentally NMI-unsafe. (we cannot change the soft and
          * hard flags at once, atomically)
          */
-
-       /* ebx: no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
  ENTRY(paranoid_exit)
         DEFAULT_FRAME
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
  ENTRY(paranoid_exit)
         DEFAULT_FRAME
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
-       testl %ebx,%ebx                         /* swapgs needed? */
-       jnz paranoid_restore
-       testl $3,CS(%rsp)
-       jnz   paranoid_userspace
-paranoid_swapgs:
+       movq    %rbx, %r12              /* paranoid_userspace uses %ebx */
+       testl   $3, CS(%rsp)
+       jnz     paranoid_userspace
+paranoid_kernel:
+       movq    %r12, %rbx              /* restore after paranoid_userspace */
         TRACE_IRQS_IRETQ 0
         TRACE_IRQS_IRETQ 0
+#ifdef CONFIG_KAISER
+       testl   $2, %ebx                /* SWITCH_USER_CR3 needed? */
+       jz      paranoid_exit_no_switch
+       SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+       testl   $1, %ebx                /* swapgs needed? */
+       jnz     paranoid_exit_no_swapgs
         SWAPGS_UNSAFE_STACK
         SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
         RESTORE_ALL 8
         RESTORE_ALL 8
-       jmp irq_return
-paranoid_restore:
-       TRACE_IRQS_IRETQ 0
-       RESTORE_ALL 8
-       jmp irq_return
+       jmp     irq_return
+
  paranoid_userspace:
         GET_THREAD_INFO(%rcx)
         movl TI_flags(%rcx),%ebx
         andl $_TIF_WORK_MASK,%ebx
  paranoid_userspace:
         GET_THREAD_INFO(%rcx)
         movl TI_flags(%rcx),%ebx
         andl $_TIF_WORK_MASK,%ebx
-       jz paranoid_swapgs
+       jz paranoid_kernel
         movq %rsp,%rdi                  /* &pt_regs */
         call sync_regs
         movq %rax,%rsp                  /* switch stack for scheduling */
         movq %rsp,%rdi                  /* &pt_regs */
         call sync_regs
         movq %rax,%rsp                  /* switch stack for scheduling */
@@ -1438,6 +1493,13 @@ ENTRY(error_entry)
         movq_cfi r13, R13+8
         movq_cfi r14, R14+8
         movq_cfi r15, R15+8
         movq_cfi r13, R13+8
         movq_cfi r14, R14+8
         movq_cfi r15, R15+8
+       /*
+        * error_entry() always returns with a kernel gsbase and
+        * CR3.  We must also have a kernel CR3/gsbase before
+        * calling TRACE_IRQS_*.  Just unconditionally switch to
+        * the kernel CR3 here.
+        */
+       SWITCH_KERNEL_CR3
         xorl %ebx,%ebx
         testl $3,CS+8(%rsp)
         je error_kernelspace
         xorl %ebx,%ebx
         testl $3,CS+8(%rsp)
         je error_kernelspace
@@ -1527,22 +1589,31 @@ ENTRY(nmi)
         call do_nmi
  #ifdef CONFIG_TRACE_IRQFLAGS
         /* paranoidexit; without TRACE_IRQS_OFF */
         call do_nmi
  #ifdef CONFIG_TRACE_IRQFLAGS
         /* paranoidexit; without TRACE_IRQS_OFF */
-       /* ebx: no swapgs flag */
+       /* ebx: no-swapgs and kaiser-switch-cr3 flag */
         DISABLE_INTERRUPTS(CLBR_NONE)
         DISABLE_INTERRUPTS(CLBR_NONE)
-       testl %ebx,%ebx                         /* swapgs needed? */
-       jnz nmi_restore
-       testl $3,CS(%rsp)
-       jnz nmi_userspace
-nmi_swapgs:
+       movq    %rbx, %r12              /* nmi_userspace uses %ebx */
+       testl   $3, CS(%rsp)
+       jnz     nmi_userspace
+nmi_kernel:
+       movq    %r12, %rbx              /* restore after nmi_userspace */
+#ifdef CONFIG_KAISER
+       testl   $2, %ebx                /* SWITCH_USER_CR3 needed? */
+       jz      nmi_exit_no_switch
+       SWITCH_USER_CR3
+nmi_exit_no_switch:
+#endif
+       testl   $1, %ebx                /* swapgs needed? */
+       jnz     nmi_exit_no_swapgs
         SWAPGS_UNSAFE_STACK
         SWAPGS_UNSAFE_STACK
-nmi_restore:
+nmi_exit_no_swapgs:
         RESTORE_ALL 8
         RESTORE_ALL 8
-       jmp irq_return
+       jmp     irq_return
+
  nmi_userspace:
         GET_THREAD_INFO(%rcx)
         movl TI_flags(%rcx),%ebx
         andl $_TIF_WORK_MASK,%ebx
  nmi_userspace:
         GET_THREAD_INFO(%rcx)
         movl TI_flags(%rcx),%ebx
         andl $_TIF_WORK_MASK,%ebx
-       jz nmi_swapgs
+       jz nmi_kernel
         movq %rsp,%rdi                  /* &pt_regs */
         call sync_regs
         movq %rax,%rsp                  /* switch stack for scheduling */
         movq %rsp,%rdi                  /* &pt_regs */
         call sync_regs
         movq %rax,%rsp                  /* switch stack for scheduling */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

index 94d857f..14cd73b 100644 (file)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
  #include <asm/pgalloc.h>
  #include <asm/setup.h>
  #include <asm/espfix.h>
  #include <asm/pgalloc.h>
  #include <asm/setup.h>
  #include <asm/espfix.h>
+#include <asm/kaiser.h>
  
  /*
   * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
  
  /*
   * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -129,6 +130,14 @@ void __init init_espfix_bsp(void)
         /* Install the espfix pud into the kernel page directory */
         pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
         pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
         /* Install the espfix pud into the kernel page directory */
         pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
         pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+       /*
+        * Just copy the top-level PGD that is mapping the espfix
+        * area to ensure it is mapped into the shadow user page
+        * tables.
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               set_pgd(native_get_shadow_pgd(pgd_p),
+                       __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
  
         /* Randomize the locations */
         init_espfix_random();
  
         /* Randomize the locations */
         init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index 0f8ebf7..6e697ac 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -338,6 +338,27 @@ early_idt_ripmsg:
         .balign PAGE_SIZE; \
  ENTRY(name)
  
         .balign PAGE_SIZE; \
  ENTRY(name)
  
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL   512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL   0
+#endif
+
  /* Automate the creation of 1 to 1 mapping pmd entries */
  #define PMDS(START, PERM, COUNT)                       \
         i = 0 ;                                         \
  /* Automate the creation of 1 to 1 mapping pmd entries */
  #define PMDS(START, PERM, COUNT)                       \
         i = 0 ;                                         \
@@ -353,13 +374,14 @@ ENTRY(name)
          * 0xffffffff80000000 to physical address 0x000000. (always using
          * 2Mbyte large pages provided by PAE mode)
          */
          * 0xffffffff80000000 to physical address 0x000000. (always using
          * 2Mbyte large pages provided by PAE mode)
          */
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_START_KERNEL*8, 0
         /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_START_KERNEL*8, 0
         /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
  
  NEXT_PAGE(level3_ident_pgt)
         .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
  
  NEXT_PAGE(level3_ident_pgt)
         .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt)
          * Don't set NX because code runs from these pages.
          */
         PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
          * Don't set NX because code runs from these pages.
          */
         PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+       .fill   KAISER_USER_PGD_FILL,8,0
  
  NEXT_PAGE(level2_kernel_pgt)
         /*
  
  NEXT_PAGE(level2_kernel_pgt)
         /*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c

index 43e9ccf..f00e6e7 100644 (file)
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
   * section. Since TSS's are completely CPU-local, we want them
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
   * section. Since TSS's are completely CPU-local, we want them
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
  
  
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index e328f69..990f743 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
         .flags = IRQF_NO_THREAD,
  };
  
         .flags = IRQF_NO_THREAD,
  };
  
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
         [0 ... NR_VECTORS - 1] = -1,
  };
  
         [0 ... NR_VECTORS - 1] = -1,
  };
  
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c

index 1dd3230..836a4c2 100644 (file)
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/uaccess.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/uaccess.h>
+#include <linux/kaiser.h>
  
  #include <asm/system.h>
  #include <asm/ldt.h>
  
  #include <asm/system.h>
  #include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
         set_ldt(pc->ldt->entries, pc->ldt->size);
  }
  
         set_ldt(pc->ldt->entries, pc->ldt->size);
  }
  
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+               vfree(ldt->entries);
+       else
+               free_page((unsigned long)ldt->entries);
+       kfree(ldt);
+}
+
  /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
  static struct ldt_struct *alloc_ldt_struct(int size)
  {
         struct ldt_struct *new_ldt;
         int alloc_size;
  /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
  static struct ldt_struct *alloc_ldt_struct(int size)
  {
         struct ldt_struct *new_ldt;
         int alloc_size;
+       int ret;
  
         if (size > LDT_ENTRIES)
                 return NULL;
  
         if (size > LDT_ENTRIES)
                 return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
                 return NULL;
         }
  
                 return NULL;
         }
  
+       ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+                                __PAGE_KERNEL);
         new_ldt->size = size;
         new_ldt->size = size;
+       if (ret) {
+               __free_ldt_struct(new_ldt);
+               return NULL;
+       }
         return new_ldt;
  }
  
         return new_ldt;
  }
  
@@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
         if (likely(!ldt))
                 return;
  
         if (likely(!ldt))
                 return;
  
+       kaiser_remove_mapping((unsigned long)ldt->entries,
+                             ldt->size * LDT_ENTRY_SIZE);
         paravirt_free_ldt(ldt->entries, ldt->size);
         paravirt_free_ldt(ldt->entries, ldt->size);
-       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-               vfree(ldt->entries);
-       else
-               kfree(ldt->entries);
-       kfree(ldt);
+       __free_ldt_struct(ldt);
  }
  
  /*
  }
  
  /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index 557eb37..d2ce2a3 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -57,7 +57,7 @@
  
  asmlinkage extern void ret_from_fork(void);
  
  
  asmlinkage extern void ret_from_fork(void);
  
-DEFINE_PER_CPU(unsigned long, old_rsp);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
  static DEFINE_PER_CPU(unsigned char, is_idle);
  
  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  static DEFINE_PER_CPU(unsigned char, is_idle);
  
  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index cf2a840..c9a00a5 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
  obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
  
  obj-$(CONFIG_MEMTEST)          += memtest.o
  obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
  
  obj-$(CONFIG_MEMTEST)          += memtest.o
+obj-$(CONFIG_KAISER)           += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

new file mode 100644 (file)

index 0000000..79b0222
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,382 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+extern struct mm_struct init_mm;
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h>      /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_KAISER
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(vaddr);
+       /*
+        * We made all the kernel PGDs present in kaiser_init().
+        * We expect them to stay that way.
+        */
+       BUG_ON(pgd_none(*pgd));
+       /*
+        * PGDs are either 512GB or 128TB on all x86_64
+        * configurations.  We don't handle these.
+        */
+       BUG_ON(pgd_large(*pgd));
+
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pud_large(*pud))
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pmd_large(*pmd))
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pte_none(*pte)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+       pmd_t *pmd;
+       pud_t *pud;
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+       gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+       if (pgd_none(*pgd)) {
+               WARN_ONCE(1, "All shadow pgds should have been populated");
+               return NULL;
+       }
+       BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+       pud = pud_offset(pgd, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pud_large(*pud)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pud_none(*pud)) {
+               unsigned long new_pmd_page = __get_free_page(gfp);
+               if (!new_pmd_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pmd_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pmd_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       pmd = pmd_offset(pud, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pmd_large(*pmd)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pmd_none(*pmd)) {
+               unsigned long new_pte_page = __get_free_page(gfp);
+               if (!new_pte_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pte_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pte_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       return pte_offset_kernel(pmd, address);
+}
+
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+                       unsigned long flags)
+{
+       int ret = 0;
+       pte_t *pte;
+       unsigned long start_addr = (unsigned long )__start_addr;
+       unsigned long address = start_addr & PAGE_MASK;
+       unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+       unsigned long target_address;
+
+       for (; address < end_addr; address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+               if (target_address == -1) {
+                       ret = -EIO;
+                       break;
+               }
+               pte = kaiser_pagetable_walk(address);
+               if (!pte) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       pte_t tmp;
+                       set_pte(&tmp, __pte(flags | target_address));
+                       WARN_ON_ONCE(!pte_same(*pte, tmp));
+               }
+       }
+       return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+       unsigned long size = end - start;
+
+       return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               pgd_t new_pgd;
+               pud_t *pud = pud_alloc_one(&init_mm,
+                                          PAGE_OFFSET + i * PGDIR_SIZE);
+               if (!pud) {
+                       WARN_ON(1);
+                       break;
+               }
+               inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+               new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+               /*
+                * Make sure not to stomp on some other pgd entry.
+                */
+               if (!pgd_none(pgd[i])) {
+                       WARN_ON(1);
+                       continue;
+               }
+               set_pgd(pgd + i, new_pgd);
+       }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do {     \
+       int __ret = kaiser_add_user_map(start, size, flags);    \
+       WARN_ON(__ret);                                         \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {         \
+       int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
+       WARN_ON(__ret);                                                 \
+} while (0)
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+       int cpu;
+
+       kaiser_init_all_pgds();
+
+       for_each_possible_cpu(cpu) {
+               void *percpu_vaddr = __per_cpu_user_mapped_start +
+                                    per_cpu_offset(cpu);
+               unsigned long percpu_sz = __per_cpu_user_mapped_end -
+                                         __per_cpu_user_mapped_start;
+               kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+                                         __PAGE_KERNEL);
+       }
+
+       /*
+        * Map the entry/exit text section, which is needed at
+        * switches from user to and from kernel.
+        */
+       kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+                                      __PAGE_KERNEL_RX);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+                                      __irqentry_text_end,
+                                      __PAGE_KERNEL_RX);
+#endif
+       kaiser_add_user_map_early((void *)idt_descr.address,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL_RO);
+       kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+                                 sizeof(x86_cr3_pcid_noflush),
+                                 __PAGE_KERNEL);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       unsigned long end = start + size;
+       unsigned long addr;
+       pte_t *pte;
+
+       for (addr = start; addr < end; addr += PAGE_SIZE) {
+               pte = kaiser_pagetable_walk(addr);
+               if (pte)
+                       set_pte(pte, __pte(0));
+       }
+}
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+       return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       /*
+        * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+        * skip cases like kexec and EFI which make temporary low mappings.
+        */
+       if (pgd.pgd & _PAGE_USER) {
+               if (is_userspace_pgd(pgdp)) {
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+                       /*
+                        * Even if the entry is *mapping* userspace, ensure
+                        * that userspace can not use it.  This way, if we
+                        * get out to userspace running on the kernel CR3,
+                        * userspace will crash instead of running.
+                        */
+                       pgd.pgd |= _PAGE_NX;
+               }
+       } else if (!pgd.pgd) {
+               /*
+                * pgd_clear() cannot check _PAGE_USER, and is even used to
+                * clear corrupted pgd entries: so just rely on cases like
+                * kexec and EFI never to be using pgd_clear().
+                */
+               if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+                   is_userspace_pgd(pgdp))
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+       return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+       unsigned long kern_cr3 = 0;
+       unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+       if (this_cpu_has(X86_FEATURE_PCID)) {
+               kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+               user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+       }
+       /*
+        * These variables are used by the entry/exit
+        * code to change PCID and pgd and TLB flushing.
+        */
+       x86_cr3_pcid_noflush = kern_cr3;
+       this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+       this_cpu_write(x86_cr3_pcid_user,
+                       X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index 8573b83..7328560 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -5,7 +5,7 @@
  #include <asm/tlb.h>
  #include <asm/fixmap.h>
  
  #include <asm/tlb.h>
  #include <asm/fixmap.h>
  
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
  
  #ifdef CONFIG_HIGHPTE
  #define PGALLOC_USER_GFP __GFP_HIGHMEM
  
  #ifdef CONFIG_HIGHPTE
  #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
         }
  }
  
         }
  }
  
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+       /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+       return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+                                        PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+       free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
  pgd_t *pgd_alloc(struct mm_struct *mm)
  {
         pgd_t *pgd;
         pmd_t *pmds[PREALLOCATED_PMDS];
  
  pgd_t *pgd_alloc(struct mm_struct *mm)
  {
         pgd_t *pgd;
         pmd_t *pmds[PREALLOCATED_PMDS];
  
-       pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+       pgd = _pgd_alloc();
  
         if (pgd == NULL)
                 goto out;
  
         if (pgd == NULL)
                 goto out;
@@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
  out_free_pmds:
         free_pmds(pmds);
  out_free_pgd:
  out_free_pmds:
         free_pmds(pmds);
  out_free_pgd:
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
  out:
         return NULL;
  }
  out:
         return NULL;
  }
@@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
         pgd_mop_up_pmds(mm, pgd);
         pgd_dtor(pgd);
         paravirt_pgd_free(mm, pgd);
         pgd_mop_up_pmds(mm, pgd);
         pgd_dtor(pgd);
         paravirt_pgd_free(mm, pgd);
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
  }
  
  int ptep_set_access_flags(struct vm_area_struct *vma,
  }
  
  int ptep_set_access_flags(struct vm_area_struct *vma,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 4f5ca8f..4078e30 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,10 +12,43 @@
  #include <asm/cache.h>
  #include <asm/apic.h>
  #include <asm/uv/uv.h>
  #include <asm/cache.h>
  #include <asm/apic.h>
  #include <asm/uv/uv.h>
+#include <asm/kaiser.h>
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
                         = { &init_mm, 0, };
  
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
                         = { &init_mm, 0, };
  
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+       unsigned long new_mm_cr3 = __pa(pgdir);
+
+#ifdef CONFIG_KAISER
+       if (this_cpu_has(X86_FEATURE_PCID)) {
+               /*
+                * We reuse the same PCID for different tasks, so we must
+                * flush all the entries for the PCID out when we change tasks.
+                * Flush KERN below, flush USER when returning to userspace in
+                * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+                *
+                * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+                * do it here, but can only be used if X86_FEATURE_INVPCID is
+                * available - and many machines support pcid without invpcid.
+                *
+                * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+                * but keep that line in there in case something changes.
+                */
+               new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+               kaiser_flush_tlb_on_return_to_user();
+       }
+#endif /* CONFIG_KAISER */
+
+       /*
+        * Caution: many callers of this function expect
+        * that load_new_mm_cr3() is serializing and orders TLB
+        * fills with respect to the mm_cpumask writes.
+        */
+       write_cr3(new_mm_cr3);
+}
+
  /*
   *     TLB flushing, formerly SMP-only
   *             c/o Linus Torvalds.
  /*
   *     TLB flushing, formerly SMP-only
   *             c/o Linus Torvalds.
@@ -65,7 +98,7 @@ void leave_mm(int cpu)
                 BUG();
         cpumask_clear_cpu(cpu,
                           mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
                 BUG();
         cpumask_clear_cpu(cpu,
                           mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
-       load_cr3(swapper_pg_dir);
+       load_new_mm_cr3(swapper_pg_dir);
  }
  EXPORT_SYMBOL_GPL(leave_mm);
  
  }
  EXPORT_SYMBOL_GPL(leave_mm);
  
@@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                  * from next->pgd.  TLB fills are special and can happen
                  * due to instruction fetches or for no reason at all,
                  * and neither LOCK nor MFENCE orders them.
                  * from next->pgd.  TLB fills are special and can happen
                  * due to instruction fetches or for no reason at all,
                  * and neither LOCK nor MFENCE orders them.
-                * Fortunately, load_cr3() is serializing and gives the
-                * ordering guarantee we need.
-                *
+                * Fortunately, load_new_mm_cr3() is serializing
+                * and gives the  ordering guarantee we need.
                  */
                  */
-               load_cr3(next->pgd);
+               load_new_mm_cr3(next->pgd);
  
                 /* stop flush ipis for the previous mm */
                 cpumask_clear_cpu(cpu, mm_cpumask(prev));
  
                 /* stop flush ipis for the previous mm */
                 cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                          * tlb flush IPI delivery. We must reload CR3
                          * to make sure to use no freed page tables.
                          *
                          * tlb flush IPI delivery. We must reload CR3
                          * to make sure to use no freed page tables.
                          *
-                        * As above, load_cr3() is serializing and orders TLB
-                        * fills with respect to the mm_cpumask write.
+                        * As above, load_new_mm_cr3() is serializing and orders
+                        * TLB fills with respect to the mm_cpumask write.
                          */
                          */
-                       load_cr3(next->pgd);
+                       load_new_mm_cr3(next->pgd);
                         load_mm_ldt(next);
                 }
         }
                         load_mm_ldt(next);
                 }
         }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index b5e2e4c..01c8155 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -692,7 +692,14 @@
   */
  #define PERCPU_INPUT(cacheline)                                                \
         VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
   */
  #define PERCPU_INPUT(cacheline)                                                \
         VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;                \
         *(.data..percpu..first)                                         \
         *(.data..percpu..first)                                         \
+       . = ALIGN(cacheline);                                           \
+       *(.data..percpu..user_mapped)                                   \
+       *(.data..percpu..user_mapped..shared_aligned)                   \
+       . = ALIGN(PAGE_SIZE);                                           \
+       *(.data..percpu..user_mapped..page_aligned)                     \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;                  \
         . = ALIGN(PAGE_SIZE);                                           \
         *(.data..percpu..page_aligned)                                  \
         . = ALIGN(cacheline);                                           \
         . = ALIGN(PAGE_SIZE);                                           \
         *(.data..percpu..page_aligned)                                  \
         . = ALIGN(cacheline);                                           \
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h

new file mode 100644 (file)

index 0000000..4a4d6d9
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       /*
+        * Map that page of kernel stack on which we enter from user context.
+        */
+       return kaiser_add_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+       /*
+        * Note: may be called even when kaiser_map_thread_stack() failed.
+        */
+       kaiser_remove_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+                                    unsigned long size, unsigned long flags)
+{
+       return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+                                        unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 25842b6..a0b4422 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -95,8 +95,9 @@ enum zone_stat_item {
         NR_SLAB_RECLAIMABLE,
         NR_SLAB_UNRECLAIMABLE,
         NR_PAGETABLE,           /* used for pagetables */
         NR_SLAB_RECLAIMABLE,
         NR_SLAB_UNRECLAIMABLE,
         NR_PAGETABLE,           /* used for pagetables */
-       NR_KERNEL_STACK,
         /* Second 128 byte cacheline */
         /* Second 128 byte cacheline */
+       NR_KERNEL_STACK,
+       NR_KAISERTABLE,
         NR_UNSTABLE_NFS,        /* NFS unstable pages */
         NR_BOUNCE,
         NR_VMSCAN_WRITE,
         NR_UNSTABLE_NFS,        /* NFS unstable pages */
         NR_BOUNCE,
         NR_VMSCAN_WRITE,
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

index 27ef6b1..56f5eeb 100644 (file)
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -28,6 +28,12 @@
         (void)__vpp_verify;                                             \
  } while (0)
  
         (void)__vpp_verify;                                             \
  } while (0)
  
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
  /*
   * s390 and alpha modules require percpu variables to be defined as
   * weak to force the compiler to generate GOT based external
  /*
   * s390 and alpha modules require percpu variables to be defined as
   * weak to force the compiler to generate GOT based external
@@ -90,6 +96,12 @@
  #define DEFINE_PER_CPU(type, name)                                     \
         DEFINE_PER_CPU_SECTION(type, name, "")
  
  #define DEFINE_PER_CPU(type, name)                                     \
         DEFINE_PER_CPU_SECTION(type, name, "")
  
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)                                \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)                         \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
  /*
   * Declaration/definition used for per-CPU variables that must come first in
   * the set of variables.
  /*
   * Declaration/definition used for per-CPU variables that must come first in
   * the set of variables.
@@ -119,6 +131,14 @@
         DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
         ____cacheline_aligned_in_smp
  
         DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
         ____cacheline_aligned_in_smp
  
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
  #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
         DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
         ____cacheline_aligned
  #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
         DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
         ____cacheline_aligned
@@ -137,11 +157,21 @@
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
         DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
         __aligned(PAGE_SIZE)
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
         DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
         __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)           \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+       __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)            \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+       __aligned(PAGE_SIZE)
  
  /*
   * Declaration/definition used for per-CPU variables that must be read mostly.
   */
  
  /*
   * Declaration/definition used for per-CPU variables that must be read mostly.
   */
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                        \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                                \
         DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
  
  #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
         DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
  
  #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
diff --git a/init/main.c b/init/main.c

index e937d9b..558a9fd 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@
  #include <linux/slab.h>
  #include <linux/perf_event.h>
  #include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/perf_event.h>
  #include <linux/random.h>
+#include <linux/kaiser.h>
  
  #include <asm/io.h>
  #include <asm/bugs.h>
  
  #include <asm/io.h>
  #include <asm/bugs.h>
@@ -463,6 +464,7 @@ static void __init mm_init(void)
         percpu_init_late();
         pgtable_cache_init();
         vmalloc_init();
         percpu_init_late();
         pgtable_cache_init();
         vmalloc_init();
+       kaiser_init();
  }
  
  asmlinkage void __init start_kernel(void)
  }
  
  asmlinkage void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c

index 29b4604..511131a 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
+#include <linux/kaiser.h>
  #include <linux/delayacct.h>
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
  #include <linux/delayacct.h>
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
@@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
  
  static inline void free_thread_info(struct thread_info *ti)
  {
  
  static inline void free_thread_info(struct thread_info *ti)
  {
+       kaiser_unmap_thread_stack(ti);
         free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
  }
  #endif
         free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
  }
  #endif
@@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
  
         tsk->stack = ti;
  
  
         tsk->stack = ti;
  
+       err = kaiser_map_thread_stack(tsk->stack);
+       if (err)
+               goto out;
+
         setup_thread_stack(tsk, orig);
         clear_user_return_notifier(tsk);
         clear_tsk_need_resched(tsk);
         setup_thread_stack(tsk, orig);
         clear_user_return_notifier(tsk);
         clear_tsk_need_resched(tsk);
diff --git a/mm/vmstat.c b/mm/vmstat.c

index ff90609..eaf3db0 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
         "nr_slab_unreclaimable",
         "nr_page_table_pages",
         "nr_kernel_stack",
         "nr_slab_unreclaimable",
         "nr_page_table_pages",
         "nr_kernel_stack",
+       "nr_overhead",
         "nr_unstable",
         "nr_bounce",
         "nr_vmscan_write",
         "nr_unstable",
         "nr_bounce",
         "nr_vmscan_write",
diff --git a/security/Kconfig b/security/Kconfig

index 51bd5a0..19f8319 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -96,6 +96,16 @@ config SECURITY
  
           If you are unsure how to answer this question, answer N.
  
  
           If you are unsure how to answer this question, answer N.
  
+config KAISER
+       bool "Remove the kernel mapping in user mode"
+       default y
+       depends on X86_64 && SMP && !PARAVIRT
+       help
+         This enforces a strict kernel and user space isolation, in order
+         to close hardware side channels on kernel address information.
+
+         If you are unsure how to answer this question, answer Y.
+
  config SECURITYFS
         bool "Enable the securityfs filesystem"
         help
  config SECURITYFS
         bool "Enable the securityfs filesystem"
         help
author	Hugh Dickins <hughd@google.com>
	Tue, 12 Dec 2017 01:59:50 +0000 (17:59 -0800)
committer	Ben Hutchings <ben@decadent.org.uk>
	Sun, 7 Jan 2018 01:46:49 +0000 (01:46 +0000)
arch/x86/boot/compressed/misc.h		patch \| blob \| history
arch/x86/ia32/ia32entry.S		patch \| blob \| history
arch/x86/include/asm/cpufeature.h		patch \| blob \| history
arch/x86/include/asm/desc.h		patch \| blob \| history
arch/x86/include/asm/hw_irq.h		patch \| blob \| history
arch/x86/include/asm/kaiser.h	[new file with mode: 0644]	patch \| blob
arch/x86/include/asm/pgtable.h		patch \| blob \| history
arch/x86/include/asm/pgtable_64.h		patch \| blob \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| history
arch/x86/include/asm/processor-flags.h		patch \| blob \| history
arch/x86/include/asm/processor.h		patch \| blob \| history
arch/x86/include/asm/tlbflush.h		patch \| blob \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| history
arch/x86/kernel/cpu/perf_event_intel_ds.c		patch \| blob \| history
arch/x86/kernel/entry_64.S		patch \| blob \| history
arch/x86/kernel/espfix_64.c		patch \| blob \| history
arch/x86/kernel/head_64.S		patch \| blob \| history
arch/x86/kernel/init_task.c		patch \| blob \| history
arch/x86/kernel/irqinit.c		patch \| blob \| history
arch/x86/kernel/ldt.c		patch \| blob \| history
arch/x86/kernel/process_64.c		patch \| blob \| history
arch/x86/mm/Makefile		patch \| blob \| history
arch/x86/mm/kaiser.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/pgtable.c		patch \| blob \| history
arch/x86/mm/tlb.c		patch \| blob \| history
include/asm-generic/vmlinux.lds.h		patch \| blob \| history
include/linux/kaiser.h	[new file with mode: 0644]	patch \| blob
include/linux/mmzone.h		patch \| blob \| history
include/linux/percpu-defs.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history
security/Kconfig		patch \| blob \| history