arch/x86/mm/kaiser.c

   1 #include <linux/bug.h>
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/string.h>
   5 #include <linux/types.h>
   6 #include <linux/bug.h>
   7 #include <linux/init.h>
   8 #include <linux/interrupt.h>
   9 #include <linux/spinlock.h>
  10 #include <linux/mm.h>
  11 #include <linux/module.h>
  12 #include <linux/uaccess.h>
  13 #include <linux/ftrace.h>
  14
  15 extern struct mm_struct init_mm;
  16
  17 #include <asm/kaiser.h>
  18 #include <asm/tlbflush.h>       /* to verify its kaiser declarations */
  19 #include <asm/vsyscall.h>
  20 #include <asm/pgtable.h>
  21 #include <asm/pgalloc.h>
  22 #include <asm/desc.h>
  23
  24 int kaiser_enabled __read_mostly = 1;
  25 EXPORT_SYMBOL(kaiser_enabled);  /* for inlined TLB flush functions */
  26
  27 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  28
  29 /*
  30  * These can have bit 63 set, so we can not just use a plain "or"
  31  * instruction to get their value or'd into CR3.  It would take
  32  * another register.  So, we use a memory reference to these instead.
  33  *
  34  * This is also handy because systems that do not support PCIDs
  35  * just end up or'ing a 0 into their CR3, which does no harm.
  36  */
  37 unsigned long x86_cr3_pcid_noflush __read_mostly;
  38 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
  39
  40 /*
  41  * At runtime, the only things we map are some things for CPU
  42  * hotplug, and stacks for new processes.  No two CPUs will ever
  43  * be populating the same addresses, so we only need to ensure
  44  * that we protect between two CPUs trying to allocate and
  45  * populate the same page table page.
  46  *
  47  * Only take this lock when doing a set_p[4um]d(), but it is not
  48  * needed for doing a set_pte().  We assume that only the *owner*
  49  * of a given allocation will be doing this for _their_
  50  * allocation.
  51  *
  52  * This ensures that once a system has been running for a while
  53  * and there have been stacks all over and these page tables
  54  * are fully populated, there will be no further acquisitions of
  55  * this lock.
  56  */
  57 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
  58
  59 /*
  60  * Returns -1 on error.
  61  */
  62 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  63 {
  64         pgd_t *pgd;
  65         pud_t *pud;
  66         pmd_t *pmd;
  67         pte_t *pte;
  68
  69         pgd = pgd_offset_k(vaddr);
  70         /*
  71          * We made all the kernel PGDs present in kaiser_init().
  72          * We expect them to stay that way.
  73          */
  74         BUG_ON(pgd_none(*pgd));
  75         /*
  76          * PGDs are either 512GB or 128TB on all x86_64
  77          * configurations.  We don't handle these.
  78          */
  79         BUG_ON(pgd_large(*pgd));
  80
  81         pud = pud_offset(pgd, vaddr);
  82         if (pud_none(*pud)) {
  83                 WARN_ON_ONCE(1);
  84                 return -1;
  85         }
  86
  87         if (pud_large(*pud))
  88                 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
  89
  90         pmd = pmd_offset(pud, vaddr);
  91         if (pmd_none(*pmd)) {
  92                 WARN_ON_ONCE(1);
  93                 return -1;
  94         }
  95
  96         if (pmd_large(*pmd))
  97                 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
  98
  99         pte = pte_offset_kernel(pmd, vaddr);
 100         if (pte_none(*pte)) {
 101                 WARN_ON_ONCE(1);
 102                 return -1;
 103         }
 104
 105         return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 106 }
 107
 108 /*
 109  * This is a relatively normal page table walk, except that it
 110  * also tries to allocate page tables pages along the way.
 111  *
 112  * Returns a pointer to a PTE on success, or NULL on failure.
 113  */
 114 static pte_t *kaiser_pagetable_walk(unsigned long address)
 115 {
 116         pmd_t *pmd;
 117         pud_t *pud;
 118         pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 119         gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 120
 121         if (pgd_none(*pgd)) {
 122                 WARN_ONCE(1, "All shadow pgds should have been populated");
 123                 return NULL;
 124         }
 125         BUILD_BUG_ON(pgd_large(*pgd) != 0);
 126
 127         pud = pud_offset(pgd, address);
 128         /* The shadow page tables do not use large mappings: */
 129         if (pud_large(*pud)) {
 130                 WARN_ON(1);
 131                 return NULL;
 132         }
 133         if (pud_none(*pud)) {
 134                 unsigned long new_pmd_page = __get_free_page(gfp);
 135                 if (!new_pmd_page)
 136                         return NULL;
 137                 spin_lock(&shadow_table_allocation_lock);
 138                 if (pud_none(*pud)) {
 139                         set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page)));
 140                         __inc_zone_page_state(virt_to_page((void *)
 141                                                 new_pmd_page), NR_KAISERTABLE);
 142                 } else
 143                         free_page(new_pmd_page);
 144                 spin_unlock(&shadow_table_allocation_lock);
 145         }
 146
 147         pmd = pmd_offset(pud, address);
 148         /* The shadow page tables do not use large mappings: */
 149         if (pmd_large(*pmd)) {
 150                 WARN_ON(1);
 151                 return NULL;
 152         }
 153         if (pmd_none(*pmd)) {
 154                 unsigned long new_pte_page = __get_free_page(gfp);
 155                 if (!new_pte_page)
 156                         return NULL;
 157                 spin_lock(&shadow_table_allocation_lock);
 158                 if (pmd_none(*pmd)) {
 159                         set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page)));
 160                         __inc_zone_page_state(virt_to_page((void *)
 161                                                 new_pte_page), NR_KAISERTABLE);
 162                 } else
 163                         free_page(new_pte_page);
 164                 spin_unlock(&shadow_table_allocation_lock);
 165         }
 166
 167         return pte_offset_kernel(pmd, address);
 168 }
 169
 170 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 171                                unsigned long flags)
 172 {
 173         int ret = 0;
 174         pte_t *pte;
 175         unsigned long start_addr = (unsigned long )__start_addr;
 176         unsigned long address = start_addr & PAGE_MASK;
 177         unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 178         unsigned long target_address;
 179
 180         /*
 181          * It is convenient for callers to pass in __PAGE_KERNEL etc,
 182          * and there is no actual harm from setting _PAGE_GLOBAL, so
 183          * long as CR4.PGE is not set.  But it is nonetheless troubling
 184          * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
 185          * requires that not to be #defined to 0): so mask it off here.
 186          */
 187         flags &= ~_PAGE_GLOBAL;
 188
 189         if (flags & _PAGE_USER)
 190                 BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP);
 191
 192         for (; address < end_addr; address += PAGE_SIZE) {
 193                 target_address = get_pa_from_mapping(address);
 194                 if (target_address == -1) {
 195                         ret = -EIO;
 196                         break;
 197                 }
 198                 pte = kaiser_pagetable_walk(address);
 199                 if (!pte) {
 200                         ret = -ENOMEM;
 201                         break;
 202                 }
 203                 if (pte_none(*pte)) {
 204                         set_pte(pte, __pte(flags | target_address));
 205                 } else {
 206                         pte_t tmp;
 207                         set_pte(&tmp, __pte(flags | target_address));
 208                         WARN_ON_ONCE(!pte_same(*pte, tmp));
 209                 }
 210         }
 211         return ret;
 212 }
 213
 214 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 215 {
 216         unsigned long size = end - start;
 217
 218         return kaiser_add_user_map(start, size, flags);
 219 }
 220
 221 /*
 222  * Ensure that the top level of the (shadow) page tables are
 223  * entirely populated.  This ensures that all processes that get
 224  * forked have the same entries.  This way, we do not have to
 225  * ever go set up new entries in older processes.
 226  *
 227  * Note: we never free these, so there are no updates to them
 228  * after this.
 229  */
 230 static void __init kaiser_init_all_pgds(void)
 231 {
 232         pgd_t *pgd;
 233         int i = 0;
 234
 235         pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 236         for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 237                 pgd_t new_pgd;
 238                 pud_t *pud = pud_alloc_one(&init_mm,
 239                                            PAGE_OFFSET + i * PGDIR_SIZE);
 240                 if (!pud) {
 241                         WARN_ON(1);
 242                         break;
 243                 }
 244                 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 245                 new_pgd = __pgd(_PAGE_TABLE |__pa(pud));
 246                 /*
 247                  * Make sure not to stomp on some other pgd entry.
 248                  */
 249                 if (!pgd_none(pgd[i])) {
 250                         WARN_ON(1);
 251                         continue;
 252                 }
 253                 set_pgd(pgd + i, new_pgd);
 254         }
 255 }
 256
 257 #define kaiser_add_user_map_early(start, size, flags) do {      \
 258         int __ret = kaiser_add_user_map(start, size, flags);    \
 259         WARN_ON(__ret);                                         \
 260 } while (0)
 261
 262 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {          \
 263         int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
 264         WARN_ON(__ret);                                                 \
 265 } while (0)
 266
 267 /*
 268  * If anything in here fails, we will likely die on one of the
 269  * first kernel->user transitions and init will die.  But, we
 270  * will have most of the kernel up by then and should be able to
 271  * get a clean warning out of it.  If we BUG_ON() here, we run
 272  * the risk of being before we have good console output.
 273  */
 274 void __init kaiser_init(void)
 275 {
 276         int cpu;
 277
 278         if (!kaiser_enabled)
 279                 return;
 280         kaiser_init_all_pgds();
 281
 282         for_each_possible_cpu(cpu) {
 283                 void *percpu_vaddr = __per_cpu_user_mapped_start +
 284                                      per_cpu_offset(cpu);
 285                 unsigned long percpu_sz = __per_cpu_user_mapped_end -
 286                                           __per_cpu_user_mapped_start;
 287                 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 288                                           __PAGE_KERNEL);
 289         }
 290
 291         /*
 292          * Map the entry/exit text section, which is needed at
 293          * switches from user to and from kernel.
 294          */
 295         kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 296                                        __PAGE_KERNEL_RX);
 297         kaiser_add_user_map_ptrs_early(__kprobes_text_start, __kprobes_text_end,
 298                                        __PAGE_KERNEL_RX);
 299 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 300         kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 301                                        __irqentry_text_end,
 302                                        __PAGE_KERNEL_RX);
 303 #endif
 304         kaiser_add_user_map_early((void *)idt_descr.address,
 305                                   sizeof(gate_desc) * NR_VECTORS,
 306                                   __PAGE_KERNEL_RO);
 307         kaiser_add_user_map_early((void *)VVAR_ADDRESS, PAGE_SIZE,
 308                                   __PAGE_KERNEL_VVAR);
 309         kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE,
 310                                   vsyscall_pgprot);
 311         kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
 312                                   sizeof(x86_cr3_pcid_noflush),
 313                                   __PAGE_KERNEL);
 314 }
 315
 316 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 317 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 318 {
 319         if (!kaiser_enabled)
 320                 return 0;
 321         return kaiser_add_user_map((const void *)addr, size, flags);
 322 }
 323
 324 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 325 {
 326         unsigned long end = start + size;
 327         unsigned long addr;
 328         pte_t *pte;
 329
 330         if (!kaiser_enabled)
 331                 return;
 332         for (addr = start; addr < end; addr += PAGE_SIZE) {
 333                 pte = kaiser_pagetable_walk(addr);
 334                 if (pte)
 335                         set_pte(pte, __pte(0));
 336         }
 337 }
 338
 339 /*
 340  * Page table pages are page-aligned.  The lower half of the top
 341  * level is used for userspace and the top half for the kernel.
 342  * This returns true for user pages that need to get copied into
 343  * both the user and kernel copies of the page tables, and false
 344  * for kernel pages that should only be in the kernel copy.
 345  */
 346 static inline bool is_userspace_pgd(pgd_t *pgdp)
 347 {
 348         return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
 349 }
 350
 351 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 352 {
 353         if (!kaiser_enabled)
 354                 return pgd;
 355         /*
 356          * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 357          * skip cases like kexec and EFI which make temporary low mappings.
 358          */
 359         if (pgd.pgd & _PAGE_USER) {
 360                 if (is_userspace_pgd(pgdp)) {
 361                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 362                         /*
 363                          * Even if the entry is *mapping* userspace, ensure
 364                          * that userspace can not use it.  This way, if we
 365                          * get out to userspace running on the kernel CR3,
 366                          * userspace will crash instead of running.
 367                          */
 368                         pgd.pgd |= _PAGE_NX;
 369                 }
 370         } else if (!pgd.pgd) {
 371                 /*
 372                  * pgd_clear() cannot check _PAGE_USER, and is even used to
 373                  * clear corrupted pgd entries: so just rely on cases like
 374                  * kexec and EFI never to be using pgd_clear().
 375                  */
 376                 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
 377                     is_userspace_pgd(pgdp))
 378                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 379         }
 380         return pgd;
 381 }
 382
 383 void kaiser_setup_pcid(void)
 384 {
 385         unsigned long kern_cr3 = 0;
 386         unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 387
 388         if (this_cpu_has(X86_FEATURE_PCID)) {
 389                 kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
 390                 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
 391         }
 392         /*
 393          * These variables are used by the entry/exit
 394          * code to change PCID and pgd and TLB flushing.
 395          */
 396         x86_cr3_pcid_noflush = kern_cr3;
 397         this_cpu_write(x86_cr3_pcid_user, user_cr3);
 398 }
 399
 400 /*
 401  * Make a note that this cpu will need to flush USER tlb on return to user.
 402  * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
 403  * if cpu does not, then the NOFLUSH bit will never have been set.
 404  */
 405 void kaiser_flush_tlb_on_return_to_user(void)
 406 {
 407         this_cpu_write(x86_cr3_pcid_user,
 408                         X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 409 }
 410 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);