arch/x86/mm/kaiser.c

   1 #include <linux/bug.h>
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/string.h>
   5 #include <linux/types.h>
   6 #include <linux/bug.h>
   7 #include <linux/init.h>
   8 #include <linux/interrupt.h>
   9 #include <linux/spinlock.h>
  10 #include <linux/mm.h>
  11 #include <linux/module.h>
  12 #include <linux/uaccess.h>
  13 #include <linux/ftrace.h>
  14
  15 extern struct mm_struct init_mm;
  16
  17 #include <asm/kaiser.h>
  18 #include <asm/tlbflush.h>       /* to verify its kaiser declarations */
  19 #include <asm/pgtable.h>
  20 #include <asm/pgalloc.h>
  21 #include <asm/desc.h>
  22
  23 #ifdef CONFIG_KAISER
  24 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  25
  26 /*
  27  * These can have bit 63 set, so we can not just use a plain "or"
  28  * instruction to get their value or'd into CR3.  It would take
  29  * another register.  So, we use a memory reference to these instead.
  30  *
  31  * This is also handy because systems that do not support PCIDs
  32  * just end up or'ing a 0 into their CR3, which does no harm.
  33  */
  34 unsigned long x86_cr3_pcid_noflush __read_mostly;
  35 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
  36
  37 /*
  38  * At runtime, the only things we map are some things for CPU
  39  * hotplug, and stacks for new processes.  No two CPUs will ever
  40  * be populating the same addresses, so we only need to ensure
  41  * that we protect between two CPUs trying to allocate and
  42  * populate the same page table page.
  43  *
  44  * Only take this lock when doing a set_p[4um]d(), but it is not
  45  * needed for doing a set_pte().  We assume that only the *owner*
  46  * of a given allocation will be doing this for _their_
  47  * allocation.
  48  *
  49  * This ensures that once a system has been running for a while
  50  * and there have been stacks all over and these page tables
  51  * are fully populated, there will be no further acquisitions of
  52  * this lock.
  53  */
  54 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
  55
  56 /*
  57  * Returns -1 on error.
  58  */
  59 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  60 {
  61         pgd_t *pgd;
  62         pud_t *pud;
  63         pmd_t *pmd;
  64         pte_t *pte;
  65
  66         pgd = pgd_offset_k(vaddr);
  67         /*
  68          * We made all the kernel PGDs present in kaiser_init().
  69          * We expect them to stay that way.
  70          */
  71         BUG_ON(pgd_none(*pgd));
  72         /*
  73          * PGDs are either 512GB or 128TB on all x86_64
  74          * configurations.  We don't handle these.
  75          */
  76         BUG_ON(pgd_large(*pgd));
  77
  78         pud = pud_offset(pgd, vaddr);
  79         if (pud_none(*pud)) {
  80                 WARN_ON_ONCE(1);
  81                 return -1;
  82         }
  83
  84         if (pud_large(*pud))
  85                 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
  86
  87         pmd = pmd_offset(pud, vaddr);
  88         if (pmd_none(*pmd)) {
  89                 WARN_ON_ONCE(1);
  90                 return -1;
  91         }
  92
  93         if (pmd_large(*pmd))
  94                 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
  95
  96         pte = pte_offset_kernel(pmd, vaddr);
  97         if (pte_none(*pte)) {
  98                 WARN_ON_ONCE(1);
  99                 return -1;
 100         }
 101
 102         return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 103 }
 104
 105 /*
 106  * This is a relatively normal page table walk, except that it
 107  * also tries to allocate page tables pages along the way.
 108  *
 109  * Returns a pointer to a PTE on success, or NULL on failure.
 110  */
 111 static pte_t *kaiser_pagetable_walk(unsigned long address)
 112 {
 113         pmd_t *pmd;
 114         pud_t *pud;
 115         pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 116         gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 117
 118         if (pgd_none(*pgd)) {
 119                 WARN_ONCE(1, "All shadow pgds should have been populated");
 120                 return NULL;
 121         }
 122         BUILD_BUG_ON(pgd_large(*pgd) != 0);
 123
 124         pud = pud_offset(pgd, address);
 125         /* The shadow page tables do not use large mappings: */
 126         if (pud_large(*pud)) {
 127                 WARN_ON(1);
 128                 return NULL;
 129         }
 130         if (pud_none(*pud)) {
 131                 unsigned long new_pmd_page = __get_free_page(gfp);
 132                 if (!new_pmd_page)
 133                         return NULL;
 134                 spin_lock(&shadow_table_allocation_lock);
 135                 if (pud_none(*pud)) {
 136                         set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 137                         __inc_zone_page_state(virt_to_page((void *)
 138                                                 new_pmd_page), NR_KAISERTABLE);
 139                 } else
 140                         free_page(new_pmd_page);
 141                 spin_unlock(&shadow_table_allocation_lock);
 142         }
 143
 144         pmd = pmd_offset(pud, address);
 145         /* The shadow page tables do not use large mappings: */
 146         if (pmd_large(*pmd)) {
 147                 WARN_ON(1);
 148                 return NULL;
 149         }
 150         if (pmd_none(*pmd)) {
 151                 unsigned long new_pte_page = __get_free_page(gfp);
 152                 if (!new_pte_page)
 153                         return NULL;
 154                 spin_lock(&shadow_table_allocation_lock);
 155                 if (pmd_none(*pmd)) {
 156                         set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 157                         __inc_zone_page_state(virt_to_page((void *)
 158                                                 new_pte_page), NR_KAISERTABLE);
 159                 } else
 160                         free_page(new_pte_page);
 161                 spin_unlock(&shadow_table_allocation_lock);
 162         }
 163
 164         return pte_offset_kernel(pmd, address);
 165 }
 166
 167 int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 168                         unsigned long flags)
 169 {
 170         int ret = 0;
 171         pte_t *pte;
 172         unsigned long start_addr = (unsigned long )__start_addr;
 173         unsigned long address = start_addr & PAGE_MASK;
 174         unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 175         unsigned long target_address;
 176
 177         for (; address < end_addr; address += PAGE_SIZE) {
 178                 target_address = get_pa_from_mapping(address);
 179                 if (target_address == -1) {
 180                         ret = -EIO;
 181                         break;
 182                 }
 183                 pte = kaiser_pagetable_walk(address);
 184                 if (!pte) {
 185                         ret = -ENOMEM;
 186                         break;
 187                 }
 188                 if (pte_none(*pte)) {
 189                         set_pte(pte, __pte(flags | target_address));
 190                 } else {
 191                         pte_t tmp;
 192                         set_pte(&tmp, __pte(flags | target_address));
 193                         WARN_ON_ONCE(!pte_same(*pte, tmp));
 194                 }
 195         }
 196         return ret;
 197 }
 198
 199 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 200 {
 201         unsigned long size = end - start;
 202
 203         return kaiser_add_user_map(start, size, flags);
 204 }
 205
 206 /*
 207  * Ensure that the top level of the (shadow) page tables are
 208  * entirely populated.  This ensures that all processes that get
 209  * forked have the same entries.  This way, we do not have to
 210  * ever go set up new entries in older processes.
 211  *
 212  * Note: we never free these, so there are no updates to them
 213  * after this.
 214  */
 215 static void __init kaiser_init_all_pgds(void)
 216 {
 217         pgd_t *pgd;
 218         int i = 0;
 219
 220         pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 221         for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 222                 pgd_t new_pgd;
 223                 pud_t *pud = pud_alloc_one(&init_mm,
 224                                            PAGE_OFFSET + i * PGDIR_SIZE);
 225                 if (!pud) {
 226                         WARN_ON(1);
 227                         break;
 228                 }
 229                 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 230                 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 231                 /*
 232                  * Make sure not to stomp on some other pgd entry.
 233                  */
 234                 if (!pgd_none(pgd[i])) {
 235                         WARN_ON(1);
 236                         continue;
 237                 }
 238                 set_pgd(pgd + i, new_pgd);
 239         }
 240 }
 241
 242 #define kaiser_add_user_map_early(start, size, flags) do {      \
 243         int __ret = kaiser_add_user_map(start, size, flags);    \
 244         WARN_ON(__ret);                                         \
 245 } while (0)
 246
 247 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {          \
 248         int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
 249         WARN_ON(__ret);                                                 \
 250 } while (0)
 251
 252 /*
 253  * If anything in here fails, we will likely die on one of the
 254  * first kernel->user transitions and init will die.  But, we
 255  * will have most of the kernel up by then and should be able to
 256  * get a clean warning out of it.  If we BUG_ON() here, we run
 257  * the risk of being before we have good console output.
 258  */
 259 void __init kaiser_init(void)
 260 {
 261         int cpu;
 262
 263         kaiser_init_all_pgds();
 264
 265         for_each_possible_cpu(cpu) {
 266                 void *percpu_vaddr = __per_cpu_user_mapped_start +
 267                                      per_cpu_offset(cpu);
 268                 unsigned long percpu_sz = __per_cpu_user_mapped_end -
 269                                           __per_cpu_user_mapped_start;
 270                 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 271                                           __PAGE_KERNEL);
 272         }
 273
 274         /*
 275          * Map the entry/exit text section, which is needed at
 276          * switches from user to and from kernel.
 277          */
 278         kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 279                                        __PAGE_KERNEL_RX);
 280 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 281         kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 282                                        __irqentry_text_end,
 283                                        __PAGE_KERNEL_RX);
 284 #endif
 285         kaiser_add_user_map_early((void *)idt_descr.address,
 286                                   sizeof(gate_desc) * NR_VECTORS,
 287                                   __PAGE_KERNEL_RO);
 288         kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
 289                                   sizeof(x86_cr3_pcid_noflush),
 290                                   __PAGE_KERNEL);
 291 }
 292
 293 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 294 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 295 {
 296         return kaiser_add_user_map((const void *)addr, size, flags);
 297 }
 298
 299 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 300 {
 301         unsigned long end = start + size;
 302         unsigned long addr;
 303         pte_t *pte;
 304
 305         for (addr = start; addr < end; addr += PAGE_SIZE) {
 306                 pte = kaiser_pagetable_walk(addr);
 307                 if (pte)
 308                         set_pte(pte, __pte(0));
 309         }
 310 }
 311
 312 /*
 313  * Page table pages are page-aligned.  The lower half of the top
 314  * level is used for userspace and the top half for the kernel.
 315  * This returns true for user pages that need to get copied into
 316  * both the user and kernel copies of the page tables, and false
 317  * for kernel pages that should only be in the kernel copy.
 318  */
 319 static inline bool is_userspace_pgd(pgd_t *pgdp)
 320 {
 321         return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
 322 }
 323
 324 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 325 {
 326         /*
 327          * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 328          * skip cases like kexec and EFI which make temporary low mappings.
 329          */
 330         if (pgd.pgd & _PAGE_USER) {
 331                 if (is_userspace_pgd(pgdp)) {
 332                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 333                         /*
 334                          * Even if the entry is *mapping* userspace, ensure
 335                          * that userspace can not use it.  This way, if we
 336                          * get out to userspace running on the kernel CR3,
 337                          * userspace will crash instead of running.
 338                          */
 339                         pgd.pgd |= _PAGE_NX;
 340                 }
 341         } else if (!pgd.pgd) {
 342                 /*
 343                  * pgd_clear() cannot check _PAGE_USER, and is even used to
 344                  * clear corrupted pgd entries: so just rely on cases like
 345                  * kexec and EFI never to be using pgd_clear().
 346                  */
 347                 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
 348                     is_userspace_pgd(pgdp))
 349                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 350         }
 351         return pgd;
 352 }
 353
 354 void kaiser_setup_pcid(void)
 355 {
 356         unsigned long kern_cr3 = 0;
 357         unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 358
 359         if (this_cpu_has(X86_FEATURE_PCID)) {
 360                 kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
 361                 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
 362         }
 363         /*
 364          * These variables are used by the entry/exit
 365          * code to change PCID and pgd and TLB flushing.
 366          */
 367         x86_cr3_pcid_noflush = kern_cr3;
 368         this_cpu_write(x86_cr3_pcid_user, user_cr3);
 369 }
 370
 371 /*
 372  * Make a note that this cpu will need to flush USER tlb on return to user.
 373  * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
 374  * if cpu does not, then the NOFLUSH bit will never have been set.
 375  */
 376 void kaiser_flush_tlb_on_return_to_user(void)
 377 {
 378         this_cpu_write(x86_cr3_pcid_user,
 379                         X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 380 }
 381 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
 382 #endif /* CONFIG_KAISER */