2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
11 #include <linux/module.h>
12 #include <linux/uaccess.h>
13 #include <linux/ftrace.h>
17 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
19 extern struct mm_struct init_mm;
21 #include <asm/kaiser.h>
22 #include <asm/tlbflush.h> /* to verify its kaiser declarations */
23 #include <asm/vsyscall.h>
24 #include <asm/pgtable.h>
25 #include <asm/pgalloc.h>
27 #include <asm/cmdline.h>
29 int kaiser_enabled __read_mostly = 1;
30 EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
32 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
35 * These can have bit 63 set, so we can not just use a plain "or"
36 * instruction to get their value or'd into CR3. It would take
37 * another register. So, we use a memory reference to these instead.
39 * This is also handy because systems that do not support PCIDs
40 * just end up or'ing a 0 into their CR3, which does no harm.
42 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
45 * At runtime, the only things we map are some things for CPU
46 * hotplug, and stacks for new processes. No two CPUs will ever
47 * be populating the same addresses, so we only need to ensure
48 * that we protect between two CPUs trying to allocate and
49 * populate the same page table page.
51 * Only take this lock when doing a set_p[4um]d(), but it is not
52 * needed for doing a set_pte(). We assume that only the *owner*
53 * of a given allocation will be doing this for _their_
56 * This ensures that once a system has been running for a while
57 * and there have been stacks all over and these page tables
58 * are fully populated, there will be no further acquisitions of
61 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
64 * Returns -1 on error.
66 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
73 pgd = pgd_offset_k(vaddr);
75 * We made all the kernel PGDs present in kaiser_init().
76 * We expect them to stay that way.
78 BUG_ON(pgd_none(*pgd));
80 * PGDs are either 512GB or 128TB on all x86_64
81 * configurations. We don't handle these.
83 BUG_ON(pgd_large(*pgd));
85 pud = pud_offset(pgd, vaddr);
92 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
94 pmd = pmd_offset(pud, vaddr);
101 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
103 pte = pte_offset_kernel(pmd, vaddr);
104 if (pte_none(*pte)) {
109 return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
113 * This is a relatively normal page table walk, except that it
114 * also tries to allocate page tables pages along the way.
116 * Returns a pointer to a PTE on success, or NULL on failure.
118 static pte_t *kaiser_pagetable_walk(unsigned long address)
122 pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
123 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
125 if (pgd_none(*pgd)) {
126 WARN_ONCE(1, "All shadow pgds should have been populated");
129 BUILD_BUG_ON(pgd_large(*pgd) != 0);
131 pud = pud_offset(pgd, address);
132 /* The shadow page tables do not use large mappings: */
133 if (pud_large(*pud)) {
137 if (pud_none(*pud)) {
138 unsigned long new_pmd_page = __get_free_page(gfp);
141 spin_lock(&shadow_table_allocation_lock);
142 if (pud_none(*pud)) {
143 set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page)));
144 __inc_zone_page_state(virt_to_page((void *)
145 new_pmd_page), NR_KAISERTABLE);
147 free_page(new_pmd_page);
148 spin_unlock(&shadow_table_allocation_lock);
151 pmd = pmd_offset(pud, address);
152 /* The shadow page tables do not use large mappings: */
153 if (pmd_large(*pmd)) {
157 if (pmd_none(*pmd)) {
158 unsigned long new_pte_page = __get_free_page(gfp);
161 spin_lock(&shadow_table_allocation_lock);
162 if (pmd_none(*pmd)) {
163 set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page)));
164 __inc_zone_page_state(virt_to_page((void *)
165 new_pte_page), NR_KAISERTABLE);
167 free_page(new_pte_page);
168 spin_unlock(&shadow_table_allocation_lock);
171 return pte_offset_kernel(pmd, address);
174 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
179 unsigned long start_addr = (unsigned long )__start_addr;
180 unsigned long address = start_addr & PAGE_MASK;
181 unsigned long end_addr = PAGE_ALIGN(start_addr + size);
182 unsigned long target_address;
185 * It is convenient for callers to pass in __PAGE_KERNEL etc,
186 * and there is no actual harm from setting _PAGE_GLOBAL, so
187 * long as CR4.PGE is not set. But it is nonetheless troubling
188 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
189 * requires that not to be #defined to 0): so mask it off here.
191 flags &= ~_PAGE_GLOBAL;
192 if (!(__supported_pte_mask & _PAGE_NX))
195 if (flags & _PAGE_USER)
196 BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP);
198 for (; address < end_addr; address += PAGE_SIZE) {
199 target_address = get_pa_from_mapping(address);
200 if (target_address == -1) {
204 pte = kaiser_pagetable_walk(address);
209 if (pte_none(*pte)) {
210 set_pte(pte, __pte(flags | target_address));
213 set_pte(&tmp, __pte(flags | target_address));
214 WARN_ON_ONCE(!pte_same(*pte, tmp));
220 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
222 unsigned long size = end - start;
224 return kaiser_add_user_map(start, size, flags);
228 * Ensure that the top level of the (shadow) page tables are
229 * entirely populated. This ensures that all processes that get
230 * forked have the same entries. This way, we do not have to
231 * ever go set up new entries in older processes.
233 * Note: we never free these, so there are no updates to them
236 static void __init kaiser_init_all_pgds(void)
241 pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
242 for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
244 pud_t *pud = pud_alloc_one(&init_mm,
245 PAGE_OFFSET + i * PGDIR_SIZE);
250 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
251 new_pgd = __pgd(_PAGE_TABLE |__pa(pud));
253 * Make sure not to stomp on some other pgd entry.
255 if (!pgd_none(pgd[i])) {
259 set_pgd(pgd + i, new_pgd);
263 #define kaiser_add_user_map_early(start, size, flags) do { \
264 int __ret = kaiser_add_user_map(start, size, flags); \
268 #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
269 int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
273 void __init kaiser_check_boottime_disable(void)
282 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
284 if (!strncmp(arg, "on", 2))
287 if (!strncmp(arg, "off", 3))
290 if (!strncmp(arg, "auto", 4))
294 if (cmdline_find_option_bool(boot_command_line, "nopti"))
298 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
303 setup_force_cpu_cap(X86_FEATURE_KAISER);
308 pr_info("disabled\n");
312 setup_clear_cpu_cap(X86_FEATURE_KAISER);
316 * If anything in here fails, we will likely die on one of the
317 * first kernel->user transitions and init will die. But, we
318 * will have most of the kernel up by then and should be able to
319 * get a clean warning out of it. If we BUG_ON() here, we run
320 * the risk of being before we have good console output.
322 void __init kaiser_init(void)
329 kaiser_init_all_pgds();
331 for_each_possible_cpu(cpu) {
332 void *percpu_vaddr = __per_cpu_user_mapped_start +
334 unsigned long percpu_sz = __per_cpu_user_mapped_end -
335 __per_cpu_user_mapped_start;
336 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
341 * Map the entry/exit text section, which is needed at
342 * switches from user to and from kernel.
344 kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
346 kaiser_add_user_map_ptrs_early(__kprobes_text_start, __kprobes_text_end,
348 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
349 kaiser_add_user_map_ptrs_early(__irqentry_text_start,
353 kaiser_add_user_map_early((void *)idt_descr.address,
354 sizeof(gate_desc) * NR_VECTORS,
356 kaiser_add_user_map_early((void *)VVAR_ADDRESS, PAGE_SIZE,
358 kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE,
361 pr_info("enabled\n");
364 /* Add a mapping to the shadow mapping, and synchronize the mappings */
365 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
369 return kaiser_add_user_map((const void *)addr, size, flags);
372 void kaiser_remove_mapping(unsigned long start, unsigned long size)
374 unsigned long end = start + size;
380 for (addr = start; addr < end; addr += PAGE_SIZE) {
381 pte = kaiser_pagetable_walk(addr);
383 set_pte(pte, __pte(0));
388 * Page table pages are page-aligned. The lower half of the top
389 * level is used for userspace and the top half for the kernel.
390 * This returns true for user pages that need to get copied into
391 * both the user and kernel copies of the page tables, and false
392 * for kernel pages that should only be in the kernel copy.
394 static inline bool is_userspace_pgd(pgd_t *pgdp)
396 return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
399 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
404 * Do we need to also populate the shadow pgd? Check _PAGE_USER to
405 * skip cases like kexec and EFI which make temporary low mappings.
407 if (pgd.pgd & _PAGE_USER) {
408 if (is_userspace_pgd(pgdp)) {
409 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
411 * Even if the entry is *mapping* userspace, ensure
412 * that userspace can not use it. This way, if we
413 * get out to userspace running on the kernel CR3,
414 * userspace will crash instead of running.
416 if (__supported_pte_mask & _PAGE_NX)
419 } else if (!pgd.pgd) {
421 * pgd_clear() cannot check _PAGE_USER, and is even used to
422 * clear corrupted pgd entries: so just rely on cases like
423 * kexec and EFI never to be using pgd_clear().
425 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
426 is_userspace_pgd(pgdp))
427 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
432 void kaiser_setup_pcid(void)
434 unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
436 if (this_cpu_has(X86_FEATURE_PCID))
437 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
439 * These variables are used by the entry/exit
440 * code to change PCID and pgd and TLB flushing.
442 this_cpu_write(x86_cr3_pcid_user, user_cr3);
446 * Make a note that this cpu will need to flush USER tlb on return to user.
447 * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
449 void kaiser_flush_tlb_on_return_to_user(void)
451 if (this_cpu_has(X86_FEATURE_PCID))
452 this_cpu_write(x86_cr3_pcid_user,
453 X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
455 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);