2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Avi Kivity <avi@qumranet.com>
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
24 #include "kvm_cache_regs.h"
27 #include <linux/kvm_host.h>
28 #include <linux/types.h>
29 #include <linux/string.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/swap.h>
34 #include <linux/hugetlb.h>
35 #include <linux/compiler.h>
36 #include <linux/srcu.h>
37 #include <linux/slab.h>
38 #include <linux/uaccess.h>
41 #include <asm/cmpxchg.h>
46 * When setting this variable to true it enables Two-Dimensional-Paging
47 * where the hardware walks 2 page tables:
48 * 1. the guest-virtual to guest-physical
49 * 2. while doing 1. it walks guest-physical to host-physical
50 * If the hardware supports that we don't need to do shadow paging.
52 bool tdp_enabled = false;
56 AUDIT_POST_PAGE_FAULT,
63 char *audit_point_name[] = {
76 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
81 #define pgprintk(x...) do { } while (0)
82 #define rmap_printk(x...) do { } while (0)
88 module_param(dbg, bool, 0644);
91 static int oos_shadow = 1;
92 module_param(oos_shadow, bool, 0644);
95 #define ASSERT(x) do { } while (0)
99 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
100 __FILE__, __LINE__, #x); \
104 #define PTE_PREFETCH_NUM 8
106 #define PT_FIRST_AVAIL_BITS_SHIFT 9
107 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
109 #define PT64_LEVEL_BITS 9
111 #define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
114 #define PT64_INDEX(address, level)\
115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118 #define PT32_LEVEL_BITS 10
120 #define PT32_LEVEL_SHIFT(level) \
121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
123 #define PT32_LVL_OFFSET_MASK(level) \
124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
125 * PT32_LEVEL_BITS))) - 1))
127 #define PT32_INDEX(address, level)\
128 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
131 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
132 #define PT64_DIR_BASE_ADDR_MASK \
133 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
134 #define PT64_LVL_ADDR_MASK(level) \
135 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
136 * PT64_LEVEL_BITS))) - 1))
137 #define PT64_LVL_OFFSET_MASK(level) \
138 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
139 * PT64_LEVEL_BITS))) - 1))
141 #define PT32_BASE_ADDR_MASK PAGE_MASK
142 #define PT32_DIR_BASE_ADDR_MASK \
143 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
144 #define PT32_LVL_ADDR_MASK(level) \
145 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
146 * PT32_LEVEL_BITS))) - 1))
148 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
153 #define ACC_EXEC_MASK 1
154 #define ACC_WRITE_MASK PT_WRITABLE_MASK
155 #define ACC_USER_MASK PT_USER_MASK
156 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
158 #include <trace/events/kvm.h>
160 #define CREATE_TRACE_POINTS
161 #include "mmutrace.h"
163 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
165 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
167 struct kvm_rmap_desc {
168 u64 *sptes[RMAP_EXT];
169 struct kvm_rmap_desc *more;
172 struct kvm_shadow_walk_iterator {
180 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
181 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
182 shadow_walk_okay(&(_walker)); \
183 shadow_walk_next(&(_walker)))
185 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
187 static struct kmem_cache *pte_chain_cache;
188 static struct kmem_cache *rmap_desc_cache;
189 static struct kmem_cache *mmu_page_header_cache;
190 static struct percpu_counter kvm_total_used_mmu_pages;
192 static u64 __read_mostly shadow_trap_nonpresent_pte;
193 static u64 __read_mostly shadow_notrap_nonpresent_pte;
194 static u64 __read_mostly shadow_nx_mask;
195 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
196 static u64 __read_mostly shadow_user_mask;
197 static u64 __read_mostly shadow_accessed_mask;
198 static u64 __read_mostly shadow_dirty_mask;
200 static inline u64 rsvd_bits(int s, int e)
202 return ((1ULL << (e - s + 1)) - 1) << s;
205 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
207 shadow_trap_nonpresent_pte = trap_pte;
208 shadow_notrap_nonpresent_pte = notrap_pte;
210 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
212 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 u64 dirty_mask, u64 nx_mask, u64 x_mask)
215 shadow_user_mask = user_mask;
216 shadow_accessed_mask = accessed_mask;
217 shadow_dirty_mask = dirty_mask;
218 shadow_nx_mask = nx_mask;
219 shadow_x_mask = x_mask;
221 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
223 static bool is_write_protection(struct kvm_vcpu *vcpu)
225 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
228 static int is_cpuid_PSE36(void)
233 static int is_nx(struct kvm_vcpu *vcpu)
235 return vcpu->arch.efer & EFER_NX;
238 static int is_shadow_present_pte(u64 pte)
240 return pte != shadow_trap_nonpresent_pte
241 && pte != shadow_notrap_nonpresent_pte;
244 static int is_large_pte(u64 pte)
246 return pte & PT_PAGE_SIZE_MASK;
249 static int is_writable_pte(unsigned long pte)
251 return pte & PT_WRITABLE_MASK;
254 static int is_dirty_gpte(unsigned long pte)
256 return pte & PT_DIRTY_MASK;
259 static int is_rmap_spte(u64 pte)
261 return is_shadow_present_pte(pte);
264 static int is_last_spte(u64 pte, int level)
266 if (level == PT_PAGE_TABLE_LEVEL)
268 if (is_large_pte(pte))
273 static pfn_t spte_to_pfn(u64 pte)
275 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
278 static gfn_t pse36_gfn_delta(u32 gpte)
280 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
282 return (gpte & PT32_DIR_PSE36_MASK) << shift;
285 static void __set_spte(u64 *sptep, u64 spte)
287 set_64bit(sptep, spte);
290 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
293 return xchg(sptep, new_spte);
299 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
305 static bool spte_has_volatile_bits(u64 spte)
307 if (!shadow_accessed_mask)
310 if (!is_shadow_present_pte(spte))
313 if ((spte & shadow_accessed_mask) &&
314 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
320 static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
322 return (old_spte & bit_mask) && !(new_spte & bit_mask);
325 static void update_spte(u64 *sptep, u64 new_spte)
327 u64 mask, old_spte = *sptep;
329 WARN_ON(!is_rmap_spte(new_spte));
331 new_spte |= old_spte & shadow_dirty_mask;
333 mask = shadow_accessed_mask;
334 if (is_writable_pte(old_spte))
335 mask |= shadow_dirty_mask;
337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 __set_spte(sptep, new_spte);
340 old_spte = __xchg_spte(sptep, new_spte);
342 if (!shadow_accessed_mask)
345 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
351 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 struct kmem_cache *base_cache, int min)
356 if (cache->nobjs >= min)
358 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
359 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
362 cache->objects[cache->nobjs++] = obj;
367 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
368 struct kmem_cache *cache)
371 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
374 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
379 if (cache->nobjs >= min)
381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
382 page = (void *)__get_free_page(GFP_KERNEL);
385 cache->objects[cache->nobjs++] = page;
390 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
393 free_page((unsigned long)mc->objects[--mc->nobjs]);
396 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
400 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
411 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
412 mmu_page_header_cache, 4);
417 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
419 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
420 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
421 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 mmu_page_header_cache);
426 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
432 p = mc->objects[--mc->nobjs];
436 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
438 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
439 sizeof(struct kvm_pte_chain));
442 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
444 kmem_cache_free(pte_chain_cache, pc);
447 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
449 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
450 sizeof(struct kvm_rmap_desc));
453 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
455 kmem_cache_free(rmap_desc_cache, rd);
458 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
460 if (!sp->role.direct)
461 return sp->gfns[index];
463 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
466 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
469 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
471 sp->gfns[index] = gfn;
475 * Return the pointer to the large page information for a given gfn,
476 * handling slots that are not large page aligned.
478 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
479 struct kvm_memory_slot *slot,
484 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
485 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
486 return &slot->lpage_info[level - 2][idx];
489 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
491 struct kvm_memory_slot *slot;
492 struct kvm_lpage_info *linfo;
495 slot = gfn_to_memslot(kvm, gfn);
496 for (i = PT_DIRECTORY_LEVEL;
497 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
498 linfo = lpage_info_slot(gfn, slot, i);
499 linfo->write_count += 1;
503 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
505 struct kvm_memory_slot *slot;
506 struct kvm_lpage_info *linfo;
509 slot = gfn_to_memslot(kvm, gfn);
510 for (i = PT_DIRECTORY_LEVEL;
511 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
512 linfo = lpage_info_slot(gfn, slot, i);
513 linfo->write_count -= 1;
514 WARN_ON(linfo->write_count < 0);
518 static int has_wrprotected_page(struct kvm *kvm,
522 struct kvm_memory_slot *slot;
523 struct kvm_lpage_info *linfo;
525 slot = gfn_to_memslot(kvm, gfn);
527 linfo = lpage_info_slot(gfn, slot, level);
528 return linfo->write_count;
534 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
536 unsigned long page_size;
539 page_size = kvm_host_page_size(kvm, gfn);
541 for (i = PT_PAGE_TABLE_LEVEL;
542 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
543 if (page_size >= KVM_HPAGE_SIZE(i))
552 static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
554 struct kvm_memory_slot *slot;
555 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
556 if (slot && slot->dirty_bitmap)
561 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
563 int host_level, level, max_level;
565 host_level = host_mapping_level(vcpu->kvm, large_gfn);
567 if (host_level == PT_PAGE_TABLE_LEVEL)
570 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
571 kvm_x86_ops->get_lpage_level() : host_level;
573 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
574 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
581 * Take gfn and return the reverse mapping to it.
584 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
586 struct kvm_memory_slot *slot;
587 struct kvm_lpage_info *linfo;
589 slot = gfn_to_memslot(kvm, gfn);
590 if (likely(level == PT_PAGE_TABLE_LEVEL))
591 return &slot->rmap[gfn - slot->base_gfn];
593 linfo = lpage_info_slot(gfn, slot, level);
595 return &linfo->rmap_pde;
599 * Reverse mapping data structures:
601 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
602 * that points to page_address(page).
604 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
605 * containing more mappings.
607 * Returns the number of rmap entries before the spte was added or zero if
608 * the spte was not added.
611 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
613 struct kvm_mmu_page *sp;
614 struct kvm_rmap_desc *desc;
615 unsigned long *rmapp;
618 if (!is_rmap_spte(*spte))
620 sp = page_header(__pa(spte));
621 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
622 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
624 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
625 *rmapp = (unsigned long)spte;
626 } else if (!(*rmapp & 1)) {
627 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
628 desc = mmu_alloc_rmap_desc(vcpu);
629 desc->sptes[0] = (u64 *)*rmapp;
630 desc->sptes[1] = spte;
631 *rmapp = (unsigned long)desc | 1;
634 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
635 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
636 while (desc->sptes[RMAP_EXT-1] && desc->more) {
640 if (desc->sptes[RMAP_EXT-1]) {
641 desc->more = mmu_alloc_rmap_desc(vcpu);
644 for (i = 0; desc->sptes[i]; ++i)
646 desc->sptes[i] = spte;
651 static void rmap_desc_remove_entry(unsigned long *rmapp,
652 struct kvm_rmap_desc *desc,
654 struct kvm_rmap_desc *prev_desc)
658 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
660 desc->sptes[i] = desc->sptes[j];
661 desc->sptes[j] = NULL;
664 if (!prev_desc && !desc->more)
665 *rmapp = (unsigned long)desc->sptes[0];
668 prev_desc->more = desc->more;
670 *rmapp = (unsigned long)desc->more | 1;
671 mmu_free_rmap_desc(desc);
674 static void rmap_remove(struct kvm *kvm, u64 *spte)
676 struct kvm_rmap_desc *desc;
677 struct kvm_rmap_desc *prev_desc;
678 struct kvm_mmu_page *sp;
680 unsigned long *rmapp;
683 sp = page_header(__pa(spte));
684 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
685 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
687 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
689 } else if (!(*rmapp & 1)) {
690 rmap_printk("rmap_remove: %p 1->0\n", spte);
691 if ((u64 *)*rmapp != spte) {
692 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
697 rmap_printk("rmap_remove: %p many->many\n", spte);
698 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
701 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
702 if (desc->sptes[i] == spte) {
703 rmap_desc_remove_entry(rmapp,
711 pr_err("rmap_remove: %p many->many\n", spte);
716 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
719 u64 old_spte = *sptep;
721 if (!spte_has_volatile_bits(old_spte))
722 __set_spte(sptep, new_spte);
724 old_spte = __xchg_spte(sptep, new_spte);
726 if (!is_rmap_spte(old_spte))
729 pfn = spte_to_pfn(old_spte);
730 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
731 kvm_set_pfn_accessed(pfn);
732 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
733 kvm_set_pfn_dirty(pfn);
737 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
739 if (set_spte_track_bits(sptep, new_spte))
740 rmap_remove(kvm, sptep);
743 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
745 struct kvm_rmap_desc *desc;
751 else if (!(*rmapp & 1)) {
753 return (u64 *)*rmapp;
756 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
759 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
760 if (prev_spte == spte)
761 return desc->sptes[i];
762 prev_spte = desc->sptes[i];
769 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
771 unsigned long *rmapp;
773 int i, write_protected = 0;
775 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
777 spte = rmap_next(kvm, rmapp, NULL);
780 BUG_ON(!(*spte & PT_PRESENT_MASK));
781 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
782 if (is_writable_pte(*spte)) {
783 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
786 spte = rmap_next(kvm, rmapp, spte);
789 /* check for huge page mappings */
790 for (i = PT_DIRECTORY_LEVEL;
791 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
792 rmapp = gfn_to_rmap(kvm, gfn, i);
793 spte = rmap_next(kvm, rmapp, NULL);
796 BUG_ON(!(*spte & PT_PRESENT_MASK));
797 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
798 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
799 if (is_writable_pte(*spte)) {
801 shadow_trap_nonpresent_pte);
806 spte = rmap_next(kvm, rmapp, spte);
810 return write_protected;
813 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
817 int need_tlb_flush = 0;
819 while ((spte = rmap_next(kvm, rmapp, NULL))) {
820 BUG_ON(!(*spte & PT_PRESENT_MASK));
821 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
822 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
825 return need_tlb_flush;
828 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
833 pte_t *ptep = (pte_t *)data;
836 WARN_ON(pte_huge(*ptep));
837 new_pfn = pte_pfn(*ptep);
838 spte = rmap_next(kvm, rmapp, NULL);
840 BUG_ON(!is_shadow_present_pte(*spte));
841 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
843 if (pte_write(*ptep)) {
844 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
845 spte = rmap_next(kvm, rmapp, NULL);
847 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
848 new_spte |= (u64)new_pfn << PAGE_SHIFT;
850 new_spte &= ~PT_WRITABLE_MASK;
851 new_spte &= ~SPTE_HOST_WRITEABLE;
852 new_spte &= ~shadow_accessed_mask;
853 set_spte_track_bits(spte, new_spte);
854 spte = rmap_next(kvm, rmapp, spte);
858 kvm_flush_remote_tlbs(kvm);
863 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
865 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
871 struct kvm_memslots *slots;
873 slots = kvm_memslots(kvm);
875 for (i = 0; i < slots->nmemslots; i++) {
876 struct kvm_memory_slot *memslot = &slots->memslots[i];
877 unsigned long start = memslot->userspace_addr;
880 end = start + (memslot->npages << PAGE_SHIFT);
881 if (hva >= start && hva < end) {
882 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
883 gfn_t gfn = memslot->base_gfn + gfn_offset;
885 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
887 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
888 struct kvm_lpage_info *linfo;
890 linfo = lpage_info_slot(gfn, memslot,
891 PT_DIRECTORY_LEVEL + j);
892 ret |= handler(kvm, &linfo->rmap_pde, data);
894 trace_kvm_age_page(hva, memslot, ret);
902 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
904 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
907 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
909 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
912 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
919 * Emulate the accessed bit for EPT, by checking if this page has
920 * an EPT mapping, and clearing it if it does. On the next access,
921 * a new EPT mapping will be established.
922 * This has some overhead, but not as much as the cost of swapping
923 * out actively used pages or breaking up actively used hugepages.
925 if (!shadow_accessed_mask)
926 return kvm_unmap_rmapp(kvm, rmapp, data);
928 spte = rmap_next(kvm, rmapp, NULL);
932 BUG_ON(!(_spte & PT_PRESENT_MASK));
933 _young = _spte & PT_ACCESSED_MASK;
936 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
938 spte = rmap_next(kvm, rmapp, spte);
943 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950 * If there's no access bit in the secondary pte set by the
951 * hardware it's up to gup-fast/gup to set the access bit in
952 * the primary pte or in the page structure.
954 if (!shadow_accessed_mask)
957 spte = rmap_next(kvm, rmapp, NULL);
960 BUG_ON(!(_spte & PT_PRESENT_MASK));
961 young = _spte & PT_ACCESSED_MASK;
966 spte = rmap_next(kvm, rmapp, spte);
972 #define RMAP_RECYCLE_THRESHOLD 1000
974 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
976 unsigned long *rmapp;
977 struct kvm_mmu_page *sp;
979 sp = page_header(__pa(spte));
981 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
983 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
984 kvm_flush_remote_tlbs(vcpu->kvm);
987 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
989 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
992 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
994 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
998 static int is_empty_shadow_page(u64 *spt)
1003 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1004 if (is_shadow_present_pte(*pos)) {
1005 printk(KERN_ERR "%s: %p %llx\n", __func__,
1014 * This value is the sum of all of the kvm instances's
1015 * kvm->arch.n_used_mmu_pages values. We need a global,
1016 * aggregate version in order to make the slab shrinker
1019 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1021 kvm->arch.n_used_mmu_pages += nr;
1022 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1025 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1027 ASSERT(is_empty_shadow_page(sp->spt));
1028 hlist_del(&sp->hash_link);
1029 list_del(&sp->link);
1030 free_page((unsigned long)sp->spt);
1031 if (!sp->role.direct)
1032 free_page((unsigned long)sp->gfns);
1033 kmem_cache_free(mmu_page_header_cache, sp);
1034 kvm_mod_used_mmu_pages(kvm, -1);
1037 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1039 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1042 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1043 u64 *parent_pte, int direct)
1045 struct kvm_mmu_page *sp;
1047 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1048 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1050 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1052 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1053 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1054 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1055 sp->multimapped = 0;
1056 sp->parent_pte = parent_pte;
1057 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1061 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1062 struct kvm_mmu_page *sp, u64 *parent_pte)
1064 struct kvm_pte_chain *pte_chain;
1065 struct hlist_node *node;
1070 if (!sp->multimapped) {
1071 u64 *old = sp->parent_pte;
1074 sp->parent_pte = parent_pte;
1077 sp->multimapped = 1;
1078 pte_chain = mmu_alloc_pte_chain(vcpu);
1079 INIT_HLIST_HEAD(&sp->parent_ptes);
1080 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1081 pte_chain->parent_ptes[0] = old;
1083 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1084 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1086 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1087 if (!pte_chain->parent_ptes[i]) {
1088 pte_chain->parent_ptes[i] = parent_pte;
1092 pte_chain = mmu_alloc_pte_chain(vcpu);
1094 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1095 pte_chain->parent_ptes[0] = parent_pte;
1098 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1101 struct kvm_pte_chain *pte_chain;
1102 struct hlist_node *node;
1105 if (!sp->multimapped) {
1106 BUG_ON(sp->parent_pte != parent_pte);
1107 sp->parent_pte = NULL;
1110 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1111 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1112 if (!pte_chain->parent_ptes[i])
1114 if (pte_chain->parent_ptes[i] != parent_pte)
1116 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1117 && pte_chain->parent_ptes[i + 1]) {
1118 pte_chain->parent_ptes[i]
1119 = pte_chain->parent_ptes[i + 1];
1122 pte_chain->parent_ptes[i] = NULL;
1124 hlist_del(&pte_chain->link);
1125 mmu_free_pte_chain(pte_chain);
1126 if (hlist_empty(&sp->parent_ptes)) {
1127 sp->multimapped = 0;
1128 sp->parent_pte = NULL;
1136 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1138 struct kvm_pte_chain *pte_chain;
1139 struct hlist_node *node;
1140 struct kvm_mmu_page *parent_sp;
1143 if (!sp->multimapped && sp->parent_pte) {
1144 parent_sp = page_header(__pa(sp->parent_pte));
1145 fn(parent_sp, sp->parent_pte);
1149 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1150 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1151 u64 *spte = pte_chain->parent_ptes[i];
1155 parent_sp = page_header(__pa(spte));
1156 fn(parent_sp, spte);
1160 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1161 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1163 mmu_parent_walk(sp, mark_unsync);
1166 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1170 index = spte - sp->spt;
1171 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1173 if (sp->unsync_children++)
1175 kvm_mmu_mark_parents_unsync(sp);
1178 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1179 struct kvm_mmu_page *sp)
1183 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1184 sp->spt[i] = shadow_trap_nonpresent_pte;
1187 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1188 struct kvm_mmu_page *sp)
1193 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1197 #define KVM_PAGE_ARRAY_NR 16
1199 struct kvm_mmu_pages {
1200 struct mmu_page_and_offset {
1201 struct kvm_mmu_page *sp;
1203 } page[KVM_PAGE_ARRAY_NR];
1207 #define for_each_unsync_children(bitmap, idx) \
1208 for (idx = find_first_bit(bitmap, 512); \
1210 idx = find_next_bit(bitmap, 512, idx+1))
1212 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1218 for (i=0; i < pvec->nr; i++)
1219 if (pvec->page[i].sp == sp)
1222 pvec->page[pvec->nr].sp = sp;
1223 pvec->page[pvec->nr].idx = idx;
1225 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1228 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1229 struct kvm_mmu_pages *pvec)
1231 int i, ret, nr_unsync_leaf = 0;
1233 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1234 struct kvm_mmu_page *child;
1235 u64 ent = sp->spt[i];
1237 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1238 goto clear_child_bitmap;
1240 child = page_header(ent & PT64_BASE_ADDR_MASK);
1242 if (child->unsync_children) {
1243 if (mmu_pages_add(pvec, child, i))
1246 ret = __mmu_unsync_walk(child, pvec);
1248 goto clear_child_bitmap;
1250 nr_unsync_leaf += ret;
1253 } else if (child->unsync) {
1255 if (mmu_pages_add(pvec, child, i))
1258 goto clear_child_bitmap;
1263 __clear_bit(i, sp->unsync_child_bitmap);
1264 sp->unsync_children--;
1265 WARN_ON((int)sp->unsync_children < 0);
1269 return nr_unsync_leaf;
1272 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1273 struct kvm_mmu_pages *pvec)
1275 if (!sp->unsync_children)
1278 mmu_pages_add(pvec, sp, 0);
1279 return __mmu_unsync_walk(sp, pvec);
1282 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1284 WARN_ON(!sp->unsync);
1285 trace_kvm_mmu_sync_page(sp);
1287 --kvm->stat.mmu_unsync;
1290 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1291 struct list_head *invalid_list);
1292 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1293 struct list_head *invalid_list);
1295 #define for_each_gfn_sp(kvm, sp, gfn, pos) \
1296 hlist_for_each_entry(sp, pos, \
1297 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1298 if ((sp)->gfn != (gfn)) {} else
1300 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1301 hlist_for_each_entry(sp, pos, \
1302 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1303 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1304 (sp)->role.invalid) {} else
1306 /* @sp->gfn should be write-protected at the call site */
1307 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1308 struct list_head *invalid_list, bool clear_unsync)
1310 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1311 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1316 kvm_unlink_unsync_page(vcpu->kvm, sp);
1318 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1319 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1323 kvm_mmu_flush_tlb(vcpu);
1327 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1328 struct kvm_mmu_page *sp)
1330 LIST_HEAD(invalid_list);
1333 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1335 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1340 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1341 struct list_head *invalid_list)
1343 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1346 /* @gfn should be write-protected at the call site */
1347 static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1349 struct kvm_mmu_page *s;
1350 struct hlist_node *node;
1351 LIST_HEAD(invalid_list);
1354 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1358 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1359 kvm_unlink_unsync_page(vcpu->kvm, s);
1360 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1361 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1362 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1368 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1370 kvm_mmu_flush_tlb(vcpu);
1373 struct mmu_page_path {
1374 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1375 unsigned int idx[PT64_ROOT_LEVEL-1];
1378 #define for_each_sp(pvec, sp, parents, i) \
1379 for (i = mmu_pages_next(&pvec, &parents, -1), \
1380 sp = pvec.page[i].sp; \
1381 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1382 i = mmu_pages_next(&pvec, &parents, i))
1384 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1385 struct mmu_page_path *parents,
1390 for (n = i+1; n < pvec->nr; n++) {
1391 struct kvm_mmu_page *sp = pvec->page[n].sp;
1393 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1394 parents->idx[0] = pvec->page[n].idx;
1398 parents->parent[sp->role.level-2] = sp;
1399 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1405 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1407 struct kvm_mmu_page *sp;
1408 unsigned int level = 0;
1411 unsigned int idx = parents->idx[level];
1413 sp = parents->parent[level];
1417 --sp->unsync_children;
1418 WARN_ON((int)sp->unsync_children < 0);
1419 __clear_bit(idx, sp->unsync_child_bitmap);
1421 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1424 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1425 struct mmu_page_path *parents,
1426 struct kvm_mmu_pages *pvec)
1428 parents->parent[parent->role.level-1] = NULL;
1432 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1433 struct kvm_mmu_page *parent)
1436 struct kvm_mmu_page *sp;
1437 struct mmu_page_path parents;
1438 struct kvm_mmu_pages pages;
1439 LIST_HEAD(invalid_list);
1441 kvm_mmu_pages_init(parent, &parents, &pages);
1442 while (mmu_unsync_walk(parent, &pages)) {
1445 for_each_sp(pages, sp, parents, i)
1446 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1449 kvm_flush_remote_tlbs(vcpu->kvm);
1451 for_each_sp(pages, sp, parents, i) {
1452 kvm_sync_page(vcpu, sp, &invalid_list);
1453 mmu_pages_clear_parents(&parents);
1455 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1456 cond_resched_lock(&vcpu->kvm->mmu_lock);
1457 kvm_mmu_pages_init(parent, &parents, &pages);
1461 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1469 union kvm_mmu_page_role role;
1471 struct kvm_mmu_page *sp;
1472 struct hlist_node *node;
1473 bool need_sync = false;
1475 role = vcpu->arch.mmu.base_role;
1477 role.direct = direct;
1480 role.access = access;
1481 if (!vcpu->arch.mmu.direct_map
1482 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1483 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1484 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1485 role.quadrant = quadrant;
1487 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1488 if (!need_sync && sp->unsync)
1491 if (sp->role.word != role.word)
1494 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1497 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1498 if (sp->unsync_children) {
1499 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1500 kvm_mmu_mark_parents_unsync(sp);
1501 } else if (sp->unsync)
1502 kvm_mmu_mark_parents_unsync(sp);
1504 trace_kvm_mmu_get_page(sp, false);
1507 ++vcpu->kvm->stat.mmu_cache_miss;
1508 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1513 hlist_add_head(&sp->hash_link,
1514 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1516 if (rmap_write_protect(vcpu->kvm, gfn))
1517 kvm_flush_remote_tlbs(vcpu->kvm);
1518 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1519 kvm_sync_pages(vcpu, gfn);
1521 account_shadowed(vcpu->kvm, gfn);
1523 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1524 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1526 nonpaging_prefetch_page(vcpu, sp);
1527 trace_kvm_mmu_get_page(sp, true);
1531 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1532 struct kvm_vcpu *vcpu, u64 addr)
1534 iterator->addr = addr;
1535 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1536 iterator->level = vcpu->arch.mmu.shadow_root_level;
1538 if (iterator->level == PT64_ROOT_LEVEL &&
1539 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1540 !vcpu->arch.mmu.direct_map)
1543 if (iterator->level == PT32E_ROOT_LEVEL) {
1544 iterator->shadow_addr
1545 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1546 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1548 if (!iterator->shadow_addr)
1549 iterator->level = 0;
1553 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1555 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1558 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1559 if (is_large_pte(*iterator->sptep))
1562 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1563 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1567 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1569 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1573 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1577 spte = __pa(sp->spt)
1578 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1579 | PT_WRITABLE_MASK | PT_USER_MASK;
1580 __set_spte(sptep, spte);
1583 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1585 if (is_large_pte(*sptep)) {
1586 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1587 kvm_flush_remote_tlbs(vcpu->kvm);
1591 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1592 unsigned direct_access)
1594 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1595 struct kvm_mmu_page *child;
1598 * For the direct sp, if the guest pte's dirty bit
1599 * changed form clean to dirty, it will corrupt the
1600 * sp's access: allow writable in the read-only sp,
1601 * so we should update the spte at this point to get
1602 * a new sp with the correct access.
1604 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1605 if (child->role.access == direct_access)
1608 mmu_page_remove_parent_pte(child, sptep);
1609 __set_spte(sptep, shadow_trap_nonpresent_pte);
1610 kvm_flush_remote_tlbs(vcpu->kvm);
1614 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1615 struct kvm_mmu_page *sp)
1623 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1626 if (is_shadow_present_pte(ent)) {
1627 if (!is_last_spte(ent, sp->role.level)) {
1628 ent &= PT64_BASE_ADDR_MASK;
1629 mmu_page_remove_parent_pte(page_header(ent),
1632 if (is_large_pte(ent))
1634 drop_spte(kvm, &pt[i],
1635 shadow_trap_nonpresent_pte);
1638 pt[i] = shadow_trap_nonpresent_pte;
1642 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1644 mmu_page_remove_parent_pte(sp, parent_pte);
1647 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1650 struct kvm_vcpu *vcpu;
1652 kvm_for_each_vcpu(i, vcpu, kvm)
1653 vcpu->arch.last_pte_updated = NULL;
1656 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1660 while (sp->multimapped || sp->parent_pte) {
1661 if (!sp->multimapped)
1662 parent_pte = sp->parent_pte;
1664 struct kvm_pte_chain *chain;
1666 chain = container_of(sp->parent_ptes.first,
1667 struct kvm_pte_chain, link);
1668 parent_pte = chain->parent_ptes[0];
1670 BUG_ON(!parent_pte);
1671 kvm_mmu_put_page(sp, parent_pte);
1672 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1676 static int mmu_zap_unsync_children(struct kvm *kvm,
1677 struct kvm_mmu_page *parent,
1678 struct list_head *invalid_list)
1681 struct mmu_page_path parents;
1682 struct kvm_mmu_pages pages;
1684 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1687 kvm_mmu_pages_init(parent, &parents, &pages);
1688 while (mmu_unsync_walk(parent, &pages)) {
1689 struct kvm_mmu_page *sp;
1691 for_each_sp(pages, sp, parents, i) {
1692 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1693 mmu_pages_clear_parents(&parents);
1696 kvm_mmu_pages_init(parent, &parents, &pages);
1702 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1703 struct list_head *invalid_list)
1707 trace_kvm_mmu_prepare_zap_page(sp);
1708 ++kvm->stat.mmu_shadow_zapped;
1709 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1710 kvm_mmu_page_unlink_children(kvm, sp);
1711 kvm_mmu_unlink_parents(kvm, sp);
1712 if (!sp->role.invalid && !sp->role.direct)
1713 unaccount_shadowed(kvm, sp->gfn);
1715 kvm_unlink_unsync_page(kvm, sp);
1716 if (!sp->root_count) {
1719 list_move(&sp->link, invalid_list);
1721 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1722 kvm_reload_remote_mmus(kvm);
1725 sp->role.invalid = 1;
1726 kvm_mmu_reset_last_pte_updated(kvm);
1730 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1731 struct list_head *invalid_list)
1733 struct kvm_mmu_page *sp;
1735 if (list_empty(invalid_list))
1738 kvm_flush_remote_tlbs(kvm);
1741 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1742 WARN_ON(!sp->role.invalid || sp->root_count);
1743 kvm_mmu_free_page(kvm, sp);
1744 } while (!list_empty(invalid_list));
1749 * Changing the number of mmu pages allocated to the vm
1750 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1752 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1754 LIST_HEAD(invalid_list);
1756 * If we set the number of mmu pages to be smaller be than the
1757 * number of actived pages , we must to free some mmu pages before we
1761 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1762 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1763 !list_empty(&kvm->arch.active_mmu_pages)) {
1764 struct kvm_mmu_page *page;
1766 page = container_of(kvm->arch.active_mmu_pages.prev,
1767 struct kvm_mmu_page, link);
1768 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1769 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1771 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1774 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1777 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1779 struct kvm_mmu_page *sp;
1780 struct hlist_node *node;
1781 LIST_HEAD(invalid_list);
1784 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1787 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1788 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1791 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1793 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1797 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1799 struct kvm_mmu_page *sp;
1800 struct hlist_node *node;
1801 LIST_HEAD(invalid_list);
1803 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1804 pgprintk("%s: zap %llx %x\n",
1805 __func__, gfn, sp->role.word);
1806 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1808 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1811 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1813 int slot = memslot_id(kvm, gfn);
1814 struct kvm_mmu_page *sp = page_header(__pa(pte));
1816 __set_bit(slot, sp->slot_bitmap);
1819 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1824 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1827 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1828 if (pt[i] == shadow_notrap_nonpresent_pte)
1829 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1834 * The function is based on mtrr_type_lookup() in
1835 * arch/x86/kernel/cpu/mtrr/generic.c
1837 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1842 u8 prev_match, curr_match;
1843 int num_var_ranges = KVM_NR_VAR_MTRR;
1845 if (!mtrr_state->enabled)
1848 /* Make end inclusive end, instead of exclusive */
1851 /* Look in fixed ranges. Just return the type as per start */
1852 if (mtrr_state->have_fixed && (start < 0x100000)) {
1855 if (start < 0x80000) {
1857 idx += (start >> 16);
1858 return mtrr_state->fixed_ranges[idx];
1859 } else if (start < 0xC0000) {
1861 idx += ((start - 0x80000) >> 14);
1862 return mtrr_state->fixed_ranges[idx];
1863 } else if (start < 0x1000000) {
1865 idx += ((start - 0xC0000) >> 12);
1866 return mtrr_state->fixed_ranges[idx];
1871 * Look in variable ranges
1872 * Look of multiple ranges matching this address and pick type
1873 * as per MTRR precedence
1875 if (!(mtrr_state->enabled & 2))
1876 return mtrr_state->def_type;
1879 for (i = 0; i < num_var_ranges; ++i) {
1880 unsigned short start_state, end_state;
1882 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1885 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1886 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1887 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1888 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1890 start_state = ((start & mask) == (base & mask));
1891 end_state = ((end & mask) == (base & mask));
1892 if (start_state != end_state)
1895 if ((start & mask) != (base & mask))
1898 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1899 if (prev_match == 0xFF) {
1900 prev_match = curr_match;
1904 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1905 curr_match == MTRR_TYPE_UNCACHABLE)
1906 return MTRR_TYPE_UNCACHABLE;
1908 if ((prev_match == MTRR_TYPE_WRBACK &&
1909 curr_match == MTRR_TYPE_WRTHROUGH) ||
1910 (prev_match == MTRR_TYPE_WRTHROUGH &&
1911 curr_match == MTRR_TYPE_WRBACK)) {
1912 prev_match = MTRR_TYPE_WRTHROUGH;
1913 curr_match = MTRR_TYPE_WRTHROUGH;
1916 if (prev_match != curr_match)
1917 return MTRR_TYPE_UNCACHABLE;
1920 if (prev_match != 0xFF)
1923 return mtrr_state->def_type;
1926 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1930 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1931 (gfn << PAGE_SHIFT) + PAGE_SIZE);
1932 if (mtrr == 0xfe || mtrr == 0xff)
1933 mtrr = MTRR_TYPE_WRBACK;
1936 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1938 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1940 trace_kvm_mmu_unsync_page(sp);
1941 ++vcpu->kvm->stat.mmu_unsync;
1944 kvm_mmu_mark_parents_unsync(sp);
1945 mmu_convert_notrap(sp);
1948 static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1950 struct kvm_mmu_page *s;
1951 struct hlist_node *node;
1953 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1956 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1957 __kvm_unsync_page(vcpu, s);
1961 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1964 struct kvm_mmu_page *s;
1965 struct hlist_node *node;
1966 bool need_unsync = false;
1968 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1972 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1975 if (!need_unsync && !s->unsync) {
1982 kvm_unsync_pages(vcpu, gfn);
1986 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1987 unsigned pte_access, int user_fault,
1988 int write_fault, int dirty, int level,
1989 gfn_t gfn, pfn_t pfn, bool speculative,
1990 bool can_unsync, bool host_writable)
1992 u64 spte, entry = *sptep;
1996 * We don't set the accessed bit, since we sometimes want to see
1997 * whether the guest actually used the pte (in order to detect
2000 spte = PT_PRESENT_MASK;
2002 spte |= shadow_accessed_mask;
2004 pte_access &= ~ACC_WRITE_MASK;
2005 if (pte_access & ACC_EXEC_MASK)
2006 spte |= shadow_x_mask;
2008 spte |= shadow_nx_mask;
2009 if (pte_access & ACC_USER_MASK)
2010 spte |= shadow_user_mask;
2011 if (level > PT_PAGE_TABLE_LEVEL)
2012 spte |= PT_PAGE_SIZE_MASK;
2014 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2015 kvm_is_mmio_pfn(pfn));
2018 spte |= SPTE_HOST_WRITEABLE;
2020 pte_access &= ~ACC_WRITE_MASK;
2022 spte |= (u64)pfn << PAGE_SHIFT;
2024 if ((pte_access & ACC_WRITE_MASK)
2025 || (!vcpu->arch.mmu.direct_map && write_fault
2026 && !is_write_protection(vcpu) && !user_fault)) {
2028 if (level > PT_PAGE_TABLE_LEVEL &&
2029 has_wrprotected_page(vcpu->kvm, gfn, level)) {
2031 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2035 spte |= PT_WRITABLE_MASK;
2037 if (!vcpu->arch.mmu.direct_map
2038 && !(pte_access & ACC_WRITE_MASK))
2039 spte &= ~PT_USER_MASK;
2042 * Optimization: for pte sync, if spte was writable the hash
2043 * lookup is unnecessary (and expensive). Write protection
2044 * is responsibility of mmu_get_page / kvm_sync_page.
2045 * Same reasoning can be applied to dirty page accounting.
2047 if (!can_unsync && is_writable_pte(*sptep))
2050 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2051 pgprintk("%s: found shadow page for %llx, marking ro\n",
2054 pte_access &= ~ACC_WRITE_MASK;
2055 if (is_writable_pte(spte))
2056 spte &= ~PT_WRITABLE_MASK;
2060 if (pte_access & ACC_WRITE_MASK)
2061 mark_page_dirty(vcpu->kvm, gfn);
2064 update_spte(sptep, spte);
2066 * If we overwrite a writable spte with a read-only one we
2067 * should flush remote TLBs. Otherwise rmap_write_protect
2068 * will find a read-only spte, even though the writable spte
2069 * might be cached on a CPU's TLB.
2071 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2072 kvm_flush_remote_tlbs(vcpu->kvm);
2077 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2078 unsigned pt_access, unsigned pte_access,
2079 int user_fault, int write_fault, int dirty,
2080 int *ptwrite, int level, gfn_t gfn,
2081 pfn_t pfn, bool speculative,
2084 int was_rmapped = 0;
2087 pgprintk("%s: spte %llx access %x write_fault %d"
2088 " user_fault %d gfn %llx\n",
2089 __func__, *sptep, pt_access,
2090 write_fault, user_fault, gfn);
2092 if (is_rmap_spte(*sptep)) {
2094 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2095 * the parent of the now unreachable PTE.
2097 if (level > PT_PAGE_TABLE_LEVEL &&
2098 !is_large_pte(*sptep)) {
2099 struct kvm_mmu_page *child;
2102 child = page_header(pte & PT64_BASE_ADDR_MASK);
2103 mmu_page_remove_parent_pte(child, sptep);
2104 __set_spte(sptep, shadow_trap_nonpresent_pte);
2105 kvm_flush_remote_tlbs(vcpu->kvm);
2106 } else if (pfn != spte_to_pfn(*sptep)) {
2107 pgprintk("hfn old %llx new %llx\n",
2108 spte_to_pfn(*sptep), pfn);
2109 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2110 kvm_flush_remote_tlbs(vcpu->kvm);
2115 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2116 dirty, level, gfn, pfn, speculative, true,
2120 kvm_mmu_flush_tlb(vcpu);
2123 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2124 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2125 is_large_pte(*sptep)? "2MB" : "4kB",
2126 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2128 if (!was_rmapped && is_large_pte(*sptep))
2129 ++vcpu->kvm->stat.lpages;
2131 page_header_update_slot(vcpu->kvm, sptep, gfn);
2133 rmap_count = rmap_add(vcpu, sptep, gfn);
2134 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2135 rmap_recycle(vcpu, sptep, gfn);
2137 kvm_release_pfn_clean(pfn);
2139 vcpu->arch.last_pte_updated = sptep;
2140 vcpu->arch.last_pte_gfn = gfn;
2144 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2148 static struct kvm_memory_slot *
2149 pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2151 struct kvm_memory_slot *slot;
2153 slot = gfn_to_memslot(vcpu->kvm, gfn);
2154 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2155 (no_dirty_log && slot->dirty_bitmap))
2161 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2164 struct kvm_memory_slot *slot;
2167 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2170 return page_to_pfn(bad_page);
2173 hva = gfn_to_hva_memslot(slot, gfn);
2175 return hva_to_pfn_atomic(vcpu->kvm, hva);
2178 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2179 struct kvm_mmu_page *sp,
2180 u64 *start, u64 *end)
2182 struct page *pages[PTE_PREFETCH_NUM];
2183 unsigned access = sp->role.access;
2187 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2188 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2191 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2195 for (i = 0; i < ret; i++, gfn++, start++)
2196 mmu_set_spte(vcpu, start, ACC_ALL,
2197 access, 0, 0, 1, NULL,
2198 sp->role.level, gfn,
2199 page_to_pfn(pages[i]), true, true);
2204 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2205 struct kvm_mmu_page *sp, u64 *sptep)
2207 u64 *spte, *start = NULL;
2210 WARN_ON(!sp->role.direct);
2212 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2215 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2216 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2219 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2227 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2229 struct kvm_mmu_page *sp;
2232 * Since it's no accessed bit on EPT, it's no way to
2233 * distinguish between actually accessed translations
2234 * and prefetched, so disable pte prefetch if EPT is
2237 if (!shadow_accessed_mask)
2240 sp = page_header(__pa(sptep));
2241 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2244 __direct_pte_prefetch(vcpu, sp, sptep);
2247 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2248 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2251 struct kvm_shadow_walk_iterator iterator;
2252 struct kvm_mmu_page *sp;
2256 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2257 if (iterator.level == level) {
2258 unsigned pte_access = ACC_ALL;
2260 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2261 0, write, 1, &pt_write,
2262 level, gfn, pfn, prefault, map_writable);
2263 direct_pte_prefetch(vcpu, iterator.sptep);
2264 ++vcpu->stat.pf_fixed;
2268 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2269 u64 base_addr = iterator.addr;
2271 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2272 pseudo_gfn = base_addr >> PAGE_SHIFT;
2273 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2275 1, ACC_ALL, iterator.sptep);
2277 pgprintk("nonpaging_map: ENOMEM\n");
2278 kvm_release_pfn_clean(pfn);
2282 __set_spte(iterator.sptep,
2284 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2285 | shadow_user_mask | shadow_x_mask
2286 | shadow_accessed_mask);
2292 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2296 info.si_signo = SIGBUS;
2298 info.si_code = BUS_MCEERR_AR;
2299 info.si_addr = (void __user *)address;
2300 info.si_addr_lsb = PAGE_SHIFT;
2302 send_sig_info(SIGBUS, &info, tsk);
2305 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2307 kvm_release_pfn_clean(pfn);
2308 if (is_hwpoison_pfn(pfn)) {
2309 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2311 } else if (is_fault_pfn(pfn))
2317 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2318 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2322 int level = *levelp;
2325 * Check if it's a transparent hugepage. If this would be an
2326 * hugetlbfs page, level wouldn't be set to
2327 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2330 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2331 level == PT_PAGE_TABLE_LEVEL &&
2332 PageTransCompound(pfn_to_page(pfn)) &&
2333 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2336 * mmu_notifier_retry was successful and we hold the
2337 * mmu_lock here, so the pmd can't become splitting
2338 * from under us, and in turn
2339 * __split_huge_page_refcount() can't run from under
2340 * us and we can safely transfer the refcount from
2341 * PG_tail to PG_head as we switch the pfn to tail to
2344 *levelp = level = PT_DIRECTORY_LEVEL;
2345 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2346 VM_BUG_ON((gfn & mask) != (pfn & mask));
2350 kvm_release_pfn_clean(pfn);
2352 if (!get_page_unless_zero(pfn_to_page(pfn)))
2359 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2360 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2362 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2369 unsigned long mmu_seq;
2372 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2373 if (likely(!force_pt_level)) {
2374 level = mapping_level(vcpu, gfn);
2376 * This path builds a PAE pagetable - so we can map
2377 * 2mb pages at maximum. Therefore check if the level
2378 * is larger than that.
2380 if (level > PT_DIRECTORY_LEVEL)
2381 level = PT_DIRECTORY_LEVEL;
2383 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2385 level = PT_PAGE_TABLE_LEVEL;
2387 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2390 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2394 if (is_error_pfn(pfn))
2395 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2397 spin_lock(&vcpu->kvm->mmu_lock);
2398 if (mmu_notifier_retry(vcpu, mmu_seq))
2400 kvm_mmu_free_some_pages(vcpu);
2401 if (likely(!force_pt_level))
2402 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2403 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2405 spin_unlock(&vcpu->kvm->mmu_lock);
2411 spin_unlock(&vcpu->kvm->mmu_lock);
2412 kvm_release_pfn_clean(pfn);
2417 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2420 struct kvm_mmu_page *sp;
2421 LIST_HEAD(invalid_list);
2423 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2425 spin_lock(&vcpu->kvm->mmu_lock);
2426 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2427 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2428 vcpu->arch.mmu.direct_map)) {
2429 hpa_t root = vcpu->arch.mmu.root_hpa;
2431 sp = page_header(root);
2433 if (!sp->root_count && sp->role.invalid) {
2434 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2435 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2437 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2438 spin_unlock(&vcpu->kvm->mmu_lock);
2441 for (i = 0; i < 4; ++i) {
2442 hpa_t root = vcpu->arch.mmu.pae_root[i];
2445 root &= PT64_BASE_ADDR_MASK;
2446 sp = page_header(root);
2448 if (!sp->root_count && sp->role.invalid)
2449 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2452 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2454 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2455 spin_unlock(&vcpu->kvm->mmu_lock);
2456 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2459 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2463 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2464 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2471 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2473 struct kvm_mmu_page *sp;
2476 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2477 spin_lock(&vcpu->kvm->mmu_lock);
2478 kvm_mmu_free_some_pages(vcpu);
2479 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2482 spin_unlock(&vcpu->kvm->mmu_lock);
2483 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2484 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2485 for (i = 0; i < 4; ++i) {
2486 hpa_t root = vcpu->arch.mmu.pae_root[i];
2488 ASSERT(!VALID_PAGE(root));
2489 spin_lock(&vcpu->kvm->mmu_lock);
2490 kvm_mmu_free_some_pages(vcpu);
2491 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2493 PT32_ROOT_LEVEL, 1, ACC_ALL,
2495 root = __pa(sp->spt);
2497 spin_unlock(&vcpu->kvm->mmu_lock);
2498 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2500 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2507 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2509 struct kvm_mmu_page *sp;
2514 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2516 if (mmu_check_root(vcpu, root_gfn))
2520 * Do we shadow a long mode page table? If so we need to
2521 * write-protect the guests page table root.
2523 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2524 hpa_t root = vcpu->arch.mmu.root_hpa;
2526 ASSERT(!VALID_PAGE(root));
2528 spin_lock(&vcpu->kvm->mmu_lock);
2529 kvm_mmu_free_some_pages(vcpu);
2530 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2532 root = __pa(sp->spt);
2534 spin_unlock(&vcpu->kvm->mmu_lock);
2535 vcpu->arch.mmu.root_hpa = root;
2540 * We shadow a 32 bit page table. This may be a legacy 2-level
2541 * or a PAE 3-level page table. In either case we need to be aware that
2542 * the shadow page table may be a PAE or a long mode page table.
2544 pm_mask = PT_PRESENT_MASK;
2545 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2546 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2548 for (i = 0; i < 4; ++i) {
2549 hpa_t root = vcpu->arch.mmu.pae_root[i];
2551 ASSERT(!VALID_PAGE(root));
2552 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2553 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2554 if (!is_present_gpte(pdptr)) {
2555 vcpu->arch.mmu.pae_root[i] = 0;
2558 root_gfn = pdptr >> PAGE_SHIFT;
2559 if (mmu_check_root(vcpu, root_gfn))
2562 spin_lock(&vcpu->kvm->mmu_lock);
2563 kvm_mmu_free_some_pages(vcpu);
2564 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2567 root = __pa(sp->spt);
2569 spin_unlock(&vcpu->kvm->mmu_lock);
2571 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2573 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2576 * If we shadow a 32 bit page table with a long mode page
2577 * table we enter this path.
2579 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2580 if (vcpu->arch.mmu.lm_root == NULL) {
2582 * The additional page necessary for this is only
2583 * allocated on demand.
2588 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2589 if (lm_root == NULL)
2592 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2594 vcpu->arch.mmu.lm_root = lm_root;
2597 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2603 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2605 if (vcpu->arch.mmu.direct_map)
2606 return mmu_alloc_direct_roots(vcpu);
2608 return mmu_alloc_shadow_roots(vcpu);
2611 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2614 struct kvm_mmu_page *sp;
2616 if (vcpu->arch.mmu.direct_map)
2619 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2622 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2623 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2624 hpa_t root = vcpu->arch.mmu.root_hpa;
2625 sp = page_header(root);
2626 mmu_sync_children(vcpu, sp);
2627 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2630 for (i = 0; i < 4; ++i) {
2631 hpa_t root = vcpu->arch.mmu.pae_root[i];
2633 if (root && VALID_PAGE(root)) {
2634 root &= PT64_BASE_ADDR_MASK;
2635 sp = page_header(root);
2636 mmu_sync_children(vcpu, sp);
2639 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2642 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2644 spin_lock(&vcpu->kvm->mmu_lock);
2645 mmu_sync_roots(vcpu);
2646 spin_unlock(&vcpu->kvm->mmu_lock);
2649 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2650 u32 access, struct x86_exception *exception)
2653 exception->error_code = 0;
2657 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2659 struct x86_exception *exception)
2662 exception->error_code = 0;
2663 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2666 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2667 u32 error_code, bool prefault)
2672 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2673 r = mmu_topup_memory_caches(vcpu);
2678 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2680 gfn = gva >> PAGE_SHIFT;
2682 return nonpaging_map(vcpu, gva & PAGE_MASK,
2683 error_code & PFERR_WRITE_MASK, gfn, prefault);
2686 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2688 struct kvm_arch_async_pf arch;
2690 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2692 arch.direct_map = vcpu->arch.mmu.direct_map;
2693 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2695 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2698 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2700 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2701 kvm_event_needs_reinjection(vcpu)))
2704 return kvm_x86_ops->interrupt_allowed(vcpu);
2707 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2708 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2712 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2715 return false; /* *pfn has correct page already */
2717 put_page(pfn_to_page(*pfn));
2719 if (!prefault && can_do_async_pf(vcpu)) {
2720 trace_kvm_try_async_get_page(gva, gfn);
2721 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2722 trace_kvm_async_pf_doublefault(gva, gfn);
2723 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2725 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2729 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2741 gfn_t gfn = gpa >> PAGE_SHIFT;
2742 unsigned long mmu_seq;
2743 int write = error_code & PFERR_WRITE_MASK;
2747 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2749 r = mmu_topup_memory_caches(vcpu);
2753 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2754 if (likely(!force_pt_level)) {
2755 level = mapping_level(vcpu, gfn);
2756 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2758 level = PT_PAGE_TABLE_LEVEL;
2760 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2763 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2767 if (is_error_pfn(pfn))
2768 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2769 spin_lock(&vcpu->kvm->mmu_lock);
2770 if (mmu_notifier_retry(vcpu, mmu_seq))
2772 kvm_mmu_free_some_pages(vcpu);
2773 if (likely(!force_pt_level))
2774 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2775 r = __direct_map(vcpu, gpa, write, map_writable,
2776 level, gfn, pfn, prefault);
2777 spin_unlock(&vcpu->kvm->mmu_lock);
2782 spin_unlock(&vcpu->kvm->mmu_lock);
2783 kvm_release_pfn_clean(pfn);
2787 static void nonpaging_free(struct kvm_vcpu *vcpu)
2789 mmu_free_roots(vcpu);
2792 static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2793 struct kvm_mmu *context)
2795 context->new_cr3 = nonpaging_new_cr3;
2796 context->page_fault = nonpaging_page_fault;
2797 context->gva_to_gpa = nonpaging_gva_to_gpa;
2798 context->free = nonpaging_free;
2799 context->prefetch_page = nonpaging_prefetch_page;
2800 context->sync_page = nonpaging_sync_page;
2801 context->invlpg = nonpaging_invlpg;
2802 context->root_level = 0;
2803 context->shadow_root_level = PT32E_ROOT_LEVEL;
2804 context->root_hpa = INVALID_PAGE;
2805 context->direct_map = true;
2806 context->nx = false;
2810 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2812 ++vcpu->stat.tlb_flush;
2813 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2816 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2818 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2819 mmu_free_roots(vcpu);
2822 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2824 return kvm_read_cr3(vcpu);
2827 static void inject_page_fault(struct kvm_vcpu *vcpu,
2828 struct x86_exception *fault)
2830 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2833 static void paging_free(struct kvm_vcpu *vcpu)
2835 nonpaging_free(vcpu);
2838 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2842 bit7 = (gpte >> 7) & 1;
2843 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2847 #include "paging_tmpl.h"
2851 #include "paging_tmpl.h"
2854 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2855 struct kvm_mmu *context,
2858 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2859 u64 exb_bit_rsvd = 0;
2862 exb_bit_rsvd = rsvd_bits(63, 63);
2864 case PT32_ROOT_LEVEL:
2865 /* no rsvd bits for 2 level 4K page table entries */
2866 context->rsvd_bits_mask[0][1] = 0;
2867 context->rsvd_bits_mask[0][0] = 0;
2868 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2870 if (!is_pse(vcpu)) {
2871 context->rsvd_bits_mask[1][1] = 0;
2875 if (is_cpuid_PSE36())
2876 /* 36bits PSE 4MB page */
2877 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2879 /* 32 bits PSE 4MB page */
2880 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2882 case PT32E_ROOT_LEVEL:
2883 context->rsvd_bits_mask[0][2] =
2884 rsvd_bits(maxphyaddr, 63) |
2885 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2886 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2887 rsvd_bits(maxphyaddr, 62); /* PDE */
2888 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2889 rsvd_bits(maxphyaddr, 62); /* PTE */
2890 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2891 rsvd_bits(maxphyaddr, 62) |
2892 rsvd_bits(13, 20); /* large page */
2893 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2895 case PT64_ROOT_LEVEL:
2896 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2897 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2898 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2899 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2900 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2901 rsvd_bits(maxphyaddr, 51);
2902 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2903 rsvd_bits(maxphyaddr, 51);
2904 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2905 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2906 rsvd_bits(maxphyaddr, 51) |
2908 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2909 rsvd_bits(maxphyaddr, 51) |
2910 rsvd_bits(13, 20); /* large page */
2911 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2916 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2917 struct kvm_mmu *context,
2920 context->nx = is_nx(vcpu);
2922 reset_rsvds_bits_mask(vcpu, context, level);
2924 ASSERT(is_pae(vcpu));
2925 context->new_cr3 = paging_new_cr3;
2926 context->page_fault = paging64_page_fault;
2927 context->gva_to_gpa = paging64_gva_to_gpa;
2928 context->prefetch_page = paging64_prefetch_page;
2929 context->sync_page = paging64_sync_page;
2930 context->invlpg = paging64_invlpg;
2931 context->free = paging_free;
2932 context->root_level = level;
2933 context->shadow_root_level = level;
2934 context->root_hpa = INVALID_PAGE;
2935 context->direct_map = false;
2939 static int paging64_init_context(struct kvm_vcpu *vcpu,
2940 struct kvm_mmu *context)
2942 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2945 static int paging32_init_context(struct kvm_vcpu *vcpu,
2946 struct kvm_mmu *context)
2948 context->nx = false;
2950 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2952 context->new_cr3 = paging_new_cr3;
2953 context->page_fault = paging32_page_fault;
2954 context->gva_to_gpa = paging32_gva_to_gpa;
2955 context->free = paging_free;
2956 context->prefetch_page = paging32_prefetch_page;
2957 context->sync_page = paging32_sync_page;
2958 context->invlpg = paging32_invlpg;
2959 context->root_level = PT32_ROOT_LEVEL;
2960 context->shadow_root_level = PT32E_ROOT_LEVEL;
2961 context->root_hpa = INVALID_PAGE;
2962 context->direct_map = false;
2966 static int paging32E_init_context(struct kvm_vcpu *vcpu,
2967 struct kvm_mmu *context)
2969 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2972 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2974 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2976 context->base_role.word = 0;
2977 context->new_cr3 = nonpaging_new_cr3;
2978 context->page_fault = tdp_page_fault;
2979 context->free = nonpaging_free;
2980 context->prefetch_page = nonpaging_prefetch_page;
2981 context->sync_page = nonpaging_sync_page;
2982 context->invlpg = nonpaging_invlpg;
2983 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2984 context->root_hpa = INVALID_PAGE;
2985 context->direct_map = true;
2986 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2987 context->get_cr3 = get_cr3;
2988 context->inject_page_fault = kvm_inject_page_fault;
2989 context->nx = is_nx(vcpu);
2991 if (!is_paging(vcpu)) {
2992 context->nx = false;
2993 context->gva_to_gpa = nonpaging_gva_to_gpa;
2994 context->root_level = 0;
2995 } else if (is_long_mode(vcpu)) {
2996 context->nx = is_nx(vcpu);
2997 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2998 context->gva_to_gpa = paging64_gva_to_gpa;
2999 context->root_level = PT64_ROOT_LEVEL;
3000 } else if (is_pae(vcpu)) {
3001 context->nx = is_nx(vcpu);
3002 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3003 context->gva_to_gpa = paging64_gva_to_gpa;
3004 context->root_level = PT32E_ROOT_LEVEL;
3006 context->nx = false;
3007 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3008 context->gva_to_gpa = paging32_gva_to_gpa;
3009 context->root_level = PT32_ROOT_LEVEL;
3015 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3019 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3021 if (!is_paging(vcpu))
3022 r = nonpaging_init_context(vcpu, context);
3023 else if (is_long_mode(vcpu))
3024 r = paging64_init_context(vcpu, context);
3025 else if (is_pae(vcpu))
3026 r = paging32E_init_context(vcpu, context);
3028 r = paging32_init_context(vcpu, context);
3030 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3031 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3035 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3037 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3039 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
3041 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3042 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3043 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3048 static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3050 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3052 g_context->get_cr3 = get_cr3;
3053 g_context->inject_page_fault = kvm_inject_page_fault;
3056 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3057 * translation of l2_gpa to l1_gpa addresses is done using the
3058 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3059 * functions between mmu and nested_mmu are swapped.
3061 if (!is_paging(vcpu)) {
3062 g_context->nx = false;
3063 g_context->root_level = 0;
3064 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3065 } else if (is_long_mode(vcpu)) {
3066 g_context->nx = is_nx(vcpu);
3067 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3068 g_context->root_level = PT64_ROOT_LEVEL;
3069 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3070 } else if (is_pae(vcpu)) {
3071 g_context->nx = is_nx(vcpu);
3072 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3073 g_context->root_level = PT32E_ROOT_LEVEL;
3074 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3076 g_context->nx = false;
3077 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3078 g_context->root_level = PT32_ROOT_LEVEL;
3079 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3085 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3087 vcpu->arch.update_pte.pfn = bad_pfn;
3089 if (mmu_is_nested(vcpu))
3090 return init_kvm_nested_mmu(vcpu);
3091 else if (tdp_enabled)
3092 return init_kvm_tdp_mmu(vcpu);
3094 return init_kvm_softmmu(vcpu);
3097 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
3100 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
3101 /* mmu.free() should set root_hpa = INVALID_PAGE */
3102 vcpu->arch.mmu.free(vcpu);
3105 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3107 destroy_kvm_mmu(vcpu);
3108 return init_kvm_mmu(vcpu);
3110 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
3112 int kvm_mmu_load(struct kvm_vcpu *vcpu)
3116 r = mmu_topup_memory_caches(vcpu);
3119 r = mmu_alloc_roots(vcpu);
3120 spin_lock(&vcpu->kvm->mmu_lock);
3121 mmu_sync_roots(vcpu);
3122 spin_unlock(&vcpu->kvm->mmu_lock);
3125 /* set_cr3() should ensure TLB has been flushed */
3126 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3130 EXPORT_SYMBOL_GPL(kvm_mmu_load);
3132 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3134 mmu_free_roots(vcpu);
3136 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3138 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3139 struct kvm_mmu_page *sp,
3143 struct kvm_mmu_page *child;
3146 if (is_shadow_present_pte(pte)) {
3147 if (is_last_spte(pte, sp->role.level))
3148 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3150 child = page_header(pte & PT64_BASE_ADDR_MASK);
3151 mmu_page_remove_parent_pte(child, spte);
3154 __set_spte(spte, shadow_trap_nonpresent_pte);
3155 if (is_large_pte(pte))
3156 --vcpu->kvm->stat.lpages;
3159 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3160 struct kvm_mmu_page *sp,
3164 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3165 ++vcpu->kvm->stat.mmu_pde_zapped;
3169 ++vcpu->kvm->stat.mmu_pte_updated;
3170 if (!sp->role.cr4_pae)
3171 paging32_update_pte(vcpu, sp, spte, new);
3173 paging64_update_pte(vcpu, sp, spte, new);
3176 static bool need_remote_flush(u64 old, u64 new)
3178 if (!is_shadow_present_pte(old))
3180 if (!is_shadow_present_pte(new))
3182 if ((old ^ new) & PT64_BASE_ADDR_MASK)
3184 old ^= PT64_NX_MASK;
3185 new ^= PT64_NX_MASK;
3186 return (old & ~new & PT64_PERM_MASK) != 0;
3189 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3190 bool remote_flush, bool local_flush)
3196 kvm_flush_remote_tlbs(vcpu->kvm);
3197 else if (local_flush)
3198 kvm_mmu_flush_tlb(vcpu);
3201 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3203 u64 *spte = vcpu->arch.last_pte_updated;
3205 return !!(spte && (*spte & shadow_accessed_mask));
3208 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 if (!is_present_gpte(gpte))
3216 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3218 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3220 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3222 if (is_error_pfn(pfn)) {
3223 kvm_release_pfn_clean(pfn);
3226 vcpu->arch.update_pte.pfn = pfn;
3229 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3231 u64 *spte = vcpu->arch.last_pte_updated;
3234 && vcpu->arch.last_pte_gfn == gfn
3235 && shadow_accessed_mask
3236 && !(*spte & shadow_accessed_mask)
3237 && is_shadow_present_pte(*spte))
3238 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
3241 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3242 const u8 *new, int bytes,
3243 bool guest_initiated)
3245 gfn_t gfn = gpa >> PAGE_SHIFT;
3246 union kvm_mmu_page_role mask = { .word = 0 };
3247 struct kvm_mmu_page *sp;
3248 struct hlist_node *node;
3249 LIST_HEAD(invalid_list);
3252 unsigned offset = offset_in_page(gpa);
3254 unsigned page_offset;
3255 unsigned misaligned;
3262 bool remote_flush, local_flush, zap_page;
3264 zap_page = remote_flush = local_flush = false;
3266 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3268 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3271 * Assume that the pte write on a page table of the same type
3272 * as the current vcpu paging mode since we update the sptes only
3273 * when they have the same mode.
3275 if ((is_pae(vcpu) && bytes == 4) || !new) {
3276 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3281 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3284 new = (const u8 *)&gentry;
3289 gentry = *(const u32 *)new;
3292 gentry = *(const u64 *)new;
3299 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
3300 spin_lock(&vcpu->kvm->mmu_lock);
3301 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3303 kvm_mmu_free_some_pages(vcpu);
3304 ++vcpu->kvm->stat.mmu_pte_write;
3305 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3306 if (guest_initiated) {
3307 kvm_mmu_access_page(vcpu, gfn);
3308 if (gfn == vcpu->arch.last_pt_write_gfn
3309 && !last_updated_pte_accessed(vcpu)) {
3310 ++vcpu->arch.last_pt_write_count;
3311 if (vcpu->arch.last_pt_write_count >= 3)
3314 vcpu->arch.last_pt_write_gfn = gfn;
3315 vcpu->arch.last_pt_write_count = 1;
3316 vcpu->arch.last_pte_updated = NULL;
3320 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3321 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3322 pte_size = sp->role.cr4_pae ? 8 : 4;
3323 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3324 misaligned |= bytes < 4;
3325 if (misaligned || flooded) {
3327 * Misaligned accesses are too much trouble to fix
3328 * up; also, they usually indicate a page is not used
3331 * If we're seeing too many writes to a page,
3332 * it may no longer be a page table, or we may be
3333 * forking, in which case it is better to unmap the
3336 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3337 gpa, bytes, sp->role.word);
3338 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3340 ++vcpu->kvm->stat.mmu_flooded;
3343 page_offset = offset;
3344 level = sp->role.level;
3346 if (!sp->role.cr4_pae) {
3347 page_offset <<= 1; /* 32->64 */
3349 * A 32-bit pde maps 4MB while the shadow pdes map
3350 * only 2MB. So we need to double the offset again
3351 * and zap two pdes instead of one.
3353 if (level == PT32_ROOT_LEVEL) {
3354 page_offset &= ~7; /* kill rounding error */
3358 quadrant = page_offset >> PAGE_SHIFT;
3359 page_offset &= ~PAGE_MASK;
3360 if (quadrant != sp->role.quadrant)
3364 spte = &sp->spt[page_offset / sizeof(*spte)];
3367 mmu_pte_write_zap_pte(vcpu, sp, spte);
3369 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3371 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3372 if (!remote_flush && need_remote_flush(entry, *spte))
3373 remote_flush = true;
3377 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3378 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3379 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3380 spin_unlock(&vcpu->kvm->mmu_lock);
3381 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3382 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3383 vcpu->arch.update_pte.pfn = bad_pfn;
3387 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3392 if (vcpu->arch.mmu.direct_map)
3395 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3397 spin_lock(&vcpu->kvm->mmu_lock);
3398 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3399 spin_unlock(&vcpu->kvm->mmu_lock);
3402 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3404 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3406 LIST_HEAD(invalid_list);
3408 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3409 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3410 struct kvm_mmu_page *sp;
3412 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3413 struct kvm_mmu_page, link);
3414 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3415 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3416 ++vcpu->kvm->stat.mmu_recycled;
3420 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3421 void *insn, int insn_len)
3424 enum emulation_result er;
3426 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3435 r = mmu_topup_memory_caches(vcpu);
3439 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3444 case EMULATE_DO_MMIO:
3445 ++vcpu->stat.mmio_exits;
3455 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
3457 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
3459 vcpu->arch.mmu.invlpg(vcpu, gva);
3460 kvm_mmu_flush_tlb(vcpu);
3461 ++vcpu->stat.invlpg;
3463 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3465 void kvm_enable_tdp(void)
3469 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3471 void kvm_disable_tdp(void)
3473 tdp_enabled = false;
3475 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3477 static void free_mmu_pages(struct kvm_vcpu *vcpu)
3479 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3480 if (vcpu->arch.mmu.lm_root != NULL)
3481 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3484 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3492 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3493 * Therefore we need to allocate shadow page tables in the first
3494 * 4GB of memory, which happens to fit the DMA32 zone.
3496 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3500 vcpu->arch.mmu.pae_root = page_address(page);
3501 for (i = 0; i < 4; ++i)
3502 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3507 int kvm_mmu_create(struct kvm_vcpu *vcpu)
3510 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3512 return alloc_mmu_pages(vcpu);
3515 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3518 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3520 return init_kvm_mmu(vcpu);
3523 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3525 struct kvm_mmu_page *sp;
3527 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3531 if (!test_bit(slot, sp->slot_bitmap))
3535 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3536 if (!is_shadow_present_pte(pt[i]) ||
3537 !is_last_spte(pt[i], sp->role.level))
3540 if (is_large_pte(pt[i])) {
3541 drop_spte(kvm, &pt[i],
3542 shadow_trap_nonpresent_pte);
3548 if (is_writable_pte(pt[i]))
3549 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3552 kvm_flush_remote_tlbs(kvm);
3555 void kvm_mmu_zap_all(struct kvm *kvm)
3557 struct kvm_mmu_page *sp, *node;
3558 LIST_HEAD(invalid_list);
3560 spin_lock(&kvm->mmu_lock);
3562 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3563 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3566 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3567 spin_unlock(&kvm->mmu_lock);
3570 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3571 struct list_head *invalid_list)
3573 struct kvm_mmu_page *page;
3575 page = container_of(kvm->arch.active_mmu_pages.prev,
3576 struct kvm_mmu_page, link);
3577 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3580 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 struct kvm *kvm_freed = NULL;
3585 if (nr_to_scan == 0)
3588 raw_spin_lock(&kvm_lock);
3590 list_for_each_entry(kvm, &vm_list, vm_list) {
3591 int idx, freed_pages;
3592 LIST_HEAD(invalid_list);
3594 idx = srcu_read_lock(&kvm->srcu);
3595 spin_lock(&kvm->mmu_lock);
3596 if (!kvm_freed && nr_to_scan > 0 &&
3597 kvm->arch.n_used_mmu_pages > 0) {
3598 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3604 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3605 spin_unlock(&kvm->mmu_lock);
3606 srcu_read_unlock(&kvm->srcu, idx);
3609 list_move_tail(&kvm_freed->vm_list, &vm_list);
3611 raw_spin_unlock(&kvm_lock);
3614 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3617 static struct shrinker mmu_shrinker = {
3618 .shrink = mmu_shrink,
3619 .seeks = DEFAULT_SEEKS * 10,
3622 static void mmu_destroy_caches(void)
3624 if (pte_chain_cache)
3625 kmem_cache_destroy(pte_chain_cache);
3626 if (rmap_desc_cache)
3627 kmem_cache_destroy(rmap_desc_cache);
3628 if (mmu_page_header_cache)
3629 kmem_cache_destroy(mmu_page_header_cache);
3632 int kvm_mmu_module_init(void)
3634 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3635 sizeof(struct kvm_pte_chain),
3637 if (!pte_chain_cache)
3639 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3640 sizeof(struct kvm_rmap_desc),
3642 if (!rmap_desc_cache)
3645 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3646 sizeof(struct kvm_mmu_page),
3648 if (!mmu_page_header_cache)
3651 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3654 register_shrinker(&mmu_shrinker);
3659 mmu_destroy_caches();
3664 * Caculate mmu pages needed for kvm.
3666 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3669 unsigned int nr_mmu_pages;
3670 unsigned int nr_pages = 0;
3671 struct kvm_memslots *slots;
3673 slots = kvm_memslots(kvm);
3675 for (i = 0; i < slots->nmemslots; i++)
3676 nr_pages += slots->memslots[i].npages;
3678 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3679 nr_mmu_pages = max(nr_mmu_pages,
3680 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3682 return nr_mmu_pages;
3685 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3688 if (len > buffer->len)
3693 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3698 ret = pv_mmu_peek_buffer(buffer, len);
3703 buffer->processed += len;
3707 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3708 gpa_t addr, gpa_t value)
3713 if (!is_long_mode(vcpu) && !is_pae(vcpu))
3716 r = mmu_topup_memory_caches(vcpu);
3720 if (!emulator_write_phys(vcpu, addr, &value, bytes))
3726 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3728 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3732 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3734 spin_lock(&vcpu->kvm->mmu_lock);
3735 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3736 spin_unlock(&vcpu->kvm->mmu_lock);
3740 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3741 struct kvm_pv_mmu_op_buffer *buffer)
3743 struct kvm_mmu_op_header *header;
3745 header = pv_mmu_peek_buffer(buffer, sizeof *header);
3748 switch (header->op) {
3749 case KVM_MMU_OP_WRITE_PTE: {
3750 struct kvm_mmu_op_write_pte *wpte;
3752 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3755 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3758 case KVM_MMU_OP_FLUSH_TLB: {
3759 struct kvm_mmu_op_flush_tlb *ftlb;
3761 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3764 return kvm_pv_mmu_flush_tlb(vcpu);
3766 case KVM_MMU_OP_RELEASE_PT: {
3767 struct kvm_mmu_op_release_pt *rpt;
3769 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3772 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3778 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3779 gpa_t addr, unsigned long *ret)
3782 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3784 buffer->ptr = buffer->buf;
3785 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3786 buffer->processed = 0;
3788 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3792 while (buffer->len) {
3793 r = kvm_pv_mmu_op_one(vcpu, buffer);
3802 *ret = buffer->processed;
3806 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3808 struct kvm_shadow_walk_iterator iterator;
3811 spin_lock(&vcpu->kvm->mmu_lock);
3812 for_each_shadow_entry(vcpu, addr, iterator) {
3813 sptes[iterator.level-1] = *iterator.sptep;
3815 if (!is_shadow_present_pte(*iterator.sptep))
3818 spin_unlock(&vcpu->kvm->mmu_lock);
3822 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3824 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3828 destroy_kvm_mmu(vcpu);
3829 free_mmu_pages(vcpu);
3830 mmu_free_memory_caches(vcpu);
3833 #ifdef CONFIG_KVM_MMU_AUDIT
3834 #include "mmu_audit.c"
3836 static void mmu_audit_disable(void) { }
3839 void kvm_mmu_module_exit(void)
3841 mmu_destroy_caches();
3842 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3843 unregister_shrinker(&mmu_shrinker);
3844 mmu_audit_disable();