struct kvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr;
- int level;
u64 *sptep;
+ int level;
unsigned index;
};
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
-static inline u64 rsvd_bits(int s, int e)
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
- return ((1ULL << (e - s + 1)) - 1) << s;
+ shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+ trace_mark_mmio_spte(sptep, gfn, access);
+ mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
}
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+static bool is_mmio_spte(u64 spte)
{
- shadow_trap_nonpresent_pte = trap_pte;
- shadow_notrap_nonpresent_pte = notrap_pte;
+ return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
static int is_shadow_present_pte(u64 pte)
{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
+ return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
+#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
- set_64bit(sptep, spte);
+ *sptep = spte;
}
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
-#ifdef CONFIG_X86_64
- return xchg(sptep, new_spte);
+ *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return ACCESS_ONCE(*sptep);
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ /* It is valid if the spte is zapped. */
+ return spte == 0ull;
+}
#else
- u64 old_spte;
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
- do {
- old_spte = *sptep;
- } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
- return old_spte;
-#endif
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ ssptep->spte_low = sspte.spte_low;
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_low = sspte.spte_low;
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
}
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ union split_spte sspte = (union split_spte)spte;
+ u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+ /* It is valid if the spte is zapped. */
+ if (spte == 0ull)
+ return true;
+
+ /* It is valid if the spte is being zapped. */
+ if (sspte.spte_low == 0ull &&
+ (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+ return true;
+
+ return false;
+}
+#endif
+
static bool spte_has_volatile_bits(u64 spte)
{
if (!shadow_accessed_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 mask, old_spte = *sptep;
WARN_ON(!is_rmap_spte(new_spte));
+ if (!is_shadow_present_pte(old_spte))
+ return mmu_spte_set(sptep, new_spte);
+
new_spte |= old_spte & shadow_dirty_mask;
mask = shadow_accessed_mask;
mask |= shadow_dirty_mask;
if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
- __set_spte(sptep, new_spte);
+ __update_clear_spte_fast(sptep, new_spte);
else
- old_spte = __xchg_spte(sptep, new_spte);
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
if (!shadow_accessed_mask)
return;
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
}
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, 0ull);
+ else
+ old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+ if (!is_rmap_spte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+ kvm_set_pfn_dirty(pfn);
+ return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ rcu_read_lock();
+ atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+ /* Increase the counter before walking shadow page table */
+ smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /* Decrease the counter after walking shadow page table finished */
+ smp_mb__before_atomic_dec();
+ atomic_dec(&vcpu->kvm->arch.reader_counter);
+ rcu_read_unlock();
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
pte_list_remove(spte, rmapp);
}
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
- pfn_t pfn;
- u64 old_spte = *sptep;
-
- if (!spte_has_volatile_bits(old_spte))
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
-
- if (!is_rmap_spte(old_spte))
- return 0;
-
- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
- return 1;
-}
-
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (set_spte_track_bits(sptep, new_spte))
+ if (mmu_spte_clear_track_bits(sptep))
rmap_remove(kvm, sptep);
}
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte,
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
--kvm->stat.lpages;
spte = NULL;
write_protected = 1;
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
need_tlb_flush = 1;
}
return need_tlb_flush;
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
new_spte &= ~shadow_accessed_mask;
- set_spte_track_bits(spte, new_spte);
+ mmu_spte_clear_track_bits(spte);
+ mmu_spte_set(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- list_del(&sp->link);
- free_page((unsigned long)sp->spt);
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
+}
+
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
- kvm_mod_used_mmu_pages(kvm, -1);
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
u64 *parent_pte)
{
mmu_page_remove_parent_pte(sp, parent_pte);
- __set_spte(parent_pte, shadow_trap_nonpresent_pte);
+ mmu_spte_clear_no_track(parent_pte);
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
kvm_mmu_mark_parents_unsync(sp);
}
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
}
}
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = 0ull;
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
account_shadowed(vcpu->kvm, gfn);
}
- if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
- else
- nonpaging_prefetch_page(vcpu, sp);
+ init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
}
return true;
}
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
{
- if (is_last_spte(*iterator->sptep, iterator->level)) {
+ if (is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
- iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+ iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
}
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
spte = __pa(sp->spt)
| PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
- __set_spte(sptep, spte);
+ mmu_spte_set(sptep, spte);
}
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- }
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ } else if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);
+
if (is_large_pte(pte))
--kvm->stat.lpages;
}
/* Count self */
ret++;
list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
return ret;
}
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, invalid_list, link)
+ kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+ struct kvm_mmu_page *next, *sp;
+
+ sp = container_of(head, struct kvm_mmu_page, rcu);
+ while (sp) {
+ if (!list_empty(&sp->link))
+ next = list_first_entry(&sp->link,
+ struct kvm_mmu_page, link);
+ else
+ next = NULL;
+ kvm_mmu_free_page(sp);
+ sp = next;
+ }
+}
+
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
kvm_flush_remote_tlbs(kvm);
+ if (atomic_read(&kvm->arch.reader_counter)) {
+ kvm_mmu_isolate_pages(invalid_list);
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_del_init(invalid_list);
+
+ trace_kvm_mmu_delay_free_pages(sp);
+ call_rcu(&sp->rcu, free_pages_rcu);
+ return;
+ }
+
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(kvm, sp);
+ kvm_mmu_isolate_page(sp);
+ kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
__set_bit(slot, sp->slot_bitmap);
}
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
- int i;
- u64 *pt = sp->spt;
-
- if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] == shadow_notrap_nonpresent_pte)
- __set_spte(&pt[i], shadow_trap_nonpresent_pte);
- }
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
- mmu_convert_notrap(sp);
}
static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int level,
+ int write_fault, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
u64 spte, entry = *sptep;
int ret = 0;
+ if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ return 0;
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
goto done;
}
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- update_spte(sptep, spte);
+ mmu_spte_update(sptep, spte);
/*
* If we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int level, gfn_t gfn,
+ int user_fault, int write_fault,
+ int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
{
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
+ level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
- *ptwrite = 1;
+ *emulate = 1;
kvm_mmu_flush_tlb(vcpu);
}
+ if (unlikely(is_mmio_spte(*sptep) && emulate))
+ *emulate = 1;
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
+ get_page(fault_page);
+ return page_to_pfn(fault_page);
}
hva = gfn_to_hva_memslot(slot, gfn);
for (i = 0; i < ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, 1, NULL,
+ access, 0, 0, NULL,
sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- int pt_write = 0;
+ int emulate = 0;
gfn_t pseudo_gfn;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
unsigned pte_access = ACC_ALL;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, 1, &pt_write,
+ 0, write, &emulate,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
- if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+ if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
return -ENOMEM;
}
- __set_spte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ mmu_spte_set(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
}
}
- return pt_write;
+ return emulate;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
- unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
- } else if (is_fault_pfn(pfn))
- return -EFAULT;
+ }
- vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- return 1;
+ return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+ return unlikely(is_invalid_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ pfn_t pfn, unsigned access, int *ret_val)
+{
+ bool ret = true;
+
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_invalid_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ goto exit;
+ }
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+ ret = false;
+exit:
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+ return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ * - It is the mmio spte
+ * - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+ return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte = 0ull;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ if (!is_shadow_present_pte(spte))
+ break;
+ walk_shadow_page_lockless_end(vcpu);
+
+ return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+
+ if (quickly_check_mmio_pf(vcpu, addr, direct))
+ return 1;
+
+ spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return 1;
+ }
+
+ /*
+ * It's ok if the gva is remapped by other cpus on shadow guest,
+ * it's a BUG if the gfn is not a mmio page.
+ */
+ if (direct && !check_direct_spte_mmio_pf(spte))
+ return -1;
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+ u32 error_code, bool direct)
+{
+ int ret;
+
+ ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+ WARN_ON(ret < 0);
+ return ret;
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+ int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->prefetch_page = paging64_prefetch_page;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
- context->prefetch_page = paging32_prefetch_page;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
continue;
if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, &pt[i]);
--kvm->stat.lpages;
continue;
}
/* avoid RMW */
if (is_writable_pte(pt[i]))
- update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ mmu_spte_update(&pt[i],
+ pt[i] & ~PT_WRITABLE_MASK);
}
}
kvm_flush_remote_tlbs(kvm);
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
int nr_sptes = 0;
- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, addr, iterator) {
- sptes[iterator.level-1] = *iterator.sptep;
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ sptes[iterator.level-1] = spte;
nr_sptes++;
- if (!is_shadow_present_pte(*iterator.sptep))
+ if (!is_shadow_present_pte(spte))
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ walk_shadow_page_lockless_end(vcpu);
return nr_sptes;
}