Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6

[pandora-kernel.git] / arch / x86 / kvm / mmu.c
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index d1986b7..9335e1b 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,8 +172,8 @@ struct pte_list_desc {
  struct kvm_shadow_walk_iterator {
         u64 addr;
         hpa_t shadow_addr;
-       int level;
         u64 *sptep;
+       int level;
         unsigned index;
  };
  
@@ -182,29 +182,68 @@ struct kvm_shadow_walk_iterator {
              shadow_walk_okay(&(_walker));                      \
              shadow_walk_next(&(_walker)))
  
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)    \
+       for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
+            shadow_walk_okay(&(_walker)) &&                            \
+               ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
+            __shadow_walk_next(&(_walker), spte))
+
  static struct kmem_cache *pte_list_desc_cache;
  static struct kmem_cache *mmu_page_header_cache;
  static struct percpu_counter kvm_total_used_mmu_pages;
  
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
  static u64 __read_mostly shadow_nx_mask;
  static u64 __read_mostly shadow_x_mask;        /* mutual exclusive with nx_mask */
  static u64 __read_mostly shadow_user_mask;
  static u64 __read_mostly shadow_accessed_mask;
  static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
  
-static inline u64 rsvd_bits(int s, int e)
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
  {
-       return ((1ULL << (e - s + 1)) - 1) << s;
+       shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+       access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+       trace_mark_mmio_spte(sptep, gfn, access);
+       mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
  }
  
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+static bool is_mmio_spte(u64 spte)
  {
-       shadow_trap_nonpresent_pte = trap_pte;
-       shadow_notrap_nonpresent_pte = notrap_pte;
+       return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+       return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+       return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(sptep, gfn, access);
+               return true;
+       }
+
+       return false;
+}
+
+static inline u64 rsvd_bits(int s, int e)
+{
+       return ((1ULL << (e - s + 1)) - 1) << s;
  }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
  
  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -229,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
  
  static int is_shadow_present_pte(u64 pte)
  {
-       return pte != shadow_trap_nonpresent_pte
-               && pte != shadow_notrap_nonpresent_pte;
+       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
  }
  
  static int is_large_pte(u64 pte)
@@ -269,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)
         return (gpte & PT32_DIR_PSE36_MASK) << shift;
  }
  
+#ifdef CONFIG_X86_64
  static void __set_spte(u64 *sptep, u64 spte)
  {
-       set_64bit(sptep, spte);
+       *sptep = spte;
  }
  
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
  {
-#ifdef CONFIG_X86_64
-       return xchg(sptep, new_spte);
+       *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+       return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+       return ACCESS_ONCE(*sptep);
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+       /* It is valid if the spte is zapped. */
+       return spte == 0ull;
+}
  #else
-       u64 old_spte;
+union split_spte {
+       struct {
+               u32 spte_low;
+               u32 spte_high;
+       };
+       u64 spte;
+};
  
-       do {
-               old_spte = *sptep;
-       } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
  
-       return old_spte;
-#endif
+       if (is_shadow_present_pte(spte))
+               return;
+
+       /* Ensure the spte is completely set before we increase the count */
+       smp_wmb();
+       sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       ssptep->spte_high = sspte.spte_high;
+
+       /*
+        * If we map the spte from nonpresent to present, We should store
+        * the high bits firstly, then set present bit, so cpu can not
+        * fetch this spte while we are setting the spte.
+        */
+       smp_wmb();
+
+       ssptep->spte_low = sspte.spte_low;
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       ssptep->spte_low = sspte.spte_low;
+
+       /*
+        * If we map the spte from present to nonpresent, we should clear
+        * present bit firstly to avoid vcpu fetch the old high bits.
+        */
+       smp_wmb();
+
+       ssptep->spte_high = sspte.spte_high;
+       count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte, orig;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       /* xchg acts as a barrier before the setting of the high bits */
+       orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+       orig.spte_high = ssptep->spte_high = sspte.spte_high;
+       count_spte_clear(sptep, spte);
+
+       return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+       union split_spte spte, *orig = (union split_spte *)sptep;
+       int count;
+
+retry:
+       count = sp->clear_spte_count;
+       smp_rmb();
+
+       spte.spte_low = orig->spte_low;
+       smp_rmb();
+
+       spte.spte_high = orig->spte_high;
+       smp_rmb();
+
+       if (unlikely(spte.spte_low != orig->spte_low ||
+             count != sp->clear_spte_count))
+               goto retry;
+
+       return spte.spte;
  }
  
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+       union split_spte sspte = (union split_spte)spte;
+       u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+       /* It is valid if the spte is zapped. */
+       if (spte == 0ull)
+               return true;
+
+       /* It is valid if the spte is being zapped. */
+       if (sspte.spte_low == 0ull &&
+           (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+               return true;
+
+       return false;
+}
+#endif
+
  static bool spte_has_volatile_bits(u64 spte)
  {
         if (!shadow_accessed_mask)
@@ -309,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
         return (old_spte & bit_mask) && !(new_spte & bit_mask);
  }
  
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+       WARN_ON(is_shadow_present_pte(*sptep));
+       __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
  {
         u64 mask, old_spte = *sptep;
  
         WARN_ON(!is_rmap_spte(new_spte));
  
+       if (!is_shadow_present_pte(old_spte))
+               return mmu_spte_set(sptep, new_spte);
+
         new_spte |= old_spte & shadow_dirty_mask;
  
         mask = shadow_accessed_mask;
@@ -322,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
                 mask |= shadow_dirty_mask;
  
         if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
-               __set_spte(sptep, new_spte);
+               __update_clear_spte_fast(sptep, new_spte);
         else
-               old_spte = __xchg_spte(sptep, new_spte);
+               old_spte = __update_clear_spte_slow(sptep, new_spte);
  
         if (!shadow_accessed_mask)
                 return;
@@ -335,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
  }
  
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+       pfn_t pfn;
+       u64 old_spte = *sptep;
+
+       if (!spte_has_volatile_bits(old_spte))
+               __update_clear_spte_fast(sptep, 0ull);
+       else
+               old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+       if (!is_rmap_spte(old_spte))
+               return 0;
+
+       pfn = spte_to_pfn(old_spte);
+       if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+               kvm_set_pfn_accessed(pfn);
+       if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+               kvm_set_pfn_dirty(pfn);
+       return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+       __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+       return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+       rcu_read_lock();
+       atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+       /* Increase the counter before walking shadow page table */
+       smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+       /* Decrease the counter after walking shadow page table finished */
+       smp_mb__before_atomic_dec();
+       atomic_dec(&vcpu->kvm->arch.reader_counter);
+       rcu_read_unlock();
+}
+
  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
                                   struct kmem_cache *base_cache, int min)
  {
@@ -756,30 +998,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
         pte_list_remove(spte, rmapp);
  }
  
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
-       pfn_t pfn;
-       u64 old_spte = *sptep;
-
-       if (!spte_has_volatile_bits(old_spte))
-               __set_spte(sptep, new_spte);
-       else
-               old_spte = __xchg_spte(sptep, new_spte);
-
-       if (!is_rmap_spte(old_spte))
-               return 0;
-
-       pfn = spte_to_pfn(old_spte);
-       if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
-               kvm_set_pfn_accessed(pfn);
-       if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
-               kvm_set_pfn_dirty(pfn);
-       return 1;
-}
-
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
  {
-       if (set_spte_track_bits(sptep, new_spte))
+       if (mmu_spte_clear_track_bits(sptep))
                 rmap_remove(kvm, sptep);
  }
  
@@ -797,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                 BUG_ON(!(*spte & PT_PRESENT_MASK));
                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
                 if (is_writable_pte(*spte)) {
-                       update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+                       mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
                         write_protected = 1;
                 }
                 spte = rmap_next(kvm, rmapp, spte);
@@ -814,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                         BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
                         pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
                         if (is_writable_pte(*spte)) {
-                               drop_spte(kvm, spte,
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, spte);
                                 --kvm->stat.lpages;
                                 spte = NULL;
                                 write_protected = 1;
@@ -836,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
         while ((spte = rmap_next(kvm, rmapp, NULL))) {
                 BUG_ON(!(*spte & PT_PRESENT_MASK));
                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-               drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+               drop_spte(kvm, spte);
                 need_tlb_flush = 1;
         }
         return need_tlb_flush;
@@ -858,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
                 need_flush = 1;
                 if (pte_write(*ptep)) {
-                       drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+                       drop_spte(kvm, spte);
                         spte = rmap_next(kvm, rmapp, NULL);
                 } else {
                         new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -867,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                         new_spte &= ~PT_WRITABLE_MASK;
                         new_spte &= ~SPTE_HOST_WRITEABLE;
                         new_spte &= ~shadow_accessed_mask;
-                       set_spte_track_bits(spte, new_spte);
+                       mmu_spte_clear_track_bits(spte);
+                       mmu_spte_set(spte, new_spte);
                         spte = rmap_next(kvm, rmapp, spte);
                 }
         }
@@ -1039,16 +1260,29 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
  }
  
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
  {
         ASSERT(is_empty_shadow_page(sp->spt));
         hlist_del(&sp->hash_link);
-       list_del(&sp->link);
-       free_page((unsigned long)sp->spt);
         if (!sp->role.direct)
                 free_page((unsigned long)sp->gfns);
+}
+
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+       list_del(&sp->link);
+       free_page((unsigned long)sp->spt);
         kmem_cache_free(mmu_page_header_cache, sp);
-       kvm_mod_used_mmu_pages(kvm, -1);
  }
  
  static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1075,7 +1309,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
                             u64 *parent_pte)
  {
         mmu_page_remove_parent_pte(sp, parent_pte);
-       __set_spte(parent_pte, shadow_trap_nonpresent_pte);
+       mmu_spte_clear_no_track(parent_pte);
  }
  
  static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -1117,15 +1351,6 @@ static void mark_unsync(u64 *spte)
         kvm_mmu_mark_parents_unsync(sp);
  }
  
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *sp)
-{
-       int i;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
  static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                                struct kvm_mmu_page *sp)
  {
@@ -1407,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
         }
  }
  
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = 0ull;
+}
+
  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                              gfn_t gfn,
                                              gva_t gaddr,
@@ -1469,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
  
                 account_shadowed(vcpu->kvm, gfn);
         }
-       if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-               vcpu->arch.mmu.prefetch_page(vcpu, sp);
-       else
-               nonpaging_prefetch_page(vcpu, sp);
+       init_shadow_page_table(sp);
         trace_kvm_mmu_get_page(sp, true);
         return sp;
  }
@@ -1509,17 +1739,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
         return true;
  }
  
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+                              u64 spte)
  {
-       if (is_last_spte(*iterator->sptep, iterator->level)) {
+       if (is_last_spte(spte, iterator->level)) {
                 iterator->level = 0;
                 return;
         }
  
-       iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+       iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
         --iterator->level;
  }
  
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+       return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
  static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
  {
         u64 spte;
@@ -1527,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
         spte = __pa(sp->spt)
                 | PT_PRESENT_MASK | PT_ACCESSED_MASK
                 | PT_WRITABLE_MASK | PT_USER_MASK;
-       __set_spte(sptep, spte);
+       mmu_spte_set(sptep, spte);
  }
  
  static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
  {
         if (is_large_pte(*sptep)) {
-               drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+               drop_spte(vcpu->kvm, sptep);
                 kvm_flush_remote_tlbs(vcpu->kvm);
         }
  }
@@ -1569,13 +1805,14 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
         pte = *spte;
         if (is_shadow_present_pte(pte)) {
                 if (is_last_spte(pte, sp->role.level))
-                       drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+                       drop_spte(kvm, spte);
                 else {
                         child = page_header(pte & PT64_BASE_ADDR_MASK);
                         drop_parent_pte(child, spte);
                 }
-       }
-       __set_spte(spte, shadow_trap_nonpresent_pte);
+       } else if (is_mmio_spte(pte))
+               mmu_spte_clear_no_track(spte);
+
         if (is_large_pte(pte))
                 --kvm->stat.lpages;
  }
@@ -1655,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                 /* Count self */
                 ret++;
                 list_move(&sp->link, invalid_list);
+               kvm_mod_used_mmu_pages(kvm, -1);
         } else {
                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
                 kvm_reload_remote_mmus(kvm);
@@ -1665,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
         return ret;
  }
  
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+       struct kvm_mmu_page *sp;
+
+       list_for_each_entry(sp, invalid_list, link)
+               kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+       struct kvm_mmu_page *next, *sp;
+
+       sp = container_of(head, struct kvm_mmu_page, rcu);
+       while (sp) {
+               if (!list_empty(&sp->link))
+                       next = list_first_entry(&sp->link,
+                                     struct kvm_mmu_page, link);
+               else
+                       next = NULL;
+               kvm_mmu_free_page(sp);
+               sp = next;
+       }
+}
+
  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                     struct list_head *invalid_list)
  {
@@ -1675,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
  
         kvm_flush_remote_tlbs(kvm);
  
+       if (atomic_read(&kvm->arch.reader_counter)) {
+               kvm_mmu_isolate_pages(invalid_list);
+               sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+               list_del_init(invalid_list);
+
+               trace_kvm_mmu_delay_free_pages(sp);
+               call_rcu(&sp->rcu, free_pages_rcu);
+               return;
+       }
+
         do {
                 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
                 WARN_ON(!sp->role.invalid || sp->root_count);
-               kvm_mmu_free_page(kvm, sp);
+               kvm_mmu_isolate_page(sp);
+               kvm_mmu_free_page(sp);
         } while (!list_empty(invalid_list));
  
  }
@@ -1704,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
                         page = container_of(kvm->arch.active_mmu_pages.prev,
                                             struct kvm_mmu_page, link);
                         kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
                 }
+               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
         }
  
@@ -1754,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
         __set_bit(slot, sp->slot_bitmap);
  }
  
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
-       int i;
-       u64 *pt = sp->spt;
-
-       if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
-               return;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               if (pt[i] == shadow_notrap_nonpresent_pte)
-                       __set_spte(&pt[i], shadow_trap_nonpresent_pte);
-       }
-}
-
  /*
   * The function is based on mtrr_type_lookup() in
   * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1880,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
         sp->unsync = 1;
  
         kvm_mmu_mark_parents_unsync(sp);
-       mmu_convert_notrap(sp);
  }
  
  static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
@@ -1923,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
  
  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                     unsigned pte_access, int user_fault,
-                   int write_fault, int dirty, int level,
+                   int write_fault, int level,
                     gfn_t gfn, pfn_t pfn, bool speculative,
                     bool can_unsync, bool host_writable)
  {
         u64 spte, entry = *sptep;
         int ret = 0;
  
+       if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+               return 0;
+
         /*
          * We don't set the accessed bit, since we sometimes want to see
          * whether the guest actually used the pte (in order to detect
@@ -1938,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         spte = PT_PRESENT_MASK;
         if (!speculative)
                 spte |= shadow_accessed_mask;
-       if (!dirty)
-               pte_access &= ~ACC_WRITE_MASK;
+
         if (pte_access & ACC_EXEC_MASK)
                 spte |= shadow_x_mask;
         else
@@ -1966,7 +2226,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 if (level > PT_PAGE_TABLE_LEVEL &&
                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
                         ret = 1;
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                         goto done;
                 }
  
@@ -2008,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 mark_page_dirty(vcpu->kvm, gfn);
  
  set_pte:
-       update_spte(sptep, spte);
+       mmu_spte_update(sptep, spte);
         /*
          * If we overwrite a writable spte with a read-only one we
          * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2023,8 +2283,8 @@ done:
  
  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                          unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int level, gfn_t gfn,
+                        int user_fault, int write_fault,
+                        int *emulate, int level, gfn_t gfn,
                          pfn_t pfn, bool speculative,
                          bool host_writable)
  {
@@ -2052,20 +2312,23 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 } else if (pfn != spte_to_pfn(*sptep)) {
                         pgprintk("hfn old %llx new %llx\n",
                                  spte_to_pfn(*sptep), pfn);
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                         kvm_flush_remote_tlbs(vcpu->kvm);
                 } else
                         was_rmapped = 1;
         }
  
         if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-                     dirty, level, gfn, pfn, speculative, true,
+                     level, gfn, pfn, speculative, true,
                       host_writable)) {
                 if (write_fault)
-                       *ptwrite = 1;
+                       *emulate = 1;
                 kvm_mmu_flush_tlb(vcpu);
         }
  
+       if (unlikely(is_mmio_spte(*sptep) && emulate))
+               *emulate = 1;
+
         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
         pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
                  is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2101,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
  
         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
         if (!slot) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
+               get_page(fault_page);
+               return page_to_pfn(fault_page);
         }
  
         hva = gfn_to_hva_memslot(slot, gfn);
@@ -2129,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
  
         for (i = 0; i < ret; i++, gfn++, start++)
                 mmu_set_spte(vcpu, start, ACC_ALL,
-                            access, 0, 0, 1, NULL,
+                            access, 0, 0, NULL,
                              sp->role.level, gfn,
                              page_to_pfn(pages[i]), true, true);
  
@@ -2148,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
         spte = sp->spt + i;
  
         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-               if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+               if (is_shadow_present_pte(*spte) || spte == sptep) {
                         if (!start)
                                 continue;
                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2185,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
  {
         struct kvm_shadow_walk_iterator iterator;
         struct kvm_mmu_page *sp;
-       int pt_write = 0;
+       int emulate = 0;
         gfn_t pseudo_gfn;
  
         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2193,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                         unsigned pte_access = ACC_ALL;
  
                         mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-                                    0, write, 1, &pt_write,
+                                    0, write, &emulate,
                                      level, gfn, pfn, prefault, map_writable);
                         direct_pte_prefetch(vcpu, iterator.sptep);
                         ++vcpu->stat.pf_fixed;
                         break;
                 }
  
-               if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+               if (!is_shadow_present_pte(*iterator.sptep)) {
                         u64 base_addr = iterator.addr;
  
                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2214,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                 return -ENOMEM;
                         }
  
-                       __set_spte(iterator.sptep,
-                                  __pa(sp->spt)
-                                  | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                  | shadow_user_mask | shadow_x_mask
-                                  | shadow_accessed_mask);
+                       mmu_spte_set(iterator.sptep,
+                                    __pa(sp->spt)
+                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
+                                    | shadow_user_mask | shadow_x_mask
+                                    | shadow_accessed_mask);
                 }
         }
-       return pt_write;
+       return emulate;
  }
  
  static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2237,18 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
         send_sig_info(SIGBUS, &info, tsk);
  }
  
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
-                              unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
  {
         kvm_release_pfn_clean(pfn);
         if (is_hwpoison_pfn(pfn)) {
                 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
                 return 0;
-       } else if (is_fault_pfn(pfn))
-               return -EFAULT;
+       }
  
-       vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-       return 1;
+       return -EFAULT;
  }
  
  static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2293,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
         }
  }
  
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+       return unlikely(is_invalid_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+                               pfn_t pfn, unsigned access, int *ret_val)
+{
+       bool ret = true;
+
+       /* The pfn is invalid, report the error! */
+       if (unlikely(is_invalid_pfn(pfn))) {
+               *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+               goto exit;
+       }
+
+       if (unlikely(is_noslot_pfn(pfn)))
+               vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+       ret = false;
+exit:
+       return ret;
+}
+
  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                          gva_t gva, pfn_t *pfn, bool write, bool *writable);
  
@@ -2327,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
         if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
                 return 0;
  
-       /* mmio */
-       if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+               return r;
  
         spin_lock(&vcpu->kvm->mmu_lock);
         if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2601,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
  }
  
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+       if (direct)
+               return vcpu_match_mmio_gpa(vcpu, addr);
+
+       return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ *   - It is the mmio spte
+ *   - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+       return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte = 0ull;
+
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+               if (!is_shadow_present_pte(spte))
+                       break;
+       walk_shadow_page_lockless_end(vcpu);
+
+       return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+       u64 spte;
+
+       if (quickly_check_mmio_pf(vcpu, addr, direct))
+               return 1;
+
+       spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+       if (is_mmio_spte(spte)) {
+               gfn_t gfn = get_mmio_spte_gfn(spte);
+               unsigned access = get_mmio_spte_access(spte);
+
+               if (direct)
+                       addr = 0;
+
+               trace_handle_mmio_page_fault(addr, gfn, access);
+               vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+               return 1;
+       }
+
+       /*
+        * It's ok if the gva is remapped by other cpus on shadow guest,
+        * it's a BUG if the gfn is not a mmio page.
+        */
+       if (direct && !check_direct_spte_mmio_pf(spte))
+               return -1;
+
+       /*
+        * If the page table is zapped by other cpus, let CPU fault again on
+        * the address.
+        */
+       return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+                                 u32 error_code, bool direct)
+{
+       int ret;
+
+       ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+       WARN_ON(ret < 0);
+       return ret;
+}
+
  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                 u32 error_code, bool prefault)
  {
@@ -2608,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
         int r;
  
         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
         r = mmu_topup_memory_caches(vcpu);
         if (r)
                 return r;
@@ -2684,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
         ASSERT(vcpu);
         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
         r = mmu_topup_memory_caches(vcpu);
         if (r)
                 return r;
@@ -2701,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
                 return 0;
  
-       /* mmio */
-       if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+       if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+               return r;
+
         spin_lock(&vcpu->kvm->mmu_lock);
         if (mmu_notifier_retry(vcpu, mmu_seq))
                 goto out_unlock;
@@ -2734,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
         context->page_fault = nonpaging_page_fault;
         context->gva_to_gpa = nonpaging_gva_to_gpa;
         context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
         context->sync_page = nonpaging_sync_page;
         context->invlpg = nonpaging_invlpg;
         context->update_pte = nonpaging_update_pte;
@@ -2782,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
         return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
  }
  
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+                          int *nr_present)
+{
+       if (unlikely(is_mmio_spte(*sptep))) {
+               if (gfn != get_mmio_spte_gfn(*sptep)) {
+                       mmu_spte_clear_no_track(sptep);
+                       return true;
+               }
+
+               (*nr_present)++;
+               mark_mmio_spte(sptep, gfn, access);
+               return true;
+       }
+
+       return false;
+}
+
  #define PTTYPE 64
  #include "paging_tmpl.h"
  #undef PTTYPE
@@ -2864,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
         context->new_cr3 = paging_new_cr3;
         context->page_fault = paging64_page_fault;
         context->gva_to_gpa = paging64_gva_to_gpa;
-       context->prefetch_page = paging64_prefetch_page;
         context->sync_page = paging64_sync_page;
         context->invlpg = paging64_invlpg;
         context->update_pte = paging64_update_pte;
@@ -2893,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
         context->page_fault = paging32_page_fault;
         context->gva_to_gpa = paging32_gva_to_gpa;
         context->free = paging_free;
-       context->prefetch_page = paging32_prefetch_page;
         context->sync_page = paging32_sync_page;
         context->invlpg = paging32_invlpg;
         context->update_pte = paging32_update_pte;
@@ -2918,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
         context->new_cr3 = nonpaging_new_cr3;
         context->page_fault = tdp_page_fault;
         context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
         context->sync_page = nonpaging_sync_page;
         context->invlpg = nonpaging_invlpg;
         context->update_pte = nonpaging_update_pte;
@@ -3303,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                   struct kvm_mmu_page, link);
                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                 ++vcpu->kvm->stat.mmu_recycled;
         }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
  }
  
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3429,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                                 continue;
  
                         if (is_large_pte(pt[i])) {
-                               drop_spte(kvm, &pt[i],
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, &pt[i]);
                                 --kvm->stat.lpages;
                                 continue;
                         }
  
                         /* avoid RMW */
                         if (is_writable_pte(pt[i]))
-                               update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+                               mmu_spte_update(&pt[i],
+                                               pt[i] & ~PT_WRITABLE_MASK);
                 }
         }
         kvm_flush_remote_tlbs(kvm);
@@ -3691,16 +4082,17 @@ out:
  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
  {
         struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
         int nr_sptes = 0;
  
-       spin_lock(&vcpu->kvm->mmu_lock);
-       for_each_shadow_entry(vcpu, addr, iterator) {
-               sptes[iterator.level-1] = *iterator.sptep;
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               sptes[iterator.level-1] = spte;
                 nr_sptes++;
-               if (!is_shadow_present_pte(*iterator.sptep))
+               if (!is_shadow_present_pte(spte))
                         break;
         }
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       walk_shadow_page_lockless_end(vcpu);
  
         return nr_sptes;
  }