static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ sp->mmio_cached = true;
trace_mark_mmio_spte(sptep, gfn, access);
mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
}
u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
+
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * The active_mmu_pages list is the FIFO list, do not move the
+ * page until it is zapped. kvm_zap_obsolete_pages depends on
+ * this feature. See the comments in kvm_zap_obsolete_pages().
+ */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-#define for_each_gfn_sp(kvm, sp, gfn) \
- hlist_for_each_entry(sp, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn)) {} else
+#define for_each_gfn_sp(_kvm, _sp, _gfn) \
+ hlist_for_each_entry(_sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ if ((_sp)->gfn != (_gfn)) {} else
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \
- hlist_for_each_entry(sp, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn) || (sp)->role.direct || \
- (sp)->role.invalid) {} else
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+ for_each_gfn_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->role.direct || (_sp)->role.invalid) {} else
/* @sp->gfn should be write-protected at the call site */
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
__clear_sp_write_flooding_count(sp);
}
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
account_shadowed(vcpu->kvm, gfn);
}
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
- struct kvm_mmu_page *sp;
+ struct kvm_mmu_page *sp, *nsp;
if (list_empty(invalid_list))
return;
*/
kvm_flush_remote_tlbs(kvm);
- do {
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
kvm_mmu_free_page(sp);
- } while (!list_empty(invalid_list));
+ }
+}
+
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return false;
+
+ sp = list_entry(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ return true;
}
/*
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
- /*
- * If we set the number of mmu pages to be smaller be than the
- * number of actived pages , we must to free some mmu pages before we
- * change the value
- */
spin_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
- !list_empty(&kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *page;
+ /* Need to free some mmu pages to achieve the goal. */
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+ if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ break;
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- }
kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
gfn_t gfn, bool prefault)
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
(vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+ spin_lock(&vcpu->kvm->mmu_lock);
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
return;
}
+
+ spin_lock(&vcpu->kvm->mmu_lock);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
1, ACC_ALL, NULL);
++sp->root_count;
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i << 30,
PT32_ROOT_LEVEL, 1, ACC_ALL,
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
0, ACC_ALL, NULL);
root = __pa(sp->spt);
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, 0,
ACC_ALL, NULL);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
+ make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable,
if (r)
goto out;
r = mmu_alloc_roots(vcpu);
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_sync_roots(vcpu);
if (r)
goto out;
/* set_cr3() should ensure TLB has been flushed */
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
LIST_HEAD(invalid_list);
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
- !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *sp;
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
}
-void kvm_mmu_zap_all(struct kvm *kvm)
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
- spin_lock(&kvm->mmu_lock);
restart:
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete page exists before new created page since
+ * active_mmu_pages is the FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Do not repeatedly zap a root page to avoid unnecessary
+ * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+ * progress:
+ * vcpu 0 vcpu 1
+ * call vcpu_enter_guest():
+ * 1): handle KVM_REQ_MMU_RELOAD
+ * and require mmu-lock to
+ * load mmu
+ * repeat:
+ * 1): zap root page and
+ * send KVM_REQ_MMU_RELOAD
+ *
+ * 2): if (cond_resched_lock(mmu-lock))
+ *
+ * 2): hold mmu-lock and load mmu
+ *
+ * 3): see KVM_REQ_MMU_RELOAD bit
+ * on vcpu->requests is set
+ * then return 1 to call
+ * vcpu_enter_guest() again.
+ * goto repeat;
+ *
+ * Since we are reversely walking the list and the invalid
+ * list will be moved to the head, skip the invalid page
+ * can help us to avoid the infinity list walking.
+ */
+ if (sp->role.invalid)
+ continue;
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ cond_resched_lock(&kvm->mmu_lock);
+ goto restart;
+ }
+
if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+ }
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
+{
+ spin_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm->arch.mmu_valid_gen++;
+
+ kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
}
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
- struct list_head *invalid_list)
+void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
{
- struct kvm_mmu_page *page;
+ struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
- if (list_empty(&kvm->arch.active_mmu_pages))
- return;
+ spin_lock(&kvm->mmu_lock);
+restart:
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (!sp->mmio_cached)
+ continue;
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+ goto restart;
+ }
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
}
static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
+ prepare_zap_oldest_mmu_page(kvm, &invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);