KVM: MMU: unify slots_lock usage
[pandora-kernel.git] / arch / x86 / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  *
11  * Authors:
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Avi Kivity   <avi@qumranet.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2.  See
16  * the COPYING file in the top-level directory.
17  *
18  */
19
20 #include "vmx.h"
21 #include "mmu.h"
22
23 #include <linux/kvm_host.h>
24 #include <linux/types.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/highmem.h>
28 #include <linux/module.h>
29 #include <linux/swap.h>
30 #include <linux/hugetlb.h>
31 #include <linux/compiler.h>
32
33 #include <asm/page.h>
34 #include <asm/cmpxchg.h>
35 #include <asm/io.h>
36
37 /*
38  * When setting this variable to true it enables Two-Dimensional-Paging
39  * where the hardware walks 2 page tables:
40  * 1. the guest-virtual to guest-physical
41  * 2. while doing 1. it walks guest-physical to host-physical
42  * If the hardware supports that we don't need to do shadow paging.
43  */
44 bool tdp_enabled = false;
45
46 #undef MMU_DEBUG
47
48 #undef AUDIT
49
50 #ifdef AUDIT
51 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
52 #else
53 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
54 #endif
55
56 #ifdef MMU_DEBUG
57
58 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
59 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
60
61 #else
62
63 #define pgprintk(x...) do { } while (0)
64 #define rmap_printk(x...) do { } while (0)
65
66 #endif
67
68 #if defined(MMU_DEBUG) || defined(AUDIT)
69 static int dbg = 1;
70 #endif
71
72 #ifndef MMU_DEBUG
73 #define ASSERT(x) do { } while (0)
74 #else
75 #define ASSERT(x)                                                       \
76         if (!(x)) {                                                     \
77                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
78                        __FILE__, __LINE__, #x);                         \
79         }
80 #endif
81
82 #define PT64_PT_BITS 9
83 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
84 #define PT32_PT_BITS 10
85 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
86
87 #define PT_WRITABLE_SHIFT 1
88
89 #define PT_PRESENT_MASK (1ULL << 0)
90 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
91 #define PT_USER_MASK (1ULL << 2)
92 #define PT_PWT_MASK (1ULL << 3)
93 #define PT_PCD_MASK (1ULL << 4)
94 #define PT_ACCESSED_MASK (1ULL << 5)
95 #define PT_DIRTY_MASK (1ULL << 6)
96 #define PT_PAGE_SIZE_MASK (1ULL << 7)
97 #define PT_PAT_MASK (1ULL << 7)
98 #define PT_GLOBAL_MASK (1ULL << 8)
99 #define PT64_NX_SHIFT 63
100 #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
101
102 #define PT_PAT_SHIFT 7
103 #define PT_DIR_PAT_SHIFT 12
104 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
105
106 #define PT32_DIR_PSE36_SIZE 4
107 #define PT32_DIR_PSE36_SHIFT 13
108 #define PT32_DIR_PSE36_MASK \
109         (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
110
111
112 #define PT_FIRST_AVAIL_BITS_SHIFT 9
113 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
114
115 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
116
117 #define PT64_LEVEL_BITS 9
118
119 #define PT64_LEVEL_SHIFT(level) \
120                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
121
122 #define PT64_LEVEL_MASK(level) \
123                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
124
125 #define PT64_INDEX(address, level)\
126         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
127
128
129 #define PT32_LEVEL_BITS 10
130
131 #define PT32_LEVEL_SHIFT(level) \
132                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
133
134 #define PT32_LEVEL_MASK(level) \
135                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
136
137 #define PT32_INDEX(address, level)\
138         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
139
140
141 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
142 #define PT64_DIR_BASE_ADDR_MASK \
143         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
144
145 #define PT32_BASE_ADDR_MASK PAGE_MASK
146 #define PT32_DIR_BASE_ADDR_MASK \
147         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
148
149 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
150                         | PT64_NX_MASK)
151
152 #define PFERR_PRESENT_MASK (1U << 0)
153 #define PFERR_WRITE_MASK (1U << 1)
154 #define PFERR_USER_MASK (1U << 2)
155 #define PFERR_FETCH_MASK (1U << 4)
156
157 #define PT64_ROOT_LEVEL 4
158 #define PT32_ROOT_LEVEL 2
159 #define PT32E_ROOT_LEVEL 3
160
161 #define PT_DIRECTORY_LEVEL 2
162 #define PT_PAGE_TABLE_LEVEL 1
163
164 #define RMAP_EXT 4
165
166 #define ACC_EXEC_MASK    1
167 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
168 #define ACC_USER_MASK    PT_USER_MASK
169 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
170
171 struct kvm_pv_mmu_op_buffer {
172         void *ptr;
173         unsigned len;
174         unsigned processed;
175         char buf[512] __aligned(sizeof(long));
176 };
177
178 struct kvm_rmap_desc {
179         u64 *shadow_ptes[RMAP_EXT];
180         struct kvm_rmap_desc *more;
181 };
182
183 static struct kmem_cache *pte_chain_cache;
184 static struct kmem_cache *rmap_desc_cache;
185 static struct kmem_cache *mmu_page_header_cache;
186
187 static u64 __read_mostly shadow_trap_nonpresent_pte;
188 static u64 __read_mostly shadow_notrap_nonpresent_pte;
189
190 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
191 {
192         shadow_trap_nonpresent_pte = trap_pte;
193         shadow_notrap_nonpresent_pte = notrap_pte;
194 }
195 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
196
197 static int is_write_protection(struct kvm_vcpu *vcpu)
198 {
199         return vcpu->arch.cr0 & X86_CR0_WP;
200 }
201
202 static int is_cpuid_PSE36(void)
203 {
204         return 1;
205 }
206
207 static int is_nx(struct kvm_vcpu *vcpu)
208 {
209         return vcpu->arch.shadow_efer & EFER_NX;
210 }
211
212 static int is_present_pte(unsigned long pte)
213 {
214         return pte & PT_PRESENT_MASK;
215 }
216
217 static int is_shadow_present_pte(u64 pte)
218 {
219         return pte != shadow_trap_nonpresent_pte
220                 && pte != shadow_notrap_nonpresent_pte;
221 }
222
223 static int is_large_pte(u64 pte)
224 {
225         return pte & PT_PAGE_SIZE_MASK;
226 }
227
228 static int is_writeble_pte(unsigned long pte)
229 {
230         return pte & PT_WRITABLE_MASK;
231 }
232
233 static int is_dirty_pte(unsigned long pte)
234 {
235         return pte & PT_DIRTY_MASK;
236 }
237
238 static int is_rmap_pte(u64 pte)
239 {
240         return is_shadow_present_pte(pte);
241 }
242
243 static struct page *spte_to_page(u64 pte)
244 {
245         hfn_t hfn = (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
246
247         return pfn_to_page(hfn);
248 }
249
250 static gfn_t pse36_gfn_delta(u32 gpte)
251 {
252         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
253
254         return (gpte & PT32_DIR_PSE36_MASK) << shift;
255 }
256
257 static void set_shadow_pte(u64 *sptep, u64 spte)
258 {
259 #ifdef CONFIG_X86_64
260         set_64bit((unsigned long *)sptep, spte);
261 #else
262         set_64bit((unsigned long long *)sptep, spte);
263 #endif
264 }
265
266 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
267                                   struct kmem_cache *base_cache, int min)
268 {
269         void *obj;
270
271         if (cache->nobjs >= min)
272                 return 0;
273         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
274                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
275                 if (!obj)
276                         return -ENOMEM;
277                 cache->objects[cache->nobjs++] = obj;
278         }
279         return 0;
280 }
281
282 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
283 {
284         while (mc->nobjs)
285                 kfree(mc->objects[--mc->nobjs]);
286 }
287
288 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
289                                        int min)
290 {
291         struct page *page;
292
293         if (cache->nobjs >= min)
294                 return 0;
295         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
296                 page = alloc_page(GFP_KERNEL);
297                 if (!page)
298                         return -ENOMEM;
299                 set_page_private(page, 0);
300                 cache->objects[cache->nobjs++] = page_address(page);
301         }
302         return 0;
303 }
304
305 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
306 {
307         while (mc->nobjs)
308                 free_page((unsigned long)mc->objects[--mc->nobjs]);
309 }
310
311 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
312 {
313         int r;
314
315         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
316                                    pte_chain_cache, 4);
317         if (r)
318                 goto out;
319         r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
320                                    rmap_desc_cache, 1);
321         if (r)
322                 goto out;
323         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
324         if (r)
325                 goto out;
326         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
327                                    mmu_page_header_cache, 4);
328 out:
329         return r;
330 }
331
332 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
333 {
334         mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
335         mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
336         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
337         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
338 }
339
340 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
341                                     size_t size)
342 {
343         void *p;
344
345         BUG_ON(!mc->nobjs);
346         p = mc->objects[--mc->nobjs];
347         memset(p, 0, size);
348         return p;
349 }
350
351 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
352 {
353         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
354                                       sizeof(struct kvm_pte_chain));
355 }
356
357 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
358 {
359         kfree(pc);
360 }
361
362 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
363 {
364         return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
365                                       sizeof(struct kvm_rmap_desc));
366 }
367
368 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
369 {
370         kfree(rd);
371 }
372
373 /*
374  * Return the pointer to the largepage write count for a given
375  * gfn, handling slots that are not large page aligned.
376  */
377 static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
378 {
379         unsigned long idx;
380
381         idx = (gfn / KVM_PAGES_PER_HPAGE) -
382               (slot->base_gfn / KVM_PAGES_PER_HPAGE);
383         return &slot->lpage_info[idx].write_count;
384 }
385
386 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
387 {
388         int *write_count;
389
390         write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
391         *write_count += 1;
392         WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
393 }
394
395 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
396 {
397         int *write_count;
398
399         write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
400         *write_count -= 1;
401         WARN_ON(*write_count < 0);
402 }
403
404 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
405 {
406         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
407         int *largepage_idx;
408
409         if (slot) {
410                 largepage_idx = slot_largepage_idx(gfn, slot);
411                 return *largepage_idx;
412         }
413
414         return 1;
415 }
416
417 static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
418 {
419         struct vm_area_struct *vma;
420         unsigned long addr;
421
422         addr = gfn_to_hva(kvm, gfn);
423         if (kvm_is_error_hva(addr))
424                 return 0;
425
426         vma = find_vma(current->mm, addr);
427         if (vma && is_vm_hugetlb_page(vma))
428                 return 1;
429
430         return 0;
431 }
432
433 static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
434 {
435         struct kvm_memory_slot *slot;
436
437         if (has_wrprotected_page(vcpu->kvm, large_gfn))
438                 return 0;
439
440         if (!host_largepage_backed(vcpu->kvm, large_gfn))
441                 return 0;
442
443         slot = gfn_to_memslot(vcpu->kvm, large_gfn);
444         if (slot && slot->dirty_bitmap)
445                 return 0;
446
447         return 1;
448 }
449
450 /*
451  * Take gfn and return the reverse mapping to it.
452  * Note: gfn must be unaliased before this function get called
453  */
454
455 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
456 {
457         struct kvm_memory_slot *slot;
458         unsigned long idx;
459
460         slot = gfn_to_memslot(kvm, gfn);
461         if (!lpage)
462                 return &slot->rmap[gfn - slot->base_gfn];
463
464         idx = (gfn / KVM_PAGES_PER_HPAGE) -
465               (slot->base_gfn / KVM_PAGES_PER_HPAGE);
466
467         return &slot->lpage_info[idx].rmap_pde;
468 }
469
470 /*
471  * Reverse mapping data structures:
472  *
473  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
474  * that points to page_address(page).
475  *
476  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
477  * containing more mappings.
478  */
479 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
480 {
481         struct kvm_mmu_page *sp;
482         struct kvm_rmap_desc *desc;
483         unsigned long *rmapp;
484         int i;
485
486         if (!is_rmap_pte(*spte))
487                 return;
488         gfn = unalias_gfn(vcpu->kvm, gfn);
489         sp = page_header(__pa(spte));
490         sp->gfns[spte - sp->spt] = gfn;
491         rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
492         if (!*rmapp) {
493                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
494                 *rmapp = (unsigned long)spte;
495         } else if (!(*rmapp & 1)) {
496                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
497                 desc = mmu_alloc_rmap_desc(vcpu);
498                 desc->shadow_ptes[0] = (u64 *)*rmapp;
499                 desc->shadow_ptes[1] = spte;
500                 *rmapp = (unsigned long)desc | 1;
501         } else {
502                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
503                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
504                 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
505                         desc = desc->more;
506                 if (desc->shadow_ptes[RMAP_EXT-1]) {
507                         desc->more = mmu_alloc_rmap_desc(vcpu);
508                         desc = desc->more;
509                 }
510                 for (i = 0; desc->shadow_ptes[i]; ++i)
511                         ;
512                 desc->shadow_ptes[i] = spte;
513         }
514 }
515
516 static void rmap_desc_remove_entry(unsigned long *rmapp,
517                                    struct kvm_rmap_desc *desc,
518                                    int i,
519                                    struct kvm_rmap_desc *prev_desc)
520 {
521         int j;
522
523         for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
524                 ;
525         desc->shadow_ptes[i] = desc->shadow_ptes[j];
526         desc->shadow_ptes[j] = NULL;
527         if (j != 0)
528                 return;
529         if (!prev_desc && !desc->more)
530                 *rmapp = (unsigned long)desc->shadow_ptes[0];
531         else
532                 if (prev_desc)
533                         prev_desc->more = desc->more;
534                 else
535                         *rmapp = (unsigned long)desc->more | 1;
536         mmu_free_rmap_desc(desc);
537 }
538
539 static void rmap_remove(struct kvm *kvm, u64 *spte)
540 {
541         struct kvm_rmap_desc *desc;
542         struct kvm_rmap_desc *prev_desc;
543         struct kvm_mmu_page *sp;
544         struct page *page;
545         unsigned long *rmapp;
546         int i;
547
548         if (!is_rmap_pte(*spte))
549                 return;
550         sp = page_header(__pa(spte));
551         page = spte_to_page(*spte);
552         mark_page_accessed(page);
553         if (is_writeble_pte(*spte))
554                 kvm_release_page_dirty(page);
555         else
556                 kvm_release_page_clean(page);
557         rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
558         if (!*rmapp) {
559                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
560                 BUG();
561         } else if (!(*rmapp & 1)) {
562                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
563                 if ((u64 *)*rmapp != spte) {
564                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
565                                spte, *spte);
566                         BUG();
567                 }
568                 *rmapp = 0;
569         } else {
570                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
571                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
572                 prev_desc = NULL;
573                 while (desc) {
574                         for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
575                                 if (desc->shadow_ptes[i] == spte) {
576                                         rmap_desc_remove_entry(rmapp,
577                                                                desc, i,
578                                                                prev_desc);
579                                         return;
580                                 }
581                         prev_desc = desc;
582                         desc = desc->more;
583                 }
584                 BUG();
585         }
586 }
587
588 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
589 {
590         struct kvm_rmap_desc *desc;
591         struct kvm_rmap_desc *prev_desc;
592         u64 *prev_spte;
593         int i;
594
595         if (!*rmapp)
596                 return NULL;
597         else if (!(*rmapp & 1)) {
598                 if (!spte)
599                         return (u64 *)*rmapp;
600                 return NULL;
601         }
602         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
603         prev_desc = NULL;
604         prev_spte = NULL;
605         while (desc) {
606                 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
607                         if (prev_spte == spte)
608                                 return desc->shadow_ptes[i];
609                         prev_spte = desc->shadow_ptes[i];
610                 }
611                 desc = desc->more;
612         }
613         return NULL;
614 }
615
616 static void rmap_write_protect(struct kvm *kvm, u64 gfn)
617 {
618         unsigned long *rmapp;
619         u64 *spte;
620         int write_protected = 0;
621
622         gfn = unalias_gfn(kvm, gfn);
623         rmapp = gfn_to_rmap(kvm, gfn, 0);
624
625         spte = rmap_next(kvm, rmapp, NULL);
626         while (spte) {
627                 BUG_ON(!spte);
628                 BUG_ON(!(*spte & PT_PRESENT_MASK));
629                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
630                 if (is_writeble_pte(*spte)) {
631                         set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
632                         write_protected = 1;
633                 }
634                 spte = rmap_next(kvm, rmapp, spte);
635         }
636         if (write_protected) {
637                 struct page *page;
638
639                 spte = rmap_next(kvm, rmapp, NULL);
640                 page = spte_to_page(*spte);
641                 SetPageDirty(page);
642         }
643
644         /* check for huge page mappings */
645         rmapp = gfn_to_rmap(kvm, gfn, 1);
646         spte = rmap_next(kvm, rmapp, NULL);
647         while (spte) {
648                 BUG_ON(!spte);
649                 BUG_ON(!(*spte & PT_PRESENT_MASK));
650                 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
651                 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
652                 if (is_writeble_pte(*spte)) {
653                         rmap_remove(kvm, spte);
654                         --kvm->stat.lpages;
655                         set_shadow_pte(spte, shadow_trap_nonpresent_pte);
656                         write_protected = 1;
657                 }
658                 spte = rmap_next(kvm, rmapp, spte);
659         }
660
661         if (write_protected)
662                 kvm_flush_remote_tlbs(kvm);
663
664         account_shadowed(kvm, gfn);
665 }
666
667 #ifdef MMU_DEBUG
668 static int is_empty_shadow_page(u64 *spt)
669 {
670         u64 *pos;
671         u64 *end;
672
673         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
674                 if (*pos != shadow_trap_nonpresent_pte) {
675                         printk(KERN_ERR "%s: %p %llx\n", __func__,
676                                pos, *pos);
677                         return 0;
678                 }
679         return 1;
680 }
681 #endif
682
683 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
684 {
685         ASSERT(is_empty_shadow_page(sp->spt));
686         list_del(&sp->link);
687         __free_page(virt_to_page(sp->spt));
688         __free_page(virt_to_page(sp->gfns));
689         kfree(sp);
690         ++kvm->arch.n_free_mmu_pages;
691 }
692
693 static unsigned kvm_page_table_hashfn(gfn_t gfn)
694 {
695         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
696 }
697
698 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
699                                                u64 *parent_pte)
700 {
701         struct kvm_mmu_page *sp;
702
703         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
704         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
705         sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
706         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
707         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
708         ASSERT(is_empty_shadow_page(sp->spt));
709         sp->slot_bitmap = 0;
710         sp->multimapped = 0;
711         sp->parent_pte = parent_pte;
712         --vcpu->kvm->arch.n_free_mmu_pages;
713         return sp;
714 }
715
716 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
717                                     struct kvm_mmu_page *sp, u64 *parent_pte)
718 {
719         struct kvm_pte_chain *pte_chain;
720         struct hlist_node *node;
721         int i;
722
723         if (!parent_pte)
724                 return;
725         if (!sp->multimapped) {
726                 u64 *old = sp->parent_pte;
727
728                 if (!old) {
729                         sp->parent_pte = parent_pte;
730                         return;
731                 }
732                 sp->multimapped = 1;
733                 pte_chain = mmu_alloc_pte_chain(vcpu);
734                 INIT_HLIST_HEAD(&sp->parent_ptes);
735                 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
736                 pte_chain->parent_ptes[0] = old;
737         }
738         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
739                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
740                         continue;
741                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
742                         if (!pte_chain->parent_ptes[i]) {
743                                 pte_chain->parent_ptes[i] = parent_pte;
744                                 return;
745                         }
746         }
747         pte_chain = mmu_alloc_pte_chain(vcpu);
748         BUG_ON(!pte_chain);
749         hlist_add_head(&pte_chain->link, &sp->parent_ptes);
750         pte_chain->parent_ptes[0] = parent_pte;
751 }
752
753 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
754                                        u64 *parent_pte)
755 {
756         struct kvm_pte_chain *pte_chain;
757         struct hlist_node *node;
758         int i;
759
760         if (!sp->multimapped) {
761                 BUG_ON(sp->parent_pte != parent_pte);
762                 sp->parent_pte = NULL;
763                 return;
764         }
765         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
766                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
767                         if (!pte_chain->parent_ptes[i])
768                                 break;
769                         if (pte_chain->parent_ptes[i] != parent_pte)
770                                 continue;
771                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
772                                 && pte_chain->parent_ptes[i + 1]) {
773                                 pte_chain->parent_ptes[i]
774                                         = pte_chain->parent_ptes[i + 1];
775                                 ++i;
776                         }
777                         pte_chain->parent_ptes[i] = NULL;
778                         if (i == 0) {
779                                 hlist_del(&pte_chain->link);
780                                 mmu_free_pte_chain(pte_chain);
781                                 if (hlist_empty(&sp->parent_ptes)) {
782                                         sp->multimapped = 0;
783                                         sp->parent_pte = NULL;
784                                 }
785                         }
786                         return;
787                 }
788         BUG();
789 }
790
791 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
792 {
793         unsigned index;
794         struct hlist_head *bucket;
795         struct kvm_mmu_page *sp;
796         struct hlist_node *node;
797
798         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
799         index = kvm_page_table_hashfn(gfn);
800         bucket = &kvm->arch.mmu_page_hash[index];
801         hlist_for_each_entry(sp, node, bucket, hash_link)
802                 if (sp->gfn == gfn && !sp->role.metaphysical
803                     && !sp->role.invalid) {
804                         pgprintk("%s: found role %x\n",
805                                  __func__, sp->role.word);
806                         return sp;
807                 }
808         return NULL;
809 }
810
811 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
812                                              gfn_t gfn,
813                                              gva_t gaddr,
814                                              unsigned level,
815                                              int metaphysical,
816                                              unsigned access,
817                                              u64 *parent_pte)
818 {
819         union kvm_mmu_page_role role;
820         unsigned index;
821         unsigned quadrant;
822         struct hlist_head *bucket;
823         struct kvm_mmu_page *sp;
824         struct hlist_node *node;
825
826         role.word = 0;
827         role.glevels = vcpu->arch.mmu.root_level;
828         role.level = level;
829         role.metaphysical = metaphysical;
830         role.access = access;
831         if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
832                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
833                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
834                 role.quadrant = quadrant;
835         }
836         pgprintk("%s: looking gfn %lx role %x\n", __func__,
837                  gfn, role.word);
838         index = kvm_page_table_hashfn(gfn);
839         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
840         hlist_for_each_entry(sp, node, bucket, hash_link)
841                 if (sp->gfn == gfn && sp->role.word == role.word) {
842                         mmu_page_add_parent_pte(vcpu, sp, parent_pte);
843                         pgprintk("%s: found\n", __func__);
844                         return sp;
845                 }
846         ++vcpu->kvm->stat.mmu_cache_miss;
847         sp = kvm_mmu_alloc_page(vcpu, parent_pte);
848         if (!sp)
849                 return sp;
850         pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
851         sp->gfn = gfn;
852         sp->role = role;
853         hlist_add_head(&sp->hash_link, bucket);
854         vcpu->arch.mmu.prefetch_page(vcpu, sp);
855         if (!metaphysical)
856                 rmap_write_protect(vcpu->kvm, gfn);
857         return sp;
858 }
859
860 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
861                                          struct kvm_mmu_page *sp)
862 {
863         unsigned i;
864         u64 *pt;
865         u64 ent;
866
867         pt = sp->spt;
868
869         if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
870                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
871                         if (is_shadow_present_pte(pt[i]))
872                                 rmap_remove(kvm, &pt[i]);
873                         pt[i] = shadow_trap_nonpresent_pte;
874                 }
875                 kvm_flush_remote_tlbs(kvm);
876                 return;
877         }
878
879         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
880                 ent = pt[i];
881
882                 if (is_shadow_present_pte(ent)) {
883                         if (!is_large_pte(ent)) {
884                                 ent &= PT64_BASE_ADDR_MASK;
885                                 mmu_page_remove_parent_pte(page_header(ent),
886                                                            &pt[i]);
887                         } else {
888                                 --kvm->stat.lpages;
889                                 rmap_remove(kvm, &pt[i]);
890                         }
891                 }
892                 pt[i] = shadow_trap_nonpresent_pte;
893         }
894         kvm_flush_remote_tlbs(kvm);
895 }
896
897 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
898 {
899         mmu_page_remove_parent_pte(sp, parent_pte);
900 }
901
902 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
903 {
904         int i;
905
906         for (i = 0; i < KVM_MAX_VCPUS; ++i)
907                 if (kvm->vcpus[i])
908                         kvm->vcpus[i]->arch.last_pte_updated = NULL;
909 }
910
911 static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
912 {
913         u64 *parent_pte;
914
915         ++kvm->stat.mmu_shadow_zapped;
916         while (sp->multimapped || sp->parent_pte) {
917                 if (!sp->multimapped)
918                         parent_pte = sp->parent_pte;
919                 else {
920                         struct kvm_pte_chain *chain;
921
922                         chain = container_of(sp->parent_ptes.first,
923                                              struct kvm_pte_chain, link);
924                         parent_pte = chain->parent_ptes[0];
925                 }
926                 BUG_ON(!parent_pte);
927                 kvm_mmu_put_page(sp, parent_pte);
928                 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
929         }
930         kvm_mmu_page_unlink_children(kvm, sp);
931         if (!sp->root_count) {
932                 if (!sp->role.metaphysical)
933                         unaccount_shadowed(kvm, sp->gfn);
934                 hlist_del(&sp->hash_link);
935                 kvm_mmu_free_page(kvm, sp);
936         } else {
937                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
938                 sp->role.invalid = 1;
939                 kvm_reload_remote_mmus(kvm);
940         }
941         kvm_mmu_reset_last_pte_updated(kvm);
942 }
943
944 /*
945  * Changing the number of mmu pages allocated to the vm
946  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
947  */
948 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
949 {
950         /*
951          * If we set the number of mmu pages to be smaller be than the
952          * number of actived pages , we must to free some mmu pages before we
953          * change the value
954          */
955
956         if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
957             kvm_nr_mmu_pages) {
958                 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
959                                        - kvm->arch.n_free_mmu_pages;
960
961                 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
962                         struct kvm_mmu_page *page;
963
964                         page = container_of(kvm->arch.active_mmu_pages.prev,
965                                             struct kvm_mmu_page, link);
966                         kvm_mmu_zap_page(kvm, page);
967                         n_used_mmu_pages--;
968                 }
969                 kvm->arch.n_free_mmu_pages = 0;
970         }
971         else
972                 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
973                                          - kvm->arch.n_alloc_mmu_pages;
974
975         kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
976 }
977
978 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
979 {
980         unsigned index;
981         struct hlist_head *bucket;
982         struct kvm_mmu_page *sp;
983         struct hlist_node *node, *n;
984         int r;
985
986         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
987         r = 0;
988         index = kvm_page_table_hashfn(gfn);
989         bucket = &kvm->arch.mmu_page_hash[index];
990         hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
991                 if (sp->gfn == gfn && !sp->role.metaphysical) {
992                         pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
993                                  sp->role.word);
994                         kvm_mmu_zap_page(kvm, sp);
995                         r = 1;
996                 }
997         return r;
998 }
999
1000 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1001 {
1002         struct kvm_mmu_page *sp;
1003
1004         while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
1005                 pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
1006                 kvm_mmu_zap_page(kvm, sp);
1007         }
1008 }
1009
1010 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1011 {
1012         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1013         struct kvm_mmu_page *sp = page_header(__pa(pte));
1014
1015         __set_bit(slot, &sp->slot_bitmap);
1016 }
1017
1018 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1019 {
1020         struct page *page;
1021
1022         gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1023
1024         if (gpa == UNMAPPED_GVA)
1025                 return NULL;
1026
1027         down_read(&current->mm->mmap_sem);
1028         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1029         up_read(&current->mm->mmap_sem);
1030
1031         return page;
1032 }
1033
1034 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1035                          unsigned pt_access, unsigned pte_access,
1036                          int user_fault, int write_fault, int dirty,
1037                          int *ptwrite, int largepage, gfn_t gfn,
1038                          struct page *page, bool speculative)
1039 {
1040         u64 spte;
1041         int was_rmapped = 0;
1042         int was_writeble = is_writeble_pte(*shadow_pte);
1043
1044         pgprintk("%s: spte %llx access %x write_fault %d"
1045                  " user_fault %d gfn %lx\n",
1046                  __func__, *shadow_pte, pt_access,
1047                  write_fault, user_fault, gfn);
1048
1049         if (is_rmap_pte(*shadow_pte)) {
1050                 /*
1051                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1052                  * the parent of the now unreachable PTE.
1053                  */
1054                 if (largepage && !is_large_pte(*shadow_pte)) {
1055                         struct kvm_mmu_page *child;
1056                         u64 pte = *shadow_pte;
1057
1058                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1059                         mmu_page_remove_parent_pte(child, shadow_pte);
1060                 } else if (page != spte_to_page(*shadow_pte)) {
1061                         pgprintk("hfn old %lx new %lx\n",
1062                                  page_to_pfn(spte_to_page(*shadow_pte)),
1063                                  page_to_pfn(page));
1064                         rmap_remove(vcpu->kvm, shadow_pte);
1065                 } else {
1066                         if (largepage)
1067                                 was_rmapped = is_large_pte(*shadow_pte);
1068                         else
1069                                 was_rmapped = 1;
1070                 }
1071         }
1072
1073         /*
1074          * We don't set the accessed bit, since we sometimes want to see
1075          * whether the guest actually used the pte (in order to detect
1076          * demand paging).
1077          */
1078         spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
1079         if (!speculative)
1080                 pte_access |= PT_ACCESSED_MASK;
1081         if (!dirty)
1082                 pte_access &= ~ACC_WRITE_MASK;
1083         if (!(pte_access & ACC_EXEC_MASK))
1084                 spte |= PT64_NX_MASK;
1085
1086         spte |= PT_PRESENT_MASK;
1087         if (pte_access & ACC_USER_MASK)
1088                 spte |= PT_USER_MASK;
1089         if (largepage)
1090                 spte |= PT_PAGE_SIZE_MASK;
1091
1092         spte |= page_to_phys(page);
1093
1094         if ((pte_access & ACC_WRITE_MASK)
1095             || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1096                 struct kvm_mmu_page *shadow;
1097
1098                 spte |= PT_WRITABLE_MASK;
1099                 if (user_fault) {
1100                         mmu_unshadow(vcpu->kvm, gfn);
1101                         goto unshadowed;
1102                 }
1103
1104                 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1105                 if (shadow ||
1106                    (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1107                         pgprintk("%s: found shadow page for %lx, marking ro\n",
1108                                  __func__, gfn);
1109                         pte_access &= ~ACC_WRITE_MASK;
1110                         if (is_writeble_pte(spte)) {
1111                                 spte &= ~PT_WRITABLE_MASK;
1112                                 kvm_x86_ops->tlb_flush(vcpu);
1113                         }
1114                         if (write_fault)
1115                                 *ptwrite = 1;
1116                 }
1117         }
1118
1119 unshadowed:
1120
1121         if (pte_access & ACC_WRITE_MASK)
1122                 mark_page_dirty(vcpu->kvm, gfn);
1123
1124         pgprintk("%s: setting spte %llx\n", __func__, spte);
1125         pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
1126                  (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1127                  (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1128         set_shadow_pte(shadow_pte, spte);
1129         if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
1130             && (spte & PT_PRESENT_MASK))
1131                 ++vcpu->kvm->stat.lpages;
1132
1133         page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1134         if (!was_rmapped) {
1135                 rmap_add(vcpu, shadow_pte, gfn, largepage);
1136                 if (!is_rmap_pte(*shadow_pte))
1137                         kvm_release_page_clean(page);
1138         } else {
1139                 if (was_writeble)
1140                         kvm_release_page_dirty(page);
1141                 else
1142                         kvm_release_page_clean(page);
1143         }
1144         if (!ptwrite || !*ptwrite)
1145                 vcpu->arch.last_pte_updated = shadow_pte;
1146 }
1147
1148 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1149 {
1150 }
1151
1152 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1153                            int largepage, gfn_t gfn, struct page *page,
1154                            int level)
1155 {
1156         hpa_t table_addr = vcpu->arch.mmu.root_hpa;
1157         int pt_write = 0;
1158
1159         for (; ; level--) {
1160                 u32 index = PT64_INDEX(v, level);
1161                 u64 *table;
1162
1163                 ASSERT(VALID_PAGE(table_addr));
1164                 table = __va(table_addr);
1165
1166                 if (level == 1) {
1167                         mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1168                                      0, write, 1, &pt_write, 0, gfn, page, false);
1169                         return pt_write;
1170                 }
1171
1172                 if (largepage && level == 2) {
1173                         mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1174                                      0, write, 1, &pt_write, 1, gfn, page, false);
1175                         return pt_write;
1176                 }
1177
1178                 if (table[index] == shadow_trap_nonpresent_pte) {
1179                         struct kvm_mmu_page *new_table;
1180                         gfn_t pseudo_gfn;
1181
1182                         pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1183                                 >> PAGE_SHIFT;
1184                         new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1185                                                      v, level - 1,
1186                                                      1, ACC_ALL, &table[index]);
1187                         if (!new_table) {
1188                                 pgprintk("nonpaging_map: ENOMEM\n");
1189                                 kvm_release_page_clean(page);
1190                                 return -ENOMEM;
1191                         }
1192
1193                         table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1194                                 | PT_WRITABLE_MASK | PT_USER_MASK;
1195                 }
1196                 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1197         }
1198 }
1199
1200 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1201 {
1202         int r;
1203         int largepage = 0;
1204
1205         struct page *page;
1206
1207         down_read(&current->mm->mmap_sem);
1208         if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1209                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1210                 largepage = 1;
1211         }
1212
1213         page = gfn_to_page(vcpu->kvm, gfn);
1214         up_read(&current->mm->mmap_sem);
1215
1216         /* mmio */
1217         if (is_error_page(page)) {
1218                 kvm_release_page_clean(page);
1219                 return 1;
1220         }
1221
1222         spin_lock(&vcpu->kvm->mmu_lock);
1223         kvm_mmu_free_some_pages(vcpu);
1224         r = __direct_map(vcpu, v, write, largepage, gfn, page,
1225                          PT32E_ROOT_LEVEL);
1226         spin_unlock(&vcpu->kvm->mmu_lock);
1227
1228
1229         return r;
1230 }
1231
1232
1233 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1234                                     struct kvm_mmu_page *sp)
1235 {
1236         int i;
1237
1238         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1239                 sp->spt[i] = shadow_trap_nonpresent_pte;
1240 }
1241
1242 static void mmu_free_roots(struct kvm_vcpu *vcpu)
1243 {
1244         int i;
1245         struct kvm_mmu_page *sp;
1246
1247         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1248                 return;
1249         spin_lock(&vcpu->kvm->mmu_lock);
1250 #ifdef CONFIG_X86_64
1251         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1252                 hpa_t root = vcpu->arch.mmu.root_hpa;
1253
1254                 sp = page_header(root);
1255                 --sp->root_count;
1256                 if (!sp->root_count && sp->role.invalid)
1257                         kvm_mmu_zap_page(vcpu->kvm, sp);
1258                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1259                 spin_unlock(&vcpu->kvm->mmu_lock);
1260                 return;
1261         }
1262 #endif
1263         for (i = 0; i < 4; ++i) {
1264                 hpa_t root = vcpu->arch.mmu.pae_root[i];
1265
1266                 if (root) {
1267                         root &= PT64_BASE_ADDR_MASK;
1268                         sp = page_header(root);
1269                         --sp->root_count;
1270                         if (!sp->root_count && sp->role.invalid)
1271                                 kvm_mmu_zap_page(vcpu->kvm, sp);
1272                 }
1273                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1274         }
1275         spin_unlock(&vcpu->kvm->mmu_lock);
1276         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1277 }
1278
1279 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1280 {
1281         int i;
1282         gfn_t root_gfn;
1283         struct kvm_mmu_page *sp;
1284         int metaphysical = 0;
1285
1286         root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1287
1288 #ifdef CONFIG_X86_64
1289         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1290                 hpa_t root = vcpu->arch.mmu.root_hpa;
1291
1292                 ASSERT(!VALID_PAGE(root));
1293                 if (tdp_enabled)
1294                         metaphysical = 1;
1295                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1296                                       PT64_ROOT_LEVEL, metaphysical,
1297                                       ACC_ALL, NULL);
1298                 root = __pa(sp->spt);
1299                 ++sp->root_count;
1300                 vcpu->arch.mmu.root_hpa = root;
1301                 return;
1302         }
1303 #endif
1304         metaphysical = !is_paging(vcpu);
1305         if (tdp_enabled)
1306                 metaphysical = 1;
1307         for (i = 0; i < 4; ++i) {
1308                 hpa_t root = vcpu->arch.mmu.pae_root[i];
1309
1310                 ASSERT(!VALID_PAGE(root));
1311                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1312                         if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1313                                 vcpu->arch.mmu.pae_root[i] = 0;
1314                                 continue;
1315                         }
1316                         root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1317                 } else if (vcpu->arch.mmu.root_level == 0)
1318                         root_gfn = 0;
1319                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1320                                       PT32_ROOT_LEVEL, metaphysical,
1321                                       ACC_ALL, NULL);
1322                 root = __pa(sp->spt);
1323                 ++sp->root_count;
1324                 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1325         }
1326         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1327 }
1328
1329 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1330 {
1331         return vaddr;
1332 }
1333
1334 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1335                                 u32 error_code)
1336 {
1337         gfn_t gfn;
1338         int r;
1339
1340         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1341         r = mmu_topup_memory_caches(vcpu);
1342         if (r)
1343                 return r;
1344
1345         ASSERT(vcpu);
1346         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1347
1348         gfn = gva >> PAGE_SHIFT;
1349
1350         return nonpaging_map(vcpu, gva & PAGE_MASK,
1351                              error_code & PFERR_WRITE_MASK, gfn);
1352 }
1353
1354 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1355                                 u32 error_code)
1356 {
1357         struct page *page;
1358         int r;
1359         int largepage = 0;
1360         gfn_t gfn = gpa >> PAGE_SHIFT;
1361
1362         ASSERT(vcpu);
1363         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1364
1365         r = mmu_topup_memory_caches(vcpu);
1366         if (r)
1367                 return r;
1368
1369         down_read(&current->mm->mmap_sem);
1370         if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1371                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1372                 largepage = 1;
1373         }
1374         page = gfn_to_page(vcpu->kvm, gfn);
1375         up_read(&current->mm->mmap_sem);
1376         if (is_error_page(page)) {
1377                 kvm_release_page_clean(page);
1378                 return 1;
1379         }
1380         spin_lock(&vcpu->kvm->mmu_lock);
1381         kvm_mmu_free_some_pages(vcpu);
1382         r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1383                          largepage, gfn, page, TDP_ROOT_LEVEL);
1384         spin_unlock(&vcpu->kvm->mmu_lock);
1385
1386         return r;
1387 }
1388
1389 static void nonpaging_free(struct kvm_vcpu *vcpu)
1390 {
1391         mmu_free_roots(vcpu);
1392 }
1393
1394 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1395 {
1396         struct kvm_mmu *context = &vcpu->arch.mmu;
1397
1398         context->new_cr3 = nonpaging_new_cr3;
1399         context->page_fault = nonpaging_page_fault;
1400         context->gva_to_gpa = nonpaging_gva_to_gpa;
1401         context->free = nonpaging_free;
1402         context->prefetch_page = nonpaging_prefetch_page;
1403         context->root_level = 0;
1404         context->shadow_root_level = PT32E_ROOT_LEVEL;
1405         context->root_hpa = INVALID_PAGE;
1406         return 0;
1407 }
1408
1409 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1410 {
1411         ++vcpu->stat.tlb_flush;
1412         kvm_x86_ops->tlb_flush(vcpu);
1413 }
1414
1415 static void paging_new_cr3(struct kvm_vcpu *vcpu)
1416 {
1417         pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
1418         mmu_free_roots(vcpu);
1419 }
1420
1421 static void inject_page_fault(struct kvm_vcpu *vcpu,
1422                               u64 addr,
1423                               u32 err_code)
1424 {
1425         kvm_inject_page_fault(vcpu, addr, err_code);
1426 }
1427
1428 static void paging_free(struct kvm_vcpu *vcpu)
1429 {
1430         nonpaging_free(vcpu);
1431 }
1432
1433 #define PTTYPE 64
1434 #include "paging_tmpl.h"
1435 #undef PTTYPE
1436
1437 #define PTTYPE 32
1438 #include "paging_tmpl.h"
1439 #undef PTTYPE
1440
1441 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1442 {
1443         struct kvm_mmu *context = &vcpu->arch.mmu;
1444
1445         ASSERT(is_pae(vcpu));
1446         context->new_cr3 = paging_new_cr3;
1447         context->page_fault = paging64_page_fault;
1448         context->gva_to_gpa = paging64_gva_to_gpa;
1449         context->prefetch_page = paging64_prefetch_page;
1450         context->free = paging_free;
1451         context->root_level = level;
1452         context->shadow_root_level = level;
1453         context->root_hpa = INVALID_PAGE;
1454         return 0;
1455 }
1456
1457 static int paging64_init_context(struct kvm_vcpu *vcpu)
1458 {
1459         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1460 }
1461
1462 static int paging32_init_context(struct kvm_vcpu *vcpu)
1463 {
1464         struct kvm_mmu *context = &vcpu->arch.mmu;
1465
1466         context->new_cr3 = paging_new_cr3;
1467         context->page_fault = paging32_page_fault;
1468         context->gva_to_gpa = paging32_gva_to_gpa;
1469         context->free = paging_free;
1470         context->prefetch_page = paging32_prefetch_page;
1471         context->root_level = PT32_ROOT_LEVEL;
1472         context->shadow_root_level = PT32E_ROOT_LEVEL;
1473         context->root_hpa = INVALID_PAGE;
1474         return 0;
1475 }
1476
1477 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1478 {
1479         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1480 }
1481
1482 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1483 {
1484         struct kvm_mmu *context = &vcpu->arch.mmu;
1485
1486         context->new_cr3 = nonpaging_new_cr3;
1487         context->page_fault = tdp_page_fault;
1488         context->free = nonpaging_free;
1489         context->prefetch_page = nonpaging_prefetch_page;
1490         context->shadow_root_level = TDP_ROOT_LEVEL;
1491         context->root_hpa = INVALID_PAGE;
1492
1493         if (!is_paging(vcpu)) {
1494                 context->gva_to_gpa = nonpaging_gva_to_gpa;
1495                 context->root_level = 0;
1496         } else if (is_long_mode(vcpu)) {
1497                 context->gva_to_gpa = paging64_gva_to_gpa;
1498                 context->root_level = PT64_ROOT_LEVEL;
1499         } else if (is_pae(vcpu)) {
1500                 context->gva_to_gpa = paging64_gva_to_gpa;
1501                 context->root_level = PT32E_ROOT_LEVEL;
1502         } else {
1503                 context->gva_to_gpa = paging32_gva_to_gpa;
1504                 context->root_level = PT32_ROOT_LEVEL;
1505         }
1506
1507         return 0;
1508 }
1509
1510 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
1511 {
1512         ASSERT(vcpu);
1513         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1514
1515         if (!is_paging(vcpu))
1516                 return nonpaging_init_context(vcpu);
1517         else if (is_long_mode(vcpu))
1518                 return paging64_init_context(vcpu);
1519         else if (is_pae(vcpu))
1520                 return paging32E_init_context(vcpu);
1521         else
1522                 return paging32_init_context(vcpu);
1523 }
1524
1525 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1526 {
1527         if (tdp_enabled)
1528                 return init_kvm_tdp_mmu(vcpu);
1529         else
1530                 return init_kvm_softmmu(vcpu);
1531 }
1532
1533 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1534 {
1535         ASSERT(vcpu);
1536         if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1537                 vcpu->arch.mmu.free(vcpu);
1538                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1539         }
1540 }
1541
1542 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1543 {
1544         destroy_kvm_mmu(vcpu);
1545         return init_kvm_mmu(vcpu);
1546 }
1547 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1548
1549 int kvm_mmu_load(struct kvm_vcpu *vcpu)
1550 {
1551         int r;
1552
1553         r = mmu_topup_memory_caches(vcpu);
1554         if (r)
1555                 goto out;
1556         spin_lock(&vcpu->kvm->mmu_lock);
1557         kvm_mmu_free_some_pages(vcpu);
1558         mmu_alloc_roots(vcpu);
1559         spin_unlock(&vcpu->kvm->mmu_lock);
1560         kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1561         kvm_mmu_flush_tlb(vcpu);
1562 out:
1563         return r;
1564 }
1565 EXPORT_SYMBOL_GPL(kvm_mmu_load);
1566
1567 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1568 {
1569         mmu_free_roots(vcpu);
1570 }
1571
1572 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1573                                   struct kvm_mmu_page *sp,
1574                                   u64 *spte)
1575 {
1576         u64 pte;
1577         struct kvm_mmu_page *child;
1578
1579         pte = *spte;
1580         if (is_shadow_present_pte(pte)) {
1581                 if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
1582                     is_large_pte(pte))
1583                         rmap_remove(vcpu->kvm, spte);
1584                 else {
1585                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1586                         mmu_page_remove_parent_pte(child, spte);
1587                 }
1588         }
1589         set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1590         if (is_large_pte(pte))
1591                 --vcpu->kvm->stat.lpages;
1592 }
1593
1594 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1595                                   struct kvm_mmu_page *sp,
1596                                   u64 *spte,
1597                                   const void *new)
1598 {
1599         if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
1600             && !vcpu->arch.update_pte.largepage) {
1601                 ++vcpu->kvm->stat.mmu_pde_zapped;
1602                 return;
1603         }
1604
1605         ++vcpu->kvm->stat.mmu_pte_updated;
1606         if (sp->role.glevels == PT32_ROOT_LEVEL)
1607                 paging32_update_pte(vcpu, sp, spte, new);
1608         else
1609                 paging64_update_pte(vcpu, sp, spte, new);
1610 }
1611
1612 static bool need_remote_flush(u64 old, u64 new)
1613 {
1614         if (!is_shadow_present_pte(old))
1615                 return false;
1616         if (!is_shadow_present_pte(new))
1617                 return true;
1618         if ((old ^ new) & PT64_BASE_ADDR_MASK)
1619                 return true;
1620         old ^= PT64_NX_MASK;
1621         new ^= PT64_NX_MASK;
1622         return (old & ~new & PT64_PERM_MASK) != 0;
1623 }
1624
1625 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1626 {
1627         if (need_remote_flush(old, new))
1628                 kvm_flush_remote_tlbs(vcpu->kvm);
1629         else
1630                 kvm_mmu_flush_tlb(vcpu);
1631 }
1632
1633 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1634 {
1635         u64 *spte = vcpu->arch.last_pte_updated;
1636
1637         return !!(spte && (*spte & PT_ACCESSED_MASK));
1638 }
1639
1640 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1641                                           const u8 *new, int bytes)
1642 {
1643         gfn_t gfn;
1644         int r;
1645         u64 gpte = 0;
1646         struct page *page;
1647
1648         vcpu->arch.update_pte.largepage = 0;
1649
1650         if (bytes != 4 && bytes != 8)
1651                 return;
1652
1653         /*
1654          * Assume that the pte write on a page table of the same type
1655          * as the current vcpu paging mode.  This is nearly always true
1656          * (might be false while changing modes).  Note it is verified later
1657          * by update_pte().
1658          */
1659         if (is_pae(vcpu)) {
1660                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1661                 if ((bytes == 4) && (gpa % 4 == 0)) {
1662                         r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1663                         if (r)
1664                                 return;
1665                         memcpy((void *)&gpte + (gpa % 8), new, 4);
1666                 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1667                         memcpy((void *)&gpte, new, 8);
1668                 }
1669         } else {
1670                 if ((bytes == 4) && (gpa % 4 == 0))
1671                         memcpy((void *)&gpte, new, 4);
1672         }
1673         if (!is_present_pte(gpte))
1674                 return;
1675         gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1676
1677         down_read(&current->mm->mmap_sem);
1678         if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1679                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1680                 vcpu->arch.update_pte.largepage = 1;
1681         }
1682         page = gfn_to_page(vcpu->kvm, gfn);
1683         up_read(&current->mm->mmap_sem);
1684
1685         if (is_error_page(page)) {
1686                 kvm_release_page_clean(page);
1687                 return;
1688         }
1689         vcpu->arch.update_pte.gfn = gfn;
1690         vcpu->arch.update_pte.page = page;
1691 }
1692
1693 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694                        const u8 *new, int bytes)
1695 {
1696         gfn_t gfn = gpa >> PAGE_SHIFT;
1697         struct kvm_mmu_page *sp;
1698         struct hlist_node *node, *n;
1699         struct hlist_head *bucket;
1700         unsigned index;
1701         u64 entry, gentry;
1702         u64 *spte;
1703         unsigned offset = offset_in_page(gpa);
1704         unsigned pte_size;
1705         unsigned page_offset;
1706         unsigned misaligned;
1707         unsigned quadrant;
1708         int level;
1709         int flooded = 0;
1710         int npte;
1711         int r;
1712
1713         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1714         mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1715         spin_lock(&vcpu->kvm->mmu_lock);
1716         kvm_mmu_free_some_pages(vcpu);
1717         ++vcpu->kvm->stat.mmu_pte_write;
1718         kvm_mmu_audit(vcpu, "pre pte write");
1719         if (gfn == vcpu->arch.last_pt_write_gfn
1720             && !last_updated_pte_accessed(vcpu)) {
1721                 ++vcpu->arch.last_pt_write_count;
1722                 if (vcpu->arch.last_pt_write_count >= 3)
1723                         flooded = 1;
1724         } else {
1725                 vcpu->arch.last_pt_write_gfn = gfn;
1726                 vcpu->arch.last_pt_write_count = 1;
1727                 vcpu->arch.last_pte_updated = NULL;
1728         }
1729         index = kvm_page_table_hashfn(gfn);
1730         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1731         hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1732                 if (sp->gfn != gfn || sp->role.metaphysical)
1733                         continue;
1734                 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1735                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1736                 misaligned |= bytes < 4;
1737                 if (misaligned || flooded) {
1738                         /*
1739                          * Misaligned accesses are too much trouble to fix
1740                          * up; also, they usually indicate a page is not used
1741                          * as a page table.
1742                          *
1743                          * If we're seeing too many writes to a page,
1744                          * it may no longer be a page table, or we may be
1745                          * forking, in which case it is better to unmap the
1746                          * page.
1747                          */
1748                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1749                                  gpa, bytes, sp->role.word);
1750                         kvm_mmu_zap_page(vcpu->kvm, sp);
1751                         ++vcpu->kvm->stat.mmu_flooded;
1752                         continue;
1753                 }
1754                 page_offset = offset;
1755                 level = sp->role.level;
1756                 npte = 1;
1757                 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1758                         page_offset <<= 1;      /* 32->64 */
1759                         /*
1760                          * A 32-bit pde maps 4MB while the shadow pdes map
1761                          * only 2MB.  So we need to double the offset again
1762                          * and zap two pdes instead of one.
1763                          */
1764                         if (level == PT32_ROOT_LEVEL) {
1765                                 page_offset &= ~7; /* kill rounding error */
1766                                 page_offset <<= 1;
1767                                 npte = 2;
1768                         }
1769                         quadrant = page_offset >> PAGE_SHIFT;
1770                         page_offset &= ~PAGE_MASK;
1771                         if (quadrant != sp->role.quadrant)
1772                                 continue;
1773                 }
1774                 spte = &sp->spt[page_offset / sizeof(*spte)];
1775                 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
1776                         gentry = 0;
1777                         r = kvm_read_guest_atomic(vcpu->kvm,
1778                                                   gpa & ~(u64)(pte_size - 1),
1779                                                   &gentry, pte_size);
1780                         new = (const void *)&gentry;
1781                         if (r < 0)
1782                                 new = NULL;
1783                 }
1784                 while (npte--) {
1785                         entry = *spte;
1786                         mmu_pte_write_zap_pte(vcpu, sp, spte);
1787                         if (new)
1788                                 mmu_pte_write_new_pte(vcpu, sp, spte, new);
1789                         mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1790                         ++spte;
1791                 }
1792         }
1793         kvm_mmu_audit(vcpu, "post pte write");
1794         spin_unlock(&vcpu->kvm->mmu_lock);
1795         if (vcpu->arch.update_pte.page) {
1796                 kvm_release_page_clean(vcpu->arch.update_pte.page);
1797                 vcpu->arch.update_pte.page = NULL;
1798         }
1799 }
1800
1801 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1802 {
1803         gpa_t gpa;
1804         int r;
1805
1806         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1807
1808         spin_lock(&vcpu->kvm->mmu_lock);
1809         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1810         spin_unlock(&vcpu->kvm->mmu_lock);
1811         return r;
1812 }
1813
1814 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1815 {
1816         while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1817                 struct kvm_mmu_page *sp;
1818
1819                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1820                                   struct kvm_mmu_page, link);
1821                 kvm_mmu_zap_page(vcpu->kvm, sp);
1822                 ++vcpu->kvm->stat.mmu_recycled;
1823         }
1824 }
1825
1826 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1827 {
1828         int r;
1829         enum emulation_result er;
1830
1831         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1832         if (r < 0)
1833                 goto out;
1834
1835         if (!r) {
1836                 r = 1;
1837                 goto out;
1838         }
1839
1840         r = mmu_topup_memory_caches(vcpu);
1841         if (r)
1842                 goto out;
1843
1844         er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1845
1846         switch (er) {
1847         case EMULATE_DONE:
1848                 return 1;
1849         case EMULATE_DO_MMIO:
1850                 ++vcpu->stat.mmio_exits;
1851                 return 0;
1852         case EMULATE_FAIL:
1853                 kvm_report_emulation_failure(vcpu, "pagetable");
1854                 return 1;
1855         default:
1856                 BUG();
1857         }
1858 out:
1859         return r;
1860 }
1861 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1862
1863 void kvm_enable_tdp(void)
1864 {
1865         tdp_enabled = true;
1866 }
1867 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1868
1869 static void free_mmu_pages(struct kvm_vcpu *vcpu)
1870 {
1871         struct kvm_mmu_page *sp;
1872
1873         while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1874                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1875                                   struct kvm_mmu_page, link);
1876                 kvm_mmu_zap_page(vcpu->kvm, sp);
1877         }
1878         free_page((unsigned long)vcpu->arch.mmu.pae_root);
1879 }
1880
1881 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1882 {
1883         struct page *page;
1884         int i;
1885
1886         ASSERT(vcpu);
1887
1888         if (vcpu->kvm->arch.n_requested_mmu_pages)
1889                 vcpu->kvm->arch.n_free_mmu_pages =
1890                                         vcpu->kvm->arch.n_requested_mmu_pages;
1891         else
1892                 vcpu->kvm->arch.n_free_mmu_pages =
1893                                         vcpu->kvm->arch.n_alloc_mmu_pages;
1894         /*
1895          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1896          * Therefore we need to allocate shadow page tables in the first
1897          * 4GB of memory, which happens to fit the DMA32 zone.
1898          */
1899         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1900         if (!page)
1901                 goto error_1;
1902         vcpu->arch.mmu.pae_root = page_address(page);
1903         for (i = 0; i < 4; ++i)
1904                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1905
1906         return 0;
1907
1908 error_1:
1909         free_mmu_pages(vcpu);
1910         return -ENOMEM;
1911 }
1912
1913 int kvm_mmu_create(struct kvm_vcpu *vcpu)
1914 {
1915         ASSERT(vcpu);
1916         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1917
1918         return alloc_mmu_pages(vcpu);
1919 }
1920
1921 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1922 {
1923         ASSERT(vcpu);
1924         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1925
1926         return init_kvm_mmu(vcpu);
1927 }
1928
1929 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1930 {
1931         ASSERT(vcpu);
1932
1933         destroy_kvm_mmu(vcpu);
1934         free_mmu_pages(vcpu);
1935         mmu_free_memory_caches(vcpu);
1936 }
1937
1938 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1939 {
1940         struct kvm_mmu_page *sp;
1941
1942         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1943                 int i;
1944                 u64 *pt;
1945
1946                 if (!test_bit(slot, &sp->slot_bitmap))
1947                         continue;
1948
1949                 pt = sp->spt;
1950                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1951                         /* avoid RMW */
1952                         if (pt[i] & PT_WRITABLE_MASK)
1953                                 pt[i] &= ~PT_WRITABLE_MASK;
1954         }
1955 }
1956
1957 void kvm_mmu_zap_all(struct kvm *kvm)
1958 {
1959         struct kvm_mmu_page *sp, *node;
1960
1961         spin_lock(&kvm->mmu_lock);
1962         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1963                 kvm_mmu_zap_page(kvm, sp);
1964         spin_unlock(&kvm->mmu_lock);
1965
1966         kvm_flush_remote_tlbs(kvm);
1967 }
1968
1969 void kvm_mmu_module_exit(void)
1970 {
1971         if (pte_chain_cache)
1972                 kmem_cache_destroy(pte_chain_cache);
1973         if (rmap_desc_cache)
1974                 kmem_cache_destroy(rmap_desc_cache);
1975         if (mmu_page_header_cache)
1976                 kmem_cache_destroy(mmu_page_header_cache);
1977 }
1978
1979 int kvm_mmu_module_init(void)
1980 {
1981         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1982                                             sizeof(struct kvm_pte_chain),
1983                                             0, 0, NULL);
1984         if (!pte_chain_cache)
1985                 goto nomem;
1986         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1987                                             sizeof(struct kvm_rmap_desc),
1988                                             0, 0, NULL);
1989         if (!rmap_desc_cache)
1990                 goto nomem;
1991
1992         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1993                                                   sizeof(struct kvm_mmu_page),
1994                                                   0, 0, NULL);
1995         if (!mmu_page_header_cache)
1996                 goto nomem;
1997
1998         return 0;
1999
2000 nomem:
2001         kvm_mmu_module_exit();
2002         return -ENOMEM;
2003 }
2004
2005 /*
2006  * Caculate mmu pages needed for kvm.
2007  */
2008 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2009 {
2010         int i;
2011         unsigned int nr_mmu_pages;
2012         unsigned int  nr_pages = 0;
2013
2014         for (i = 0; i < kvm->nmemslots; i++)
2015                 nr_pages += kvm->memslots[i].npages;
2016
2017         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2018         nr_mmu_pages = max(nr_mmu_pages,
2019                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2020
2021         return nr_mmu_pages;
2022 }
2023
2024 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2025                                 unsigned len)
2026 {
2027         if (len > buffer->len)
2028                 return NULL;
2029         return buffer->ptr;
2030 }
2031
2032 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2033                                 unsigned len)
2034 {
2035         void *ret;
2036
2037         ret = pv_mmu_peek_buffer(buffer, len);
2038         if (!ret)
2039                 return ret;
2040         buffer->ptr += len;
2041         buffer->len -= len;
2042         buffer->processed += len;
2043         return ret;
2044 }
2045
2046 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2047                              gpa_t addr, gpa_t value)
2048 {
2049         int bytes = 8;
2050         int r;
2051
2052         if (!is_long_mode(vcpu) && !is_pae(vcpu))
2053                 bytes = 4;
2054
2055         r = mmu_topup_memory_caches(vcpu);
2056         if (r)
2057                 return r;
2058
2059         if (!emulator_write_phys(vcpu, addr, &value, bytes))
2060                 return -EFAULT;
2061
2062         return 1;
2063 }
2064
2065 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2066 {
2067         kvm_x86_ops->tlb_flush(vcpu);
2068         return 1;
2069 }
2070
2071 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
2072 {
2073         spin_lock(&vcpu->kvm->mmu_lock);
2074         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
2075         spin_unlock(&vcpu->kvm->mmu_lock);
2076         return 1;
2077 }
2078
2079 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
2080                              struct kvm_pv_mmu_op_buffer *buffer)
2081 {
2082         struct kvm_mmu_op_header *header;
2083
2084         header = pv_mmu_peek_buffer(buffer, sizeof *header);
2085         if (!header)
2086                 return 0;
2087         switch (header->op) {
2088         case KVM_MMU_OP_WRITE_PTE: {
2089                 struct kvm_mmu_op_write_pte *wpte;
2090
2091                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
2092                 if (!wpte)
2093                         return 0;
2094                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
2095                                         wpte->pte_val);
2096         }
2097         case KVM_MMU_OP_FLUSH_TLB: {
2098                 struct kvm_mmu_op_flush_tlb *ftlb;
2099
2100                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
2101                 if (!ftlb)
2102                         return 0;
2103                 return kvm_pv_mmu_flush_tlb(vcpu);
2104         }
2105         case KVM_MMU_OP_RELEASE_PT: {
2106                 struct kvm_mmu_op_release_pt *rpt;
2107
2108                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
2109                 if (!rpt)
2110                         return 0;
2111                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
2112         }
2113         default: return 0;
2114         }
2115 }
2116
2117 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2118                   gpa_t addr, unsigned long *ret)
2119 {
2120         int r;
2121         struct kvm_pv_mmu_op_buffer buffer;
2122
2123         down_read(&current->mm->mmap_sem);
2124
2125         buffer.ptr = buffer.buf;
2126         buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
2127         buffer.processed = 0;
2128
2129         r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
2130         if (r)
2131                 goto out;
2132
2133         while (buffer.len) {
2134                 r = kvm_pv_mmu_op_one(vcpu, &buffer);
2135                 if (r < 0)
2136                         goto out;
2137                 if (r == 0)
2138                         break;
2139         }
2140
2141         r = 1;
2142 out:
2143         *ret = buffer.processed;
2144         up_read(&current->mm->mmap_sem);
2145         return r;
2146 }
2147
2148 #ifdef AUDIT
2149
2150 static const char *audit_msg;
2151
2152 static gva_t canonicalize(gva_t gva)
2153 {
2154 #ifdef CONFIG_X86_64
2155         gva = (long long)(gva << 16) >> 16;
2156 #endif
2157         return gva;
2158 }
2159
2160 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
2161                                 gva_t va, int level)
2162 {
2163         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
2164         int i;
2165         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
2166
2167         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
2168                 u64 ent = pt[i];
2169
2170                 if (ent == shadow_trap_nonpresent_pte)
2171                         continue;
2172
2173                 va = canonicalize(va);
2174                 if (level > 1) {
2175                         if (ent == shadow_notrap_nonpresent_pte)
2176                                 printk(KERN_ERR "audit: (%s) nontrapping pte"
2177                                        " in nonleaf level: levels %d gva %lx"
2178                                        " level %d pte %llx\n", audit_msg,
2179                                        vcpu->arch.mmu.root_level, va, level, ent);
2180
2181                         audit_mappings_page(vcpu, ent, va, level - 1);
2182                 } else {
2183                         gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
2184                         struct page *page = gpa_to_page(vcpu, gpa);
2185                         hpa_t hpa = page_to_phys(page);
2186
2187                         if (is_shadow_present_pte(ent)
2188                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
2189                                 printk(KERN_ERR "xx audit error: (%s) levels %d"
2190                                        " gva %lx gpa %llx hpa %llx ent %llx %d\n",
2191                                        audit_msg, vcpu->arch.mmu.root_level,
2192                                        va, gpa, hpa, ent,
2193                                        is_shadow_present_pte(ent));
2194                         else if (ent == shadow_notrap_nonpresent_pte
2195                                  && !is_error_hpa(hpa))
2196                                 printk(KERN_ERR "audit: (%s) notrap shadow,"
2197                                        " valid guest gva %lx\n", audit_msg, va);
2198                         kvm_release_page_clean(page);
2199
2200                 }
2201         }
2202 }
2203
2204 static void audit_mappings(struct kvm_vcpu *vcpu)
2205 {
2206         unsigned i;
2207
2208         if (vcpu->arch.mmu.root_level == 4)
2209                 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
2210         else
2211                 for (i = 0; i < 4; ++i)
2212                         if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
2213                                 audit_mappings_page(vcpu,
2214                                                     vcpu->arch.mmu.pae_root[i],
2215                                                     i << 30,
2216                                                     2);
2217 }
2218
2219 static int count_rmaps(struct kvm_vcpu *vcpu)
2220 {
2221         int nmaps = 0;
2222         int i, j, k;
2223
2224         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
2225                 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
2226                 struct kvm_rmap_desc *d;
2227
2228                 for (j = 0; j < m->npages; ++j) {
2229                         unsigned long *rmapp = &m->rmap[j];
2230
2231                         if (!*rmapp)
2232                                 continue;
2233                         if (!(*rmapp & 1)) {
2234                                 ++nmaps;
2235                                 continue;
2236                         }
2237                         d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
2238                         while (d) {
2239                                 for (k = 0; k < RMAP_EXT; ++k)
2240                                         if (d->shadow_ptes[k])
2241                                                 ++nmaps;
2242                                         else
2243                                                 break;
2244                                 d = d->more;
2245                         }
2246                 }
2247         }
2248         return nmaps;
2249 }
2250
2251 static int count_writable_mappings(struct kvm_vcpu *vcpu)
2252 {
2253         int nmaps = 0;
2254         struct kvm_mmu_page *sp;
2255         int i;
2256
2257         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2258                 u64 *pt = sp->spt;
2259
2260                 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
2261                         continue;
2262
2263                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
2264                         u64 ent = pt[i];
2265
2266                         if (!(ent & PT_PRESENT_MASK))
2267                                 continue;
2268                         if (!(ent & PT_WRITABLE_MASK))
2269                                 continue;
2270                         ++nmaps;
2271                 }
2272         }
2273         return nmaps;
2274 }
2275
2276 static void audit_rmap(struct kvm_vcpu *vcpu)
2277 {
2278         int n_rmap = count_rmaps(vcpu);
2279         int n_actual = count_writable_mappings(vcpu);
2280
2281         if (n_rmap != n_actual)
2282                 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
2283                        __func__, audit_msg, n_rmap, n_actual);
2284 }
2285
2286 static void audit_write_protection(struct kvm_vcpu *vcpu)
2287 {
2288         struct kvm_mmu_page *sp;
2289         struct kvm_memory_slot *slot;
2290         unsigned long *rmapp;
2291         gfn_t gfn;
2292
2293         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2294                 if (sp->role.metaphysical)
2295                         continue;
2296
2297                 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2298                 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
2299                 rmapp = &slot->rmap[gfn - slot->base_gfn];
2300                 if (*rmapp)
2301                         printk(KERN_ERR "%s: (%s) shadow page has writable"
2302                                " mappings: gfn %lx role %x\n",
2303                                __func__, audit_msg, sp->gfn,
2304                                sp->role.word);
2305         }
2306 }
2307
2308 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
2309 {
2310         int olddbg = dbg;
2311
2312         dbg = 0;
2313         audit_msg = msg;
2314         audit_rmap(vcpu);
2315         audit_write_protection(vcpu);
2316         audit_mappings(vcpu);
2317         dbg = olddbg;
2318 }
2319
2320 #endif