Merge branch 'tip/perf/jump-label-2' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / arch / x86 / kvm / paging_tmpl.h
index 51ef909..cd7a833 100644 (file)
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -67,6 +67,7 @@ struct guest_walker {
        int level;
        gfn_t table_gfn[PT_MAX_FULL_LEVELS];
        pt_element_t ptes[PT_MAX_FULL_LEVELS];
+       pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
        gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
        unsigned pt_access;
        unsigned pte_access;
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 
        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 #if PTTYPE == 64
-       if (is_nx(vcpu))
+       if (vcpu->arch.mmu.nx)
                access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
        return access;
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 /*
  * Fetch a guest pte for a guest virtual address
  */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-                           struct kvm_vcpu *vcpu, gva_t addr,
-                           int write_fault, int user_fault, int fetch_fault)
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+                                   struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                                   gva_t addr, u32 access)
 {
        pt_element_t pte;
        gfn_t table_gfn;
        unsigned index, pt_access, uninitialized_var(pte_access);
        gpa_t pte_gpa;
        bool eperm, present, rsvd_fault;
+       int offset, write_fault, user_fault, fetch_fault;
+
+       write_fault = access & PFERR_WRITE_MASK;
+       user_fault = access & PFERR_USER_MASK;
+       fetch_fault = access & PFERR_FETCH_MASK;
 
        trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
                                     fetch_fault);
 walk:
        present = true;
        eperm = rsvd_fault = false;
-       walker->level = vcpu->arch.mmu.root_level;
-       pte = vcpu->arch.cr3;
+       walker->level = mmu->root_level;
+       pte           = mmu->get_cr3(vcpu);
+
 #if PTTYPE == 64
-       if (!is_long_mode(vcpu)) {
-               pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+       if (walker->level == PT32E_ROOT_LEVEL) {
+               pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
                if (!is_present_gpte(pte)) {
                        present = false;
@@ -142,7 +149,7 @@ walk:
        }
 #endif
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-              (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+              (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
        pt_access = ACC_ALL;
 
@@ -150,12 +157,14 @@ walk:
                index = PT_INDEX(addr, walker->level);
 
                table_gfn = gpte_to_gfn(pte);
-               pte_gpa = gfn_to_gpa(table_gfn);
-               pte_gpa += index * sizeof(pt_element_t);
+               offset    = index * sizeof(pt_element_t);
+               pte_gpa   = gfn_to_gpa(table_gfn) + offset;
                walker->table_gfn[walker->level - 1] = table_gfn;
                walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-               if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+               if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+                                           offset, sizeof(pte),
+                                           PFERR_USER_MASK|PFERR_WRITE_MASK)) {
                        present = false;
                        break;
                }
@@ -167,7 +176,7 @@ walk:
                        break;
                }
 
-               if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+               if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
                        rsvd_fault = true;
                        break;
                }
@@ -204,17 +213,28 @@ walk:
                                (PTTYPE == 64 || is_pse(vcpu))) ||
                    ((walker->level == PT_PDPE_LEVEL) &&
                                is_large_pte(pte) &&
-                               is_long_mode(vcpu))) {
+                               mmu->root_level == PT64_ROOT_LEVEL)) {
                        int lvl = walker->level;
+                       gpa_t real_gpa;
+                       gfn_t gfn;
+                       u32 ac;
 
-                       walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-                       walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
-                                       >> PAGE_SHIFT;
+                       gfn = gpte_to_gfn_lvl(pte, lvl);
+                       gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
                        if (PTTYPE == 32 &&
                            walker->level == PT_DIRECTORY_LEVEL &&
                            is_cpuid_PSE36())
-                               walker->gfn += pse36_gfn_delta(pte);
+                               gfn += pse36_gfn_delta(pte);
+
+                       ac = write_fault | fetch_fault | user_fault;
+
+                       real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+                                                     ac);
+                       if (real_gpa == UNMAPPED_GVA)
+                               return 0;
+
+                       walker->gfn = real_gpa >> PAGE_SHIFT;
 
                        break;
                }
@@ -249,18 +269,36 @@ error:
        walker->error_code = 0;
        if (present)
                walker->error_code |= PFERR_PRESENT_MASK;
-       if (write_fault)
-               walker->error_code |= PFERR_WRITE_MASK;
-       if (user_fault)
-               walker->error_code |= PFERR_USER_MASK;
-       if (fetch_fault && is_nx(vcpu))
+
+       walker->error_code |= write_fault | user_fault;
+
+       if (fetch_fault && mmu->nx)
                walker->error_code |= PFERR_FETCH_MASK;
        if (rsvd_fault)
                walker->error_code |= PFERR_RSVD_MASK;
+
+       vcpu->arch.fault.address    = addr;
+       vcpu->arch.fault.error_code = walker->error_code;
+
        trace_kvm_mmu_walker_error(walker->error_code);
        return 0;
 }
 
+static int FNAME(walk_addr)(struct guest_walker *walker,
+                           struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+{
+       return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+                                       access);
+}
+
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+                                  struct kvm_vcpu *vcpu, gva_t addr,
+                                  u32 access)
+{
+       return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+                                       addr, access);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                              u64 *spte, const void *pte)
 {
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
                                struct guest_walker *gw, int level)
 {
-       int r;
        pt_element_t curr_pte;
-
-       r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+       gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+       u64 mask;
+       int r, index;
+
+       if (level == PT_PAGE_TABLE_LEVEL) {
+               mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+               base_gpa = pte_gpa & ~mask;
+               index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+               r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+                               gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+               curr_pte = gw->prefetch_ptes[index];
+       } else
+               r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
                                  &curr_pte, sizeof(curr_pte));
+
        return r || curr_pte != gw->ptes[level - 1];
 }
 
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+                               u64 *sptep)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_mmu *mmu = &vcpu->arch.mmu;
+       pt_element_t *gptep = gw->prefetch_ptes;
+       u64 *spte;
+       int i;
+
+       sp = page_header(__pa(sptep));
+
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return;
+
+       if (sp->role.direct)
+               return __direct_pte_prefetch(vcpu, sp, sptep);
+
+       i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+       spte = sp->spt + i;
+
+       for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+               pt_element_t gpte;
+               unsigned pte_access;
+               gfn_t gfn;
+               pfn_t pfn;
+               bool dirty;
+
+               if (spte == sptep)
+                       continue;
+
+               if (*spte != shadow_trap_nonpresent_pte)
+                       continue;
+
+               gpte = gptep[i];
+
+               if (!is_present_gpte(gpte) ||
+                     is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+                       if (!sp->unsync)
+                               __set_spte(spte, shadow_notrap_nonpresent_pte);
+                       continue;
+               }
+
+               if (!(gpte & PT_ACCESSED_MASK))
+                       continue;
+
+               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+               gfn = gpte_to_gfn(gpte);
+               dirty = is_dirty_gpte(gpte);
+               pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+                                     (pte_access & ACC_WRITE_MASK) && dirty);
+               if (is_error_pfn(pfn)) {
+                       kvm_release_pfn_clean(pfn);
+                       break;
+               }
+
+               mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+                            dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+                            pfn, true, true);
+       }
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
                     user_fault, write_fault, dirty, ptwrite, it.level,
                     gw->gfn, pfn, false, true);
+       FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
        return it.sptep;
 
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 {
        int write_fault = error_code & PFERR_WRITE_MASK;
        int user_fault = error_code & PFERR_USER_MASK;
-       int fetch_fault = error_code & PFERR_FETCH_MASK;
        struct guest_walker walker;
        u64 *sptep;
        int write_pt = 0;
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        unsigned long mmu_seq;
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-       kvm_mmu_audit(vcpu, "pre page fault");
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        /*
         * Look up the guest pte for the faulting address.
         */
-       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-                            fetch_fault);
+       r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
 
        /*
         * The page is not mapped by the guest.  Let the guest handle it.
         */
        if (!r) {
                pgprintk("%s: guest page fault\n", __func__);
-               inject_page_fault(vcpu, addr, walker.error_code);
+               inject_page_fault(vcpu);
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
                return 0;
        }
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
+
+       trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
        kvm_mmu_free_some_pages(vcpu);
        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
                             level, &write_pt, pfn);
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
        ++vcpu->stat.pf_fixed;
-       kvm_mmu_audit(vcpu, "post page fault (fixed)");
+       trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return write_pt;
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
        gpa_t gpa = UNMAPPED_GVA;
        int r;
 
-       r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-                            !!(access & PFERR_WRITE_MASK),
-                            !!(access & PFERR_USER_MASK),
-                            !!(access & PFERR_FETCH_MASK));
+       r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+
+       if (r) {
+               gpa = gfn_to_gpa(walker.gfn);
+               gpa |= vaddr & ~PAGE_MASK;
+       } else if (error)
+               *error = walker.error_code;
+
+       return gpa;
+}
+
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+                                     u32 access, u32 *error)
+{
+       struct guest_walker walker;
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
+
+       r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
 
        if (r) {
                gpa = gfn_to_gpa(walker.gfn);
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                        return -EINVAL;
 
                gfn = gpte_to_gfn(gpte);
-               if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+               if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
                      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
                      || !(gpte & PT_ACCESSED_MASK)) {
                        u64 nonpresent;