KVM: VMX: fix use after free of vmx->loaded_vmcs
[pandora-kernel.git] / arch / x86 / kvm / vmx.c
index 2b2fce1..7661eb1 100644 (file)
@@ -418,6 +418,8 @@ struct vcpu_vmx {
        u64                   msr_host_kernel_gs_base;
        u64                   msr_guest_kernel_gs_base;
 #endif
+       u32 vm_entry_controls_shadow;
+       u32 vm_exit_controls_shadow;
        /*
         * loaded_vmcs points to the VMCS currently used in this vcpu. For a
         * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -1326,6 +1328,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
        vmcs_writel(field, vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+       vmcs_write32(VM_ENTRY_CONTROLS, val);
+       vmx->vm_entry_controls_shadow = val;
+}
+
+static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+       if (vmx->vm_entry_controls_shadow != val)
+               vm_entry_controls_init(vmx, val);
+}
+
+static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
+{
+       return vmx->vm_entry_controls_shadow;
+}
+
+
+static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+       vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
+}
+
+static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+       vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
+}
+
+static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+       vmcs_write32(VM_EXIT_CONTROLS, val);
+       vmx->vm_exit_controls_shadow = val;
+}
+
+static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+       if (vmx->vm_exit_controls_shadow != val)
+               vm_exit_controls_init(vmx, val);
+}
+
+static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
+{
+       return vmx->vm_exit_controls_shadow;
+}
+
+
+static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+       vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
+}
+
+static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+       vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
+}
+
 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
        vmx->segment_cache.bitmask = 0;
@@ -1410,11 +1468,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
-static void clear_atomic_switch_msr_special(unsigned long entry,
-               unsigned long exit)
+static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+               unsigned long entry, unsigned long exit)
 {
-       vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
-       vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
+       vm_entry_controls_clearbit(vmx, entry);
+       vm_exit_controls_clearbit(vmx, exit);
 }
 
 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
@@ -1425,14 +1483,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
        switch (msr) {
        case MSR_EFER:
                if (cpu_has_load_ia32_efer) {
-                       clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+                       clear_atomic_switch_msr_special(vmx,
+                                       VM_ENTRY_LOAD_IA32_EFER,
                                        VM_EXIT_LOAD_IA32_EFER);
                        return;
                }
                break;
        case MSR_CORE_PERF_GLOBAL_CTRL:
                if (cpu_has_load_perf_global_ctrl) {
-                       clear_atomic_switch_msr_special(
+                       clear_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
                        return;
@@ -1453,14 +1512,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 }
 
-static void add_atomic_switch_msr_special(unsigned long entry,
-               unsigned long exit, unsigned long guest_val_vmcs,
-               unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
+static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+               unsigned long entry, unsigned long exit,
+               unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+               u64 guest_val, u64 host_val)
 {
        vmcs_write64(guest_val_vmcs, guest_val);
        vmcs_write64(host_val_vmcs, host_val);
-       vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
-       vmcs_set_bits(VM_EXIT_CONTROLS, exit);
+       vm_entry_controls_setbit(vmx, entry);
+       vm_exit_controls_setbit(vmx, exit);
 }
 
 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
@@ -1472,7 +1532,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
        switch (msr) {
        case MSR_EFER:
                if (cpu_has_load_ia32_efer) {
-                       add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+                       add_atomic_switch_msr_special(vmx,
+                                       VM_ENTRY_LOAD_IA32_EFER,
                                        VM_EXIT_LOAD_IA32_EFER,
                                        GUEST_IA32_EFER,
                                        HOST_IA32_EFER,
@@ -1482,7 +1543,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                break;
        case MSR_CORE_PERF_GLOBAL_CTRL:
                if (cpu_has_load_perf_global_ctrl) {
-                       add_atomic_switch_msr_special(
+                       add_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        GUEST_IA32_PERF_GLOBAL_CTRL,
@@ -1498,7 +1559,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        break;
 
        if (i == NR_AUTOLOAD_MSRS) {
-               printk_once(KERN_WARNING"Not enough mst switch entries. "
+               printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
        } else if (i == m->nr) {
@@ -1898,16 +1959,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
- * This function assumes it is called with the exit reason in vmcs02 being
- * a #PF exception (this is the only case in which KVM injects a #PF when L2
- * is running).
  */
-static int nested_pf_handled(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
+       if (!(vmcs12->exception_bitmap & (1u << nr)))
                return 0;
 
        nested_vmx_vmexit(vcpu);
@@ -1921,8 +1978,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-       if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
-           !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
+       if (!reinject && is_guest_mode(vcpu) &&
+           nested_vmx_check_exception(vcpu, nr))
                return;
 
        if (has_error_code) {
@@ -2204,9 +2261,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       }
        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-                                     VM_EXIT_LOAD_IA32_EFER);
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2226,7 +2289,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
        nested_vmx_procbased_ctls_low = 0;
        nested_vmx_procbased_ctls_high &=
-               CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+               CPU_BASED_VIRTUAL_INTR_PENDING |
+               CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
                CPU_BASED_CR3_STORE_EXITING |
@@ -2252,13 +2316,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING;
 
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                        VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
+                        VMX_EPT_INVEPT_BIT;
                nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * Since invept is completely emulated we support both global
@@ -2274,6 +2340,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
        nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
                VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
        nested_vmx_misc_high = 0;
 }
 
@@ -3177,14 +3244,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
        vmx_load_host_state(to_vmx(vcpu));
        vcpu->arch.efer = efer;
        if (efer & EFER_LMA) {
-               vmcs_write32(VM_ENTRY_CONTROLS,
-                            vmcs_read32(VM_ENTRY_CONTROLS) |
-                            VM_ENTRY_IA32E_MODE);
+               vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
                msr->data = efer;
        } else {
-               vmcs_write32(VM_ENTRY_CONTROLS,
-                            vmcs_read32(VM_ENTRY_CONTROLS) &
-                            ~VM_ENTRY_IA32E_MODE);
+               vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 
                msr->data = efer & ~EFER_LME;
        }
@@ -3212,9 +3275,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
 {
-       vmcs_write32(VM_ENTRY_CONTROLS,
-                    vmcs_read32(VM_ENTRY_CONTROLS)
-                    & ~VM_ENTRY_IA32E_MODE);
+       vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
        vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
 }
 
@@ -3380,8 +3441,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-               guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
-                       vcpu->kvm->arch.ept_identity_map_addr;
+               if (is_paging(vcpu) || is_guest_mode(vcpu))
+                       guest_cr3 = kvm_read_cr3(vcpu);
+               else
+                       guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
 
@@ -4339,10 +4402,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                ++vmx->nmsrs;
        }
 
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+       vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
        /* 22.2.1, 20.8.1 */
-       vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+       vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
@@ -4879,6 +4943,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
+static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+{
+       unsigned long always_on = VMXON_CR0_ALWAYSON;
+
+       if (nested_vmx_secondary_ctls_high &
+               SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+           nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+               always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+       return (val & always_on) == always_on;
+}
+
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
@@ -4897,9 +4972,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 
-               /* TODO: will have to take unrestricted guest mode into
-                * account */
-               if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
+               if (!nested_cr0_valid(vmcs12, val))
                        return 1;
 
                if (kvm_set_cr0(vcpu, val))
@@ -5064,10 +5137,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
        if (exit_qualification & TYPE_MOV_FROM_DR) {
                unsigned long val;
-               if (!kvm_get_dr(vcpu, dr, &val))
-                       kvm_register_write(vcpu, reg, val);
+
+               if (kvm_get_dr(vcpu, dr, &val))
+                       return 1;
+               kvm_register_write(vcpu, reg, val);
        } else
-               kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
+               if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
+                       return 1;
+
        skip_emulated_instruction(vcpu);
        return 1;
 }
@@ -6444,11 +6521,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
        int size;
        u8 b;
 
-       if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
-               return 1;
-
        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
-               return 0;
+               return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
@@ -6627,6 +6701,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
+               else if (is_no_device(intr_info) &&
+                        !(nested_read_cr0(vmcs12) & X86_CR0_TS))
+                       return 0;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -6722,6 +6799,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       u64 delta_tsc_l1;
+       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+
+       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
+                       PIN_BASED_VMX_PREEMPTION_TIMER))
+               return;
+       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
+               - vcpu->arch.last_guest_tsc;
+       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+       if (preempt_val_l2 <= preempt_val_l1)
+               preempt_val_l2 = 0;
+       else
+               preempt_val_l2 -= preempt_val_l1;
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -6736,20 +6834,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       /*
-        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
-        * we did not inject a still-pending event to L1 now because of
-        * nested_run_pending, we need to re-enable this bit.
-        */
-       if (vmx->nested.nested_run_pending)
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
-
-       if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
-           exit_reason == EXIT_REASON_VMRESUME))
-               vmx->nested.nested_run_pending = 1;
-       else
-               vmx->nested.nested_run_pending = 0;
-
        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
                nested_vmx_vmexit(vcpu);
                return 1;
@@ -7061,9 +7145,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                       kvm_queue_exception_e(vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
                } else
-                       kvm_queue_exception(vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
@@ -7146,6 +7230,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
+               nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -7284,6 +7370,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
 
+       /*
+        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+        * we did not inject a still-pending event to L1 now because of
+        * nested_run_pending, we need to re-enable this bit.
+        */
+       if (vmx->nested.nested_run_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       vmx->nested.nested_run_pending = 0;
+
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@ -7294,8 +7390,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        free_vpid(vmx);
-       free_nested(vmx);
        free_loaded_vmcs(vmx->loaded_vmcs);
+       free_nested(vmx);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7410,8 +7506,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
         */
        if (is_mmio)
                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-       else if (vcpu->kvm->arch.iommu_domain &&
-               !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+       else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
                      VMX_EPT_MT_EPTE_SHIFT;
        else
@@ -7501,9 +7596,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-       int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+       kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
 
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
@@ -7511,8 +7606,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-
-       return r;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -7520,6 +7613,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
 
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+               struct x86_exception *fault)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       WARN_ON(!is_guest_mode(vcpu));
+
+       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+       if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+               nested_vmx_vmexit(vcpu);
+       else
+               kvm_inject_page_fault(vcpu, fault);
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7533,6 +7640,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
+       u32 exit_control;
 
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7656,6 +7764,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                        else
                                vmcs_write64(APIC_ACCESS_ADDR,
                                  page_to_phys(vmx->nested.apic_access_page));
+               } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+                       exec_control |=
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+                       vmcs_write64(APIC_ACCESS_ADDR,
+                               page_to_phys(vcpu->kvm->arch.apic_access_page));
                }
 
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
@@ -7706,12 +7819,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+       exit_control = vmcs_config.vmexit_ctrl;
+       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       vm_exit_controls_init(vmx, exit_control);
 
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
         */
-       vmcs_write32(VM_ENTRY_CONTROLS,
+       vm_entry_controls_init(vmx, 
                (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
@@ -7773,6 +7889,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
 
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+
        /*
         * L1 may access the L2's PDPTR, so save them to construct vmcs12
         */
@@ -7826,7 +7945,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
-       if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
+       if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+           vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
@@ -7876,7 +7996,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
-       if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+       if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -7938,6 +8058,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        enter_guest_mode(vcpu);
 
+       vmx->nested.nested_run_pending = 1;
+
        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
 
        cpu = get_cpu();
@@ -7953,6 +8075,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        prepare_vmcs02(vcpu, vmcs12);
 
+       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+               return kvm_emulate_halt(vcpu);
+
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -8005,7 +8130,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
        u32 idt_vectoring;
        unsigned int nr;
 
-       if (vcpu->arch.exception.pending) {
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
 
@@ -8023,7 +8148,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
                }
 
                vmcs12->idt_vectoring_info_field = idt_vectoring;
-       } else if (vcpu->arch.nmi_pending) {
+       } else if (vcpu->arch.nmi_injected) {
                vmcs12->idt_vectoring_info_field =
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
        } else if (vcpu->arch.interrupt.pending) {
@@ -8105,6 +8230,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+               vmcs12->vmx_preemption_timer_value =
+                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
         * own CR3 without exiting. If it has changed it, we must keep it.
@@ -8123,13 +8253,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
-               (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
+               (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
 
        /* TODO: These cannot have changed unless we have MSR bitmaps and
         * the relevant bit asks not to trap the change */
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+               vmcs12->guest_ia32_efer = vcpu->arch.efer;
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
@@ -8201,7 +8333,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         * fpu_active (which may have changed).
         * Note that vmx_set_cr0 refers to efer set above.
         */
-       kvm_set_cr0(vcpu, vmcs12->host_cr0);
+       vmx_set_cr0(vcpu, vmcs12->host_cr0);
        /*
         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
@@ -8224,6 +8356,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
 
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
        if (enable_vpid) {
                /*
                 * Trivially support vpid by letting L2s share their parent
@@ -8322,6 +8457,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        vcpu->cpu = cpu;
        put_cpu();
 
+       vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
+       vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
        vmx_segment_cache_clear(vmx);
 
        /* if no vmcs02 cache requested, remove the one we used */