kvm: nVMX: Allow L1 to intercept software exceptions (#BP and #OF)
[pandora-kernel.git] / arch / x86 / kvm / vmx.c
index 579a0b5..0fb33a0 100644 (file)
@@ -390,6 +390,7 @@ struct vcpu_vmx {
                u16           fs_sel, gs_sel, ldt_sel;
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
+               unsigned long vmcs_host_cr4;    /* May not match real cr4 */
        } host_state;
        struct {
                int vm86_active;
@@ -875,10 +876,10 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+               == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
 }
 
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
@@ -1170,7 +1171,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        u32 eb;
 
        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-            (1u << NM_VECTOR) | (1u << DB_VECTOR);
+            (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
        if ((vcpu->guest_debug &
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1456,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
-       if (current_thread_info()->status & TS_USEDFPU)
+       if (__thread_has_fpu(current))
                clts();
        load_gdt(&__get_cpu_var(host_gdt));
 }
@@ -1677,7 +1678,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & PF_VECTOR))
+       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                return 0;
 
        nested_vmx_vmexit(vcpu);
@@ -1956,6 +1957,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+               CPU_BASED_RDPMC_EXITING |
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        /*
         * We can allow some features even when not supported by the
@@ -3030,8 +3032,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+       /*
+        * Pass through host's Machine Check Enable value to hw_cr4, which
+        * is in force while we are in guest mode.  Do not let guests control
+        * this bit, even if host CR4.MCE == 0.
+        */
+       unsigned long hw_cr4 =
+               (read_cr4() & X86_CR4_MCE) |
+               (cr4 & ~X86_CR4_MCE) |
+               (to_vmx(vcpu)->rmode.vm86_active ?
+                KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
        if (cr4 & X86_CR4_VMXE) {
                /*
@@ -3628,16 +3638,21 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
  * Note that host-state that does change is set elsewhere. E.g., host-state
  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
  */
-static void vmx_set_constant_host_state(void)
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 {
        u32 low32, high32;
        unsigned long tmpl;
        struct desc_ptr dt;
+       unsigned long cr4;
 
        vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
+       /* Save the most likely value for this task's CR4 in the VMCS. */
+       cr4 = read_cr4();
+       vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
+       vmx->host_state.vmcs_host_cr4 = cr4;
+
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
@@ -3759,7 +3774,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
 #ifdef CONFIG_X86_64
        rdmsrl(MSR_FS_BASE, a);
        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -3915,7 +3930,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
        vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        vmx_set_cr4(&vmx->vcpu, 0);
        vmx_set_efer(&vmx->vcpu, 0);
        vmx_fpu_activate(&vmx->vcpu);
@@ -4208,7 +4225,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+       if (is_nmi(intr_info))
                return 1;  /* already handled by vmx_vcpu_run() */
 
        if (is_no_device(intr_info)) {
@@ -4249,6 +4266,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
        switch (ex_no) {
+       case AC_VECTOR:
+               kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+               return 1;
        case DB_VECTOR:
                dr6 = vmcs_readl(EXIT_QUALIFICATION);
                if (!(vcpu->guest_debug &
@@ -4541,7 +4561,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
-       if (vmx_set_msr(vcpu, ecx, data) != 0) {
+       if (kvm_set_msr(vcpu, ecx, data) != 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -4879,6 +4899,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                if (err != EMULATE_DONE)
                        return 0;
 
+               if (vcpu->arch.halt_request) {
+                       vcpu->arch.halt_request = 0;
+                       ret = kvm_emulate_halt(vcpu);
+                       goto out;
+               }
+
                if (signal_pending(current))
                        goto out;
                if (need_resched())
@@ -4973,22 +4999,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
 
 /*
  * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
  */
 static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
 {
        struct vmcs02_list *item, *n;
+
+       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
        list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-               if (vmx->loaded_vmcs != &item->vmcs02)
-                       free_loaded_vmcs(&item->vmcs02);
+               /*
+                * Something will leak if the above WARN triggers.  Better than
+                * a use-after-free.
+                */
+               if (vmx->loaded_vmcs == &item->vmcs02)
+                       continue;
+
+               free_loaded_vmcs(&item->vmcs02);
                list_del(&item->list);
                kfree(item);
+               vmx->nested.vmcs02_num--;
        }
-       vmx->nested.vmcs02_num = 0;
-
-       if (vmx->loaded_vmcs != &vmx->vmcs01)
-               free_loaded_vmcs(&vmx->vmcs01);
 }
 
 /*
@@ -5541,6 +5572,18 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5582,6 +5625,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
+       [EXIT_REASON_INVVPID]                 = handle_invvpid,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -5728,7 +5773,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
-               if (!is_exception(intr_info))
+               if (is_nmi(intr_info))
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
@@ -5766,6 +5811,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -5895,10 +5941,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu);
        else {
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = exit_reason;
+               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
        }
-       return 0;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -5927,8 +5973,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
                kvm_machine_check();
 
        /* We need to handle NMIs before interrupts are enabled */
-       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+       if (is_nmi(exit_intr_info)) {
                kvm_before_handle_nmi(&vmx->vcpu);
                asm("int $2");
                kvm_after_handle_nmi(&vmx->vcpu);
@@ -6084,6 +6129,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr4;
 
        if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6114,6 +6160,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
+       cr4 = read_cr4();
+       if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->host_state.vmcs_host_cr4 = cr4;
+       }
+
        /* When single-stepping over STI and MOV SS, we must clear the
         * corresponding interruptibility bits in the guest state. Otherwise
         * vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6259,12 +6311,43 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #undef R
 #undef Q
 
+static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+
+       if (vmx->loaded_vmcs == &vmx->vmcs01)
+               return;
+
+       cpu = get_cpu();
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx_vcpu_put(vcpu);
+       vmx_vcpu_load(vcpu, cpu);
+       vcpu->cpu = cpu;
+       put_cpu();
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       vcpu_load(vcpu);
+       vmx_load_vmcs01(vcpu);
+       free_nested(vmx);
+       vcpu_put(vcpu);
+}
+
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        free_vpid(vmx);
-       free_nested(vmx);
+       leave_guest_mode(vcpu);
+       vmx_free_vcpu_nested(vcpu);
        free_loaded_vmcs(vmx->loaded_vmcs);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
@@ -6572,7 +6655,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * Other fields are different per CPU, and will be set later when
         * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
         */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
 
        /*
         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7011,18 +7094,12 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int cpu;
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
        leave_guest_mode(vcpu);
        prepare_vmcs12(vcpu, vmcs12);
 
-       cpu = get_cpu();
-       vmx->loaded_vmcs = &vmx->vmcs01;
-       vmx_vcpu_put(vcpu);
-       vmx_vcpu_load(vcpu, cpu);
-       vcpu->cpu = cpu;
-       put_cpu();
+       vmx_load_vmcs01(vcpu);
 
        /* if no vmcs02 cache requested, remove the one we used */
        if (VMCS02_POOL_SIZE == 0)