kvm: nVMX: Allow L1 to intercept software exceptions (#BP and #OF)

[pandora-kernel.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 579a0b5..0fb33a0 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -390,6 +390,7 @@ struct vcpu_vmx {
                 u16           fs_sel, gs_sel, ldt_sel;
                 int           gs_ldt_reload_needed;
                 int           fs_reload_needed;
+               unsigned long vmcs_host_cr4;    /* May not match real cr4 */
         } host_state;
         struct {
                 int vm86_active;
@@ -875,10 +876,10 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
  }
  
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+               == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
  }
  
  static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
@@ -1170,7 +1171,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
         u32 eb;
  
         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-            (1u << NM_VECTOR) | (1u << DB_VECTOR);
+            (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
         if ((vcpu->guest_debug &
              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1456,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
  #ifdef CONFIG_X86_64
         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
  #endif
-       if (current_thread_info()->status & TS_USEDFPU)
+       if (__thread_has_fpu(current))
                 clts();
         load_gdt(&__get_cpu_var(host_gdt));
  }
@@ -1677,7 +1678,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
         /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & PF_VECTOR))
+       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                 return 0;
  
         nested_vmx_vmexit(vcpu);
@@ -1956,6 +1957,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #endif
                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+               CPU_BASED_RDPMC_EXITING |
                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
         /*
          * We can allow some features even when not supported by the
@@ -3030,8 +3032,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  
  static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
-       unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+       /*
+        * Pass through host's Machine Check Enable value to hw_cr4, which
+        * is in force while we are in guest mode.  Do not let guests control
+        * this bit, even if host CR4.MCE == 0.
+        */
+       unsigned long hw_cr4 =
+               (read_cr4() & X86_CR4_MCE) |
+               (cr4 & ~X86_CR4_MCE) |
+               (to_vmx(vcpu)->rmode.vm86_active ?
+                KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
  
         if (cr4 & X86_CR4_VMXE) {
                 /*
@@ -3628,16 +3638,21 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
   * Note that host-state that does change is set elsewhere. E.g., host-state
   * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
   */
-static void vmx_set_constant_host_state(void)
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
  {
         u32 low32, high32;
         unsigned long tmpl;
         struct desc_ptr dt;
+       unsigned long cr4;
  
         vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
  
+       /* Save the most likely value for this task's CR4 in the VMCS. */
+       cr4 = read_cr4();
+       vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
+       vmx->host_state.vmcs_host_cr4 = cr4;
+
         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
@@ -3759,7 +3774,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
  
         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
  #ifdef CONFIG_X86_64
         rdmsrl(MSR_FS_BASE, a);
         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -3915,7 +3930,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
  
         vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
         vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
         vmx_set_cr4(&vmx->vcpu, 0);
         vmx_set_efer(&vmx->vcpu, 0);
         vmx_fpu_activate(&vmx->vcpu);
@@ -4208,7 +4225,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+       if (is_nmi(intr_info))
                 return 1;  /* already handled by vmx_vcpu_run() */
  
         if (is_no_device(intr_info)) {
@@ -4249,6 +4266,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
  
         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
         switch (ex_no) {
+       case AC_VECTOR:
+               kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+               return 1;
         case DB_VECTOR:
                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
                 if (!(vcpu->guest_debug &
@@ -4541,7 +4561,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
  
-       if (vmx_set_msr(vcpu, ecx, data) != 0) {
+       if (kvm_set_msr(vcpu, ecx, data) != 0) {
                 trace_kvm_msr_write_ex(ecx, data);
                 kvm_inject_gp(vcpu, 0);
                 return 1;
@@ -4879,6 +4899,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                 if (err != EMULATE_DONE)
                         return 0;
  
+               if (vcpu->arch.halt_request) {
+                       vcpu->arch.halt_request = 0;
+                       ret = kvm_emulate_halt(vcpu);
+                       goto out;
+               }
+
                 if (signal_pending(current))
                         goto out;
                 if (need_resched())
@@ -4973,22 +4999,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
  
  /*
   * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
   */
  static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
  {
         struct vmcs02_list *item, *n;
+
+       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
         list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-               if (vmx->loaded_vmcs != &item->vmcs02)
-                       free_loaded_vmcs(&item->vmcs02);
+               /*
+                * Something will leak if the above WARN triggers.  Better than
+                * a use-after-free.
+                */
+               if (vmx->loaded_vmcs == &item->vmcs02)
+                       continue;
+
+               free_loaded_vmcs(&item->vmcs02);
                 list_del(&item->list);
                 kfree(item);
+               vmx->nested.vmcs02_num--;
         }
-       vmx->nested.vmcs02_num = 0;
-
-       if (vmx->loaded_vmcs != &vmx->vmcs01)
-               free_loaded_vmcs(&vmx->vmcs01);
  }
  
  /*
@@ -5541,6 +5572,18 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5582,6 +5625,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
+       [EXIT_REASON_INVVPID]                 = handle_invvpid,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@ -5728,7 +5773,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
  
         switch (exit_reason) {
         case EXIT_REASON_EXCEPTION_NMI:
-               if (!is_exception(intr_info))
+               if (is_nmi(intr_info))
                         return 0;
                 else if (is_page_fault(intr_info))
                         return enable_ept;
@@ -5766,6 +5811,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
         case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
                 /*
                  * VMX instructions trap unconditionally. This allows L1 to
                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -5895,10 +5941,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
             && kvm_vmx_exit_handlers[exit_reason])
                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
         else {
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = exit_reason;
+               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
         }
-       return 0;
  }
  
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -5927,8 +5973,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
                 kvm_machine_check();
  
         /* We need to handle NMIs before interrupts are enabled */
-       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+       if (is_nmi(exit_intr_info)) {
                 kvm_before_handle_nmi(&vmx->vcpu);
                 asm("int $2");
                 kvm_after_handle_nmi(&vmx->vcpu);
@@ -6084,6 +6129,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr4;
  
         if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6114,6 +6160,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
  
+       cr4 = read_cr4();
+       if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->host_state.vmcs_host_cr4 = cr4;
+       }
+
         /* When single-stepping over STI and MOV SS, we must clear the
          * corresponding interruptibility bits in the guest state. Otherwise
          * vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6259,12 +6311,43 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  #undef R
  #undef Q
  
+static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+
+       if (vmx->loaded_vmcs == &vmx->vmcs01)
+               return;
+
+       cpu = get_cpu();
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx_vcpu_put(vcpu);
+       vmx_vcpu_load(vcpu, cpu);
+       vcpu->cpu = cpu;
+       put_cpu();
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       vcpu_load(vcpu);
+       vmx_load_vmcs01(vcpu);
+       free_nested(vmx);
+       vcpu_put(vcpu);
+}
+
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         free_vpid(vmx);
-       free_nested(vmx);
+       leave_guest_mode(vcpu);
+       vmx_free_vcpu_nested(vcpu);
         free_loaded_vmcs(vmx->loaded_vmcs);
         kfree(vmx->guest_msrs);
         kvm_vcpu_uninit(vcpu);
@@ -6572,7 +6655,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
          * Other fields are different per CPU, and will be set later when
          * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
          */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
  
         /*
          * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7011,18 +7094,12 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int cpu;
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
         leave_guest_mode(vcpu);
         prepare_vmcs12(vcpu, vmcs12);
  
-       cpu = get_cpu();
-       vmx->loaded_vmcs = &vmx->vmcs01;
-       vmx_vcpu_put(vcpu);
-       vmx_vcpu_load(vcpu, cpu);
-       vcpu->cpu = cpu;
-       put_cpu();
+       vmx_load_vmcs01(vcpu);
  
         /* if no vmcs02 cache requested, remove the one we used */
         if (VMCS02_POOL_SIZE == 0)