Merge branch 'for-upstream' of git://openrisc.net/jonas/linux

[pandora-kernel.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 7f62dc3..e65a158 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,9 +49,6 @@
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
  static int __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
  
@@ -959,7 +956,7 @@ static void vmcs_load(struct vmcs *vmcs)
                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                         : "cc", "memory");
         if (error)
-               printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+               printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                        vmcs, phys_addr);
  }
  
@@ -1179,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
         if (vcpu->fpu_active)
                 eb &= ~(1u << NM_VECTOR);
+
+       /* When we are running a nested L2 guest and L1 specified for it a
+        * certain exception bitmap, we must trap the same exceptions and pass
+        * them to L1. When running L2, we will only handle the exceptions
+        * specified above if L1 did not want them.
+        */
+       if (is_guest_mode(vcpu))
+               eb |= get_vmcs12(vcpu)->exception_bitmap;
+
         vmcs_write32(EXCEPTION_BITMAP, eb);
  }
  
@@ -1473,6 +1479,9 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
         vmcs_writel(GUEST_CR0, cr0);
         update_exception_bitmap(vcpu);
         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       if (is_guest_mode(vcpu))
+               vcpu->arch.cr0_guest_owned_bits &=
+                       ~get_vmcs12(vcpu)->cr0_guest_host_mask;
         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
  }
  
@@ -1496,12 +1505,29 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
  
  static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
  {
+       /* Note that there is no vcpu->fpu_active = 0 here. The caller must
+        * set this *before* calling this function.
+        */
         vmx_decache_cr0_guest_bits(vcpu);
         vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
         update_exception_bitmap(vcpu);
         vcpu->arch.cr0_guest_owned_bits = 0;
         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-       vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+       if (is_guest_mode(vcpu)) {
+               /*
+                * L1's specified read shadow might not contain the TS bit,
+                * so now that we turned on shadowing of this bit, we need to
+                * set this bit of the shadow. Like in nested_vmx_run we need
+                * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
+                * up-to-date here because we just decached cr0.TS (and we'll
+                * only update vmcs12->guest_cr0 on nested exit).
+                */
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+                       (vcpu->arch.cr0 & X86_CR0_TS);
+               vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+       } else
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
  }
  
  static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1585,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
  }
  
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ * This function assumes it is called with the exit reason in vmcs02 being
+ * a #PF exception (this is the only case in which KVM injects a #PF when L2
+ * is running).
+ */
+static int nested_pf_handled(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+       if (!(vmcs12->exception_bitmap & PF_VECTOR))
+               return 0;
+
+       nested_vmx_vmexit(vcpu);
+       return 1;
+}
+
  static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                 bool has_error_code, u32 error_code,
                                 bool reinject)
@@ -1592,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 intr_info = nr | INTR_INFO_VALID_MASK;
  
+       if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+               nested_pf_handled(vcpu))
+               return;
+
         if (has_error_code) {
                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1714,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
  static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
         vmcs_write64(TSC_OFFSET, offset);
+       if (is_guest_mode(vcpu))
+               /*
+                * We're here if L1 chose not to trap the TSC MSR. Since
+                * prepare_vmcs12() does not copy tsc_offset, we need to also
+                * set the vmcs12 field here.
+                */
+               get_vmcs12(vcpu)->tsc_offset = offset -
+                       to_vmx(vcpu)->nested.vmcs01_tsc_offset;
  }
  
  static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
  {
         u64 offset = vmcs_read64(TSC_OFFSET);
         vmcs_write64(TSC_OFFSET, offset + adjustment);
+       if (is_guest_mode(vcpu)) {
+               /* Even when running L2, the adjustment needs to apply to L1 */
+               to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
+       }
  }
  
  static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1790,6 +1851,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  
         /* exit controls */
         nested_vmx_exit_ctls_low = 0;
+       /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
  #ifdef CONFIG_X86_64
         nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #else
@@ -3532,12 +3594,25 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
         return exec_control;
  }
  
+static void ept_set_mmio_spte_mask(void)
+{
+       /*
+        * EPT Misconfigurations can be generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+        * spte.
+        */
+       kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
+
  /*
   * Sets up the vmcs for emulated real mode.
   */
  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
  {
+#ifdef CONFIG_X86_64
         unsigned long a;
+#endif
         int i;
  
         /* I/O */
@@ -3565,8 +3640,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                 vmcs_write32(PLE_WINDOW, ple_window);
         }
  
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
  
         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
@@ -3744,9 +3819,25 @@ out:
         return ret;
  }
  
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+       return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+               PIN_BASED_EXT_INTR_MASK;
+}
+
  static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
         u32 cpu_based_vm_exec_control;
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               /* We can get here when nested_run_pending caused
+                * vmx_interrupt_allowed() to return false. In this case, do
+                * nothing - the interrupt will be injected later.
+                */
+               return;
  
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3803,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       if (is_guest_mode(vcpu))
+               return;
+
         if (!cpu_has_virtual_nmis()) {
                 /*
                  * Tracking the NMI-blocked state in software is built upon
@@ -3869,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+               struct vmcs12 *vmcs12;
+               if (to_vmx(vcpu)->nested.nested_run_pending)
+                       return 0;
+               nested_vmx_vmexit(vcpu);
+               vmcs12 = get_vmcs12(vcpu);
+               vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+               vmcs12->vm_exit_intr_info = 0;
+               /* fall through to normal code, but now in L1, not L2 */
+       }
+
         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -4110,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
         hypercall[2] = 0xc1;
  }
  
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       if (to_vmx(vcpu)->nested.vmxon &&
+           ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+               return 1;
+
+       if (is_guest_mode(vcpu)) {
+               /*
+                * We get here when L2 changed cr0 in a way that did not change
+                * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+                * but did change L0 shadowed bits. This can currently happen
+                * with the TS bit: L0 may want to leave TS on (for lazy fpu
+                * loading) while pretending to allow the guest to change it.
+                */
+               if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
+                        (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+                       return 1;
+               vmcs_writel(CR0_READ_SHADOW, val);
+               return 0;
+       } else
+               return kvm_set_cr0(vcpu, val);
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       if (is_guest_mode(vcpu)) {
+               if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
+                        (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+                       return 1;
+               vmcs_writel(CR4_READ_SHADOW, val);
+               return 0;
+       } else
+               return kvm_set_cr4(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu)) {
+               /*
+                * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
+                * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
+                * just pretend it's off (also in arch.cr0 for fpu_activate).
+                */
+               vmcs_writel(CR0_READ_SHADOW,
+                       vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+       } else
+               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
  static int handle_cr(struct kvm_vcpu *vcpu)
  {
         unsigned long exit_qualification, val;
@@ -4126,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                 trace_kvm_cr_write(cr, val);
                 switch (cr) {
                 case 0:
-                       err = kvm_set_cr0(vcpu, val);
+                       err = handle_set_cr0(vcpu, val);
                         kvm_complete_insn_gp(vcpu, err);
                         return 1;
                 case 3:
@@ -4134,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                         kvm_complete_insn_gp(vcpu, err);
                         return 1;
                 case 4:
-                       err = kvm_set_cr4(vcpu, val);
+                       err = handle_set_cr4(vcpu, val);
                         kvm_complete_insn_gp(vcpu, err);
                         return 1;
                 case 8: {
@@ -4152,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                 };
                 break;
         case 2: /* clts */
-               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+               handle_clts(vcpu);
                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
                 skip_emulated_instruction(vcpu);
                 vmx_fpu_activate(vcpu);
@@ -4525,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
  static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
  {
         u64 sptes[4];
-       int nr_sptes, i;
+       int nr_sptes, i, ret;
         gpa_t gpa;
  
         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
  
+       ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+       if (likely(ret == 1))
+               return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+                                             EMULATE_DONE;
+       if (unlikely(!ret))
+               return 1;
+
+       /* It is the real ept misconfig */
         printk(KERN_ERR "EPT: Misconfiguration.\n");
         printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
  
@@ -5537,8 +5702,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         if (vmx->emulation_required && emulate_invalid_guest_state)
                 return handle_invalid_guest_state(vcpu);
  
-       if (exit_reason == EXIT_REASON_VMLAUNCH ||
-           exit_reason == EXIT_REASON_VMRESUME)
+       /*
+        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+        * we did not inject a still-pending event to L1 now because of
+        * nested_run_pending, we need to re-enable this bit.
+        */
+       if (vmx->nested.nested_run_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
+           exit_reason == EXIT_REASON_VMRESUME))
                 vmx->nested.nested_run_pending = 1;
         else
                 vmx->nested.nested_run_pending = 0;
@@ -5735,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
  
  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
  {
+       if (is_guest_mode(&vmx->vcpu))
+               return;
         __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
                                   VM_EXIT_INSTRUCTION_LEN,
                                   IDT_VECTORING_ERROR_CODE);
@@ -5742,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
  
  static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
  {
+       if (is_guest_mode(vcpu))
+               return;
         __vmx_complete_interrupts(to_vmx(vcpu),
                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
                                   VM_ENTRY_INSTRUCTION_LEN,
@@ -5762,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               if (vmcs12->idt_vectoring_info_field &
+                               VECTORING_INFO_VALID_MASK) {
+                       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                               vmcs12->idt_vectoring_info_field);
+                       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                               vmcs12->vm_exit_instruction_len);
+                       if (vmcs12->idt_vectoring_info_field &
+                                       VECTORING_INFO_DELIVER_CODE_MASK)
+                               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                                       vmcs12->idt_vectoring_error_code);
+               }
+       }
+
         /* Record the guest's net vcpu time for enforced NMI injections. */
         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
                 vmx->entry_time = ktime_get();
@@ -5894,6 +6086,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
  
+       if (is_guest_mode(vcpu)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
+               if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+                       vmcs12->idt_vectoring_error_code =
+                               vmcs_read32(IDT_VECTORING_ERROR_CODE);
+                       vmcs12->vm_exit_instruction_len =
+                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               }
+       }
+
         asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
         vmx->loaded_vmcs->launched = 1;
  
@@ -6114,6 +6317,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
  
  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
  {
+       if (func == 1 && nested)
+               entry->ecx |= bit(X86_FEATURE_VMX);
  }
  
  /*
@@ -6914,16 +7119,13 @@ static int __init vmx_init(void)
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
  
         if (enable_ept) {
-               bypass_guest_pf = 0;
                 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                 VMX_EPT_EXECUTABLE_MASK);
+               ept_set_mmio_spte_mask();
                 kvm_enable_tdp();
         } else
                 kvm_disable_tdp();
  
-       if (bypass_guest_pf)
-               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
         return 0;
  
  out3: