KVM: x86: fix wbinvd_dirty_mask use-after-free

[pandora-kernel.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 84a28ea..fc73c12 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -44,6 +44,7 @@
  #include <linux/perf_event.h>
  #include <linux/uaccess.h>
  #include <linux/hash.h>
+#include <linux/pci.h>
  #include <trace/events/kvm.h>
  
  #define CREATE_TRACE_POINTS
@@ -83,6 +84,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                     struct kvm_cpuid_entry2 __user *entries);
+static void process_nmi(struct kvm_vcpu *vcpu);
  
  struct kvm_x86_ops *kvm_x86_ops;
  EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -90,6 +92,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
  int ignore_msrs = 0;
  module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  
+unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
  bool kvm_has_tsc_control;
  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
  u32  kvm_max_guest_tsc_khz;
@@ -359,8 +364,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  
  void kvm_inject_nmi(struct kvm_vcpu *vcpu)
  {
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
-       vcpu->arch.nmi_pending = 1;
+       atomic_inc(&vcpu->arch.nmi_queued);
+       kvm_make_request(KVM_REQ_NMI, vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_inject_nmi);
  
@@ -549,8 +554,6 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
         if (index != XCR_XFEATURE_ENABLED_MASK)
                 return 1;
         xcr0 = xcr;
-       if (kvm_x86_ops->get_cpl(vcpu) != 0)
-               return 1;
         if (!(xcr0 & XSTATE_FP))
                 return 1;
         if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -564,7 +567,8 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
  
  int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
  {
-       if (__kvm_set_xcr(vcpu, index, xcr)) {
+       if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+           __kvm_set_xcr(vcpu, index, xcr)) {
                 kvm_inject_gp(vcpu, 0);
                 return 1;
         }
@@ -576,6 +580,9 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpuid_entry2 *best;
  
+       if (!static_cpu_has(X86_FEATURE_XSAVE))
+               return 0;
+
         best = kvm_find_cpuid_entry(vcpu, 1, 0);
         return best && (best->ecx & bit(X86_FEATURE_XSAVE));
  }
@@ -599,6 +606,7 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
  static void update_cpuid(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpuid_entry2 *best;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         best = kvm_find_cpuid_entry(vcpu, 1, 0);
         if (!best)
@@ -610,6 +618,13 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
                 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
                         best->ecx |= bit(X86_FEATURE_OSXSAVE);
         }
+
+       if (apic) {
+               if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+                       apic->lapic_timer.timer_mode_mask = 3 << 17;
+               else
+                       apic->lapic_timer.timer_mode_mask = 1 << 17;
+       }
  }
  
  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -819,12 +834,14 @@ static u32 msrs_to_save[] = {
  #ifdef CONFIG_X86_64
         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
-       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+       MSR_TSC_AUX,
  };
  
  static unsigned num_msrs_to_save;
  
  static u32 emulated_msrs[] = {
+       MSR_IA32_TSCDEADLINE,
         MSR_IA32_MISC_ENABLE,
         MSR_IA32_MCG_STATUS,
         MSR_IA32_MCG_CTL,
@@ -877,7 +894,6 @@ void kvm_enable_efer_bits(u64 mask)
  }
  EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
  
-
  /*
   * Writes msr value into into the appropriate "register".
   * Returns 0 on success, non-0 otherwise.
@@ -885,8 +901,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
   */
  int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  {
+       switch (msr_index) {
+       case MSR_FS_BASE:
+       case MSR_GS_BASE:
+       case MSR_KERNEL_GS_BASE:
+       case MSR_CSTAR:
+       case MSR_LSTAR:
+               if (is_noncanonical_address(data))
+                       return 1;
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+       case MSR_IA32_SYSENTER_ESP:
+               /*
+                * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+                * non-canonical address is written on Intel but not on
+                * AMD (which ignores the top 32-bits, because it does
+                * not implement 64-bit SYSENTER).
+                *
+                * 64-bit code should hence be able to write a non-canonical
+                * value on AMD.  Making the address canonical ensures that
+                * vmentry does not fail on Intel after writing a non-canonical
+                * value, and that something deterministic happens if the guest
+                * invokes 64-bit SYSENTER.
+                */
+               data = get_canonical(data);
+       }
         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
  }
+EXPORT_SYMBOL_GPL(kvm_set_msr);
  
  /*
   * Adapt set_msr() to msr_io()'s calling convention
@@ -1000,7 +1042,7 @@ static inline int kvm_tsc_changes_freq(void)
         return ret;
  }
  
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
  {
         if (vcpu->arch.virtual_tsc_khz)
                 return vcpu->arch.virtual_tsc_khz;
@@ -1091,14 +1133,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
         unsigned long flags;
         struct kvm_vcpu_arch *vcpu = &v->arch;
-       void *shared_kaddr;
         unsigned long this_tsc_khz;
         s64 kernel_ns, max_kernel_ns;
         u64 tsc_timestamp;
  
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+       tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
         kernel_ns = get_kernel_ns();
         this_tsc_khz = vcpu_tsc_khz(v);
         if (unlikely(this_tsc_khz == 0)) {
@@ -1127,7 +1168,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  
         local_irq_restore(flags);
  
-       if (!vcpu->time_page)
+       if (!vcpu->pv_time_enabled)
                 return 0;
  
         /*
@@ -1185,14 +1226,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          */
         vcpu->hv_clock.version += 2;
  
-       shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
-
-       memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-              sizeof(vcpu->hv_clock));
-
-       kunmap_atomic(shared_kaddr, KM_USER0);
-
-       mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock));
         return 0;
  }
  
@@ -1472,7 +1508,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
                 return 0;
         }
  
-       if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+       if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+                                       sizeof(u32)))
                 return 1;
  
         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
@@ -1482,10 +1519,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
  
  static void kvmclock_reset(struct kvm_vcpu *vcpu)
  {
-       if (vcpu->arch.time_page) {
-               kvm_release_page_dirty(vcpu->arch.time_page);
-               vcpu->arch.time_page = NULL;
-       }
+       vcpu->arch.pv_time_enabled = false;
  }
  
  static void accumulate_steal_time(struct kvm_vcpu *vcpu)
@@ -1564,6 +1598,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 break;
         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                 return kvm_x2apic_msr_write(vcpu, msr, data);
+       case MSR_IA32_TSCDEADLINE:
+               kvm_set_lapic_tscdeadline_msr(vcpu, data);
+               break;
         case MSR_IA32_MISC_ENABLE:
                 vcpu->arch.ia32_misc_enable_msr = data;
                 break;
@@ -1574,6 +1611,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 break;
         case MSR_KVM_SYSTEM_TIME_NEW:
         case MSR_KVM_SYSTEM_TIME: {
+               u64 gpa_offset;
                 kvmclock_reset(vcpu);
  
                 vcpu->arch.time = data;
@@ -1583,16 +1621,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 if (!(data & 1))
                         break;
  
-               /* ...but clean it before doing the actual write */
-               vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+               gpa_offset = data & ~(PAGE_MASK | 1);
  
-               vcpu->arch.time_page =
-                               gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-
-               if (is_error_page(vcpu->arch.time_page)) {
-                       kvm_release_page_clean(vcpu->arch.time_page);
-                       vcpu->arch.time_page = NULL;
-               }
+               if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                    &vcpu->arch.pv_time, data & ~1ULL,
+                    sizeof(struct pvclock_vcpu_time_info)))
+                       vcpu->arch.pv_time_enabled = false;
+               else
+                       vcpu->arch.pv_time_enabled = true;
                 break;
         }
         case MSR_KVM_ASYNC_PF_EN:
@@ -1608,7 +1644,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                         return 1;
  
                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
-                                                       data & KVM_STEAL_VALID_BITS))
+                                               data & KVM_STEAL_VALID_BITS,
+                                               sizeof(struct kvm_steal_time)))
                         return 1;
  
                 vcpu->arch.st.msr_val = data;
@@ -1825,6 +1862,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
         case HV_X64_MSR_TPR:
                 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+       case HV_X64_MSR_APIC_ASSIST_PAGE:
+               data = vcpu->arch.hv_vapic;
+               break;
         default:
                 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
                 return 1;
@@ -1839,7 +1879,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
  
         switch (msr) {
         case MSR_IA32_PLATFORM_ID:
-       case MSR_IA32_UCODE_REV:
         case MSR_IA32_EBL_CR_POWERON:
         case MSR_IA32_DEBUGCTLMSR:
         case MSR_IA32_LASTBRANCHFROMIP:
@@ -1847,6 +1886,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_IA32_LASTINTFROMIP:
         case MSR_IA32_LASTINTTOIP:
         case MSR_K8_SYSCFG:
+       case MSR_K8_TSEG_ADDR:
+       case MSR_K8_TSEG_MASK:
         case MSR_K7_HWCR:
         case MSR_VM_HSAVE_PA:
         case MSR_P6_PERFCTR0:
@@ -1860,6 +1901,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_FAM10H_MMIO_CONF_BASE:
                 data = 0;
                 break;
+       case MSR_IA32_UCODE_REV:
+               data = 0x100000000ULL;
+               break;
         case MSR_MTRRcap:
                 data = 0x500 | KVM_NR_VAR_MTRR;
                 break;
@@ -1888,6 +1932,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
                 break;
+       case MSR_IA32_TSCDEADLINE:
+               data = kvm_get_lapic_tscdeadline_msr(vcpu);
+               break;
         case MSR_IA32_MISC_ENABLE:
                 data = vcpu->arch.ia32_misc_enable_msr;
                 break;
@@ -2086,6 +2133,9 @@ int kvm_dev_ioctl_check_extension(long ext)
                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                 break;
         case KVM_CAP_NR_VCPUS:
+               r = KVM_SOFT_MAX_VCPUS;
+               break;
+       case KVM_CAP_MAX_VCPUS:
                 r = KVM_MAX_VCPUS;
                 break;
         case KVM_CAP_NR_MEMSLOTS:
@@ -2095,7 +2145,7 @@ int kvm_dev_ioctl_check_extension(long ext)
                 r = 0;
                 break;
         case KVM_CAP_IOMMU:
-               r = iommu_found();
+               r = iommu_present(&pci_bus_type);
                 break;
         case KVM_CAP_MCE:
                 r = KVM_MAX_MCE_BANKS;
@@ -2106,6 +2156,9 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_TSC_CONTROL:
                 r = kvm_has_tsc_control;
                 break;
+       case KVM_CAP_TSC_DEADLINE_TIMER:
+               r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
+               break;
         default:
                 r = 0;
                 break;
@@ -2210,7 +2263,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 s64 tsc_delta;
                 u64 tsc;
  
-               kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+               tsc = kvm_x86_ops->read_l1_tsc(vcpu);
                 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
                              tsc - vcpu->arch.last_guest_tsc;
  
@@ -2234,7 +2287,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
         kvm_x86_ops->vcpu_put(vcpu);
         kvm_put_guest_fpu(vcpu);
-       kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
  }
  
  static int is_efer_nx(void)
@@ -2819,6 +2872,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                                struct kvm_vcpu_events *events)
  {
+       process_nmi(vcpu);
         events->exception.injected =
                 vcpu->arch.exception.pending &&
                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2890,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                         KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
  
         events->nmi.injected = vcpu->arch.nmi_injected;
-       events->nmi.pending = vcpu->arch.nmi_pending;
+       events->nmi.pending = vcpu->arch.nmi_pending != 0;
         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
         events->nmi.pad = 0;
  
@@ -2856,6 +2910,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                               | KVM_VCPUEVENT_VALID_SHADOW))
                 return -EINVAL;
  
+       process_nmi(vcpu);
         vcpu->arch.exception.pending = events->exception.injected;
         vcpu->arch.exception.nr = events->exception.nr;
         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2897,6 +2952,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
         if (dbgregs->flags)
                 return -EINVAL;
  
+       if (dbgregs->dr6 & ~0xffffffffull)
+               return -EINVAL;
+       if (dbgregs->dr7 & ~0xffffffffull)
+               return -EINVAL;
+
         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
         vcpu->arch.dr6 = dbgregs->dr6;
         vcpu->arch.dr7 = dbgregs->dr7;
@@ -3116,8 +3176,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 r = -EFAULT;
                 if (copy_from_user(&va, argp, sizeof va))
                         goto out;
-               r = 0;
-               kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+               r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
                 break;
         }
         case KVM_X86_SETUP_MCE: {
@@ -3381,10 +3440,11 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
  {
         int r = 0;
-
+       int i;
         mutex_lock(&kvm->arch.vpit->pit_state.lock);
         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-       kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+       for (i = 0; i < 3; i++)
+               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
         return r;
  }
@@ -3405,6 +3465,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
  static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
  {
         int r = 0, start = 0;
+       int i;
         u32 prev_legacy, cur_legacy;
         mutex_lock(&kvm->arch.vpit->pit_state.lock);
         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3414,7 +3475,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
                sizeof(kvm->arch.vpit->pit_state.channels));
         kvm->arch.vpit->pit_state.flags = ps->flags;
-       kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+       for (i = 0; i < 3; i++)
+               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+                                  start && i == 0);
         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
         return r;
  }
@@ -3549,6 +3612,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = -EEXIST;
                 if (kvm->arch.vpic)
                         goto create_irqchip_unlock;
+               r = -EINVAL;
+               if (atomic_read(&kvm->online_vcpus))
+                       goto create_irqchip_unlock;
                 r = -ENOMEM;
                 vpic = kvm_create_pic(kvm);
                 if (vpic) {
@@ -3556,7 +3622,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
                         if (r) {
                                 mutex_lock(&kvm->slots_lock);
                                 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-                                                         &vpic->dev);
+                                                         &vpic->dev_master);
+                               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                                         &vpic->dev_slave);
+                               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                                         &vpic->dev_eclr);
                                 mutex_unlock(&kvm->slots_lock);
                                 kfree(vpic);
                                 goto create_irqchip_unlock;
@@ -3803,6 +3873,20 @@ static void kvm_init_msr_list(void)
         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
                         continue;
+
+               /*
+                * Even MSRs that are valid in the host may not be exposed
+                * to the guests in some cases.
+                */
+               switch (msrs_to_save[i]) {
+               case MSR_TSC_AUX:
+                       if (!kvm_x86_ops->rdtscp_supported())
+                               continue;
+                       break;
+               default:
+                       break;
+               }
+
                 if (j < i)
                         msrs_to_save[j] = msrs_to_save[i];
                 j++;
@@ -4045,84 +4129,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
         return 0;
  }
  
-static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
-                                 unsigned long addr,
-                                 void *val,
-                                 unsigned int bytes,
-                                 struct x86_exception *exception)
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                       const void *val, int bytes)
  {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       gpa_t gpa;
-       int handled, ret;
+       int ret;
+
+       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+       if (ret < 0)
+               return 0;
+       kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+       return 1;
+}
  
+struct read_write_emulator_ops {
+       int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+                                 int bytes);
+       int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                 void *val, int bytes);
+       int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                              int bytes, void *val);
+       int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                   void *val, int bytes);
+       bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
         if (vcpu->mmio_read_completed) {
                 memcpy(val, vcpu->mmio_data, bytes);
                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
                                vcpu->mmio_phys_addr, *(u64 *)val);
                 vcpu->mmio_read_completed = 0;
-               return X86EMUL_CONTINUE;
+               return 1;
         }
  
-       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
-
-       if (ret < 0)
-               return X86EMUL_PROPAGATE_FAULT;
-
-       if (ret)
-               goto mmio;
-
-       if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
-           == X86EMUL_CONTINUE)
-               return X86EMUL_CONTINUE;
+       return 0;
+}
  
-mmio:
-       /*
-        * Is this MMIO handled locally?
-        */
-       handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+                       void *val, int bytes)
+{
+       return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+}
  
-       if (handled == bytes)
-               return X86EMUL_CONTINUE;
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+                        void *val, int bytes)
+{
+       return emulator_write_phys(vcpu, gpa, val, bytes);
+}
  
-       gpa += handled;
-       bytes -= handled;
-       val += handled;
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+       return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
  
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+                         void *val, int bytes)
+{
         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
-
-       vcpu->mmio_needed = 1;
-       vcpu->run->exit_reason = KVM_EXIT_MMIO;
-       vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
-       vcpu->mmio_index = 0;
-
         return X86EMUL_IO_NEEDED;
  }
  
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const void *val, int bytes)
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+                          void *val, int bytes)
  {
-       int ret;
-
-       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-       if (ret < 0)
-               return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-       return 1;
+       memcpy(vcpu->mmio_data, val, bytes);
+       memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+       return X86EMUL_CONTINUE;
  }
  
-static int emulator_write_emulated_onepage(unsigned long addr,
-                                          const void *val,
-                                          unsigned int bytes,
-                                          struct x86_exception *exception,
-                                          struct kvm_vcpu *vcpu)
+static struct read_write_emulator_ops read_emultor = {
+       .read_write_prepare = read_prepare,
+       .read_write_emulate = read_emulate,
+       .read_write_mmio = vcpu_mmio_read,
+       .read_write_exit_mmio = read_exit_mmio,
+};
+
+static struct read_write_emulator_ops write_emultor = {
+       .read_write_emulate = write_emulate,
+       .read_write_mmio = write_mmio,
+       .read_write_exit_mmio = write_exit_mmio,
+       .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+                                      unsigned int bytes,
+                                      struct x86_exception *exception,
+                                      struct kvm_vcpu *vcpu,
+                                      struct read_write_emulator_ops *ops)
  {
         gpa_t gpa;
         int handled, ret;
+       bool write = ops->write;
  
-       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
+       if (ops->read_write_prepare &&
+                 ops->read_write_prepare(vcpu, val, bytes))
+               return X86EMUL_CONTINUE;
+
+       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
  
         if (ret < 0)
                 return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4236,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
         if (ret)
                 goto mmio;
  
-       if (emulator_write_phys(vcpu, gpa, val, bytes))
+       if (ops->read_write_emulate(vcpu, gpa, val, bytes))
                 return X86EMUL_CONTINUE;
  
  mmio:
-       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
         /*
          * Is this MMIO handled locally?
          */
-       handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+       handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
         if (handled == bytes)
                 return X86EMUL_CONTINUE;
  
@@ -4148,23 +4252,20 @@ mmio:
         val += handled;
  
         vcpu->mmio_needed = 1;
-       memcpy(vcpu->mmio_data, val, bytes);
         vcpu->run->exit_reason = KVM_EXIT_MMIO;
         vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
         vcpu->mmio_size = bytes;
         vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-       memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+       vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
         vcpu->mmio_index = 0;
  
-       return X86EMUL_CONTINUE;
+       return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
  }
  
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
-                           unsigned long addr,
-                           const void *val,
-                           unsigned int bytes,
-                           struct x86_exception *exception)
+int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+                       void *val, unsigned int bytes,
+                       struct x86_exception *exception,
+                       struct read_write_emulator_ops *ops)
  {
         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
  
@@ -4173,16 +4274,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
                 int rc, now;
  
                 now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, exception,
-                                                    vcpu);
+               rc = emulator_read_write_onepage(addr, val, now, exception,
+                                                vcpu, ops);
+
                 if (rc != X86EMUL_CONTINUE)
                         return rc;
                 addr += now;
                 val += now;
                 bytes -= now;
         }
-       return emulator_write_emulated_onepage(addr, val, bytes, exception,
-                                              vcpu);
+
+       return emulator_read_write_onepage(addr, val, bytes, exception,
+                                          vcpu, ops);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+                                 unsigned long addr,
+                                 void *val,
+                                 unsigned int bytes,
+                                 struct x86_exception *exception)
+{
+       return emulator_read_write(ctxt, addr, val, bytes,
+                                  exception, &read_emultor);
+}
+
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+                           unsigned long addr,
+                           const void *val,
+                           unsigned int bytes,
+                           struct x86_exception *exception)
+{
+       return emulator_read_write(ctxt, addr, (void *)val, bytes,
+                                  exception, &write_emultor);
  }
  
  #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4582,6 +4705,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
         return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
  }
  
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+                              u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+       struct kvm_cpuid_entry2 *cpuid = NULL;
+
+       if (eax && ecx)
+               cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
+                                           *eax, *ecx);
+
+       if (cpuid) {
+               *eax = cpuid->eax;
+               *ecx = cpuid->ecx;
+               if (ebx)
+                       *ebx = cpuid->ebx;
+               if (edx)
+                       *edx = cpuid->edx;
+               return true;
+       }
+
+       return false;
+}
+
  static struct x86_emulate_ops emulate_ops = {
         .read_std            = kvm_read_guest_virt_system,
         .write_std           = kvm_write_guest_virt_system,
@@ -4612,6 +4757,7 @@ static struct x86_emulate_ops emulate_ops = {
         .get_fpu             = emulator_get_fpu,
         .put_fpu             = emulator_put_fpu,
         .intercept           = emulator_intercept,
+       .get_cpuid           = emulator_get_cpuid,
  };
  
  static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4712,7 +4858,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
         kvm_set_rflags(vcpu, ctxt->eflags);
  
         if (irq == NMI_VECTOR)
-               vcpu->arch.nmi_pending = false;
+               vcpu->arch.nmi_pending = 0;
         else
                 vcpu->arch.interrupt.pending = false;
  
@@ -4726,7 +4872,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
  
         ++vcpu->stat.insn_emulation_fail;
         trace_kvm_emulate_insn_failed(vcpu);
-       if (!is_guest_mode(vcpu)) {
+       if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                 vcpu->run->internal.ndata = 0;
@@ -4788,7 +4934,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
  
                 trace_kvm_emulate_insn_start(vcpu);
                 ++vcpu->stat.insn_emulation;
-               if (r)  {
+               if (r != EMULATION_OK)  {
                         if (emulation_type & EMULTYPE_TRAP_UD)
                                 return EMULATE_FAIL;
                         if (reexecute_instruction(vcpu, cr2))
@@ -5116,9 +5262,10 @@ int kvm_arch_init(void *opaque)
                 goto out;
  
         kvm_set_mmio_spte_mask();
-       kvm_init_msr_list();
  
         kvm_x86_ops = ops;
+       kvm_init_msr_list();
+
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
  
@@ -5444,33 +5591,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
                         !kvm_event_needs_reinjection(vcpu);
  }
  
-static void vapic_enter(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       struct page *page;
-
-       if (!apic || !apic->vapic_addr)
-               return;
-
-       page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-
-       vcpu->arch.apic->vapic_page = page;
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       int idx;
-
-       if (!apic || !apic->vapic_addr)
-               return;
-
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-       kvm_release_page_dirty(apic->vapic_page);
-       mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-}
-
  static void update_cr8_intercept(struct kvm_vcpu *vcpu)
  {
         int max_irr, tpr;
@@ -5519,12 +5639,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
         }
  
         /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending) {
-               if (kvm_x86_ops->nmi_allowed(vcpu)) {
-                       vcpu->arch.nmi_pending = false;
-                       vcpu->arch.nmi_injected = true;
-                       kvm_x86_ops->set_nmi(vcpu);
-               }
+       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+               --vcpu->arch.nmi_pending;
+               vcpu->arch.nmi_injected = true;
+               kvm_x86_ops->set_nmi(vcpu);
         } else if (kvm_cpu_has_interrupt(vcpu)) {
                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
@@ -5553,10 +5671,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
         }
  }
  
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+       unsigned limit = 2;
+
+       /*
+        * x86 is limited to one NMI running, and one NMI pending after it.
+        * If an NMI is already in progress, limit further NMIs to just one.
+        * Otherwise, allow two (and we'll inject the first one immediately).
+        */
+       if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+               limit = 1;
+
+       vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+       vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  {
         int r;
-       bool nmi_pending;
         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
                 vcpu->run->request_interrupt_window;
  
@@ -5596,6 +5730,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 }
                 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
                         record_steal_time(vcpu);
+               if (kvm_check_request(KVM_REQ_NMI, vcpu))
+                       process_nmi(vcpu);
  
         }
  
@@ -5603,21 +5739,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         if (unlikely(r))
                 goto out;
  
-       /*
-        * An NMI can be injected between local nmi_pending read and
-        * vcpu->arch.nmi_pending read inside inject_pending_event().
-        * But in that case, KVM_REQ_EVENT will be set, which makes
-        * the race described above benign.
-        */
-       nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
-
         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                 inject_pending_event(vcpu);
  
                 /* enable NMI/IRQ window open exits if needed */
-               if (nmi_pending)
+               if (vcpu->arch.nmi_pending)
                         kvm_x86_ops->enable_nmi_window(vcpu);
-               else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+               if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
                         kvm_x86_ops->enable_irq_window(vcpu);
  
                 if (kvm_lapic_enabled(vcpu)) {
@@ -5631,8 +5759,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         kvm_x86_ops->prepare_guest_switch(vcpu);
         if (vcpu->fpu_active)
                 kvm_load_guest_fpu(vcpu);
-       kvm_load_guest_xcr0(vcpu);
-
         vcpu->mode = IN_GUEST_MODE;
  
         /* We should set ->mode before check ->requests,
@@ -5653,6 +5779,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 goto out;
         }
  
+       kvm_load_guest_xcr0(vcpu);
+
         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  
         kvm_guest_enter();
@@ -5678,10 +5806,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         if (hw_breakpoint_active())
                 hw_breakpoint_restore();
  
-       kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
  
         vcpu->mode = OUTSIDE_GUEST_MODE;
         smp_wmb();
+
+       kvm_put_guest_xcr0(vcpu);
+
         local_irq_enable();
  
         ++vcpu->stat.exits;
@@ -5733,7 +5864,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
         }
  
         vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-       vapic_enter(vcpu);
  
         r = 1;
         while (r > 0) {
@@ -5790,8 +5920,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
  
         srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
  
-       vapic_exit(vcpu);
-
         return r;
  }
  
@@ -6040,6 +6168,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
         int pending_vec, max_bits, idx;
         struct desc_ptr dt;
  
+       if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
+               return -EINVAL;
+
         dt.size = sregs->idt.limit;
         dt.address = sregs->idt.base;
         kvm_x86_ops->set_idt(vcpu, &dt);
@@ -6255,7 +6386,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
          * and assume host would use all available bits.
          * Guest xcr0 would be loaded later.
          */
-       kvm_put_guest_xcr0(vcpu);
         vcpu->guest_fpu_loaded = 1;
         unlazy_fpu(current);
         fpu_restore_checking(&vcpu->arch.guest_fpu);
@@ -6264,8 +6394,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
  
  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       kvm_put_guest_xcr0(vcpu);
-
         if (!vcpu->guest_fpu_loaded)
                 return;
  
@@ -6278,11 +6406,13 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  
  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  {
+       void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+
         kvmclock_reset(vcpu);
  
-       free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
         fx_free(vcpu);
         kvm_x86_ops->vcpu_free(vcpu);
+       free_cpumask_var(wbinvd_dirty_mask);
  }
  
  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
@@ -6323,7 +6453,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  
  int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.nmi_pending = false;
+       atomic_set(&vcpu->arch.nmi_queued, 0);
+       vcpu->arch.nmi_pending = 0;
         vcpu->arch.nmi_injected = false;
  
         vcpu->arch.switch_db_regs = 0;
@@ -6379,6 +6510,11 @@ void kvm_arch_check_processor_compat(void *rtn)
         kvm_x86_ops->check_processor_compatibility(rtn);
  }
  
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+       return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
         struct page *page;
@@ -6428,6 +6564,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
         if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
                 goto fail_free_mce_banks;
  
+       vcpu->arch.pv_time_enabled = false;
         kvm_async_pf_hash_reset(vcpu);
  
         return 0;
@@ -6598,7 +6735,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                 !vcpu->arch.apf.halted)
                 || !list_empty_careful(&vcpu->async_pf.done)
                 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-               || vcpu->arch.nmi_pending ||
+               || atomic_read(&vcpu->arch.nmi_queued) ||
                 (kvm_arch_interrupt_allowed(vcpu) &&
                  kvm_cpu_has_interrupt(vcpu));
  }