KVM: Don't save/restore MSR_IA32_PERF_STATUS
[pandora-kernel.git] / arch / x86 / kvm / x86.c
index 7fa89c3..1c97238 100644 (file)
@@ -6,6 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
 #include <asm/debugreg.h>
-#include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                              \
@@ -62,6 +67,7 @@
        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+                         | X86_CR4_OSXSAVE \
                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +153,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+       return 1 << (bitno & 31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
        unsigned slot;
@@ -285,7 +298,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
        prev_nr = vcpu->arch.exception.nr;
        if (prev_nr == DF_VECTOR) {
                /* triple fault -> shutdown */
-               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                return;
        }
        class1 = exception_class(prev_nr);
@@ -414,121 +427,163 @@ out:
        return changed;
 }
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+       unsigned long old_cr0 = kvm_read_cr0(vcpu);
+       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+                                   X86_CR0_CD | X86_CR0_NW;
+
        cr0 |= X86_CR0_ET;
 
 #ifdef CONFIG_X86_64
-       if (cr0 & 0xffffffff00000000UL) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
+       if (cr0 & 0xffffffff00000000UL)
+               return 1;
 #endif
 
        cr0 &= ~CR0_RESERVED_BITS;
 
-       if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
+       if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+               return 1;
 
-       if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
+       if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+               return 1;
 
        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
                if ((vcpu->arch.efer & EFER_LME)) {
                        int cs_db, cs_l;
 
-                       if (!is_pae(vcpu)) {
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
+                       if (!is_pae(vcpu))
+                               return 1;
                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-                       if (cs_l) {
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-
-                       }
+                       if (cs_l)
+                               return 1;
                } else
 #endif
-               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
-
+               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+                       return 1;
        }
 
        kvm_x86_ops->set_cr0(vcpu, cr0);
 
-       kvm_mmu_reset_context(vcpu);
-       return;
+       if ((cr0 ^ old_cr0) & update_bits)
+               kvm_mmu_reset_context(vcpu);
+       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-       kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+       (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
-       unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+       u64 xcr0;
+
+       /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+       if (index != XCR_XFEATURE_ENABLED_MASK)
+               return 1;
+       xcr0 = xcr;
+       if (kvm_x86_ops->get_cpl(vcpu) != 0)
+               return 1;
+       if (!(xcr0 & XSTATE_FP))
+               return 1;
+       if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+               return 1;
+       if (xcr0 & ~host_xcr0)
+               return 1;
+       vcpu->arch.xcr0 = xcr0;
+       vcpu->guest_xcr0_loaded = 0;
+       return 0;
+}
 
-       if (cr4 & CR4_RESERVED_BITS) {
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+       if (__kvm_set_xcr(vcpu, index, xcr)) {
                kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 1, 0);
+       return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 1, 0);
+       if (!best)
                return;
+
+       /* Update OSXSAVE bit */
+       if (cpu_has_xsave && best->function == 0x1) {
+               best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+                       best->ecx |= bit(X86_FEATURE_OSXSAVE);
        }
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       unsigned long old_cr4 = kvm_read_cr4(vcpu);
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+       if (cr4 & CR4_RESERVED_BITS)
+               return 1;
+
+       if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+               return 1;
 
        if (is_long_mode(vcpu)) {
-               if (!(cr4 & X86_CR4_PAE)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
+               if (!(cr4 & X86_CR4_PAE))
+                       return 1;
        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
                   && ((cr4 ^ old_cr4) & pdptr_bits)
-                  && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
+                  && !load_pdptrs(vcpu, vcpu->arch.cr3))
+               return 1;
+
+       if (cr4 & X86_CR4_VMXE)
+               return 1;
 
-       if (cr4 & X86_CR4_VMXE) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
        kvm_x86_ops->set_cr4(vcpu, cr4);
-       vcpu->arch.cr4 = cr4;
-       kvm_mmu_reset_context(vcpu);
+
+       if ((cr4 ^ old_cr4) & pdptr_bits)
+               kvm_mmu_reset_context(vcpu);
+
+       if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+               update_cpuid(vcpu);
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
                kvm_mmu_sync_roots(vcpu);
                kvm_mmu_flush_tlb(vcpu);
-               return;
+               return 0;
        }
 
        if (is_long_mode(vcpu)) {
-               if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
+               if (cr3 & CR3_L_MODE_RESERVED_BITS)
+                       return 1;
        } else {
                if (is_pae(vcpu)) {
-                       if (cr3 & CR3_PAE_RESERVED_BITS) {
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
-                       if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
+                       if (cr3 & CR3_PAE_RESERVED_BITS)
+                               return 1;
+                       if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+                               return 1;
                }
                /*
                 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +601,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
         * to debug) behavior on the guest side.
         */
        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-               kvm_inject_gp(vcpu, 0);
-       else {
-               vcpu->arch.cr3 = cr3;
-               vcpu->arch.mmu.new_cr3(vcpu);
-       }
+               return 1;
+       vcpu->arch.cr3 = cr3;
+       vcpu->arch.mmu.new_cr3(vcpu);
+       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-       if (cr8 & CR8_RESERVED_BITS) {
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
+       if (cr8 & CR8_RESERVED_BITS)
+               return 1;
        if (irqchip_in_kernel(vcpu->kvm))
                kvm_lapic_set_tpr(vcpu, cr8);
        else
                vcpu->arch.cr8 = cr8;
+       return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+       if (__kvm_set_cr8(vcpu, cr8))
+               kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
@@ -576,7 +635,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
        switch (dr) {
        case 0 ... 3:
@@ -585,29 +644,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
                        vcpu->arch.eff_db[dr] = val;
                break;
        case 4:
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-                       kvm_queue_exception(vcpu, UD_VECTOR);
-                       return 1;
-               }
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+                       return 1; /* #UD */
                /* fall through */
        case 6:
-               if (val & 0xffffffff00000000ULL) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
+               if (val & 0xffffffff00000000ULL)
+                       return -1; /* #GP */
                vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
                break;
        case 5:
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-                       kvm_queue_exception(vcpu, UD_VECTOR);
-                       return 1;
-               }
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+                       return 1; /* #UD */
                /* fall through */
        default: /* 7 */
-               if (val & 0xffffffff00000000ULL) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
+               if (val & 0xffffffff00000000ULL)
+                       return -1; /* #GP */
                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
                        kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +669,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 
        return 0;
 }
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+       int res;
+
+       res = __kvm_set_dr(vcpu, dr, val);
+       if (res > 0)
+               kvm_queue_exception(vcpu, UD_VECTOR);
+       else if (res < 0)
+               kvm_inject_gp(vcpu, 0);
+
+       return res;
+}
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
        switch (dr) {
        case 0 ... 3:
                *val = vcpu->arch.db[dr];
                break;
        case 4:
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-                       kvm_queue_exception(vcpu, UD_VECTOR);
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
                        return 1;
-               }
                /* fall through */
        case 6:
                *val = vcpu->arch.dr6;
                break;
        case 5:
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-                       kvm_queue_exception(vcpu, UD_VECTOR);
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
                        return 1;
-               }
                /* fall through */
        default: /* 7 */
                *val = vcpu->arch.dr7;
@@ -648,12 +708,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-static inline u32 bit(int bitno)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
-       return 1 << (bitno & 31);
+       if (_kvm_get_dr(vcpu, dr, val)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+       return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_get_dr);
 
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,21 +735,25 @@ static u32 msrs_to_save[] = {
        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
        HV_X64_MSR_APIC_ASSIST_PAGE,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-       MSR_K6_STAR,
+       MSR_STAR,
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-       MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
 
 static u32 emulated_msrs[] = {
        MSR_IA32_MISC_ENABLE,
+       MSR_IA32_MCG_STATUS,
+       MSR_IA32_MCG_CTL,
 };
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+       u64 old_efer = vcpu->arch.efer;
+
        if (efer & efer_reserved_bits)
                return 1;
 
@@ -714,11 +782,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
        kvm_x86_ops->set_efer(vcpu, efer);
 
-       vcpu->arch.efer = efer;
-
        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
        kvm_mmu_reset_context(vcpu);
 
+       /* Update reserved bits */
+       if ((efer ^ old_efer) & EFER_NX)
+               kvm_mmu_reset_context(vcpu);
+
        return 0;
 }
 
@@ -825,38 +895,154 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
                 hv_clock->tsc_to_system_mul);
 }
 
+static inline u64 get_kernel_ns(void)
+{
+       struct timespec ts;
+
+       WARN_ON(preemptible());
+       ktime_get_ts(&ts);
+       monotonic_to_bootbased(&ts);
+       return timespec_to_ns(&ts);
+}
+
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static inline int kvm_tsc_changes_freq(void)
+{
+       int cpu = get_cpu();
+       int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+                 cpufreq_quick_get(cpu) != 0;
+       put_cpu();
+       return ret;
+}
+
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+       u64 ret;
+
+       WARN_ON(preemptible());
+       if (kvm_tsc_changes_freq())
+               printk_once(KERN_WARNING
+                "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+       ret = nsec * __get_cpu_var(cpu_tsc_khz);
+       do_div(ret, USEC_PER_SEC);
+       return ret;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct kvm *kvm = vcpu->kvm;
+       u64 offset, ns, elapsed;
+       unsigned long flags;
+       s64 sdiff;
+
+       spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+       offset = data - native_read_tsc();
+       ns = get_kernel_ns();
+       elapsed = ns - kvm->arch.last_tsc_nsec;
+       sdiff = data - kvm->arch.last_tsc_write;
+       if (sdiff < 0)
+               sdiff = -sdiff;
+
+       /*
+        * Special case: close write to TSC within 5 seconds of
+        * another CPU is interpreted as an attempt to synchronize
+        * The 5 seconds is to accomodate host load / swapping as
+        * well as any reset of TSC during the boot process.
+        *
+        * In that case, for a reliable TSC, we can match TSC offsets,
+        * or make a best guest using elapsed value.
+        */
+       if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+           elapsed < 5ULL * NSEC_PER_SEC) {
+               if (!check_tsc_unstable()) {
+                       offset = kvm->arch.last_tsc_offset;
+                       pr_debug("kvm: matched tsc offset for %llu\n", data);
+               } else {
+                       u64 delta = nsec_to_cycles(elapsed);
+                       offset += delta;
+                       pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+               }
+               ns = kvm->arch.last_tsc_nsec;
+       }
+       kvm->arch.last_tsc_nsec = ns;
+       kvm->arch.last_tsc_write = data;
+       kvm->arch.last_tsc_offset = offset;
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+       /* Reset of TSC must disable overshoot protection below */
+       vcpu->arch.hv_clock.tsc_timestamp = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static int kvm_write_guest_time(struct kvm_vcpu *v)
 {
-       struct timespec ts;
        unsigned long flags;
        struct kvm_vcpu_arch *vcpu = &v->arch;
        void *shared_kaddr;
        unsigned long this_tsc_khz;
+       s64 kernel_ns, max_kernel_ns;
+       u64 tsc_timestamp;
 
        if ((!vcpu->time_page))
-               return;
-
-       this_tsc_khz = get_cpu_var(cpu_tsc_khz);
-       if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
-               kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-               vcpu->hv_clock_tsc_khz = this_tsc_khz;
-       }
-       put_cpu_var(cpu_tsc_khz);
+               return 0;
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-       ktime_get_ts(&ts);
-       monotonic_to_bootbased(&ts);
+       kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+       kernel_ns = get_kernel_ns();
+       this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        local_irq_restore(flags);
 
-       /* With all the info we got, fill in the values */
+       if (unlikely(this_tsc_khz == 0)) {
+               kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+               return 1;
+       }
+
+       /*
+        * Time as measured by the TSC may go backwards when resetting the base
+        * tsc_timestamp.  The reason for this is that the TSC resolution is
+        * higher than the resolution of the other clock scales.  Thus, many
+        * possible measurments of the TSC correspond to one measurement of any
+        * other clock, and so a spread of values is possible.  This is not a
+        * problem for the computation of the nanosecond clock; with TSC rates
+        * around 1GHZ, there can only be a few cycles which correspond to one
+        * nanosecond value, and any path through this code will inevitably
+        * take longer than that.  However, with the kernel_ns value itself,
+        * the precision may be much lower, down to HZ granularity.  If the
+        * first sampling of TSC against kernel_ns ends in the low part of the
+        * range, and the second in the high end of the range, we can get:
+        *
+        * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+        *
+        * As the sampling errors potentially range in the thousands of cycles,
+        * it is possible such a time value has already been observed by the
+        * guest.  To protect against this, we must compute the system time as
+        * observed by the guest and ensure the new system time is greater.
+        */
+       max_kernel_ns = 0;
+       if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+               max_kernel_ns = vcpu->last_guest_tsc -
+                               vcpu->hv_clock.tsc_timestamp;
+               max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+                                   vcpu->hv_clock.tsc_to_system_mul,
+                                   vcpu->hv_clock.tsc_shift);
+               max_kernel_ns += vcpu->last_kernel_ns;
+       }
 
-       vcpu->hv_clock.system_time = ts.tv_nsec +
-                                    (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
+               kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+               vcpu->hw_tsc_khz = this_tsc_khz;
+       }
+
+       if (max_kernel_ns > kernel_ns)
+               kernel_ns = max_kernel_ns;
 
+       /* With all the info we got, fill in the values */
+       vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+       vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+       vcpu->last_kernel_ns = kernel_ns;
        vcpu->hv_clock.flags = 0;
 
        /*
@@ -874,6 +1060,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        kunmap_atomic(shared_kaddr, KM_USER0);
 
        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+       return 0;
 }
 
 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
@@ -882,7 +1069,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
        if (!vcpu->time_page)
                return 0;
-       set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+       kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
        return 1;
 }
 
@@ -1524,16 +1711,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 {
        int i, idx;
 
-       vcpu_load(vcpu);
-
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        for (i = 0; i < msrs->nmsrs; ++i)
                if (do_msr(vcpu, entries[i].index, &entries[i].data))
                        break;
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
-       vcpu_put(vcpu);
-
        return i;
 }
 
@@ -1618,6 +1801,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
+       case KVM_CAP_XSAVE:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1825,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_MCE:
                r = KVM_MAX_MCE_BANKS;
                break;
+       case KVM_CAP_XCRS:
+               r = cpu_has_xsave;
+               break;
        default:
                r = 0;
                break;
@@ -1717,22 +1904,47 @@ out:
        return r;
 }
 
+static void wbinvd_ipi(void *garbage)
+{
+       wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+       return vcpu->kvm->arch.iommu_domain &&
+               !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+       /* Address WBINVD may be executed by guest */
+       if (need_emulate_wbinvd(vcpu)) {
+               if (kvm_x86_ops->has_wbinvd_exit())
+                       cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+               else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+                       smp_call_function_single(vcpu->cpu,
+                                       wbinvd_ipi, NULL, 1);
+       }
+
        kvm_x86_ops->vcpu_load(vcpu, cpu);
-       if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
-               unsigned long khz = cpufreq_quick_get(cpu);
-               if (!khz)
-                       khz = tsc_khz;
-               per_cpu(cpu_tsc_khz, cpu) = khz;
+       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+               /* Make sure TSC doesn't go backwards */
+               s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+                               native_read_tsc() - vcpu->arch.last_host_tsc;
+               if (tsc_delta < 0)
+                       mark_tsc_unstable("KVM discovered backwards TSC");
+               if (check_tsc_unstable())
+                       kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+               kvm_migrate_timers(vcpu);
+               vcpu->cpu = cpu;
        }
-       kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       kvm_put_guest_fpu(vcpu);
        kvm_x86_ops->vcpu_put(vcpu);
+       kvm_put_guest_fpu(vcpu);
+       vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int is_efer_nx(void)
@@ -1781,7 +1993,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
        if (copy_from_user(cpuid_entries, entries,
                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
                goto out_free;
-       vcpu_load(vcpu);
        for (i = 0; i < cpuid->nent; i++) {
                vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
                vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +2010,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
        r = 0;
        kvm_apic_set_version(vcpu);
        kvm_x86_ops->cpuid_update(vcpu);
-       vcpu_put(vcpu);
+       update_cpuid(vcpu);
 
 out_free:
        vfree(cpuid_entries);
@@ -1820,11 +2031,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
        if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                goto out;
-       vcpu_load(vcpu);
        vcpu->arch.cpuid_nent = cpuid->nent;
        kvm_apic_set_version(vcpu);
        kvm_x86_ops->cpuid_update(vcpu);
-       vcpu_put(vcpu);
+       update_cpuid(vcpu);
        return 0;
 
 out:
@@ -1837,7 +2047,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
        int r;
 
-       vcpu_load(vcpu);
        r = -E2BIG;
        if (cpuid->nent < vcpu->arch.cpuid_nent)
                goto out;
@@ -1849,7 +2058,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
        cpuid->nent = vcpu->arch.cpuid_nent;
-       vcpu_put(vcpu);
        return r;
 }
 
@@ -1901,13 +2109,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
        /* cpuid 1.ecx */
        const u32 kvm_supported_word4_x86_features =
-               F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+               F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
                0 /* DS-CPL, VMX, SMX, EST */ |
                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
                0 /* Reserved, DCA */ | F(XMM4_1) |
                F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-               0 /* Reserved, XSAVE, OSXSAVE */;
+               0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
        /* cpuid 0x80000001.ecx */
        const u32 kvm_supported_word6_x86_features =
                F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2130,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
        switch (function) {
        case 0:
-               entry->eax = min(entry->eax, (u32)0xb);
+               entry->eax = min(entry->eax, (u32)0xd);
                break;
        case 1:
                entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2188,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                }
                break;
        }
+       case 0xd: {
+               int i;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               for (i = 1; *nent < maxnent; ++i) {
+                       if (entry[i - 1].eax == 0 && i != 2)
+                               break;
+                       do_cpuid_1_ent(&entry[i], function, i);
+                       entry[i].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
        case KVM_CPUID_SIGNATURE: {
                char signature[12] = "KVMKVMKVM\0\0";
                u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2303,7 @@ out:
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       vcpu_load(vcpu);
        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-       vcpu_put(vcpu);
 
        return 0;
 }
@@ -2091,11 +2311,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       vcpu_load(vcpu);
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
        kvm_apic_post_state_restore(vcpu);
        update_cr8_intercept(vcpu);
-       vcpu_put(vcpu);
 
        return 0;
 }
@@ -2107,20 +2325,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                return -EINVAL;
        if (irqchip_in_kernel(vcpu->kvm))
                return -ENXIO;
-       vcpu_load(vcpu);
 
        kvm_queue_interrupt(vcpu, irq->irq, false);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
-       vcpu_load(vcpu);
        kvm_inject_nmi(vcpu);
-       vcpu_put(vcpu);
 
        return 0;
 }
@@ -2140,7 +2353,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        int r;
        unsigned bank_num = mcg_cap & 0xff, bank;
 
-       vcpu_load(vcpu);
        r = -EINVAL;
        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                goto out;
@@ -2155,7 +2367,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        for (bank = 0; bank < bank_num; bank++)
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
-       vcpu_put(vcpu);
        return r;
 }
 
@@ -2188,7 +2399,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
                        printk(KERN_DEBUG "kvm: set_mce: "
                               "injects mce exception while "
                               "previous one is in progress!\n");
-                       set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                        return 0;
                }
                if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2424,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
 {
-       vcpu_load(vcpu);
-
        events->exception.injected =
                vcpu->arch.exception.pending &&
                !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2448,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
                         | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                         | KVM_VCPUEVENT_VALID_SHADOW);
-
-       vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2458,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                              | KVM_VCPUEVENT_VALID_SHADOW))
                return -EINVAL;
 
-       vcpu_load(vcpu);
-
        vcpu->arch.exception.pending = events->exception.injected;
        vcpu->arch.exception.nr = events->exception.nr;
        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2480,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
                vcpu->arch.sipi_vector = events->sipi_vector;
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
                                             struct kvm_debugregs *dbgregs)
 {
-       vcpu_load(vcpu);
-
        memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
        dbgregs->dr6 = vcpu->arch.dr6;
        dbgregs->dr7 = vcpu->arch.dr7;
        dbgregs->flags = 0;
-
-       vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2498,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
        if (dbgregs->flags)
                return -EINVAL;
 
-       vcpu_load(vcpu);
-
        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
        vcpu->arch.dr6 = dbgregs->dr6;
        vcpu->arch.dr7 = dbgregs->dr7;
 
-       vcpu_put(vcpu);
+       return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+                                        struct kvm_xsave *guest_xsave)
+{
+       if (cpu_has_xsave)
+               memcpy(guest_xsave->region,
+                       &vcpu->arch.guest_fpu.state->xsave,
+                       xstate_size);
+       else {
+               memcpy(guest_xsave->region,
+                       &vcpu->arch.guest_fpu.state->fxsave,
+                       sizeof(struct i387_fxsave_struct));
+               *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+                       XSTATE_FPSSE;
+       }
+}
 
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+                                       struct kvm_xsave *guest_xsave)
+{
+       u64 xstate_bv =
+               *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+       if (cpu_has_xsave)
+               memcpy(&vcpu->arch.guest_fpu.state->xsave,
+                       guest_xsave->region, xstate_size);
+       else {
+               if (xstate_bv & ~XSTATE_FPSSE)
+                       return -EINVAL;
+               memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+                       guest_xsave->region, sizeof(struct i387_fxsave_struct));
+       }
        return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+                                       struct kvm_xcrs *guest_xcrs)
+{
+       if (!cpu_has_xsave) {
+               guest_xcrs->nr_xcrs = 0;
+               return;
+       }
+
+       guest_xcrs->nr_xcrs = 1;
+       guest_xcrs->flags = 0;
+       guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+       guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+                                      struct kvm_xcrs *guest_xcrs)
+{
+       int i, r = 0;
+
+       if (!cpu_has_xsave)
+               return -EINVAL;
+
+       if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+               return -EINVAL;
+
+       for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+               /* Only support XCR0 currently */
+               if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+                       r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+                               guest_xcrs->xcrs[0].value);
+                       break;
+               }
+       if (r)
+               r = -EINVAL;
+       return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;
-       struct kvm_lapic_state *lapic = NULL;
+       union {
+               struct kvm_lapic_state *lapic;
+               struct kvm_xsave *xsave;
+               struct kvm_xcrs *xcrs;
+               void *buffer;
+       } u;
 
+       u.buffer = NULL;
        switch (ioctl) {
        case KVM_GET_LAPIC: {
                r = -EINVAL;
                if (!vcpu->arch.apic)
                        goto out;
-               lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+               u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
                r = -ENOMEM;
-               if (!lapic)
+               if (!u.lapic)
                        goto out;
-               r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+               r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+               if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
                        goto out;
                r = 0;
                break;
@@ -2341,14 +2613,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = -EINVAL;
                if (!vcpu->arch.apic)
                        goto out;
-               lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+               u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
                r = -ENOMEM;
-               if (!lapic)
+               if (!u.lapic)
                        goto out;
                r = -EFAULT;
-               if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+               if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
                        goto out;
-               r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+               r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
                if (r)
                        goto out;
                r = 0;
@@ -2464,9 +2736,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = -EFAULT;
                if (copy_from_user(&mce, argp, sizeof mce))
                        goto out;
-               vcpu_load(vcpu);
                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
-               vcpu_put(vcpu);
                break;
        }
        case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2783,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
                break;
        }
+       case KVM_GET_XSAVE: {
+               u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.xsave)
+                       break;
+
+               kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+               r = -EFAULT;
+               if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+                       break;
+               r = 0;
+               break;
+       }
+       case KVM_SET_XSAVE: {
+               u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.xsave)
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+                       break;
+
+               r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+               break;
+       }
+       case KVM_GET_XCRS: {
+               u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.xcrs)
+                       break;
+
+               kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+               r = -EFAULT;
+               if (copy_to_user(argp, u.xcrs,
+                                sizeof(struct kvm_xcrs)))
+                       break;
+               r = 0;
+               break;
+       }
+       case KVM_SET_XCRS: {
+               u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.xcrs)
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(u.xcrs, argp,
+                                  sizeof(struct kvm_xcrs)))
+                       break;
+
+               r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+               break;
+       }
        default:
                r = -EINVAL;
        }
 out:
-       kfree(lapic);
+       kfree(u.buffer);
        return r;
 }
 
@@ -2557,116 +2883,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
-       return kvm->arch.n_alloc_mmu_pages;
-}
-
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
-       int i;
-       struct kvm_mem_alias *alias;
-       struct kvm_mem_aliases *aliases;
-
-       aliases = kvm_aliases(kvm);
-
-       for (i = 0; i < aliases->naliases; ++i) {
-               alias = &aliases->aliases[i];
-               if (alias->flags & KVM_ALIAS_INVALID)
-                       continue;
-               if (gfn >= alias->base_gfn
-                   && gfn < alias->base_gfn + alias->npages)
-                       return alias->target_gfn + gfn - alias->base_gfn;
-       }
-       return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       int i;
-       struct kvm_mem_alias *alias;
-       struct kvm_mem_aliases *aliases;
-
-       aliases = kvm_aliases(kvm);
-
-       for (i = 0; i < aliases->naliases; ++i) {
-               alias = &aliases->aliases[i];
-               if (gfn >= alias->base_gfn
-                   && gfn < alias->base_gfn + alias->npages)
-                       return alias->target_gfn + gfn - alias->base_gfn;
-       }
-       return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-                                        struct kvm_memory_alias *alias)
-{
-       int r, n;
-       struct kvm_mem_alias *p;
-       struct kvm_mem_aliases *aliases, *old_aliases;
-
-       r = -EINVAL;
-       /* General sanity checks */
-       if (alias->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->slot >= KVM_ALIAS_SLOTS)
-               goto out;
-       if (alias->guest_phys_addr + alias->memory_size
-           < alias->guest_phys_addr)
-               goto out;
-       if (alias->target_phys_addr + alias->memory_size
-           < alias->target_phys_addr)
-               goto out;
-
-       r = -ENOMEM;
-       aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-       if (!aliases)
-               goto out;
-
-       mutex_lock(&kvm->slots_lock);
-
-       /* invalidate any gfn reference in case of deletion/shrinking */
-       memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-       aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
-       old_aliases = kvm->arch.aliases;
-       rcu_assign_pointer(kvm->arch.aliases, aliases);
-       synchronize_srcu_expedited(&kvm->srcu);
-       kvm_mmu_zap_all(kvm);
-       kfree(old_aliases);
-
-       r = -ENOMEM;
-       aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-       if (!aliases)
-               goto out_unlock;
-
-       memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
-       p = &aliases->aliases[alias->slot];
-       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-       p->npages = alias->memory_size >> PAGE_SHIFT;
-       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-       p->flags &= ~(KVM_ALIAS_INVALID);
-
-       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-               if (aliases->aliases[n - 1].npages)
-                       break;
-       aliases->naliases = n;
-
-       old_aliases = kvm->arch.aliases;
-       rcu_assign_pointer(kvm->arch.aliases, aliases);
-       synchronize_srcu_expedited(&kvm->srcu);
-       kfree(old_aliases);
-       r = 0;
-
-out_unlock:
-       mutex_unlock(&kvm->slots_lock);
-out:
-       return r;
+       return kvm->arch.n_max_mmu_pages;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2797,7 +3014,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        unsigned long n;
        unsigned long is_dirty = 0;
-       unsigned long *dirty_bitmap = NULL;
 
        mutex_lock(&kvm->slots_lock);
 
@@ -2812,27 +3028,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
        n = kvm_dirty_bitmap_bytes(memslot);
 
-       r = -ENOMEM;
-       dirty_bitmap = vmalloc(n);
-       if (!dirty_bitmap)
-               goto out;
-       memset(dirty_bitmap, 0, n);
-
        for (i = 0; !is_dirty && i < n/sizeof(long); i++)
                is_dirty = memslot->dirty_bitmap[i];
 
        /* If nothing is dirty, don't bother messing with page tables. */
        if (is_dirty) {
                struct kvm_memslots *slots, *old_slots;
+               unsigned long *dirty_bitmap;
 
                spin_lock(&kvm->mmu_lock);
                kvm_mmu_slot_remove_write_access(kvm, log->slot);
                spin_unlock(&kvm->mmu_lock);
 
-               slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-               if (!slots)
-                       goto out_free;
+               r = -ENOMEM;
+               dirty_bitmap = vmalloc(n);
+               if (!dirty_bitmap)
+                       goto out;
+               memset(dirty_bitmap, 0, n);
 
+               r = -ENOMEM;
+               slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+               if (!slots) {
+                       vfree(dirty_bitmap);
+                       goto out;
+               }
                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
                slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 
@@ -2841,13 +3060,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                synchronize_srcu_expedited(&kvm->srcu);
                dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
                kfree(old_slots);
+
+               r = -EFAULT;
+               if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+                       vfree(dirty_bitmap);
+                       goto out;
+               }
+               vfree(dirty_bitmap);
+       } else {
+               r = -EFAULT;
+               if (clear_user(log->dirty_bitmap, n))
+                       goto out;
        }
 
        r = 0;
-       if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-               r = -EFAULT;
-out_free:
-       vfree(dirty_bitmap);
 out:
        mutex_unlock(&kvm->slots_lock);
        return r;
@@ -2867,7 +3093,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        union {
                struct kvm_pit_state ps;
                struct kvm_pit_state2 ps2;
-               struct kvm_memory_alias alias;
                struct kvm_pit_config pit_config;
        } u;
 
@@ -2877,30 +3102,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (r < 0)
                        goto out;
                break;
-       case KVM_SET_IDENTITY_MAP_ADDR: {
-               u64 ident_addr;
-
-               r = -EFAULT;
-               if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
-                       goto out;
-               r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-               if (r < 0)
-                       goto out;
-               break;
-       }
-       case KVM_SET_MEMORY_REGION: {
-               struct kvm_memory_region kvm_mem;
-               struct kvm_userspace_memory_region kvm_userspace_mem;
+       case KVM_SET_IDENTITY_MAP_ADDR: {
+               u64 ident_addr;
 
                r = -EFAULT;
-               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+               if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
                        goto out;
-               kvm_userspace_mem.slot = kvm_mem.slot;
-               kvm_userspace_mem.flags = kvm_mem.flags;
-               kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-               kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-               if (r)
+               r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+               if (r < 0)
                        goto out;
                break;
        }
@@ -2912,14 +3121,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
                break;
-       case KVM_SET_MEMORY_ALIAS:
-               r = -EFAULT;
-               if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
-                       goto out;
-               r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
-               if (r)
-                       goto out;
-               break;
        case KVM_CREATE_IRQCHIP: {
                struct kvm_pic *vpic;
 
@@ -3123,7 +3324,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                break;
        }
        case KVM_SET_CLOCK: {
-               struct timespec now;
                struct kvm_clock_data user_ns;
                u64 now_ns;
                s64 delta;
@@ -3137,19 +3337,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
 
                r = 0;
-               ktime_get_ts(&now);
-               now_ns = timespec_to_ns(&now);
+               now_ns = get_kernel_ns();
                delta = user_ns.clock - now_ns;
                kvm->arch.kvmclock_offset = delta;
                break;
        }
        case KVM_GET_CLOCK: {
-               struct timespec now;
                struct kvm_clock_data user_ns;
                u64 now_ns;
 
-               ktime_get_ts(&now);
-               now_ns = timespec_to_ns(&now);
+               now_ns = get_kernel_ns();
                user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
                user_ns.flags = 0;
 
@@ -3259,7 +3456,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
                }
                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
                if (ret < 0) {
-                       r = X86EMUL_UNHANDLEABLE;
+                       r = X86EMUL_IO_NEEDED;
                        goto out;
                }
 
@@ -3315,7 +3512,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
                }
                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
                if (ret < 0) {
-                       r = X86EMUL_UNHANDLEABLE;
+                       r = X86EMUL_IO_NEEDED;
                        goto out;
                }
 
@@ -3330,10 +3527,10 @@ out:
 static int emulator_read_emulated(unsigned long addr,
                                  void *val,
                                  unsigned int bytes,
+                                 unsigned int *error_code,
                                  struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
-       u32 error_code;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3540,10 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
 
-       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
 
-       if (gpa == UNMAPPED_GVA) {
-               kvm_inject_page_fault(vcpu, addr, error_code);
+       if (gpa == UNMAPPED_GVA)
                return X86EMUL_PROPAGATE_FAULT;
-       }
 
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3565,12 @@ mmio:
        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
        vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 0;
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+       vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+       vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
-       return X86EMUL_UNHANDLEABLE;
+       return X86EMUL_IO_NEEDED;
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3588,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
                                           const void *val,
                                           unsigned int bytes,
+                                          unsigned int *error_code,
                                           struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
-       u32 error_code;
 
-       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
 
-       if (gpa == UNMAPPED_GVA) {
-               kvm_inject_page_fault(vcpu, addr, error_code);
+       if (gpa == UNMAPPED_GVA)
                return X86EMUL_PROPAGATE_FAULT;
-       }
 
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3614,11 @@ mmio:
                return X86EMUL_CONTINUE;
 
        vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 1;
-       memcpy(vcpu->mmio_data, val, bytes);
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+       vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+       vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+       memcpy(vcpu->run->mmio.data, val, bytes);
 
        return X86EMUL_CONTINUE;
 }
@@ -3431,6 +3626,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
                            const void *val,
                            unsigned int bytes,
+                           unsigned int *error_code,
                            struct kvm_vcpu *vcpu)
 {
        /* Crossing a page boundary? */
@@ -3438,16 +3634,17 @@ int emulator_write_emulated(unsigned long addr,
                int rc, now;
 
                now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+               rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+                                                    vcpu);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                addr += now;
                val += now;
                bytes -= now;
        }
-       return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+       return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+                                              vcpu);
 }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
        (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3660,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                                     const void *old,
                                     const void *new,
                                     unsigned int bytes,
+                                    unsigned int *error_code,
                                     struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
@@ -3484,6 +3682,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                goto emul_write;
 
        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               goto emul_write;
+       }
 
        kaddr = kmap_atomic(page, KM_USER0);
        kaddr += offset_in_page(gpa);
@@ -3516,7 +3718,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-       return emulator_write_emulated(addr, new, bytes, vcpu);
+       return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3541,7 +3743,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
        if (vcpu->arch.pio.count)
                goto data_avail;
 
-       trace_kvm_pio(1, port, size, 1);
+       trace_kvm_pio(0, port, size, 1);
 
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = 1;
@@ -3569,7 +3771,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
                              const void *val, unsigned int count,
                              struct kvm_vcpu *vcpu)
 {
-       trace_kvm_pio(0, port, size, 1);
+       trace_kvm_pio(1, port, size, 1);
 
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = 0;
@@ -3604,42 +3806,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
        return X86EMUL_CONTINUE;
 }
 
-int emulate_clts(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-       kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-       kvm_x86_ops->fpu_activate(vcpu);
+       if (!need_emulate_wbinvd(vcpu))
+               return X86EMUL_CONTINUE;
+
+       if (kvm_x86_ops->has_wbinvd_exit()) {
+               smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+                               wbinvd_ipi, NULL, 1);
+               cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+       }
+       wbinvd();
        return X86EMUL_CONTINUE;
 }
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulate_clts(struct kvm_vcpu *vcpu)
 {
-       return kvm_get_dr(ctxt->vcpu, dr, dest);
+       kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+       kvm_x86_ops->fpu_activate(vcpu);
+       return X86EMUL_CONTINUE;
 }
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
-       unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
-       return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+       return _kvm_get_dr(vcpu, dr, dest);
 }
 
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
-       u8 opcodes[4];
-       unsigned long rip = kvm_rip_read(vcpu);
-       unsigned long rip_linear;
-
-       if (!printk_ratelimit())
-               return;
-
-       rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-       kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
-       printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-              context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+       return __kvm_set_dr(vcpu, dr, value);
 }
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
@@ -3674,27 +3872,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
        return value;
 }
 
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 {
+       int res = 0;
+
        switch (cr) {
        case 0:
-               kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+               res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
                break;
        case 2:
                vcpu->arch.cr2 = val;
                break;
        case 3:
-               kvm_set_cr3(vcpu, val);
+               res = kvm_set_cr3(vcpu, val);
                break;
        case 4:
-               kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+               res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
                break;
        case 8:
-               kvm_set_cr8(vcpu, val & 0xfUL);
+               res = __kvm_set_cr8(vcpu, val & 0xfUL);
                break;
        default:
                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+               res = -1;
        }
+
+       return res;
 }
 
 static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3910,17 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
        kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->get_idt(vcpu, dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(int seg,
+                                                     struct kvm_vcpu *vcpu)
+{
+       return get_segment_base(vcpu, seg);
+}
+
 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
                                           struct kvm_vcpu *vcpu)
 {
@@ -3779,11 +3993,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
        kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-       kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
 static struct x86_emulate_ops emulate_ops = {
        .read_std            = kvm_read_guest_virt_system,
        .write_std           = kvm_write_guest_virt_system,
@@ -3797,11 +4006,16 @@ static struct x86_emulate_ops emulate_ops = {
        .set_cached_descriptor = emulator_set_cached_descriptor,
        .get_segment_selector = emulator_get_segment_selector,
        .set_segment_selector = emulator_set_segment_selector,
+       .get_cached_segment_base = emulator_get_cached_segment_base,
        .get_gdt             = emulator_get_gdt,
+       .get_idt             = emulator_get_idt,
        .get_cr              = emulator_get_cr,
        .set_cr              = emulator_set_cr,
        .cpl                 = emulator_get_cpl,
-       .set_rflags          = emulator_set_rflags,
+       .get_dr              = emulator_get_dr,
+       .set_dr              = emulator_set_dr,
+       .set_msr             = kvm_set_msr,
+       .get_msr             = kvm_get_msr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +4026,97 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
        vcpu->arch.regs_dirty = ~0;
 }
 
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+       u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+       /*
+        * an sti; sti; sequence only disable interrupts for the first
+        * instruction. So, if the last instruction, be it emulated or
+        * not, left the system with the INT_STI flag enabled, it
+        * means that the last instruction is an sti. We should not
+        * leave the flag on in this case. The same goes for mov ss
+        */
+       if (!(int_shadow & mask))
+               kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+       if (ctxt->exception == PF_VECTOR)
+               kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+       else if (ctxt->error_code_valid)
+               kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+       else
+               kvm_queue_exception(vcpu, ctxt->exception);
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       int cs_db, cs_l;
+
+       cache_all_regs(vcpu);
+
+       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+       vcpu->arch.emulate_ctxt.vcpu = vcpu;
+       vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+       vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+       vcpu->arch.emulate_ctxt.mode =
+               (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+               (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+               ? X86EMUL_MODE_VM86 : cs_l
+               ? X86EMUL_MODE_PROT64 : cs_db
+               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+       memset(c, 0, sizeof(struct decode_cache));
+       memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.insn_emulation_fail;
+       trace_kvm_emulate_insn_failed(vcpu);
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return EMULATE_FAIL;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa;
+
+       if (tdp_enabled)
+               return false;
+
+       /*
+        * if emulation was due to access to shadowed page table
+        * and it failed try to unshadow page and re-entetr the
+        * guest to let CPU execute the instruction.
+        */
+       if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+               return true;
+
+       gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+       if (gpa == UNMAPPED_GVA)
+               return true; /* let cpu generate fault */
+
+       if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+               return true;
+
+       return false;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
                        unsigned long cr2,
                        u16 error_code,
                        int emulation_type)
 {
-       int r, shadow_mask;
-       struct decode_cache *c;
-       struct kvm_run *run = vcpu->run;
+       int r;
+       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
        kvm_clear_exception_queue(vcpu);
        vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,28 +4128,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
         */
        cache_all_regs(vcpu);
 
-       vcpu->mmio_is_write = 0;
-
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-               int cs_db, cs_l;
-               kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-               vcpu->arch.emulate_ctxt.vcpu = vcpu;
-               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-               vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-               vcpu->arch.emulate_ctxt.mode =
-                       (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-                       (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-                       ? X86EMUL_MODE_VM86 : cs_l
-                       ? X86EMUL_MODE_PROT64 : cs_db
-                       ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+               init_emulate_ctxt(vcpu);
+               vcpu->arch.emulate_ctxt.interruptibility = 0;
+               vcpu->arch.emulate_ctxt.exception = -1;
+               vcpu->arch.emulate_ctxt.perm_ok = false;
+
+               r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
                trace_kvm_emulate_insn_start(vcpu);
 
                /* Only allow emulation of specific instructions on #UD
                 * (namely VMMCALL, sysenter, sysexit, syscall)*/
-               c = &vcpu->arch.emulate_ctxt.decode;
                if (emulation_type & EMULTYPE_TRAP_UD) {
                        if (!c->twobyte)
                                return EMULATE_FAIL;
@@ -3880,11 +4166,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
                ++vcpu->stat.insn_emulation;
                if (r)  {
-                       ++vcpu->stat.insn_emulation_fail;
-                       trace_kvm_emulate_insn_failed(vcpu);
-                       if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                       if (reexecute_instruction(vcpu, cr2))
                                return EMULATE_DONE;
-                       return EMULATE_FAIL;
+                       if (emulation_type & EMULTYPE_SKIP)
+                               return EMULATE_FAIL;
+                       return handle_emulation_failure(vcpu);
                }
        }
 
@@ -3893,52 +4179,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                return EMULATE_DONE;
        }
 
-restart:
-       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-       shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-
-       if (r == 0)
-               kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
-
-       if (vcpu->arch.pio.count) {
-               if (!vcpu->arch.pio.in)
-                       vcpu->arch.pio.count = 0;
-               return EMULATE_DO_MMIO;
-       }
+       /* this is needed for vmware backdor interface to work since it
+          changes registers values  during IO operation */
+       memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
-       if (r || vcpu->mmio_is_write) {
-               run->exit_reason = KVM_EXIT_MMIO;
-               run->mmio.phys_addr = vcpu->mmio_phys_addr;
-               memcpy(run->mmio.data, vcpu->mmio_data, 8);
-               run->mmio.len = vcpu->mmio_size;
-               run->mmio.is_write = vcpu->mmio_is_write;
-       }
+restart:
+       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
-       if (r) {
-               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-                       goto done;
-               if (!vcpu->mmio_needed) {
-                       ++vcpu->stat.insn_emulation_fail;
-                       trace_kvm_emulate_insn_failed(vcpu);
-                       kvm_report_emulation_failure(vcpu, "mmio");
-                       return EMULATE_FAIL;
-               }
-               return EMULATE_DO_MMIO;
-       }
+       if (r == EMULATION_FAILED) {
+               if (reexecute_instruction(vcpu, cr2))
+                       return EMULATE_DONE;
 
-       if (vcpu->mmio_is_write) {
-               vcpu->mmio_needed = 0;
-               return EMULATE_DO_MMIO;
+               return handle_emulation_failure(vcpu);
        }
 
-done:
-       if (vcpu->arch.exception.pending)
-               vcpu->arch.emulate_ctxt.restart = false;
-
-       if (vcpu->arch.emulate_ctxt.restart)
+       if (vcpu->arch.emulate_ctxt.exception >= 0) {
+               inject_emulated_exception(vcpu);
+               r = EMULATE_DONE;
+       } else if (vcpu->arch.pio.count) {
+               if (!vcpu->arch.pio.in)
+                       vcpu->arch.pio.count = 0;
+               r = EMULATE_DO_MMIO;
+       } else if (vcpu->mmio_needed) {
+               if (vcpu->mmio_is_write)
+                       vcpu->mmio_needed = 0;
+               r = EMULATE_DO_MMIO;
+       } else if (r == EMULATION_RESTART)
                goto restart;
+       else
+               r = EMULATE_DONE;
 
-       return EMULATE_DONE;
+       toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+       kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+       memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+       kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+       return r;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
@@ -3952,9 +4228,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
-static void bounce_off(void *info)
+static void tsc_bad(void *info)
+{
+       __get_cpu_var(cpu_tsc_khz) = 0;
+}
+
+static void tsc_khz_changed(void *data)
 {
-       /* nothing */
+       struct cpufreq_freqs *freq = data;
+       unsigned long khz = 0;
+
+       if (data)
+               khz = freq->new;
+       else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+               khz = cpufreq_quick_get(raw_smp_processor_id());
+       if (!khz)
+               khz = tsc_khz;
+       __get_cpu_var(cpu_tsc_khz) = khz;
 }
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -3965,11 +4255,51 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
        struct kvm_vcpu *vcpu;
        int i, send_ipi = 0;
 
+       /*
+        * We allow guests to temporarily run on slowing clocks,
+        * provided we notify them after, or to run on accelerating
+        * clocks, provided we notify them before.  Thus time never
+        * goes backwards.
+        *
+        * However, we have a problem.  We can't atomically update
+        * the frequency of a given CPU from this function; it is
+        * merely a notifier, which can be called from any CPU.
+        * Changing the TSC frequency at arbitrary points in time
+        * requires a recomputation of local variables related to
+        * the TSC for each VCPU.  We must flag these local variables
+        * to be updated and be sure the update takes place with the
+        * new frequency before any guests proceed.
+        *
+        * Unfortunately, the combination of hotplug CPU and frequency
+        * change creates an intractable locking scenario; the order
+        * of when these callouts happen is undefined with respect to
+        * CPU hotplug, and they can race with each other.  As such,
+        * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+        * undefined; you can actually have a CPU frequency change take
+        * place in between the computation of X and the setting of the
+        * variable.  To protect against this problem, all updates of
+        * the per_cpu tsc_khz variable are done in an interrupt
+        * protected IPI, and all callers wishing to update the value
+        * must wait for a synchronous IPI to complete (which is trivial
+        * if the caller is on the CPU already).  This establishes the
+        * necessary total order on variable updates.
+        *
+        * Note that because a guest time update may take place
+        * anytime after the setting of the VCPU's request bit, the
+        * correct TSC value must be set before the request.  However,
+        * to ensure the update actually makes it to any guest which
+        * starts running in hardware virtualization between the set
+        * and the acquisition of the spinlock, we must also ping the
+        * CPU after setting the request bit.
+        *
+        */
+
        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
                return 0;
        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
                return 0;
-       per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+
+       smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3979,7 +4309,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
                        if (!kvm_request_guest_time_update(vcpu))
                                continue;
                        if (vcpu->cpu != smp_processor_id())
-                               send_ipi++;
+                               send_ipi = 1;
                }
        }
        spin_unlock(&kvm_lock);
@@ -3997,32 +4327,48 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
                 * guest context is entered kvmclock will be updated,
                 * so the guest will not see stale values.
                 */
-               smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+               smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
        }
        return 0;
 }
 
 static struct notifier_block kvmclock_cpufreq_notifier_block = {
-        .notifier_call  = kvmclock_cpufreq_notifier
+       .notifier_call  = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+               case CPU_ONLINE:
+               case CPU_DOWN_FAILED:
+                       smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+                       break;
+               case CPU_DOWN_PREPARE:
+                       smp_call_function_single(cpu, tsc_bad, NULL, 1);
+                       break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+       .notifier_call  = kvmclock_cpu_notifier,
+       .priority = -INT_MAX
 };
 
 static void kvm_timer_init(void)
 {
        int cpu;
 
+       register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
-               for_each_online_cpu(cpu) {
-                       unsigned long khz = cpufreq_get(cpu);
-                       if (!khz)
-                               khz = tsc_khz;
-                       per_cpu(cpu_tsc_khz, cpu) = khz;
-               }
-       } else {
-               for_each_possible_cpu(cpu)
-                       per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
        }
+       for_each_online_cpu(cpu)
+               smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4108,6 +4454,9 @@ int kvm_arch_init(void *opaque)
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
+       if (cpu_has_xsave)
+               host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
        return 0;
 
 out:
@@ -4121,6 +4470,7 @@ void kvm_arch_exit(void)
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
+       unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
        kvm_x86_ops = NULL;
        kvm_mmu_module_exit();
 }
@@ -4270,7 +4620,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
        kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-       return emulator_write_emulated(rip, instruction, 3, vcpu);
+       return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4856,81 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
        }
 }
 
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+                       !vcpu->guest_xcr0_loaded) {
+               /* kvm_set_xcr() also depends on this */
+               xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+               vcpu->guest_xcr0_loaded = 1;
+       }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->guest_xcr0_loaded) {
+               if (vcpu->arch.xcr0 != host_xcr0)
+                       xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+               vcpu->guest_xcr0_loaded = 0;
+       }
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
                vcpu->run->request_interrupt_window;
 
-       if (vcpu->requests)
-               if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-                       kvm_mmu_unload(vcpu);
-
-       r = kvm_mmu_reload(vcpu);
-       if (unlikely(r))
-               goto out;
-
        if (vcpu->requests) {
-               if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+               if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+                       kvm_mmu_unload(vcpu);
+               if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
-               if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
-                       kvm_write_guest_time(vcpu);
-               if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+               if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
+                       r = kvm_write_guest_time(vcpu);
+                       if (unlikely(r))
+                               goto out;
+               }
+               if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
                        kvm_mmu_sync_roots(vcpu);
-               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                        kvm_x86_ops->tlb_flush(vcpu);
-               if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-                                      &vcpu->requests)) {
+               if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
                        r = 0;
                        goto out;
                }
-               if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+               if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
                        r = 0;
                        goto out;
                }
-               if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+               if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
                        vcpu->fpu_active = 0;
                        kvm_x86_ops->fpu_deactivate(vcpu);
                }
        }
 
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               goto out;
+
        preempt_disable();
 
        kvm_x86_ops->prepare_guest_switch(vcpu);
        if (vcpu->fpu_active)
                kvm_load_guest_fpu(vcpu);
+       kvm_load_guest_xcr0(vcpu);
 
-       local_irq_disable();
+       atomic_set(&vcpu->guest_mode, 1);
+       smp_wmb();
 
-       clear_bit(KVM_REQ_KICK, &vcpu->requests);
-       smp_mb__after_clear_bit();
+       local_irq_disable();
 
-       if (vcpu->requests || need_resched() || signal_pending(current)) {
-               set_bit(KVM_REQ_KICK, &vcpu->requests);
+       if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+           || need_resched() || signal_pending(current)) {
+               atomic_set(&vcpu->guest_mode, 0);
+               smp_wmb();
                local_irq_enable();
                preempt_enable();
                r = 1;
@@ -4603,7 +4975,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
 
-       set_bit(KVM_REQ_KICK, &vcpu->requests);
+       kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
+       atomic_set(&vcpu->guest_mode, 0);
+       smp_wmb();
        local_irq_enable();
 
        ++vcpu->stat.exits;
@@ -4665,7 +5040,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
                        kvm_vcpu_block(vcpu);
                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-                       if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+                       if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
                        {
                                switch(vcpu->arch.mp_state) {
                                case KVM_MP_STATE_HALTED:
@@ -4717,8 +5092,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int r;
        sigset_t sigsaved;
 
-       vcpu_load(vcpu);
-
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -4733,8 +5106,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (!irqchip_in_kernel(vcpu->kvm))
                kvm_set_cr8(vcpu, kvm_run->cr8);
 
-       if (vcpu->arch.pio.count || vcpu->mmio_needed ||
-           vcpu->arch.emulate_ctxt.restart) {
+       if (vcpu->arch.pio.count || vcpu->mmio_needed) {
                if (vcpu->mmio_needed) {
                        memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
                        vcpu->mmio_read_completed = 1;
@@ -4743,7 +5115,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
                srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-               if (r == EMULATE_DO_MMIO) {
+               if (r != EMULATE_DONE) {
                        r = 0;
                        goto out;
                }
@@ -4759,14 +5131,11 @@ out:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-       vcpu_put(vcpu);
        return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       vcpu_load(vcpu);
-
        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
        regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
        regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +5158,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        regs->rip = kvm_rip_read(vcpu);
        regs->rflags = kvm_get_rflags(vcpu);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       vcpu_load(vcpu);
-
        kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
        kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
        kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +5187,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
        vcpu->arch.exception.pending = false;
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -4842,8 +5205,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 {
        struct desc_ptr dt;
 
-       vcpu_load(vcpu);
-
        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,54 +5236,40 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                set_bit(vcpu->arch.interrupt.nr,
                        (unsigned long *)sregs->interrupt_bitmap);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       vcpu_load(vcpu);
        mp_state->mp_state = vcpu->arch.mp_state;
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       vcpu_load(vcpu);
        vcpu->arch.mp_state = mp_state->mp_state;
-       vcpu_put(vcpu);
        return 0;
 }
 
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
                    bool has_error_code, u32 error_code)
 {
-       int cs_db, cs_l, ret;
-       cache_all_regs(vcpu);
-
-       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       int ret;
 
-       vcpu->arch.emulate_ctxt.vcpu = vcpu;
-       vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-       vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-       vcpu->arch.emulate_ctxt.mode =
-               (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-               (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-               ? X86EMUL_MODE_VM86 : cs_l
-               ? X86EMUL_MODE_PROT64 : cs_db
-               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+       init_emulate_ctxt(vcpu);
 
-       ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+       ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
                                   tss_selector, reason, has_error_code,
                                   error_code);
 
        if (ret)
                return EMULATE_FAIL;
 
+       memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+       kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
        return EMULATE_DONE;
 }
@@ -4935,8 +5282,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        int pending_vec, max_bits;
        struct desc_ptr dt;
 
-       vcpu_load(vcpu);
-
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
        kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5341,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
            !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
@@ -5007,12 +5350,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        unsigned long rflags;
        int i, r;
 
-       vcpu_load(vcpu);
-
        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                r = -EBUSY;
                if (vcpu->arch.exception.pending)
-                       goto unlock_out;
+                       goto out;
                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                        kvm_queue_exception(vcpu, DB_VECTOR);
                else
@@ -5054,33 +5395,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
        r = 0;
 
-unlock_out:
-       vcpu_put(vcpu);
+out:
 
        return r;
 }
 
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-       u16     cwd;
-       u16     swd;
-       u16     twd;
-       u16     fop;
-       u64     rip;
-       u64     rdp;
-       u32     mxcsr;
-       u32     mxcsr_mask;
-       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-       u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
 /*
  * Translate a guest virtual address to a guest physical address.
  */
@@ -5091,7 +5410,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        gpa_t gpa;
        int idx;
 
-       vcpu_load(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5417,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        tr->valid = gpa != UNMAPPED_GVA;
        tr->writeable = 1;
        tr->usermode = 0;
-       vcpu_put(vcpu);
 
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-       vcpu_load(vcpu);
+       struct i387_fxsave_struct *fxsave =
+                       &vcpu->arch.guest_fpu.state->fxsave;
 
        memcpy(fpu->fpr, fxsave->st_space, 128);
        fpu->fcw = fxsave->cwd;
@@ -5119,16 +5435,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        fpu->last_dp = fxsave->rdp;
        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-       vcpu_load(vcpu);
+       struct i387_fxsave_struct *fxsave =
+                       &vcpu->arch.guest_fpu.state->fxsave;
 
        memcpy(fxsave->st_space, fpu->fpr, 128);
        fxsave->cwd = fpu->fcw;
@@ -5139,61 +5452,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        fxsave->rdp = fpu->last_dp;
        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 
-       vcpu_put(vcpu);
-
        return 0;
 }
 
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
 {
-       unsigned after_mxcsr_mask;
+       int err;
+
+       err = fpu_alloc(&vcpu->arch.guest_fpu);
+       if (err)
+               return err;
+
+       fpu_finit(&vcpu->arch.guest_fpu);
 
        /*
-        * Touch the fpu the first time in non atomic context as if
-        * this is the first fpu instruction the exception handler
-        * will fire before the instruction returns and it'll have to
-        * allocate ram with GFP_KERNEL.
+        * Ensure guest xcr0 is valid for loading
         */
-       if (!used_math())
-               kvm_fx_save(&vcpu->arch.host_fx_image);
-
-       /* Initialize guest FPU by resetting ours and saving into guest's */
-       preempt_disable();
-       kvm_fx_save(&vcpu->arch.host_fx_image);
-       kvm_fx_finit();
-       kvm_fx_save(&vcpu->arch.guest_fx_image);
-       kvm_fx_restore(&vcpu->arch.host_fx_image);
-       preempt_enable();
+       vcpu->arch.xcr0 = XSTATE_FP;
 
        vcpu->arch.cr0 |= X86_CR0_ET;
-       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-       vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-       memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+       fpu_free(&vcpu->arch.guest_fpu);
+}
+
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
        if (vcpu->guest_fpu_loaded)
                return;
 
+       /*
+        * Restore all possible states in the guest,
+        * and assume host would use all available bits.
+        * Guest xcr0 would be loaded later.
+        */
+       kvm_put_guest_xcr0(vcpu);
        vcpu->guest_fpu_loaded = 1;
-       kvm_fx_save(&vcpu->arch.host_fx_image);
-       kvm_fx_restore(&vcpu->arch.guest_fx_image);
+       unlazy_fpu(current);
+       fpu_restore_checking(&vcpu->arch.guest_fpu);
        trace_kvm_fpu(1);
 }
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
+       kvm_put_guest_xcr0(vcpu);
+
        if (!vcpu->guest_fpu_loaded)
                return;
 
        vcpu->guest_fpu_loaded = 0;
-       kvm_fx_save(&vcpu->arch.guest_fx_image);
-       kvm_fx_restore(&vcpu->arch.host_fx_image);
+       fpu_save_init(&vcpu->arch.guest_fpu);
        ++vcpu->stat.fpu_reload;
-       set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
        trace_kvm_fpu(0);
 }
 
@@ -5204,12 +5519,18 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
                vcpu->arch.time_page = NULL;
        }
 
+       free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+       fx_free(vcpu);
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                                unsigned int id)
 {
+       if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+               printk_once(KERN_WARNING
+               "kvm: SMP vm created on host with unstable TSC; "
+               "guest TSC will not be reliable\n");
        return kvm_x86_ops->vcpu_create(kvm, id);
 }
 
@@ -5217,9 +5538,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       /* We do fxsave: this must be aligned. */
-       BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
        vcpu->arch.mtrr_state.have_fixed = 1;
        vcpu_load(vcpu);
        r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5559,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 
+       fx_free(vcpu);
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -5259,17 +5578,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
-       /*
-        * Since this may be called from a hotplug notifcation,
-        * we can't get the CPU frequency directly.
-        */
-       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-               int cpu = raw_smp_processor_id();
-               per_cpu(cpu_tsc_khz, cpu) = 0;
-       }
+       struct kvm *kvm;
+       struct kvm_vcpu *vcpu;
+       int i;
 
        kvm_shared_msr_cpu_online();
-
+       list_for_each_entry(kvm, &vm_list, vm_list)
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       if (vcpu->cpu == smp_processor_id())
+                               kvm_request_guest_time_update(vcpu);
        return kvm_x86_ops->hardware_enable(garbage);
 }
 
@@ -5303,6 +5620,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
 
+       vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -5334,7 +5652,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        }
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
+       if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+               goto fail_free_mce_banks;
+
        return 0;
+fail_free_mce_banks:
+       kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
        kvm_free_lapic(vcpu);
 fail_mmu_destroy:
@@ -5364,19 +5687,13 @@ struct  kvm *kvm_arch_create_vm(void)
        if (!kvm)
                return ERR_PTR(-ENOMEM);
 
-       kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-       if (!kvm->arch.aliases) {
-               kfree(kvm);
-               return ERR_PTR(-ENOMEM);
-       }
-
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
-       rdtscll(kvm->arch.vm_init_tsc);
+       spin_lock_init(&kvm->arch.tsc_write_lock);
 
        return kvm;
 }
@@ -5412,12 +5729,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_sync_events(struct kvm *kvm)
 {
        kvm_free_all_assigned_devices(kvm);
+       kvm_free_pit(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
        kvm_iommu_unmap_guest(kvm);
-       kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
@@ -5427,7 +5744,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
        cleanup_srcu_struct(&kvm->srcu);
-       kfree(kvm->arch.aliases);
        kfree(kvm);
 }
 
@@ -5438,6 +5754,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                int user_alloc)
 {
        int npages = memslot->npages;
+       int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+       /* Prevent internal slot pages from being moved by fork()/COW. */
+       if (memslot->id >= KVM_MEMORY_SLOTS)
+               map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
        /*To keep backward compatibility with older userspace,
         *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5771,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                        userspace_addr = do_mmap(NULL, 0,
                                                 npages * PAGE_SIZE,
                                                 PROT_READ | PROT_WRITE,
-                                                MAP_PRIVATE | MAP_ANONYMOUS,
+                                                map_flags,
                                                 0);
                        up_write(&current->mm->mmap_sem);
 
@@ -5523,7 +5844,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 
        me = get_cpu();
        if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-               if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+               if (atomic_xchg(&vcpu->guest_mode, 0))
                        smp_send_reschedule(cpu);
        put_cpu();
 }