Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[pandora-kernel.git] / arch / x86 / kvm / x86.c
index 4f76417..c243b81 100644 (file)
@@ -46,6 +46,8 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -118,7 +120,7 @@ struct kvm_shared_msrs {
 };
 
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+static struct kvm_shared_msrs __percpu *shared_msrs;
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -187,10 +191,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
 
 static void shared_msr_update(unsigned slot, u32 msr)
 {
-       struct kvm_shared_msrs *smsr;
        u64 value;
+       unsigned int cpu = smp_processor_id();
+       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 
-       smsr = &__get_cpu_var(shared_msrs);
        /* only read, and nobody should modify it at this time,
         * so don't need lock */
        if (slot >= shared_msrs_global.nr) {
@@ -222,7 +226,8 @@ static void kvm_shared_msr_cpu_online(void)
 
 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 {
-       struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+       unsigned int cpu = smp_processor_id();
+       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 
        if (((value ^ smsr->values[slot].curr) & mask) == 0)
                return;
@@ -238,7 +243,8 @@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 
 static void drop_user_return_notifiers(void *ignore)
 {
-       struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+       unsigned int cpu = smp_processor_id();
+       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 
        if (smsr->registered)
                kvm_on_user_return(&smsr->urn);
@@ -633,7 +639,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
 
        if (is_long_mode(vcpu)) {
-               if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
                        if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
                                return 1;
                } else
@@ -827,6 +833,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 
 static const u32 emulated_msrs[] = {
+       MSR_IA32_TSC_ADJUST,
        MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
@@ -886,9 +893,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+       return kvm_x86_ops->set_msr(vcpu, msr);
 }
 
 /*
@@ -896,9 +903,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  */
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
-       return kvm_set_msr(vcpu, index, *data);
+       struct msr_data msr;
+
+       msr.data = *data;
+       msr.index = index;
+       msr.host_initiated = true;
+       return kvm_set_msr(vcpu, &msr);
 }
 
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+       seqcount_t      seq;
+
+       struct { /* extract of a clocksource struct */
+               int vclock_mode;
+               cycle_t cycle_last;
+               cycle_t mask;
+               u32     mult;
+               u32     shift;
+       } clock;
+
+       /* open coded 'struct timespec' */
+       u64             monotonic_time_snsec;
+       time_t          monotonic_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+       struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+       write_seqcount_begin(&vdata->seq);
+
+       /* copy pvclock gtod data */
+       vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
+       vdata->clock.cycle_last         = tk->clock->cycle_last;
+       vdata->clock.mask               = tk->clock->mask;
+       vdata->clock.mult               = tk->mult;
+       vdata->clock.shift              = tk->shift;
+
+       vdata->monotonic_time_sec       = tk->xtime_sec
+                                       + tk->wall_to_monotonic.tv_sec;
+       vdata->monotonic_time_snsec     = tk->xtime_nsec
+                                       + (tk->wall_to_monotonic.tv_nsec
+                                               << tk->shift);
+       while (vdata->monotonic_time_snsec >=
+                                       (((u64)NSEC_PER_SEC) << tk->shift)) {
+               vdata->monotonic_time_snsec -=
+                                       ((u64)NSEC_PER_SEC) << tk->shift;
+               vdata->monotonic_time_sec++;
+       }
+
+       write_seqcount_end(&vdata->seq);
+}
+#endif
+
+
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        int version;
@@ -995,6 +1056,10 @@ static inline u64 get_kernel_ns(void)
        return timespec_to_ns(&ts);
 }
 
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
 
@@ -1046,12 +1111,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
        return tsc;
 }
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       bool vcpus_matched;
+       bool do_request = false;
+       struct kvm_arch *ka = &vcpu->kvm->arch;
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+       vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                        atomic_read(&vcpu->kvm->online_vcpus));
+
+       if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+               if (!ka->use_master_clock)
+                       do_request = 1;
+
+       if (!vcpus_matched && ka->use_master_clock)
+                       do_request = 1;
+
+       if (do_request)
+               kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+       trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+                           atomic_read(&vcpu->kvm->online_vcpus),
+                           ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+       u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+       vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
        s64 usdiff;
+       bool matched;
+       u64 data = msr->data;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1194,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
+               matched = true;
        } else {
                /*
                 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1209,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                kvm->arch.cur_tsc_nsec = ns;
                kvm->arch.cur_tsc_write = data;
                kvm->arch.cur_tsc_offset = offset;
+               matched = false;
                pr_debug("kvm: new tsc generation %u, clock %llu\n",
                         kvm->arch.cur_tsc_generation, data);
        }
@@ -1129,26 +1231,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
 
+       if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+               update_ia32_tsc_adjust_msr(vcpu, offset);
        kvm_x86_ops->write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+       spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+       if (matched)
+               kvm->arch.nr_vcpus_matched_tsc++;
+       else
+               kvm->arch.nr_vcpus_matched_tsc = 0;
+
+       kvm_track_tsc_matching(vcpu);
+       spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
+#ifdef CONFIG_X86_64
+
+static cycle_t read_tsc(void)
+{
+       cycle_t ret;
+       u64 last;
+
+       /*
+        * Empirically, a fence (of type that depends on the CPU)
+        * before rdtsc is enough to ensure that rdtsc is ordered
+        * with respect to loads.  The various CPU manuals are unclear
+        * as to whether rdtsc can be reordered with later loads,
+        * but no one has ever seen it happen.
+        */
+       rdtsc_barrier();
+       ret = (cycle_t)vget_cycles();
+
+       last = pvclock_gtod_data.clock.cycle_last;
+
+       if (likely(ret >= last))
+               return ret;
+
+       /*
+        * GCC likes to generate cmov here, but this branch is extremely
+        * predictable (it's just a funciton of time and the likely is
+        * very likely) and there's a data dependence, so force GCC
+        * to generate a branch instead.  I don't barrier() because
+        * we don't actually need a barrier, and if this function
+        * ever gets inlined it will generate worse code.
+        */
+       asm volatile ("");
+       return last;
+}
+
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+       long v;
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+       *cycle_now = read_tsc();
+
+       v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+       return v * gtod->clock.mult;
+}
+
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+       unsigned long seq;
+       u64 ns;
+       int mode;
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+       ts->tv_nsec = 0;
+       do {
+               seq = read_seqcount_begin(&gtod->seq);
+               mode = gtod->clock.vclock_mode;
+               ts->tv_sec = gtod->monotonic_time_sec;
+               ns = gtod->monotonic_time_snsec;
+               ns += vgettsc(cycle_now);
+               ns >>= gtod->clock.shift;
+       } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+       timespec_add_ns(ts, ns);
+
+       return mode;
+}
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+       struct timespec ts;
+
+       /* checked again under seqlock below */
+       if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+               return false;
+
+       if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+               return false;
+
+       monotonic_to_bootbased(&ts);
+       *kernel_ns = timespec_to_ns(&ts);
+
+       return true;
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ *             VCPU0 on CPU0           |       VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.                                  | timespec1 = timespec0 + N
+ *                                     | tsc1 = tsc0 + M
+ * 3. transition to guest              | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.                                  | ret1 = timespec1 + (rdtsc - tsc1)
+ *                                     | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ *     - ret0 < ret1
+ *     - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *             ...
+ *     - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       struct kvm_arch *ka = &kvm->arch;
+       int vclock_mode;
+       bool host_tsc_clocksource, vcpus_matched;
+
+       vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                       atomic_read(&kvm->online_vcpus));
+
+       /*
+        * If the host uses TSC clock, then passthrough TSC as stable
+        * to the guest.
+        */
+       host_tsc_clocksource = kvm_get_time_and_clockread(
+                                       &ka->master_kernel_ns,
+                                       &ka->master_cycle_now);
+
+       ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+
+       if (ka->use_master_clock)
+               atomic_set(&kvm_guest_has_master_clock, 1);
+
+       vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+       trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+                                       vcpus_matched);
+#endif
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-       unsigned long flags;
+       unsigned long flags, this_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
+       struct kvm_arch *ka = &v->kvm->arch;
        void *shared_kaddr;
-       unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
-       u64 tsc_timestamp;
+       u64 tsc_timestamp, host_tsc;
+       struct pvclock_vcpu_time_info *guest_hv_clock;
        u8 pvclock_flags;
+       bool use_master_clock;
+
+       kernel_ns = 0;
+       host_tsc = 0;
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
-       kernel_ns = get_kernel_ns();
        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
@@ -1156,6 +1427,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                return 1;
        }
 
+       /*
+        * If the host uses TSC clock, then passthrough TSC as stable
+        * to the guest.
+        */
+       spin_lock(&ka->pvclock_gtod_sync_lock);
+       use_master_clock = ka->use_master_clock;
+       if (use_master_clock) {
+               host_tsc = ka->master_cycle_now;
+               kernel_ns = ka->master_kernel_ns;
+       }
+       spin_unlock(&ka->pvclock_gtod_sync_lock);
+       if (!use_master_clock) {
+               host_tsc = native_read_tsc();
+               kernel_ns = get_kernel_ns();
+       }
+
+       tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+
        /*
         * We may have to catch up the TSC to match elapsed wall clock
         * time for two reasons, even if kvmclock is used.
@@ -1217,23 +1506,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = this_tsc_khz;
        }
 
-       if (max_kernel_ns > kernel_ns)
-               kernel_ns = max_kernel_ns;
-
+       /* with a master <monotonic time, tsc value> tuple,
+        * pvclock clock reads always increase at the (scaled) rate
+        * of guest TSC - no need to deal with sampling errors.
+        */
+       if (!use_master_clock) {
+               if (max_kernel_ns > kernel_ns)
+                       kernel_ns = max_kernel_ns;
+       }
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
 
-       pvclock_flags = 0;
-       if (vcpu->pvclock_set_guest_stopped_request) {
-               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-               vcpu->pvclock_set_guest_stopped_request = false;
-       }
-
-       vcpu->hv_clock.flags = pvclock_flags;
-
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1529,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        shared_kaddr = kmap_atomic(vcpu->time_page);
 
+       guest_hv_clock = shared_kaddr + vcpu->time_offset;
+
+       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+       pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+
+       /* If the host uses TSC clocksource, then it is stable */
+       if (use_master_clock)
+               pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+       vcpu->hv_clock.flags = pvclock_flags;
+
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
               sizeof(vcpu->hv_clock));
 
@@ -1572,9 +1874,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        bool pr = false;
+       u32 msr = msr_info->index;
+       u64 data = msr_info->data;
 
        switch (msr) {
        case MSR_EFER:
@@ -1625,6 +1929,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_TSCDEADLINE:
                kvm_set_lapic_tscdeadline_msr(vcpu, data);
                break;
+       case MSR_IA32_TSC_ADJUST:
+               if (guest_cpuid_has_tsc_adjust(vcpu)) {
+                       if (!msr_info->host_initiated) {
+                               u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+                               kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+                       }
+                       vcpu->arch.ia32_tsc_adjust_msr = data;
+               }
+               break;
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -1984,6 +2297,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_TSCDEADLINE:
                data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
+       case MSR_IA32_TSC_ADJUST:
+               data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+               break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -2342,7 +2658,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
-               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+               /*
+                * On a host with synchronized TSC, there is no need to update
+                * kvmclock on vcpu->cpu migration
+                */
+               if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+                       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                if (vcpu->cpu != cpu)
                        kvm_migrate_timers(vcpu);
                vcpu->cpu = cpu;
@@ -2691,15 +3012,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (!vcpu->arch.apic)
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
-               if (IS_ERR(u.lapic)) {
-                       r = PTR_ERR(u.lapic);
-                       goto out;
-               }
+               if (IS_ERR(u.lapic))
+                       return PTR_ERR(u.lapic);
 
                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_INTERRUPT: {
@@ -2709,16 +3025,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&irq, argp, sizeof irq))
                        goto out;
                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_NMI: {
                r = kvm_vcpu_ioctl_nmi(vcpu);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_SET_CPUID: {
@@ -2729,8 +3039,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_SET_CPUID2: {
@@ -2742,8 +3050,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
                                              cpuid_arg->entries);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_GET_CPUID2: {
@@ -2875,10 +3181,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XSAVE: {
                u.xsave = memdup_user(argp, sizeof(*u.xsave));
-               if (IS_ERR(u.xsave)) {
-                       r = PTR_ERR(u.xsave);
-                       goto out;
-               }
+               if (IS_ERR(u.xsave))
+                       return PTR_ERR(u.xsave);
 
                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                break;
@@ -2900,10 +3204,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XCRS: {
                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-               if (IS_ERR(u.xcrs)) {
-                       r = PTR_ERR(u.xcrs);
-                       goto out;
-               }
+               if (IS_ERR(u.xcrs))
+                       return PTR_ERR(u.xcrs);
 
                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
                break;
@@ -2951,7 +3253,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
        int ret;
 
        if (addr > (unsigned int)(-3 * PAGE_SIZE))
-               return -1;
+               return -EINVAL;
        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
        return ret;
 }
@@ -3212,8 +3514,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_SET_TSS_ADDR:
                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-               if (r < 0)
-                       goto out;
                break;
        case KVM_SET_IDENTITY_MAP_ADDR: {
                u64 ident_addr;
@@ -3222,14 +3522,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
                        goto out;
                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-               if (r < 0)
-                       goto out;
                break;
        }
        case KVM_SET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-               if (r)
-                       goto out;
                break;
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3616,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        get_irqchip_out:
                kfree(chip);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_SET_IRQCHIP: {
@@ -3343,8 +3637,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        set_irqchip_out:
                kfree(chip);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_GET_PIT: {
@@ -3371,9 +3663,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_GET_PIT2: {
@@ -3397,9 +3686,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3694,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&control, argp, sizeof(control)))
                        goto out;
                r = kvm_vm_ioctl_reinject(kvm, &control);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4556,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
                            u32 msr_index, u64 data)
 {
-       return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+       struct msr_data msr;
+
+       msr.data = data;
+       msr.index = msr_index;
+       msr.host_initiated = false;
+       return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4783,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
         * instruction -> ...
         */
        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-       if (!is_error_pfn(pfn)) {
+       if (!is_error_noslot_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return true;
        }
@@ -4881,6 +5169,50 @@ static void kvm_set_mmio_spte_mask(void)
        kvm_mmu_set_mmio_spte_mask(mask);
 }
 
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+       struct kvm *kvm;
+
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       raw_spin_lock(&kvm_lock);
+       list_for_each_entry(kvm, &vm_list, vm_list)
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+       atomic_set(&kvm_guest_has_master_clock, 0);
+       raw_spin_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+                              void *priv)
+{
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+       struct timekeeper *tk = priv;
+
+       update_pvclock_gtod(tk);
+
+       /* disable master clock if host does not trust, or does not
+        * use, TSC clocksource
+        */
+       if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+           atomic_read(&kvm_guest_has_master_clock) != 0)
+               queue_work(system_long_wq, &pvclock_gtod_work);
+
+       return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+       .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
 int kvm_arch_init(void *opaque)
 {
        int r;
@@ -4903,9 +5235,16 @@ int kvm_arch_init(void *opaque)
                goto out;
        }
 
+       r = -ENOMEM;
+       shared_msrs = alloc_percpu(struct kvm_shared_msrs);
+       if (!shared_msrs) {
+               printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+               goto out;
+       }
+
        r = kvm_mmu_module_init();
        if (r)
-               goto out;
+               goto out_free_percpu;
 
        kvm_set_mmio_spte_mask();
        kvm_init_msr_list();
@@ -4922,8 +5261,14 @@ int kvm_arch_init(void *opaque)
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
        kvm_lapic_init();
+#ifdef CONFIG_X86_64
+       pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
+
        return 0;
 
+out_free_percpu:
+       free_percpu(shared_msrs);
 out:
        return r;
 }
@@ -4936,8 +5281,12 @@ void kvm_arch_exit(void)
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
        unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+       pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
        kvm_x86_ops = NULL;
        kvm_mmu_module_exit();
+       free_percpu(shared_msrs);
 }
 
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@ -5059,7 +5408,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        char instruction[3];
@@ -5235,6 +5584,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       int i;
+       struct kvm_vcpu *vcpu;
+       struct kvm_arch *ka = &kvm->arch;
+
+       spin_lock(&ka->pvclock_gtod_sync_lock);
+       kvm_make_mclock_inprogress_request(kvm);
+       /* no guest entries from this point */
+       pvclock_update_vm_gtod_copy(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+       /* guest entries allowed */
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+       spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5247,6 +5619,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_mmu_unload(vcpu);
                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
+               if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+                       kvm_gen_update_masterclock(vcpu->kvm);
                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
                        r = kvm_guest_time_update(vcpu);
                        if (unlikely(r))
@@ -5362,7 +5736,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
 
-       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+                                                          native_read_tsc());
 
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
@@ -5419,7 +5794,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-               r = kvm_arch_vcpu_reset(vcpu);
+               r = kvm_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6422,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-       r = kvm_arch_vcpu_reset(vcpu);
+       r = kvm_vcpu_reset(vcpu);
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
@@ -6055,6 +6430,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        return r;
 }
 
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+       int r;
+       struct msr_data msr;
+
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
+       msr.data = 0x0;
+       msr.index = MSR_IA32_TSC;
+       msr.host_initiated = true;
+       kvm_write_tsc(vcpu, &msr);
+       vcpu_put(vcpu);
+
+       return r;
+}
+
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -6069,7 +6461,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6484,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
        kvm_pmu_reset(vcpu);
 
+       memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+       vcpu->arch.regs_avail = ~0;
+       vcpu->arch.regs_dirty = ~0;
+
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -6168,6 +6564,8 @@ int kvm_arch_hardware_enable(void *garbage)
                        kvm_for_each_vcpu(i, vcpu, kvm) {
                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
                                vcpu->arch.last_host_tsc = local_tsc;
+                               set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                       &vcpu->requests);
                        }
 
                        /*
@@ -6258,10 +6656,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
                goto fail_free_mce_banks;
 
+       r = fx_init(vcpu);
+       if (r)
+               goto fail_free_wbinvd_dirty_mask;
+
+       vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
 
        return 0;
+fail_free_wbinvd_dirty_mask:
+       free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -6305,6 +6710,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
+       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+       pvclock_update_vm_gtod_copy(kvm);
 
        return 0;
 }