Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 25 Jun 2008 01:09:06 +0000 (18:09 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 25 Jun 2008 01:09:06 +0000 (18:09 -0700)
* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
  KVM: Remove now unused structs from kvm_para.h
  x86: KVM guest: Use the paravirt clocksource structs and functions
  KVM: Make kvm host use the paravirt clocksource structs
  x86: Make xen use the paravirt clocksource structs and functions
  x86: Add structs and functions for paravirt clocksource
  KVM: VMX: Fix host msr corruption with preemption enabled
  KVM: ioapic: fix lost interrupt when changing a device's irq
  KVM: MMU: Fix oops on guest userspace access to guest pagetable
  KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend)
  KVM: MMU: Fix rmap_write_protect() hugepage iteration bug
  KVM: close timer injection race window in __vcpu_run
  KVM: Fix race between timer migration and vcpu migration

18 files changed:
arch/x86/Kconfig
arch/x86/kernel/Makefile
arch/x86/kernel/kvmclock.c
arch/x86/kernel/pvclock.c [new file with mode: 0644]
arch/x86/kvm/i8254.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/xen/Kconfig
arch/x86/xen/time.c
include/asm-x86/kvm_host.h
include/asm-x86/kvm_para.h
include/asm-x86/pvclock-abi.h [new file with mode: 0644]
include/asm-x86/pvclock.h [new file with mode: 0644]
include/linux/kvm_host.h
include/xen/interface/xen.h
virt/kvm/ioapic.c

index 52e18e6..e0edaaa 100644 (file)
@@ -383,6 +383,7 @@ config VMI
 config KVM_CLOCK
        bool "KVM paravirtualized clock"
        select PARAVIRT
+       select PARAVIRT_CLOCK
        depends on !(X86_VISWS || X86_VOYAGER)
        help
          Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
          over full virtualization.  However, when run without a hypervisor
          the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_CLOCK
+       bool
+       default n
+
 endif
 
 config MEMTEST_BOOTPARAM
index 5e618c3..77807d4 100644 (file)
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)             += vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)                += kvm.o
 obj-$(CONFIG_KVM_CLOCK)                += kvmclock.o
 obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
 
index 08a3098..87edf1c 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/clocksource.h>
 #include <linux/kvm_para.h>
+#include <asm/pvclock.h>
 #include <asm/arch_hooks.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_wall_clock wall_clock;
 
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
-       int cpu = smp_processor_id();
-       u64 delta = native_read_tsc() - last_tsc;
-       return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
  */
 static unsigned long kvm_get_wallclock(void)
 {
-       u32 wc_sec, wc_nsec;
-       u64 delta;
+       struct pvclock_vcpu_time_info *vcpu_time;
        struct timespec ts;
-       int version, nsec;
        int low, high;
 
        low = (int)__pa(&wall_clock);
        high = ((u64)__pa(&wall_clock) >> 32);
+       native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
 
-       delta = kvm_clock_read();
+       vcpu_time = &get_cpu_var(hv_clock);
+       pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+       put_cpu_var(hv_clock);
 
-       native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
-       do {
-               version = wall_clock.wc_version;
-               rmb();
-               wc_sec = wall_clock.wc_sec;
-               wc_nsec = wall_clock.wc_nsec;
-               rmb();
-       } while ((wall_clock.wc_version != version) || (version & 1));
-
-       delta = kvm_clock_read() - delta;
-       delta += wc_nsec;
-       nsec = do_div(delta, NSEC_PER_SEC);
-       set_normalized_timespec(&ts, wc_sec + delta, nsec);
-       /*
-        * Of all mechanisms of time adjustment I've tested, this one
-        * was the champion!
-        */
-       return ts.tv_sec + 1;
+       return ts.tv_sec;
 }
 
 static int kvm_set_wallclock(unsigned long now)
 {
-       return 0;
+       return -1;
 }
 
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
 static cycle_t kvm_clock_read(void)
 {
-       u64 last_tsc, now;
-       int cpu;
+       struct pvclock_vcpu_time_info *src;
+       cycle_t ret;
 
-       preempt_disable();
-       cpu = smp_processor_id();
-
-       last_tsc = get_clock(cpu, tsc_timestamp);
-       now = get_clock(cpu, system_time);
-
-       now += kvm_get_delta(last_tsc);
-       preempt_enable();
-
-       return now;
+       src = &get_cpu_var(hv_clock);
+       ret = pvclock_clocksource_read(src);
+       put_cpu_var(hv_clock);
+       return ret;
 }
+
 static struct clocksource kvm_clock = {
        .name = "kvm-clock",
        .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
        int low, high;
        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
-
+       printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+              cpu, high, low, txt);
        return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
 
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
         * Now that the first cpu already had this clocksource initialized,
         * we shouldn't fail.
         */
-       WARN_ON(kvm_register_clock());
+       WARN_ON(kvm_register_clock("secondary cpu clock"));
        /* ok, done with our trickery, call native */
        setup_secondary_APIC_clock();
 }
 #endif
 
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+       WARN_ON(kvm_register_clock("primary cpu clock"));
+       native_smp_prepare_boot_cpu();
+}
+#endif
+
 /*
  * After the clock is registered, the host will keep writing to the
  * registered memory location. If the guest happens to shutdown, this memory
@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
                return;
 
        if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-               if (kvm_register_clock())
+               if (kvm_register_clock("boot clock"))
                        return;
                pv_time_ops.get_wallclock = kvm_get_wallclock;
                pv_time_ops.set_wallclock = kvm_set_wallclock;
                pv_time_ops.sched_clock = kvm_clock_read;
 #ifdef CONFIG_X86_LOCAL_APIC
                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+#endif
+#ifdef CONFIG_SMP
+               smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
                machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644 (file)
index 0000000..05fbe9a
--- /dev/null
@@ -0,0 +1,141 @@
+/*  paravirtual clock -- common code used by kvm/xen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+
+/*
+ * These are perodically updated
+ *    xen: magic shared_info page
+ *    kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       int tsc_shift;
+       u32 version;
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+       u64 product;
+#ifdef __i386__
+       u32 tmp1, tmp2;
+#endif
+
+       if (shift < 0)
+               delta >>= -shift;
+       else
+               delta <<= shift;
+
+#ifdef __i386__
+       __asm__ (
+               "mul  %5       ; "
+               "mov  %4,%%eax ; "
+               "mov  %%edx,%4 ; "
+               "mul  %5       ; "
+               "xor  %5,%5    ; "
+               "add  %4,%%eax ; "
+               "adc  %5,%%edx ; "
+               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+       __asm__ (
+               "mul %%rdx ; shrd $32,%%rdx,%%rax"
+               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+       return product;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+       u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+                                       struct pvclock_vcpu_time_info *src)
+{
+       do {
+               dst->version = src->version;
+               rmb();          /* fetch version before data */
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();          /* test version after fetching data */
+       } while ((src->version & 1) || (dst->version != src->version));
+
+       return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+       struct pvclock_shadow_time shadow;
+       unsigned version;
+       cycle_t ret, offset;
+
+       do {
+               version = pvclock_get_time_values(&shadow, src);
+               barrier();
+               offset = pvclock_get_nsec_offset(&shadow);
+               ret = shadow.system_timestamp + offset;
+               barrier();
+       } while (version != src->version);
+
+       return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+                           struct pvclock_vcpu_time_info *vcpu_time,
+                           struct timespec *ts)
+{
+       u32 version;
+       u64 delta;
+       struct timespec now;
+
+       /* get wallclock at system boot */
+       do {
+               version = wall_clock->version;
+               rmb();          /* fetch version before time */
+               now.tv_sec  = wall_clock->sec;
+               now.tv_nsec = wall_clock->nsec;
+               rmb();          /* fetch time before checking version */
+       } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+       delta = pvclock_clocksource_read(vcpu_time);    /* time since system boot */
+       delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+       now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+       now.tv_sec = delta;
+
+       set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
index f2f5d26..3829aa7 100644 (file)
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 
        atomic_inc(&pt->pending);
        smp_mb__after_atomic_inc();
-       if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-               vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-               wake_up_interruptible(&vcpu0->wq);
+       if (vcpu0) {
+               set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
+               if (waitqueue_active(&vcpu0->wq)) {
+                       vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+                       wake_up_interruptible(&vcpu0->wq);
+               }
        }
 
        pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
index c297c50..ebc03f5 100644 (file)
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
        wait_queue_head_t *q = &apic->vcpu->wq;
 
        atomic_inc(&apic->timer.pending);
+       set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
        if (waitqueue_active(q)) {
                apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                wake_up_interruptible(q);
index ee3f530..7e7c396 100644 (file)
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                        rmap_remove(kvm, spte);
                        --kvm->stat.lpages;
                        set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+                       spte = NULL;
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                struct kvm_mmu_page *shadow;
 
                spte |= PT_WRITABLE_MASK;
-               if (user_fault) {
-                       mmu_unshadow(vcpu->kvm, gfn);
-                       goto unshadowed;
-               }
 
                shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
                if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
 
-unshadowed:
-
        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);
 
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  u64 *spte,
                                  const void *new)
 {
-       if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
-           && !vcpu->arch.update_pte.largepage) {
-               ++vcpu->kvm->stat.mmu_pde_zapped;
-               return;
-       }
+       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+               if (!vcpu->arch.update_pte.largepage ||
+                   sp->role.glevels == PT32_ROOT_LEVEL) {
+                       ++vcpu->kvm->stat.mmu_pde_zapped;
+                       return;
+               }
+        }
 
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (sp->role.glevels == PT32_ROOT_LEVEL)
index 02efbe7..540e951 100644 (file)
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
        load_transition_efer(vmx);
 }
 
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
        unsigned long flags;
 
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        reload_host_efer(vmx);
 }
 
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+       preempt_disable();
+       __vmx_load_host_state(vmx);
+       preempt_enable();
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       vmx_load_host_state(to_vmx(vcpu));
+       __vmx_load_host_state(to_vmx(vcpu));
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        switch (msr_index) {
 #ifdef CONFIG_X86_64
        case MSR_EFER:
+               vmx_load_host_state(vmx);
                ret = kvm_set_msr_common(vcpu, msr_index, data);
-               if (vmx->host_state.loaded) {
-                       reload_host_efer(vmx);
-                       load_transition_efer(vmx);
-               }
                break;
        case MSR_FS_BASE:
                vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                guest_write_tsc(data);
                break;
        default:
+               vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        msr->data = data;
-                       if (vmx->host_state.loaded)
-                               load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
                        break;
                }
                ret = kvm_set_msr_common(vcpu, msr_index, data);
index 00acf13..63a77ca 100644 (file)
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        static int version;
-       struct kvm_wall_clock wc;
-       struct timespec wc_ts;
+       struct pvclock_wall_clock wc;
+       struct timespec now, sys, boot;
 
        if (!wall_clock)
                return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
-       wc_ts = current_kernel_time();
-       wc.wc_sec = wc_ts.tv_sec;
-       wc.wc_nsec = wc_ts.tv_nsec;
-       wc.wc_version = version;
+       /*
+        * The guest calculates current wall clock time by adding
+        * system time (updated by kvm_write_guest_time below) to the
+        * wall clock specified here.  guest system time equals host
+        * system time for us, thus we must fill in host boot time here.
+        */
+       now = current_kernel_time();
+       ktime_get_ts(&sys);
+       boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+
+       wc.sec = boot.tv_sec;
+       wc.nsec = boot.tv_nsec;
+       wc.version = version;
 
        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
 
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+       uint32_t quotient, remainder;
+
+       /* Don't try to replace with do_div(), this one calculates
+        * "(dividend << 32) / divisor" */
+       __asm__ ( "divl %4"
+                 : "=a" (quotient), "=d" (remainder)
+                 : "0" (0), "1" (dividend), "r" (divisor) );
+       return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+       uint64_t nsecs = 1000000000LL;
+       int32_t  shift = 0;
+       uint64_t tps64;
+       uint32_t tps32;
+
+       tps64 = tsc_khz * 1000LL;
+       while (tps64 > nsecs*2) {
+               tps64 >>= 1;
+               shift--;
+       }
+
+       tps32 = (uint32_t)tps64;
+       while (tps32 <= (uint32_t)nsecs) {
+               tps32 <<= 1;
+               shift++;
+       }
+
+       hv_clock->tsc_shift = shift;
+       hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+       pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+                __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+                hv_clock->tsc_to_system_mul);
+}
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
        struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        if ((!vcpu->time_page))
                return;
 
+       if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+               kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+               vcpu->hv_clock_tsc_khz = tsc_khz;
+       }
+
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
-        * state, we just write "2" at the end
+        * state, we just increase by 2 at the end.
         */
-       vcpu->hv_clock.version = 2;
+       vcpu->hv_clock.version += 2;
 
        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-               sizeof(vcpu->hv_clock));
+              sizeof(vcpu->hv_clock));
 
        kunmap_atomic(shared_kaddr, KM_USER0);
 
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                /* ...but clean it before doing the actual write */
                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
-               vcpu->arch.hv_clock.tsc_to_system_mul =
-                                       clocksource_khz2mult(tsc_khz, 22);
-               vcpu->arch.hv_clock.tsc_shift = 22;
-
                down_read(&current->mm->mmap_sem);
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
        if (vcpu->requests) {
                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
                        __kvm_migrate_timers(vcpu);
+               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                       kvm_x86_ops->tlb_flush(vcpu);
                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
                                       &vcpu->requests)) {
                        kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
                }
        }
 
+       clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
        kvm_inject_pending_timer_irqs(vcpu);
 
        preempt_disable();
@@ -2781,21 +2833,13 @@ again:
 
        local_irq_disable();
 
-       if (need_resched()) {
+       if (vcpu->requests || need_resched()) {
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
 
-       if (vcpu->requests)
-               if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
-                       local_irq_enable();
-                       preempt_enable();
-                       r = 1;
-                       goto out;
-               }
-
        if (signal_pending(current)) {
                local_irq_enable();
                preempt_enable();
@@ -2825,9 +2869,6 @@ again:
 
        kvm_guest_enter();
 
-       if (vcpu->requests)
-               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-                       kvm_x86_ops->tlb_flush(vcpu);
 
        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
        kvm_x86_ops->run(vcpu, kvm_run);
index 525b108..6c388e5 100644 (file)
@@ -5,6 +5,7 @@
 config XEN
        bool "Xen guest support"
        select PARAVIRT
+       select PARAVIRT_CLOCK
        depends on X86_32
        depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
        help
index 52b2e38..41e2175 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/math64.h>
 
+#include <asm/pvclock.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
 
 static cycle_t xen_clocksource_read(void);
 
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-       u32 tsc_to_nsec_mul;
-       int tsc_shift;
-       u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
 unsigned long xen_cpu_khz(void)
 {
        u64 xen_khz = 1000000ULL << 32;
-       const struct vcpu_time_info *info =
+       const struct pvclock_vcpu_time_info *info =
                &HYPERVISOR_shared_info->vcpu_info[0].time;
 
        do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
        return xen_khz;
 }
 
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
-       struct vcpu_time_info   *src;
-       struct shadow_time_info *dst;
-
-       /* src is shared memory with the hypervisor, so we need to
-          make sure we get a consistent snapshot, even in the face of
-          being preempted. */
-       src = &__get_cpu_var(xen_vcpu)->time;
-       dst = &__get_cpu_var(shadow_time);
-
-       do {
-               dst->version = src->version;
-               rmb();          /* fetch version before data */
-               dst->tsc_timestamp     = src->tsc_timestamp;
-               dst->system_timestamp  = src->system_time;
-               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-               dst->tsc_shift         = src->tsc_shift;
-               rmb();          /* test version after fetching data */
-       } while ((src->version & 1) | (dst->version ^ src->version));
-
-       return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-       u64 product;
-#ifdef __i386__
-       u32 tmp1, tmp2;
-#endif
-
-       if (shift < 0)
-               delta >>= -shift;
-       else
-               delta <<= shift;
-
-#ifdef __i386__
-       __asm__ (
-               "mul  %5       ; "
-               "mov  %4,%%eax ; "
-               "mov  %%edx,%4 ; "
-               "mul  %5       ; "
-               "xor  %5,%5    ; "
-               "add  %4,%%eax ; "
-               "adc  %5,%%edx ; "
-               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
-               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
-       __asm__ (
-               "mul %%rdx ; shrd $32,%%rdx,%%rax"
-               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
-       return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
-       u64 now, delta;
-       now = native_read_tsc();
-       delta = now - shadow->tsc_timestamp;
-       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
 static cycle_t xen_clocksource_read(void)
 {
-       struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        struct pvclock_vcpu_time_info *src;
        cycle_t ret;
-       unsigned version;
-
-       do {
-               version = get_time_values_from_xen();
-               barrier();
-               ret = shadow->system_timestamp + get_nsec_offset(shadow);
-               barrier();
-       } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
-       put_cpu_var(shadow_time);
 
+       src = &get_cpu_var(xen_vcpu)->time;
+       ret = pvclock_clocksource_read(src);
+       put_cpu_var(xen_vcpu);
        return ret;
 }
 
 static void xen_read_wallclock(struct timespec *ts)
 {
-       const struct shared_info *s = HYPERVISOR_shared_info;
-       u32 version;
-       u64 delta;
-       struct timespec now;
-
-       /* get wallclock at system boot */
-       do {
-               version = s->wc_version;
-               rmb();          /* fetch version before time */
-               now.tv_sec  = s->wc_sec;
-               now.tv_nsec = s->wc_nsec;
-               rmb();          /* fetch time before checking version */
-       } while ((s->wc_version & 1) | (version ^ s->wc_version));
+       struct shared_info *s = HYPERVISOR_shared_info;
+       struct pvclock_wall_clock *wall_clock = &(s->wc);
+        struct pvclock_vcpu_time_info *vcpu_time;
 
-       delta = xen_clocksource_read(); /* time since system boot */
-       delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
-       now.tv_nsec = do_div(delta, NSEC_PER_SEC);
-       now.tv_sec = delta;
-
-       set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+       vcpu_time = &get_cpu_var(xen_vcpu)->time;
+       pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+       put_cpu_var(xen_vcpu);
 }
 
 unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
        struct timespec ts;
 
        xen_read_wallclock(&ts);
-
        return ts.tv_sec;
 }
 
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
 {
        int cpu = smp_processor_id();
 
-       get_time_values_from_xen();
-
        clocksource_register(&xen_clocksource);
 
        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
index 1d8cd01..844f2a8 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
 
+#include <asm/pvclock-abi.h>
 #include <asm/desc.h>
 
 #define KVM_MAX_VCPUS 16
@@ -282,7 +283,8 @@ struct kvm_vcpu_arch {
        struct x86_emulate_ctxt emulate_ctxt;
 
        gpa_t time;
-       struct kvm_vcpu_time_info hv_clock;
+       struct pvclock_vcpu_time_info hv_clock;
+       unsigned int hv_clock_tsc_khz;
        unsigned int time_offset;
        struct page *time_page;
 };
index 5098459..bfd9900 100644 (file)
@@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt {
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
-/* xen binary-compatible interface. See xen headers for details */
-struct kvm_vcpu_time_info {
-       uint32_t version;
-       uint32_t pad0;
-       uint64_t tsc_timestamp;
-       uint64_t system_time;
-       uint32_t tsc_to_system_mul;
-       int8_t   tsc_shift;
-       int8_t   pad[3];
-} __attribute__((__packed__)); /* 32 bytes */
-
-struct kvm_wall_clock {
-       uint32_t wc_version;
-       uint32_t wc_sec;
-       uint32_t wc_nsec;
-} __attribute__((__packed__));
-
-
 extern void kvmclock_init(void);
 
 
diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h
new file mode 100644 (file)
index 0000000..6857f84
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_PVCLOCK_ABI_H_
+#define _ASM_X86_PVCLOCK_ABI_H_
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time.  There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done.  Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+       u32   version;
+       u32   pad0;
+       u64   tsc_timestamp;
+       u64   system_time;
+       u32   tsc_to_system_mul;
+       s8    tsc_shift;
+       u8    pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+       u32   version;
+       u32   sec;
+       u32   nsec;
+} __attribute__((__packed__));
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H_ */
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
new file mode 100644 (file)
index 0000000..85b1bba
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_PVCLOCK_H_
+#define _ASM_X86_PVCLOCK_H_
+
+#include <linux/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+/* some helper functions for xen and kvm pv clock sources */
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
+                           struct pvclock_vcpu_time_info *vcpu,
+                           struct timespec *ts);
+
+#endif /* _ASM_X86_PVCLOCK_H_ */
index 092b1b2..de9d1df 100644 (file)
@@ -33,6 +33,7 @@
 #define KVM_REQ_REPORT_TPR_ACCESS  2
 #define KVM_REQ_MMU_RELOAD         3
 #define KVM_REQ_TRIPLE_FAULT       4
+#define KVM_REQ_PENDING_TIMER      5
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
index 9b018da..819a033 100644 (file)
@@ -10,6 +10,7 @@
 #define __XEN_PUBLIC_XEN_H__
 
 #include <asm/xen/interface.h>
+#include <asm/pvclock-abi.h>
 
 /*
  * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
@@ -336,7 +337,7 @@ struct vcpu_info {
        uint8_t evtchn_upcall_mask;
        unsigned long evtchn_pending_sel;
        struct arch_vcpu_info arch;
-       struct vcpu_time_info time;
+       struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
 /*
@@ -384,9 +385,7 @@ struct shared_info {
         * Wallclock time: updated only by control software. Guests should base
         * their gettimeofday() syscall on this wallclock-base value.
         */
-       uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
-       uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
-       uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+       struct pvclock_wall_clock wc;
 
        struct arch_shared_info arch;
 
index 98778cb..1dcf9f3 100644 (file)
@@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
        }
 }
 
-static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
 {
-       int i;
-
-       for (i = 0; i < IOAPIC_NUM_PINS; i++)
-               if (ioapic->redirtbl[i].fields.vector == vector)
-                       return i;
-       return -1;
-}
-
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
-{
-       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
        union ioapic_redir_entry *ent;
-       int gsi;
-
-       gsi = get_eoi_gsi(ioapic, vector);
-       if (gsi == -1) {
-               printk(KERN_WARNING "Can't find redir item for %d EOI\n",
-                      vector);
-               return;
-       }
 
        ent = &ioapic->redirtbl[gsi];
        ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
@@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
                ioapic_deliver(ioapic, gsi);
 }
 
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+{
+       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+       int i;
+
+       for (i = 0; i < IOAPIC_NUM_PINS; i++)
+               if (ioapic->redirtbl[i].fields.vector == vector)
+                       __kvm_ioapic_update_eoi(ioapic, i);
+}
+
 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
 {
        struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;