Merge branch 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 30 Oct 2011 22:36:45 +0000 (15:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 30 Oct 2011 22:36:45 +0000 (15:36 -0700)
* 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (75 commits)
  KVM: SVM: Keep intercepting task switching with NPT enabled
  KVM: s390: implement sigp external call
  KVM: s390: fix register setting
  KVM: s390: fix return value of kvm_arch_init_vm
  KVM: s390: check cpu_id prior to using it
  KVM: emulate lapic tsc deadline timer for guest
  x86: TSC deadline definitions
  KVM: Fix simultaneous NMIs
  KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode
  KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode
  KVM: x86 emulator: streamline decode of segment registers
  KVM: x86 emulator: simplify OpMem64 decode
  KVM: x86 emulator: switch src decode to decode_operand()
  KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack
  KVM: x86 emulator: switch OpImmUByte decode to decode_imm()
  KVM: x86 emulator: free up some flag bits near src, dst
  KVM: x86 emulator: switch src2 to generic decode_operand()
  KVM: x86 emulator: expand decode flags to 64 bits
  KVM: x86 emulator: split dst decode to a generic decode_operand()
  KVM: x86 emulator: move memop, memopp into emulation context
  ...

60 files changed:
Documentation/kernel-parameters.txt
Documentation/virtual/kvm/api.txt
arch/powerpc/include/asm/kvm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kvm/44x.c
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s_32_sr.S
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_64_slb.S
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_exports.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_pr_papr.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_rmhandlers.S
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/sigp.c
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/vmx.h
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/kvm_timer.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm.h
include/linux/kvm_host.h
virt/kvm/assigned-dev.c
virt/kvm/coalesced_mmio.c
virt/kvm/coalesced_mmio.h
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/kvm_main.c

index 93413ce..27e0488 100644 (file)
@@ -1201,6 +1201,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        [KVM,Intel] Disable FlexPriority feature (TPR shadow).
                        Default is 1 (enabled)
 
+       kvm-intel.nested=
+                       [KVM,Intel] Enable VMX nesting (nVMX).
+                       Default is 0 (disabled)
+
        kvm-intel.unrestricted_guest=
                        [KVM,Intel] Disable unrestricted guest feature
                        (virtualized real and unpaged mode) on capable
index b0e4b9c..7945b0b 100644 (file)
@@ -175,10 +175,30 @@ Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
 This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).  You can use KVM_CAP_NR_VCPUS of the
-KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
+in the range [0, max_vcpus).
+
+The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
+the KVM_CHECK_EXTENSION ioctl() at run-time.
+The maximum possible value for max_vcpus can be retrieved using the
+KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
+If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
+same as the value returned from KVM_CAP_NR_VCPUS.
+
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
 
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
@@ -1633,3 +1653,50 @@ developer registration required to access it).
                char padding[256];
        };
 };
+
+6. Capabilities that can be enabled
+
+There are certain capabilities that change the behavior of the virtual CPU when
+enabled. To enable them, please see section 4.37. Below you can find a list of
+capabilities and what their effect on the vCPU is when enabling them.
+
+The following information is provided along with the description:
+
+  Architectures: which instruction set architectures provide this ioctl.
+      x86 includes both i386 and x86_64.
+
+  Parameters: what parameters are accepted by the capability.
+
+  Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
+      are not detailed, but errors with specific meanings are.
+
+6.1 KVM_CAP_PPC_OSI
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success; -1 on error
+
+This capability enables interception of OSI hypercalls that otherwise would
+be treated as normal system calls to be injected into the guest. OSI hypercalls
+were invented by Mac-on-Linux to have a standardized communication mechanism
+between the guest and the host.
+
+When this capability is enabled, KVM_EXIT_OSI can occur.
+
+6.2 KVM_CAP_PPC_PAPR
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success; -1 on error
+
+This capability enables interception of PAPR hypercalls. PAPR hypercalls are
+done using the hypercall instruction "sc 1".
+
+It also sets the guest privilege level to "supervisor" mode. Usually the guest
+runs in "hypervisor" privilege mode with a few missing features.
+
+In addition to the above, it changes the semantics of SDR1. In this mode, the
+HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the
+HTAB invisible to the guest.
+
+When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
index a4f6c85..08fe69e 100644 (file)
@@ -148,6 +148,12 @@ struct kvm_regs {
 #define KVM_SREGS_E_UPDATE_DEC         (1 << 2)
 #define KVM_SREGS_E_UPDATE_DBSR                (1 << 3)
 
+/*
+ * Book3S special bits to indicate contents in the struct by maintaining
+ * backwards compatibility with older structs. If adding a new field,
+ * please make sure to add a flag for that new field */
+#define KVM_SREGS_S_HIOR               (1 << 0)
+
 /*
  * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
  * previous KVM_GET_REGS.
@@ -173,6 +179,8 @@ struct kvm_sregs {
                                __u64 ibat[8]; 
                                __u64 dbat[8]; 
                        } ppc32;
+                       __u64 flags; /* KVM_SREGS_S_ */
+                       __u64 hior;
                } s;
                struct {
                        union {
@@ -276,6 +284,11 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET    -2U
 #define KVM_INTERRUPT_SET_LEVEL        -3U
 
+#define KVM_CPU_440            1
+#define KVM_CPU_E500V2         2
+#define KVM_CPU_3S_32          3
+#define KVM_CPU_3S_64          4
+
 /* for KVM_CAP_SPAPR_TCE */
 struct kvm_create_spapr_tce {
        __u64 liobn;
index 98da010..a384ffd 100644 (file)
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
        int context_id[SID_CONTEXTS];
 
+       bool hior_sregs;                /* HIOR is set by SREGS, not PVR */
+
        struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
        struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
        struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -139,15 +141,14 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern void kvmppc_handler_lowmem_trampoline(void);
-extern void kvmppc_handler_trampoline_enter(void);
-extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
 extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
 extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
+extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
@@ -382,6 +383,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+                                            unsigned long pte_index)
+{
+       unsigned long rb, va_low;
+
+       rb = (v & ~0x7fUL) << 16;               /* AVA field */
+       va_low = pte_index >> 3;
+       if (v & HPTE_V_SECONDARY)
+               va_low = ~va_low;
+       /* xor vsid from AVA */
+       if (!(v & HPTE_V_1TB_SEG))
+               va_low ^= v >> 12;
+       else
+               va_low ^= v >> 24;
+       va_low &= 0x7ff;
+       if (v & HPTE_V_LARGE) {
+               rb |= 1;                        /* L field */
+               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+                   (r & 0xff000)) {
+                       /* non-16MB large page, must be 64k */
+                       /* (masks depend on page size) */
+                       rb |= 0x1000;           /* page encoding in LP field */
+                       rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+                       rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+               }
+       } else {
+               /* 4kB page */
+               rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of VA */
+       }
+       rb |= (v >> 54) & 0x300;                /* B field */
+       return rb;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
index ef7b368..1f2f5b6 100644 (file)
@@ -75,6 +75,8 @@ struct kvmppc_host_state {
        ulong scratch0;
        ulong scratch1;
        u8 in_guest;
+       u8 restore_hid5;
+       u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        struct kvm_vcpu *kvm_vcpu;
index cc22b28..bf8af5d 100644 (file)
@@ -198,21 +198,29 @@ struct kvm_arch {
  */
 struct kvmppc_vcore {
        int n_runnable;
-       int n_blocked;
+       int n_busy;
        int num_threads;
        int entry_exit_count;
        int n_woken;
        int nap_count;
+       int napping_threads;
        u16 pcpu;
-       u8 vcore_running;
+       u8 vcore_state;
        u8 in_guest;
        struct list_head runnable_threads;
        spinlock_t lock;
+       wait_queue_head_t wq;
 };
 
 #define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
 #define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
 
+/* Values for vcore_state */
+#define VCORE_INACTIVE 0
+#define VCORE_RUNNING  1
+#define VCORE_EXITING  2
+#define VCORE_SLEEPING 3
+
 struct kvmppc_pte {
        ulong eaddr;
        u64 vpage;
@@ -258,14 +266,6 @@ struct kvm_vcpu_arch {
        ulong host_stack;
        u32 host_pid;
 #ifdef CONFIG_PPC_BOOK3S
-       ulong host_msr;
-       ulong host_r2;
-       void *host_retip;
-       ulong trampoline_lowmem;
-       ulong trampoline_enter;
-       ulong highmem_handler;
-       ulong rmcall;
-       ulong host_paca_phys;
        struct kvmppc_slb slb[64];
        int slb_max;            /* 1 + index of last valid entry in slb[] */
        int slb_nr;             /* total number of entries in SLB */
@@ -389,6 +389,9 @@ struct kvm_vcpu_arch {
        u8 dcr_is_write;
        u8 osi_needed;
        u8 osi_enabled;
+       u8 papr_enabled;
+       u8 sane;
+       u8 cpu_type;
        u8 hcall_needed;
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -408,11 +411,13 @@ struct kvm_vcpu_arch {
        struct dtl *dtl;
        struct dtl *dtl_end;
 
+       wait_queue_head_t *wqp;
        struct kvmppc_vcore *vcore;
        int ret;
        int trap;
        int state;
        int ptid;
+       bool timer_running;
        wait_queue_head_t cpu_run;
 
        struct kvm_vcpu_arch_shared *shared;
@@ -428,8 +433,9 @@ struct kvm_vcpu_arch {
 #endif
 };
 
-#define KVMPPC_VCPU_BUSY_IN_HOST       0
-#define KVMPPC_VCPU_BLOCKED            1
+/* Values for vcpu->arch.state */
+#define KVMPPC_VCPU_STOPPED            0
+#define KVMPPC_VCPU_BUSY_IN_HOST       1
 #define KVMPPC_VCPU_RUNNABLE           2
 
 #endif /* __POWERPC_KVM_HOST_H__ */
index d121f49..46efd1a 100644 (file)
@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
 
index 5f078bc..69f7ffe 100644 (file)
@@ -44,6 +44,7 @@
 #include <asm/compat.h>
 #include <asm/mmu.h>
 #include <asm/hvcall.h>
+#include <asm/xics.h>
 #endif
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
@@ -449,8 +450,6 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
-       DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
-       DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
        DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
        DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
        DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
@@ -458,14 +457,12 @@ int main(void)
        DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
        DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
        DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
-       DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
-       DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
-       DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
-       DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
        DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
        DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
        DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
        DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+       DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
+       DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
        DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
        DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
@@ -481,6 +478,7 @@ int main(void)
        DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
        DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
        DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
+       DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
        DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
                           offsetof(struct kvmppc_vcpu_book3s, vcpu));
        DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -537,6 +535,8 @@ int main(void)
        HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
        HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
        HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+       HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
+       HSTATE_FIELD(HSTATE_NAPPING, napping);
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
@@ -549,6 +549,7 @@ int main(void)
        HSTATE_FIELD(HSTATE_DSCR, host_dscr);
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+       DEFINE(IPI_PRIORITY, IPI_PRIORITY);
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 #else /* CONFIG_PPC_BOOK3S */
index 41b02c7..29ddd8b 100644 (file)
@@ -427,16 +427,6 @@ slb_miss_user_pseries:
        b       .                               /* prevent spec. execution */
 #endif /* __DISABLED__ */
 
-/* KVM's trampoline code needs to be close to the interrupt handlers */
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#ifdef CONFIG_KVM_BOOK3S_PR
-#include "../kvm/book3s_rmhandlers.S"
-#else
-#include "../kvm/book3s_hv_rmhandlers.S"
-#endif
-#endif
-
        .align  7
        .globl  __end_interrupts
 __end_interrupts:
index da3a122..ca1f88b 100644 (file)
@@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
        for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
                vcpu_44x->shadow_refs[i].gtlb_index = -1;
 
+       vcpu->arch.cpu_type = KVM_CPU_440;
+
        return 0;
 }
 
index 08428e2..3688aee 100644 (file)
@@ -43,18 +43,22 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
        fpu.o \
        book3s_paired_singles.o \
        book3s_pr.o \
+       book3s_pr_papr.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
        book3s_64_mmu_host.o \
        book3s_64_mmu.o \
        book3s_32_mmu.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+       book3s_rmhandlers.o
 
 kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+       book3s_hv_rmhandlers.o \
        book3s_hv_rm_mmu.o \
        book3s_64_vio_hv.o \
        book3s_hv_builtin.o
index 3608471..7e06a6f 100644 (file)
@@ -31,7 +31,7 @@
         * R1 = host R1
         * R2 = host R2
         * R3 = shadow vcpu
-        * all other volatile GPRS = free
+        * all other volatile GPRS = free except R4, R6
         * SVCPU[CR]  = guest CR
         * SVCPU[XER] = guest XER
         * SVCPU[CTR] = guest CTR
index c6d3e19..b871721 100644 (file)
@@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
        dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
                page, vcpu_book3s->sdr1, pteg, slbe->vsid);
 
-       r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+       /* When running a PAPR guest, SDR1 contains a HVA address instead
+           of a GPA */
+       if (vcpu_book3s->vcpu.arch.papr_enabled)
+               r = pteg;
+       else
+               r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+
        if (kvm_is_error_hva(r))
                return r;
        return r | (pteg & ~PAGE_MASK);
index 04e7d3b..f2e6e48 100644 (file)
@@ -53,7 +53,7 @@ slb_exit_skip_ ## num:
         * R1 = host R1
         * R2 = host R2
         * R3 = shadow vcpu
-        * all other volatile GPRS = free
+        * all other volatile GPRS = free except R4, R6
         * SVCPU[CR]  = guest CR
         * SVCPU[XER] = guest XER
         * SVCPU[CTR] = guest CTR
index 4668465..0c9dc62 100644 (file)
  * function pointers, so let's just disable the define. */
 #undef mfsrin
 
+enum priv_level {
+       PRIV_PROBLEM = 0,
+       PRIV_SUPER = 1,
+       PRIV_HYPER = 2,
+};
+
+static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
+{
+       /* PAPR VMs only access supervisor SPRs */
+       if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
+               return false;
+
+       /* Limit user space to its own small SPR set */
+       if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM)
+               return false;
+
+       return true;
+}
+
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
@@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
        switch (sprn) {
        case SPRN_SDR1:
+               if (!spr_allowed(vcpu, PRIV_HYPER))
+                       goto unprivileged;
                to_book3s(vcpu)->sdr1 = spr_val;
                break;
        case SPRN_DSISR:
@@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
        case SPRN_PMC4_GEKKO:
        case SPRN_WPAR_GEKKO:
                break;
+unprivileged:
        default:
                printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
 #ifndef DEBUG_SPR
@@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
                break;
        }
        case SPRN_SDR1:
+               if (!spr_allowed(vcpu, PRIV_HYPER))
+                       goto unprivileged;
                kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
                break;
        case SPRN_DSISR:
@@ -449,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
        case SPRN_HID5:
                kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
                break;
+       case SPRN_CFAR:
+       case SPRN_PURR:
+               kvmppc_set_gpr(vcpu, rt, 0);
+               break;
        case SPRN_GQR0:
        case SPRN_GQR1:
        case SPRN_GQR2:
@@ -476,6 +504,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
                kvmppc_set_gpr(vcpu, rt, 0);
                break;
        default:
+unprivileged:
                printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
 #ifndef DEBUG_SPR
                emulated = EMULATE_FAIL;
index 88c8f26..f7f63a0 100644 (file)
@@ -23,9 +23,7 @@
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
 #else
-EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
-EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
-EXPORT_SYMBOL_GPL(kvmppc_rmcall);
+EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
index cc0d7f1..4644c79 100644 (file)
@@ -62,6 +62,8 @@
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        local_paca->kvm_hstate.kvm_vcpu = vcpu;
@@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
-
-void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
-{
-       u64 now;
-       unsigned long dec_nsec;
-
-       now = get_tb();
-       if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
-               kvmppc_core_queue_dec(vcpu);
-       if (vcpu->arch.pending_exceptions)
-               return;
-       if (vcpu->arch.dec_expires != ~(u64)0) {
-               dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
-                       tb_ticks_per_sec;
-               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-                             HRTIMER_MODE_REL);
-       }
-
-       kvmppc_vcpu_blocked(vcpu);
-
-       kvm_vcpu_block(vcpu);
-       vcpu->stat.halt_wakeup++;
-
-       if (vcpu->arch.dec_expires != ~(u64)0)
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-
-       kvmppc_vcpu_unblocked(vcpu);
-}
-
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
        vcpu->arch.shregs.msr = msr;
+       kvmppc_end_cede(vcpu);
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
        switch (req) {
        case H_CEDE:
-               vcpu->arch.shregs.msr |= MSR_EE;
-               vcpu->arch.ceded = 1;
-               smp_mb();
-               if (!vcpu->arch.prodded)
-                       kvmppc_vcpu_block(vcpu);
-               else
-                       vcpu->arch.prodded = 0;
-               smp_mb();
-               vcpu->arch.ceded = 0;
                break;
        case H_PROD:
                target = kvmppc_get_gpr(vcpu, 4);
@@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        }
 
-
-       if (!(r & RESUME_HOST)) {
-               /* To avoid clobbering exit_reason, only check for signals if
-                * we aren't already exiting to userspace for some other
-                * reason. */
-               if (signal_pending(tsk)) {
-                       vcpu->stat.signal_exits++;
-                       run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-               } else {
-                       kvmppc_core_deliver_interrupts(vcpu);
-               }
-       }
-
        return r;
 }
 
@@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        kvmppc_mmu_book3s_hv_init(vcpu);
 
        /*
-        * Some vcpus may start out in stopped state.  If we initialize
-        * them to busy-in-host state they will stop other vcpus in the
-        * vcore from running.  Instead we initialize them to blocked
-        * state, effectively considering them to be stopped until we
-        * see the first run ioctl for them.
+        * We consider the vcpu stopped until we see the first run ioctl for it.
         */
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+       vcpu->arch.state = KVMPPC_VCPU_STOPPED;
 
        init_waitqueue_head(&vcpu->arch.cpu_run);
 
@@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                if (vcore) {
                        INIT_LIST_HEAD(&vcore->runnable_threads);
                        spin_lock_init(&vcore->lock);
+                       init_waitqueue_head(&vcore->wq);
                }
                kvm->arch.vcores[core] = vcore;
        }
@@ -506,10 +452,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
        spin_lock(&vcore->lock);
        ++vcore->num_threads;
-       ++vcore->n_blocked;
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
 
+       vcpu->arch.cpu_type = KVM_CPU_3S_64;
+       kvmppc_sanity_check(vcpu);
+
        return vcpu;
 
 free_vcpu:
@@ -524,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kfree(vcpu);
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long dec_nsec, now;
 
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
-       ++vc->n_blocked;
-       if (vc->n_runnable > 0 &&
-           vc->n_runnable + vc->n_blocked == vc->num_threads) {
-               vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                                       arch.run_list);
-               wake_up(&vcpu->arch.cpu_run);
+       now = get_tb();
+       if (now > vcpu->arch.dec_expires) {
+               /* decrementer has already gone negative */
+               kvmppc_core_queue_dec(vcpu);
+               kvmppc_core_deliver_interrupts(vcpu);
+               return;
        }
-       spin_unlock(&vc->lock);
+       dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
+                  / tb_ticks_per_sec;
+       hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                     HRTIMER_MODE_REL);
+       vcpu->arch.timer_running = 1;
 }
 
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
-
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
-       --vc->n_blocked;
-       spin_unlock(&vc->lock);
+       vcpu->arch.ceded = 0;
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -562,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
                return;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
        --vc->n_runnable;
+       ++vc->n_busy;
        /* decrement the physical thread id of each following vcpu */
        v = vcpu;
        list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
@@ -575,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
        struct paca_struct *tpaca;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
        cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
        tpaca->kvm_hstate.kvm_vcore = vc;
+       tpaca->kvm_hstate.napping = 0;
+       vcpu->cpu = vc->pcpu;
        smp_wmb();
 #ifdef CONFIG_PPC_ICP_NATIVE
        if (vcpu->arch.ptid) {
                tpaca->cpu_start = 0x80;
-               tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
                wmb();
                xics_wake_cpu(cpu);
                ++vc->n_woken;
@@ -631,9 +586,10 @@ static int on_primary_thread(void)
  */
 static int kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu, *vcpu0, *vnext;
        long ret;
        u64 now;
+       int ptid;
 
        /* don't start if any threads have a signal pending */
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -652,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
                goto out;
        }
 
+       /*
+        * Assign physical thread IDs, first to non-ceded vcpus
+        * and then to ceded ones.
+        */
+       ptid = 0;
+       vcpu0 = NULL;
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               if (!vcpu->arch.ceded) {
+                       if (!ptid)
+                               vcpu0 = vcpu;
+                       vcpu->arch.ptid = ptid++;
+               }
+       }
+       if (!vcpu0)
+               return 0;               /* nothing to run */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               if (vcpu->arch.ceded)
+                       vcpu->arch.ptid = ptid++;
+
        vc->n_woken = 0;
        vc->nap_count = 0;
        vc->entry_exit_count = 0;
-       vc->vcore_running = 1;
+       vc->vcore_state = VCORE_RUNNING;
        vc->in_guest = 0;
        vc->pcpu = smp_processor_id();
+       vc->napping_threads = 0;
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
                kvmppc_start_thread(vcpu);
-       vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                               arch.run_list);
 
+       preempt_disable();
        spin_unlock(&vc->lock);
 
-       preempt_disable();
        kvm_guest_enter();
-       __kvmppc_vcore_entry(NULL, vcpu);
+       __kvmppc_vcore_entry(NULL, vcpu0);
 
-       /* wait for secondary threads to finish writing their state to memory */
        spin_lock(&vc->lock);
+       /* disable sending of IPIs on virtual external irqs */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               vcpu->cpu = -1;
+       /* wait for secondary threads to finish writing their state to memory */
        if (vc->nap_count < vc->n_woken)
                kvmppc_wait_for_nap(vc);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
-       vc->vcore_running = 2;
+       vc->vcore_state = VCORE_EXITING;
        spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
@@ -690,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
                        kvmppc_core_dequeue_dec(vcpu);
-               if (!vcpu->arch.trap) {
-                       if (signal_pending(vcpu->arch.run_task)) {
-                               vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                       }
-                       continue;               /* didn't get to run */
-               }
-               ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
-                                        vcpu->arch.run_task);
+
+               ret = RESUME_GUEST;
+               if (vcpu->arch.trap)
+                       ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+                                                vcpu->arch.run_task);
+
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
+
+               if (vcpu->arch.ceded) {
+                       if (ret != RESUME_GUEST)
+                               kvmppc_end_cede(vcpu);
+                       else
+                               kvmppc_set_timer(vcpu);
+               }
        }
 
        spin_lock(&vc->lock);
  out:
-       vc->vcore_running = 0;
+       vc->vcore_state = VCORE_INACTIVE;
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
                if (vcpu->arch.ret != RESUME_GUEST) {
@@ -717,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
        return 1;
 }
 
-static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 {
-       int ptid;
-       int wait_state;
-       struct kvmppc_vcore *vc;
        DEFINE_WAIT(wait);
 
-       /* No need to go into the guest when all we do is going out */
-       if (signal_pending(current)) {
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
+       prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+               schedule();
+       finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+       DEFINE_WAIT(wait);
+       struct kvm_vcpu *v;
+       int all_idle = 1;
+
+       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       vc->vcore_state = VCORE_SLEEPING;
+       spin_unlock(&vc->lock);
+       list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+               if (!v->arch.ceded || v->arch.pending_exceptions) {
+                       all_idle = 0;
+                       break;
+               }
        }
+       if (all_idle)
+               schedule();
+       finish_wait(&vc->wq, &wait);
+       spin_lock(&vc->lock);
+       vc->vcore_state = VCORE_INACTIVE;
+}
 
-       /* On PPC970, check that we have an RMA region */
-       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
-               return -EPERM;
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int n_ceded;
+       int prev_state;
+       struct kvmppc_vcore *vc;
+       struct kvm_vcpu *v, *vn;
 
        kvm_run->exit_reason = 0;
        vcpu->arch.ret = RESUME_GUEST;
        vcpu->arch.trap = 0;
 
-       flush_fp_to_thread(current);
-       flush_altivec_to_thread(current);
-       flush_vsx_to_thread(current);
-
        /*
         * Synchronize with other threads in this virtual core
         */
        vc = vcpu->arch.vcore;
        spin_lock(&vc->lock);
-       /* This happens the first time this is called for a vcpu */
-       if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
-               --vc->n_blocked;
-       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
-       ptid = vc->n_runnable;
+       vcpu->arch.ceded = 0;
        vcpu->arch.run_task = current;
        vcpu->arch.kvm_run = kvm_run;
-       vcpu->arch.ptid = ptid;
+       prev_state = vcpu->arch.state;
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
        list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
        ++vc->n_runnable;
 
-       wait_state = TASK_INTERRUPTIBLE;
-       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-               if (signal_pending(current)) {
-                       if (!vc->vcore_running) {
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                               break;
-                       }
-                       /* have to wait for vcore to stop executing guest */
-                       wait_state = TASK_UNINTERRUPTIBLE;
-                       smp_send_reschedule(vc->pcpu);
+       /*
+        * This happens the first time this is called for a vcpu.
+        * If the vcore is already running, we may be able to start
+        * this thread straight away and have it join in.
+        */
+       if (prev_state == KVMPPC_VCPU_STOPPED) {
+               if (vc->vcore_state == VCORE_RUNNING &&
+                   VCORE_EXIT_COUNT(vc) == 0) {
+                       vcpu->arch.ptid = vc->n_runnable - 1;
+                       kvmppc_start_thread(vcpu);
                }
 
-               if (!vc->vcore_running &&
-                   vc->n_runnable + vc->n_blocked == vc->num_threads) {
-                       /* we can run now */
-                       if (kvmppc_run_core(vc))
-                               continue;
-               }
+       } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
+               --vc->n_busy;
 
-               if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
-                       kvmppc_start_thread(vcpu);
+       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+              !signal_pending(current)) {
+               if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+                       continue;
+               }
+               n_ceded = 0;
+               list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+                       n_ceded += v->arch.ceded;
+               if (n_ceded == vc->n_runnable)
+                       kvmppc_vcore_blocked(vc);
+               else
+                       kvmppc_run_core(vc);
+
+               list_for_each_entry_safe(v, vn, &vc->runnable_threads,
+                                        arch.run_list) {
+                       kvmppc_core_deliver_interrupts(v);
+                       if (signal_pending(v->arch.run_task)) {
+                               kvmppc_remove_runnable(vc, v);
+                               v->stat.signal_exits++;
+                               v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+                               v->arch.ret = -EINTR;
+                               wake_up(&v->arch.cpu_run);
+                       }
+               }
+       }
 
-               /* wait for other threads to come in, or wait for vcore */
-               prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-               spin_unlock(&vc->lock);
-               schedule();
-               finish_wait(&vcpu->arch.cpu_run, &wait);
-               spin_lock(&vc->lock);
+       if (signal_pending(current)) {
+               if (vc->vcore_state == VCORE_RUNNING ||
+                   vc->vcore_state == VCORE_EXITING) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+               }
+               if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       vcpu->stat.signal_exits++;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       vcpu->arch.ret = -EINTR;
+               }
        }
 
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
-               kvmppc_remove_runnable(vc, vcpu);
        spin_unlock(&vc->lock);
-
        return vcpu->arch.ret;
 }
 
@@ -800,6 +829,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        int r;
 
+       if (!vcpu->arch.sane) {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               return -EINVAL;
+       }
+
+       /* No need to go into the guest when all we'll do is come back out */
+       if (signal_pending(current)) {
+               run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* On PPC970, check that we have an RMA region */
+       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+               return -EPERM;
+
+       flush_fp_to_thread(current);
+       flush_altivec_to_thread(current);
+       flush_vsx_to_thread(current);
+       vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+
        do {
                r = kvmppc_run_vcpu(run, vcpu);
 
index fcfe6b0..bacb0cf 100644 (file)
@@ -110,39 +110,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
        return H_SUCCESS;
 }
 
-static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
-                                     unsigned long pte_index)
-{
-       unsigned long rb, va_low;
-
-       rb = (v & ~0x7fUL) << 16;               /* AVA field */
-       va_low = pte_index >> 3;
-       if (v & HPTE_V_SECONDARY)
-               va_low = ~va_low;
-       /* xor vsid from AVA */
-       if (!(v & HPTE_V_1TB_SEG))
-               va_low ^= v >> 12;
-       else
-               va_low ^= v >> 24;
-       va_low &= 0x7ff;
-       if (v & HPTE_V_LARGE) {
-               rb |= 1;                        /* L field */
-               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-                   (r & 0xff000)) {
-                       /* non-16MB large page, must be 64k */
-                       /* (masks depend on page size) */
-                       rb |= 0x1000;           /* page encoding in LP field */
-                       rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-                       rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
-               }
-       } else {
-               /* 4kB page */
-               rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of VA */
-       }
-       rb |= (v >> 54) & 0x300;                /* B field */
-       return rb;
-}
-
 #define LOCK_TOKEN     (*(u32 *)(&get_paca()->lock_token))
 
 static inline int try_lock_tlbie(unsigned int *lock)
index de29501..f422231 100644 (file)
 #include <asm/ppc_asm.h>
 #include <asm/kvm_asm.h>
 #include <asm/reg.h>
+#include <asm/mmu.h>
 #include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/hvcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/exception-64s.h>
 
@@ -49,7 +52,7 @@ kvmppc_skip_Hinterrupt:
        b       .
 
 /*
- * Call kvmppc_handler_trampoline_enter in real mode.
+ * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
  *
  * Input Registers:
@@ -89,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
 kvm_start_guest:
        ld      r1,PACAEMERGSP(r13)
        subi    r1,r1,STACK_FRAME_OVERHEAD
+       ld      r2,PACATOC(r13)
+
+       /* were we napping due to cede? */
+       lbz     r0,HSTATE_NAPPING(r13)
+       cmpwi   r0,0
+       bne     kvm_end_cede
 
        /* get vcpu pointer */
        ld      r4, HSTATE_KVM_VCPU(r13)
@@ -276,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        cmpwi   r0,0
        beq     20b
 
-       /* Set LPCR.  Set the MER bit if there is a pending external irq. */
+       /* Set LPCR and RMOR. */
 10:    ld      r8,KVM_LPCR(r9)
-       ld      r0,VCPU_PENDING_EXC(r4)
-       li      r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
-       oris    r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
-       and.    r0,r0,r7
-       beq     11f
-       ori     r8,r8,LPCR_MER
-11:    mtspr   SPRN_LPCR,r8
+       mtspr   SPRN_LPCR,r8
        ld      r8,KVM_RMOR(r9)
        mtspr   SPRN_RMOR,r8
        isync
@@ -448,19 +451,50 @@ toc_tlbie_lock:
        mtctr   r6
        mtxer   r7
 
-       /* Move SRR0 and SRR1 into the respective regs */
+kvmppc_cede_reentry:           /* r4 = vcpu, r13 = paca */
        ld      r6, VCPU_SRR0(r4)
        ld      r7, VCPU_SRR1(r4)
-       mtspr   SPRN_SRR0, r6
-       mtspr   SPRN_SRR1, r7
-
        ld      r10, VCPU_PC(r4)
+       ld      r11, VCPU_MSR(r4)       /* r11 = vcpu->arch.msr & ~MSR_HV */
 
-       ld      r11, VCPU_MSR(r4)       /* r10 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
        ori     r11, r11, MSR_ME
 
+       /* Check if we can deliver an external or decrementer interrupt now */
+       ld      r0,VCPU_PENDING_EXC(r4)
+       li      r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and     r0,r0,r8
+       cmpdi   cr1,r0,0
+       andi.   r0,r11,MSR_EE
+       beq     cr1,11f
+BEGIN_FTR_SECTION
+       mfspr   r8,SPRN_LPCR
+       ori     r8,r8,LPCR_MER
+       mtspr   SPRN_LPCR,r8
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+       beq     5f
+       li      r0,BOOK3S_INTERRUPT_EXTERNAL
+12:    mr      r6,r10
+       mr      r10,r0
+       mr      r7,r11
+       li      r11,(MSR_ME << 1) | 1   /* synthesize MSR_SF | MSR_ME */
+       rotldi  r11,r11,63
+       b       5f
+11:    beq     5f
+       mfspr   r0,SPRN_DEC
+       cmpwi   r0,0
+       li      r0,BOOK3S_INTERRUPT_DECREMENTER
+       blt     12b
+
+       /* Move SRR0 and SRR1 into the respective regs */
+5:     mtspr   SPRN_SRR0, r6
+       mtspr   SPRN_SRR1, r7
+       li      r0,0
+       stb     r0,VCPU_CEDED(r4)       /* cancel cede */
+
 fast_guest_return:
        mtspr   SPRN_HSRR0,r10
        mtspr   SPRN_HSRR1,r11
@@ -574,21 +608,20 @@ kvmppc_interrupt:
        /* See if this is something we can handle in real mode */
        cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
        beq     hcall_try_real_mode
-hcall_real_cont:
 
        /* Check for mediated interrupts (could be done earlier really ...) */
 BEGIN_FTR_SECTION
        cmpwi   r12,BOOK3S_INTERRUPT_EXTERNAL
        bne+    1f
-       ld      r5,VCPU_KVM(r9)
-       ld      r5,KVM_LPCR(r5)
        andi.   r0,r11,MSR_EE
        beq     1f
+       mfspr   r5,SPRN_LPCR
        andi.   r0,r5,LPCR_MER
        bne     bounce_ext_interrupt
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+hcall_real_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
        /* Save DEC */
        mfspr   r5,SPRN_DEC
        mftb    r6
@@ -682,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
        slbia
        ptesync
 
-hdec_soon:
+hdec_soon:                     /* r9 = vcpu, r12 = trap, r13 = paca */
 BEGIN_FTR_SECTION
        b       32f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
@@ -700,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        addi    r0,r3,0x100
        stwcx.  r0,0,r6
        bne     41b
+       lwsync
 
        /*
         * At this point we have an interrupt that we have to pass
@@ -713,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
         * interrupt, since the other threads will already be on their
         * way here in that case.
         */
+       cmpwi   r3,0x100        /* Are we the first here? */
+       bge     43f
+       cmpwi   r3,1            /* Are any other threads in the guest? */
+       ble     43f
        cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
        beq     40f
-       cmpwi   r3,0x100        /* Are we the first here? */
-       bge     40f
-       cmpwi   r3,1
-       ble     40f
        li      r0,0
        mtspr   SPRN_HDEC,r0
 40:
+       /*
+        * Send an IPI to any napping threads, since an HDEC interrupt
+        * doesn't wake CPUs up from nap.
+        */
+       lwz     r3,VCORE_NAPPING_THREADS(r5)
+       lwz     r4,VCPU_PTID(r9)
+       li      r0,1
+       sldi    r0,r0,r4
+       andc.   r3,r3,r0                /* no sense IPI'ing ourselves */
+       beq     43f
+       mulli   r4,r4,PACA_SIZE         /* get paca for thread 0 */
+       subf    r6,r4,r13
+42:    andi.   r0,r3,1
+       beq     44f
+       ld      r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
+       li      r0,IPI_PRIORITY
+       li      r7,XICS_QIRR
+       stbcix  r0,r7,r8                /* trigger the IPI */
+44:    srdi.   r3,r3,1
+       addi    r6,r6,PACA_SIZE
+       bne     42b
 
        /* Secondary threads wait for primary to do partition switch */
-       ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+43:    ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
        ld      r5,HSTATE_KVM_VCORE(r13)
        lwz     r3,VCPU_PTID(r9)
        cmpwi   r3,0
@@ -1077,7 +1132,6 @@ hcall_try_real_mode:
 hcall_real_fallback:
        li      r12,BOOK3S_INTERRUPT_SYSCALL
        ld      r9, HSTATE_KVM_VCPU(r13)
-       ld      r11, VCPU_MSR(r9)
 
        b       hcall_real_cont
 
@@ -1139,7 +1193,7 @@ hcall_real_table:
        .long   0               /* 0xd4 */
        .long   0               /* 0xd8 */
        .long   0               /* 0xdc */
-       .long   0               /* 0xe0 */
+       .long   .kvmppc_h_cede - hcall_real_table
        .long   0               /* 0xe4 */
        .long   0               /* 0xe8 */
        .long   0               /* 0xec */
@@ -1168,7 +1222,8 @@ bounce_ext_interrupt:
        mtspr   SPRN_SRR0,r10
        mtspr   SPRN_SRR1,r11
        li      r10,BOOK3S_INTERRUPT_EXTERNAL
-       LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+       li      r11,(MSR_ME << 1) | 1   /* synthesize MSR_SF | MSR_ME */
+       rotldi  r11,r11,63
        b       fast_guest_return
 
 _GLOBAL(kvmppc_h_set_dabr)
@@ -1177,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr)
        li      r3,0
        blr
 
+_GLOBAL(kvmppc_h_cede)
+       ori     r11,r11,MSR_EE
+       std     r11,VCPU_MSR(r3)
+       li      r0,1
+       stb     r0,VCPU_CEDED(r3)
+       sync                    /* order setting ceded vs. testing prodded */
+       lbz     r5,VCPU_PRODDED(r3)
+       cmpwi   r5,0
+       bne     1f
+       li      r0,0            /* set trap to 0 to say hcall is handled */
+       stw     r0,VCPU_TRAP(r3)
+       li      r0,H_SUCCESS
+       std     r0,VCPU_GPR(r3)(r3)
+BEGIN_FTR_SECTION
+       b       2f              /* just send it up to host on 970 */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+       /*
+        * Set our bit in the bitmask of napping threads unless all the
+        * other threads are already napping, in which case we send this
+        * up to the host.
+        */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r6,VCPU_PTID(r3)
+       lwz     r8,VCORE_ENTRY_EXIT(r5)
+       clrldi  r8,r8,56
+       li      r0,1
+       sld     r0,r0,r6
+       addi    r6,r5,VCORE_NAPPING_THREADS
+31:    lwarx   r4,0,r6
+       or      r4,r4,r0
+       popcntw r7,r4
+       cmpw    r7,r8
+       bge     2f
+       stwcx.  r4,0,r6
+       bne     31b
+       li      r0,1
+       stb     r0,HSTATE_NAPPING(r13)
+       /* order napping_threads update vs testing entry_exit_count */
+       lwsync
+       mr      r4,r3
+       lwz     r7,VCORE_ENTRY_EXIT(r5)
+       cmpwi   r7,0x100
+       bge     33f             /* another thread already exiting */
+
+/*
+ * Although not specifically required by the architecture, POWER7
+ * preserves the following registers in nap mode, even if an SMT mode
+ * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
+ * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
+ */
+       /* Save non-volatile GPRs */
+       std     r14, VCPU_GPR(r14)(r3)
+       std     r15, VCPU_GPR(r15)(r3)
+       std     r16, VCPU_GPR(r16)(r3)
+       std     r17, VCPU_GPR(r17)(r3)
+       std     r18, VCPU_GPR(r18)(r3)
+       std     r19, VCPU_GPR(r19)(r3)
+       std     r20, VCPU_GPR(r20)(r3)
+       std     r21, VCPU_GPR(r21)(r3)
+       std     r22, VCPU_GPR(r22)(r3)
+       std     r23, VCPU_GPR(r23)(r3)
+       std     r24, VCPU_GPR(r24)(r3)
+       std     r25, VCPU_GPR(r25)(r3)
+       std     r26, VCPU_GPR(r26)(r3)
+       std     r27, VCPU_GPR(r27)(r3)
+       std     r28, VCPU_GPR(r28)(r3)
+       std     r29, VCPU_GPR(r29)(r3)
+       std     r30, VCPU_GPR(r30)(r3)
+       std     r31, VCPU_GPR(r31)(r3)
+
+       /* save FP state */
+       bl      .kvmppc_save_fp
+
+       /*
+        * Take a nap until a decrementer or external interrupt occurs,
+        * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
+        */
+       li      r0,0x80
+       stb     r0,PACAPROCSTART(r13)
+       mfspr   r5,SPRN_LPCR
+       ori     r5,r5,LPCR_PECE0 | LPCR_PECE1
+       mtspr   SPRN_LPCR,r5
+       isync
+       li      r0, 0
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+kvm_end_cede:
+       /* Woken by external or decrementer interrupt */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       /* If we're a secondary thread and we got here by an IPI, ack it */
+       ld      r4,HSTATE_KVM_VCPU(r13)
+       lwz     r3,VCPU_PTID(r4)
+       cmpwi   r3,0
+       beq     27f
+       mfspr   r3,SPRN_SRR1
+       rlwinm  r3,r3,44-31,0x7         /* extract wake reason field */
+       cmpwi   r3,4                    /* was it an external interrupt? */
+       bne     27f
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r0,0xff
+       li      r6,XICS_QIRR
+       li      r7,XICS_XIRR
+       lwzcix  r8,r5,r7                /* ack the interrupt */
+       sync
+       stbcix  r0,r5,r6                /* clear it */
+       stwcix  r8,r5,r7                /* EOI it */
+27:
+       /* load up FP state */
+       bl      kvmppc_load_fp
+
+       /* Load NV GPRS */
+       ld      r14, VCPU_GPR(r14)(r4)
+       ld      r15, VCPU_GPR(r15)(r4)
+       ld      r16, VCPU_GPR(r16)(r4)
+       ld      r17, VCPU_GPR(r17)(r4)
+       ld      r18, VCPU_GPR(r18)(r4)
+       ld      r19, VCPU_GPR(r19)(r4)
+       ld      r20, VCPU_GPR(r20)(r4)
+       ld      r21, VCPU_GPR(r21)(r4)
+       ld      r22, VCPU_GPR(r22)(r4)
+       ld      r23, VCPU_GPR(r23)(r4)
+       ld      r24, VCPU_GPR(r24)(r4)
+       ld      r25, VCPU_GPR(r25)(r4)
+       ld      r26, VCPU_GPR(r26)(r4)
+       ld      r27, VCPU_GPR(r27)(r4)
+       ld      r28, VCPU_GPR(r28)(r4)
+       ld      r29, VCPU_GPR(r29)(r4)
+       ld      r30, VCPU_GPR(r30)(r4)
+       ld      r31, VCPU_GPR(r31)(r4)
+
+       /* clear our bit in vcore->napping_threads */
+33:    ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r3,VCPU_PTID(r4)
+       li      r0,1
+       sld     r0,r0,r3
+       addi    r6,r5,VCORE_NAPPING_THREADS
+32:    lwarx   r7,0,r6
+       andc    r7,r7,r0
+       stwcx.  r7,0,r6
+       bne     32b
+       li      r0,0
+       stb     r0,HSTATE_NAPPING(r13)
+
+       /* see if any other thread is already exiting */
+       lwz     r0,VCORE_ENTRY_EXIT(r5)
+       cmpwi   r0,0x100
+       blt     kvmppc_cede_reentry     /* if not go back to guest */
+
+       /* some threads are exiting, so go to the guest exit path */
+       b       hcall_real_fallback
+
+       /* cede when already previously prodded case */
+1:     li      r0,0
+       stb     r0,VCPU_PRODDED(r3)
+       sync                    /* order testing prodded vs. clearing ceded */
+       stb     r0,VCPU_CEDED(r3)
+       li      r3,H_SUCCESS
+       blr
+
+       /* we've ceded but we want to give control to the host */
+2:     li      r3,H_TOO_HARD
+       blr
+
 secondary_too_late:
        ld      r5,HSTATE_KVM_VCORE(r13)
        HMT_LOW
@@ -1194,14 +1421,20 @@ secondary_too_late:
        slbmte  r6,r5
 1:     addi    r11,r11,16
        .endr
-       b       50f
 
 secondary_nap:
-       /* Clear any pending IPI */
-50:    ld      r5, HSTATE_XICS_PHYS(r13)
+       /* Clear any pending IPI - assume we're a secondary thread */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r7, XICS_XIRR
+       lwzcix  r3, r5, r7              /* ack any pending interrupt */
+       rlwinm. r0, r3, 0, 0xffffff     /* any pending? */
+       beq     37f
+       sync
        li      r0, 0xff
        li      r6, XICS_QIRR
-       stbcix  r0, r5, r6
+       stbcix  r0, r5, r6              /* clear the IPI */
+       stwcix  r3, r5, r7              /* EOI it */
+37:    sync
 
        /* increment the nap count and then go to nap mode */
        ld      r4, HSTATE_KVM_VCORE(r13)
@@ -1211,13 +1444,12 @@ secondary_nap:
        addi    r3, r3, 1
        stwcx.  r3, 0, r4
        bne     51b
-       isync
 
+       li      r3, LPCR_PECE0
        mfspr   r4, SPRN_LPCR
-       li      r0, LPCR_PECE
-       andc    r4, r4, r0
-       ori     r4, r4, LPCR_PECE0      /* exit nap on interrupt */
+       rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
        mtspr   SPRN_LPCR, r4
+       isync
        li      r0, 0
        std     r0, HSTATE_SCRATCH0(r13)
        ptesync
index c54b0e3..0a8515a 100644 (file)
 #define ULONG_SIZE             8
 #define FUNC(name)             GLUE(.,name)
 
-#define GET_SHADOW_VCPU_R13
-
-#define DISABLE_INTERRUPTS     \
-       mfmsr   r0;             \
-       rldicl  r0,r0,48,1;     \
-       rotldi  r0,r0,16;       \
-       mtmsrd  r0,1;           \
-
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
 #define ULONG_SIZE              4
 #define FUNC(name)             name
 
-#define GET_SHADOW_VCPU_R13    \
-       lwz     r13, (THREAD + THREAD_KVM_SVCPU)(r2)
-
-#define DISABLE_INTERRUPTS     \
-       mfmsr   r0;             \
-       rlwinm  r0,r0,0,17,15;  \
-       mtmsr   r0;             \
-
 #endif /* CONFIG_PPC_BOOK3S_XX */
 
 
@@ -108,44 +92,17 @@ kvm_start_entry:
 
 kvm_start_lightweight:
 
-       GET_SHADOW_VCPU_R13
-       PPC_LL  r3, VCPU_HIGHMEM_HANDLER(r4)
-       PPC_STL r3, HSTATE_VMHANDLER(r13)
-
-       PPC_LL  r10, VCPU_SHADOW_MSR(r4)        /* r10 = vcpu->arch.shadow_msr */
-
-       DISABLE_INTERRUPTS
-
 #ifdef CONFIG_PPC_BOOK3S_64
-       /* Some guests may need to have dcbz set to 32 byte length.
-        *
-        * Usually we ensure that by patching the guest's instructions
-        * to trap on dcbz and emulate it in the hypervisor.
-        *
-        * If we can, we should tell the CPU to use 32 byte dcbz though,
-        * because that's a lot faster.
-        */
-
        PPC_LL  r3, VCPU_HFLAGS(r4)
-       rldicl. r3, r3, 0, 63           /* CR = ((r3 & 1) == 0) */
-       beq     no_dcbz32_on
-
-       mfspr   r3,SPRN_HID5
-       ori     r3, r3, 0x80            /* XXX HID5_dcbz32 = 0x80 */
-       mtspr   SPRN_HID5,r3
-
-no_dcbz32_on:
-
+       rldicl  r3, r3, 0, 63           /* r3 &= 1 */
+       stb     r3, HSTATE_RESTORE_HID5(r13)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
-       PPC_LL  r6, VCPU_RMCALL(r4)
-       mtctr   r6
-
-       PPC_LL  r3, VCPU_TRAMPOLINE_ENTER(r4)
-       LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+       PPC_LL  r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
 
        /* Jump to segment patching handler and into our guest */
-       bctr
+       bl      FUNC(kvmppc_entry_trampoline)
+       nop
 
 /*
  * This is the handler in module memory. It gets jumped at from the
@@ -170,21 +127,6 @@ kvmppc_handler_highmem:
        /* R7 = vcpu */
        PPC_LL  r7, GPR4(r1)
 
-#ifdef CONFIG_PPC_BOOK3S_64
-
-       PPC_LL  r5, VCPU_HFLAGS(r7)
-       rldicl. r5, r5, 0, 63           /* CR = ((r5 & 1) == 0) */
-       beq     no_dcbz32_off
-
-       li      r4, 0
-       mfspr   r5,SPRN_HID5
-       rldimi  r5,r4,6,56
-       mtspr   SPRN_HID5,r5
-
-no_dcbz32_off:
-
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
        PPC_STL r14, VCPU_GPR(r14)(r7)
        PPC_STL r15, VCPU_GPR(r15)(r7)
        PPC_STL r16, VCPU_GPR(r16)(r7)
@@ -204,67 +146,6 @@ no_dcbz32_off:
        PPC_STL r30, VCPU_GPR(r30)(r7)
        PPC_STL r31, VCPU_GPR(r31)(r7)
 
-       /* Restore host msr -> SRR1 */
-       PPC_LL  r6, VCPU_HOST_MSR(r7)
-
-       /*
-        * For some interrupts, we need to call the real Linux
-        * handler, so it can do work for us. This has to happen
-        * as if the interrupt arrived from the kernel though,
-        * so let's fake it here where most state is restored.
-        *
-        * Call Linux for hardware interrupts/decrementer
-        * r3 = address of interrupt handler (exit reason)
-        */
-
-       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-       beq     call_linux_handler
-       cmpwi   r12, BOOK3S_INTERRUPT_DECREMENTER
-       beq     call_linux_handler
-       cmpwi   r12, BOOK3S_INTERRUPT_PERFMON
-       beq     call_linux_handler
-
-       /* Back to EE=1 */
-       mtmsr   r6
-       sync
-       b       kvm_return_point
-
-call_linux_handler:
-
-       /*
-        * If we land here we need to jump back to the handler we
-        * came from.
-        *
-        * We have a page that we can access from real mode, so let's
-        * jump back to that and use it as a trampoline to get back into the
-        * interrupt handler!
-        *
-        * R3 still contains the exit code,
-        * R5 VCPU_HOST_RETIP and
-        * R6 VCPU_HOST_MSR
-        */
-
-       /* Restore host IP -> SRR0 */
-       PPC_LL  r5, VCPU_HOST_RETIP(r7)
-
-       /* XXX Better move to a safe function?
-        *     What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
-
-       mtlr    r12
-
-       PPC_LL  r4, VCPU_TRAMPOLINE_LOWMEM(r7)
-       mtsrr0  r4
-       LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
-       mtsrr1  r3
-
-       RFI
-
-.global kvm_return_point
-kvm_return_point:
-
-       /* Jump back to lightweight entry if we're supposed to */
-       /* go back into the guest */
-
        /* Pass the exit number as 3rd argument to kvmppc_handle_exit */
        mr      r5, r12
 
index 0c0d3f2..d417511 100644 (file)
@@ -150,16 +150,22 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
        if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
                kvmppc_mmu_book3s_64_init(vcpu);
-               to_book3s(vcpu)->hior = 0xfff00000;
+               if (!to_book3s(vcpu)->hior_sregs)
+                       to_book3s(vcpu)->hior = 0xfff00000;
                to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+               vcpu->arch.cpu_type = KVM_CPU_3S_64;
        } else
 #endif
        {
                kvmppc_mmu_book3s_32_init(vcpu);
-               to_book3s(vcpu)->hior = 0;
+               if (!to_book3s(vcpu)->hior_sregs)
+                       to_book3s(vcpu)->hior = 0;
                to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+               vcpu->arch.cpu_type = KVM_CPU_3S_32;
        }
 
+       kvmppc_sanity_check(vcpu);
+
        /* If we are in hypervisor level on 970, we can tell the CPU to
         * treat DCBZ as 32 bytes store */
        vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
@@ -646,7 +652,27 @@ program_interrupt:
                break;
        }
        case BOOK3S_INTERRUPT_SYSCALL:
-               if (vcpu->arch.osi_enabled &&
+               if (vcpu->arch.papr_enabled &&
+                   (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+                   !(vcpu->arch.shared->msr & MSR_PR)) {
+                       /* SC 1 papr hypercalls */
+                       ulong cmd = kvmppc_get_gpr(vcpu, 3);
+                       int i;
+
+                       if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
+                               r = RESUME_GUEST;
+                               break;
+                       }
+
+                       run->papr_hcall.nr = cmd;
+                       for (i = 0; i < 9; ++i) {
+                               ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+                               run->papr_hcall.args[i] = gpr;
+                       }
+                       run->exit_reason = KVM_EXIT_PAPR_HCALL;
+                       vcpu->arch.hcall_needed = 1;
+                       r = RESUME_HOST;
+               } else if (vcpu->arch.osi_enabled &&
                    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
                    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
                        /* MOL hypercalls */
@@ -770,6 +796,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                }
        }
 
+       if (sregs->u.s.flags & KVM_SREGS_S_HIOR)
+               sregs->u.s.hior = to_book3s(vcpu)->hior;
+
        return 0;
 }
 
@@ -806,6 +835,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        /* Flush the MMU after messing with the segments */
        kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
+       if (sregs->u.s.flags & KVM_SREGS_S_HIOR) {
+               to_book3s(vcpu)->hior_sregs = true;
+               to_book3s(vcpu)->hior = sregs->u.s.hior;
+       }
+
        return 0;
 }
 
@@ -841,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        if (!p)
                goto uninit_vcpu;
 
-       vcpu->arch.host_retip = kvm_return_point;
-       vcpu->arch.host_msr = mfmsr();
 #ifdef CONFIG_PPC_BOOK3S_64
        /* default to book3s_64 (970fx) */
        vcpu->arch.pvr = 0x3C0301;
@@ -853,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
        vcpu->arch.slb_nr = 64;
 
-       /* remember where some real-mode handlers are */
-       vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
-       vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
-       vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-       vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-       vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
        vcpu->arch.shadow_msr = MSR_USER64;
 
        err = kvmppc_mmu_init(vcpu);
@@ -908,6 +930,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
        ulong ext_msr;
 
+       /* Check if we can run the vcpu at all */
+       if (!vcpu->arch.sane) {
+               kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               return -EINVAL;
+       }
+
        /* No need to go into the guest when all we do is going out */
        if (signal_pending(current)) {
                kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
new file mode 100644 (file)
index 0000000..b958932
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2011. Freescale Inc. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ *
+ * Hypercall handling for running PAPR guests in PR KVM on Book 3S
+ * processors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       unsigned long pteg_addr;
+
+       pte_index <<= 4;
+       pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
+       pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+       pteg_addr |= pte_index;
+
+       return pteg_addr;
+}
+
+static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
+{
+       long flags = kvmppc_get_gpr(vcpu, 4);
+       long pte_index = kvmppc_get_gpr(vcpu, 5);
+       unsigned long pteg[2 * 8];
+       unsigned long pteg_addr, i, *hpte;
+
+       pte_index &= ~7UL;
+       pteg_addr = get_pteg_addr(vcpu, pte_index);
+
+       copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
+       hpte = pteg;
+
+       if (likely((flags & H_EXACT) == 0)) {
+               pte_index &= ~7UL;
+               for (i = 0; ; ++i) {
+                       if (i == 8)
+                               return H_PTEG_FULL;
+                       if ((*hpte & HPTE_V_VALID) == 0)
+                               break;
+                       hpte += 2;
+               }
+       } else {
+               i = kvmppc_get_gpr(vcpu, 5) & 7UL;
+               hpte += i * 2;
+       }
+
+       hpte[0] = kvmppc_get_gpr(vcpu, 6);
+       hpte[1] = kvmppc_get_gpr(vcpu, 7);
+       copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
+       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+       kvmppc_set_gpr(vcpu, 4, pte_index | i);
+
+       return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags= kvmppc_get_gpr(vcpu, 4);
+       unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+       unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+       unsigned long v = 0, pteg, rb;
+       unsigned long pte[2];
+
+       pteg = get_pteg_addr(vcpu, pte_index);
+       copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+
+       if ((pte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
+           ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) {
+               kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
+               return EMULATE_DONE;
+       }
+
+       copy_to_user((void __user *)pteg, &v, sizeof(v));
+
+       rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
+       vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+
+       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+       kvmppc_set_gpr(vcpu, 4, pte[0]);
+       kvmppc_set_gpr(vcpu, 5, pte[1]);
+
+       return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags = kvmppc_get_gpr(vcpu, 4);
+       unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+       unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+       unsigned long rb, pteg, r, v;
+       unsigned long pte[2];
+
+       pteg = get_pteg_addr(vcpu, pte_index);
+       copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+
+       if ((pte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) {
+               kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
+               return EMULATE_DONE;
+       }
+
+       v = pte[0];
+       r = pte[1];
+       r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
+              HPTE_R_KEY_LO);
+       r |= (flags << 55) & HPTE_R_PP0;
+       r |= (flags << 48) & HPTE_R_KEY_HI;
+       r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+       pte[1] = r;
+
+       rb = compute_tlbie_rb(v, r, pte_index);
+       vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+       copy_to_user((void __user *)pteg, pte, sizeof(pte));
+
+       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+
+       return EMULATE_DONE;
+}
+
+int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
+{
+       switch (cmd) {
+       case H_ENTER:
+               return kvmppc_h_pr_enter(vcpu);
+       case H_REMOVE:
+               return kvmppc_h_pr_remove(vcpu);
+       case H_PROTECT:
+               return kvmppc_h_pr_protect(vcpu);
+       case H_BULK_REMOVE:
+               /* We just flush all PTEs, so user space can
+                  handle the HPT modifications */
+               kvmppc_mmu_pte_flush(vcpu, 0, 0);
+               break;
+       case H_CEDE:
+               kvm_vcpu_block(vcpu);
+               vcpu->stat.halt_wakeup++;
+               return EMULATE_DONE;
+       }
+
+       return EMULATE_FAIL;
+}
index c1f877c..3418758 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/kvm_asm.h>
 #include <asm/reg.h>
+#include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
 
 #if defined(CONFIG_PPC_BOOK3S_64)
 
-#define LOAD_SHADOW_VCPU(reg)  GET_PACA(reg)                                   
-#define MSR_NOIRQ              MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name)             GLUE(.,name)
+#define MTMSR_EERI(reg)                mtmsrd  (reg),1
 
+       .globl  kvmppc_skip_interrupt
 kvmppc_skip_interrupt:
        /*
         * Here all GPRs are unchanged from when the interrupt happened
@@ -51,6 +52,7 @@ kvmppc_skip_interrupt:
        rfid
        b       .
 
+       .globl  kvmppc_skip_Hinterrupt
 kvmppc_skip_Hinterrupt:
        /*
         * Here all GPRs are unchanged from when the interrupt happened
@@ -65,8 +67,8 @@ kvmppc_skip_Hinterrupt:
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
-#define MSR_NOIRQ              MSR_KERNEL
 #define FUNC(name)             name
+#define MTMSR_EERI(reg)                mtmsr   (reg)
 
 .macro INTERRUPT_TRAMPOLINE intno
 
@@ -167,40 +169,24 @@ kvmppc_handler_skip_ins:
 #endif
 
 /*
- * This trampoline brings us back to a real mode handler
- *
- * Input Registers:
- *
- * R5 = SRR0
- * R6 = SRR1
- * LR = real-mode IP
+ * Call kvmppc_handler_trampoline_enter in real mode
  *
+ * On entry, r4 contains the guest shadow MSR
  */
-.global kvmppc_handler_lowmem_trampoline
-kvmppc_handler_lowmem_trampoline:
-
-       mtsrr0  r5
+_GLOBAL(kvmppc_entry_trampoline)
+       mfmsr   r5
+       LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
+       toreal(r7)
+
+       li      r9, MSR_RI
+       ori     r9, r9, MSR_EE
+       andc    r9, r5, r9      /* Clear EE and RI in MSR value */
+       li      r6, MSR_IR | MSR_DR
+       ori     r6, r6, MSR_EE
+       andc    r6, r5, r6      /* Clear EE, DR and IR in MSR value */
+       MTMSR_EERI(r9)          /* Clear EE and RI in MSR */
+       mtsrr0  r7              /* before we set srr0/1 */
        mtsrr1  r6
-       blr
-kvmppc_handler_lowmem_trampoline_end:
-
-/*
- * Call a function in real mode
- *
- * Input Registers:
- *
- * R3 = function
- * R4 = MSR
- * R5 = scratch register
- *
- */
-_GLOBAL(kvmppc_rmcall)
-       LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ)
-       mtmsr   r5              /* Disable relocation and interrupts, so mtsrr
-                                  doesn't get interrupted */
-       sync
-       mtsrr0  r3
-       mtsrr1  r4
        RFI
 
 #if defined(CONFIG_PPC_BOOK3S_32)
index aed32e5..0676ae2 100644 (file)
@@ -23,6 +23,7 @@
 
 #define GET_SHADOW_VCPU(reg)    \
        mr      reg, r13
+#define MTMSR_EERI(reg)                mtmsrd  (reg),1
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
@@ -30,6 +31,7 @@
        tophys(reg, r2);                        \
        lwz     reg, (THREAD + THREAD_KVM_SVCPU)(reg);  \
        tophys(reg, reg)
+#define MTMSR_EERI(reg)                mtmsr   (reg)
 
 #endif
 
@@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter:
        /* Required state:
         *
         * MSR = ~IR|DR
-        * R13 = PACA
         * R1 = host R1
         * R2 = host R2
-        * R10 = guest MSR
+        * R4 = guest shadow MSR
+        * R5 = normal host MSR
+        * R6 = current host MSR (EE, IR, DR off)
+        * LR = highmem guest exit code
         * all other volatile GPRS = free
         * SVCPU[CR] = guest CR
         * SVCPU[XER] = guest XER
@@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter:
        /* r3 = shadow vcpu */
        GET_SHADOW_VCPU(r3)
 
+       /* Save guest exit handler address and MSR */
+       mflr    r0
+       PPC_STL r0, HSTATE_VMHANDLER(r3)
+       PPC_STL r5, HSTATE_HOST_MSR(r3)
+
        /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
        PPC_STL r1, HSTATE_HOST_R1(r3)
        PPC_STL r2, HSTATE_HOST_R2(r3)
 
-       /* Move SRR0 and SRR1 into the respective regs */
-       PPC_LL  r9, SVCPU_PC(r3)
-       mtsrr0  r9
-       mtsrr1  r10
-
        /* Activate guest mode, so faults get handled by KVM */
        li      r11, KVM_GUEST_MODE_GUEST
        stb     r11, HSTATE_IN_GUEST(r3)
@@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter:
        /* Switch to guest segment. This is subarch specific. */
        LOAD_GUEST_SEGMENTS
 
+#ifdef CONFIG_PPC_BOOK3S_64
+       /* Some guests may need to have dcbz set to 32 byte length.
+        *
+        * Usually we ensure that by patching the guest's instructions
+        * to trap on dcbz and emulate it in the hypervisor.
+        *
+        * If we can, we should tell the CPU to use 32 byte dcbz though,
+        * because that's a lot faster.
+        */
+       lbz     r0, HSTATE_RESTORE_HID5(r3)
+       cmpwi   r0, 0
+       beq     no_dcbz32_on
+
+       mfspr   r0,SPRN_HID5
+       ori     r0, r0, 0x80            /* XXX HID5_dcbz32 = 0x80 */
+       mtspr   SPRN_HID5,r0
+no_dcbz32_on:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
        /* Enter guest */
 
-       PPC_LL  r4, SVCPU_CTR(r3)
-       PPC_LL  r5, SVCPU_LR(r3)
-       lwz     r6, SVCPU_CR(r3)
-       lwz     r7, SVCPU_XER(r3)
+       PPC_LL  r8, SVCPU_CTR(r3)
+       PPC_LL  r9, SVCPU_LR(r3)
+       lwz     r10, SVCPU_CR(r3)
+       lwz     r11, SVCPU_XER(r3)
+
+       mtctr   r8
+       mtlr    r9
+       mtcr    r10
+       mtxer   r11
 
-       mtctr   r4
-       mtlr    r5
-       mtcr    r6
-       mtxer   r7
+       /* Move SRR0 and SRR1 into the respective regs */
+       PPC_LL  r9, SVCPU_PC(r3)
+       /* First clear RI in our current MSR value */
+       li      r0, MSR_RI
+       andc    r6, r6, r0
+       MTMSR_EERI(r6)
+       mtsrr0  r9
+       mtsrr1  r4
 
        PPC_LL  r0, SVCPU_R0(r3)
        PPC_LL  r1, SVCPU_R1(r3)
@@ -213,11 +246,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
        beq     ld_last_inst
        cmpwi   r12, BOOK3S_INTERRUPT_PROGRAM
        beq     ld_last_inst
+       cmpwi   r12, BOOK3S_INTERRUPT_SYSCALL
+       beq     ld_last_prev_inst
        cmpwi   r12, BOOK3S_INTERRUPT_ALIGNMENT
        beq-    ld_last_inst
 
        b       no_ld_last_inst
 
+ld_last_prev_inst:
+       addi    r3, r3, -4
+
 ld_last_inst:
        /* Save off the guest instruction we're at */
 
@@ -254,6 +292,43 @@ no_ld_last_inst:
        /* Switch back to host MMU */
        LOAD_HOST_SEGMENTS
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+       lbz     r5, HSTATE_RESTORE_HID5(r13)
+       cmpwi   r5, 0
+       beq     no_dcbz32_off
+
+       li      r4, 0
+       mfspr   r5,SPRN_HID5
+       rldimi  r5,r4,6,56
+       mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+       /*
+        * For some interrupts, we need to call the real Linux
+        * handler, so it can do work for us. This has to happen
+        * as if the interrupt arrived from the kernel though,
+        * so let's fake it here where most state is restored.
+        *
+        * Having set up SRR0/1 with the address where we want
+        * to continue with relocation on (potentially in module
+        * space), we either just go straight there with rfi[d],
+        * or we jump to an interrupt handler with bctr if there
+        * is an interrupt to be handled first.  In the latter
+        * case, the rfi[d] at the end of the interrupt handler
+        * will get us back to where we want to continue.
+        */
+
+       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
+       beq     1f
+       cmpwi   r12, BOOK3S_INTERRUPT_DECREMENTER
+       beq     1f
+       cmpwi   r12, BOOK3S_INTERRUPT_PERFMON
+1:     mtctr   r12
+
        /* Register usage at this point:
         *
         * R1       = host R1
@@ -264,13 +339,15 @@ no_ld_last_inst:
         *
         */
 
-       /* RFI into the highmem handler */
-       mfmsr   r7
-       ori     r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME     /* Enable paging */
-       mtsrr1  r7
-       /* Load highmem handler address */
+       PPC_LL  r6, HSTATE_HOST_MSR(r13)
        PPC_LL  r8, HSTATE_VMHANDLER(r13)
+
+       /* Restore host msr -> SRR1 */
+       mtsrr1  r6
+       /* Load highmem handler address */
        mtsrr0  r8
 
+       /* RFI into the highmem handler, or jump to interrupt handler */
+       beqctr
        RFI
 kvmppc_handler_trampoline_exit_end:
index ee45fa0..bb6c988 100644 (file)
@@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
        int ret;
 
+       if (!vcpu->arch.sane) {
+               kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               return -EINVAL;
+       }
+
        local_irq_disable();
        kvm_guest_enter();
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        int i;
+       int r;
 
        vcpu->arch.pc = 0;
        vcpu->arch.shared->msr = 0;
@@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
        kvmppc_init_timing_stats(vcpu);
 
-       return kvmppc_core_vcpu_setup(vcpu);
+       r = kvmppc_core_vcpu_setup(vcpu);
+       kvmppc_sanity_check(vcpu);
+       return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
index 797a744..26d2090 100644 (file)
@@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
        /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
        vcpu->vcpu_id = 0;
 
+       vcpu->arch.cpu_type = KVM_CPU_E500V2;
+
        return 0;
 }
 
index a107c9b..0d843c6 100644 (file)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-#ifndef CONFIG_KVM_BOOK3S_64_HV
        return !(v->arch.shared->msr & MSR_WE) ||
               !!(v->arch.pending_exceptions);
-#else
-       return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
-#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -95,6 +91,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
        return r;
 }
 
+int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
+{
+       int r = false;
+
+       /* We have to know what CPU to virtualize */
+       if (!vcpu->arch.pvr)
+               goto out;
+
+       /* PAPR only works with book3s_64 */
+       if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
+               goto out;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       /* HV KVM can only do PAPR mode for now */
+       if (!vcpu->arch.papr_enabled)
+               goto out;
+#endif
+
+       r = true;
+
+out:
+       vcpu->arch.sane = r;
+       return r ? 0 : -EINVAL;
+}
+
 int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        enum emulation_result er;
@@ -188,6 +209,8 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PPC_BOOKE_SREGS:
 #else
        case KVM_CAP_PPC_SEGSTATE:
+       case KVM_CAP_PPC_HIOR:
+       case KVM_CAP_PPC_PAPR:
 #endif
        case KVM_CAP_PPC_UNSET_IRQ:
        case KVM_CAP_PPC_IRQ_LEVEL:
@@ -258,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvm_vcpu *vcpu;
        vcpu = kvmppc_core_vcpu_create(kvm, id);
+       vcpu->arch.wqp = &vcpu->wq;
        if (!IS_ERR(vcpu))
                kvmppc_create_vcpu_debugfs(vcpu, id);
        return vcpu;
@@ -289,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data)
 
        kvmppc_core_queue_dec(vcpu);
 
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
+       if (waitqueue_active(vcpu->arch.wqp)) {
+               wake_up_interruptible(vcpu->arch.wqp);
                vcpu->stat.halt_wakeup++;
        }
 }
@@ -543,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-       if (irq->irq == KVM_INTERRUPT_UNSET)
+       if (irq->irq == KVM_INTERRUPT_UNSET) {
                kvmppc_core_dequeue_external(vcpu, irq);
-       else
-               kvmppc_core_queue_external(vcpu, irq);
+               return 0;
+       }
+
+       kvmppc_core_queue_external(vcpu, irq);
 
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
+       if (waitqueue_active(vcpu->arch.wqp)) {
+               wake_up_interruptible(vcpu->arch.wqp);
                vcpu->stat.halt_wakeup++;
        } else if (vcpu->cpu != -1) {
                smp_send_reschedule(vcpu->cpu);
@@ -571,11 +597,18 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                r = 0;
                vcpu->arch.osi_enabled = true;
                break;
+       case KVM_CAP_PPC_PAPR:
+               r = 0;
+               vcpu->arch.papr_enabled = true;
+               break;
        default:
                r = -EINVAL;
                break;
        }
 
+       if (!r)
+               r = kvmppc_sanity_check(vcpu);
+
        return r;
 }
 
index 00ff00d..1ca5de0 100644 (file)
@@ -119,6 +119,7 @@ struct kvm_vcpu_stat {
        u32 instruction_lctlg;
        u32 exit_program_interruption;
        u32 exit_instr_and_program;
+       u32 deliver_external_call;
        u32 deliver_emergency_signal;
        u32 deliver_service_signal;
        u32 deliver_virtio_interrupt;
@@ -138,6 +139,7 @@ struct kvm_vcpu_stat {
        u32 instruction_stfl;
        u32 instruction_tprot;
        u32 instruction_sigp_sense;
+       u32 instruction_sigp_external_call;
        u32 instruction_sigp_emergency;
        u32 instruction_sigp_stop;
        u32 instruction_sigp_arch;
@@ -174,6 +176,10 @@ struct kvm_s390_prefix_info {
        __u32 address;
 };
 
+struct kvm_s390_extcall_info {
+       __u16 code;
+};
+
 struct kvm_s390_emerg_info {
        __u16 code;
 };
@@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info {
                struct kvm_s390_ext_info ext;
                struct kvm_s390_pgm_info pgm;
                struct kvm_s390_emerg_info emerg;
+               struct kvm_s390_extcall_info extcall;
                struct kvm_s390_prefix_info prefix;
        };
 };
index c9aeb4b..87c1670 100644 (file)
@@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                                      struct kvm_s390_interrupt_info *inti)
 {
        switch (inti->type) {
+       case KVM_S390_INT_EXTERNAL_CALL:
+               if (psw_extint_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
+                       return 1;
        case KVM_S390_INT_EMERGENCY:
                if (psw_extint_disabled(vcpu))
                        return 0;
@@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
                                      struct kvm_s390_interrupt_info *inti)
 {
        switch (inti->type) {
+       case KVM_S390_INT_EXTERNAL_CALL:
        case KVM_S390_INT_EMERGENCY:
        case KVM_S390_INT_SERVICE:
        case KVM_S390_INT_VIRTIO:
@@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                        exception = 1;
                break;
 
+       case KVM_S390_INT_EXTERNAL_CALL:
+               VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
+               vcpu->stat.deliver_external_call++;
+               rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
+               if (rc == -EFAULT)
+                       exception = 1;
+
+               rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
+               if (rc == -EFAULT)
+                       exception = 1;
+
+               rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+
+               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                       __LC_EXT_NEW_PSW, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               break;
+
        case KVM_S390_INT_SERVICE:
                VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
                           inti->ext.ext_params);
@@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
                break;
        case KVM_S390_PROGRAM_INT:
        case KVM_S390_SIGP_STOP:
+       case KVM_S390_INT_EXTERNAL_CALL:
        case KVM_S390_INT_EMERGENCY:
        default:
                kfree(inti);
@@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                break;
        case KVM_S390_SIGP_STOP:
        case KVM_S390_RESTART:
+       case KVM_S390_INT_EXTERNAL_CALL:
        case KVM_S390_INT_EMERGENCY:
                VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
                inti->type = s390int->type;
index dc2b580..9610ba4 100644 (file)
@@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
        { "instruction_lctl", VCPU_STAT(instruction_lctl) },
        { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
+       { "deliver_external_call", VCPU_STAT(deliver_external_call) },
        { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
        { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
        { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
+       { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
        { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
        { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
        { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
@@ -175,6 +177,8 @@ int kvm_arch_init_vm(struct kvm *kvm)
        if (rc)
                goto out_err;
 
+       rc = -ENOMEM;
+
        kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
        if (!kvm->arch.sca)
                goto out_err;
@@ -312,11 +316,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                      unsigned int id)
 {
-       struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
-       int rc = -ENOMEM;
+       struct kvm_vcpu *vcpu;
+       int rc = -EINVAL;
+
+       if (id >= KVM_MAX_VCPUS)
+               goto out;
 
+       rc = -ENOMEM;
+
+       vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
        if (!vcpu)
-               goto out_nomem;
+               goto out;
 
        vcpu->arch.sie_block = (struct kvm_s390_sie_block *)
                                        get_zeroed_page(GFP_KERNEL);
@@ -352,7 +362,7 @@ out_free_sie_block:
        free_page((unsigned long)(vcpu->arch.sie_block));
 out_free_cpu:
        kfree(vcpu);
-out_nomem:
+out:
        return ERR_PTR(rc);
 }
 
@@ -386,6 +396,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
        memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
        memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
+       restore_access_regs(vcpu->arch.guest_acrs);
        return 0;
 }
 
@@ -401,6 +412,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
        vcpu->arch.guest_fpregs.fpc = fpu->fpc;
+       restore_fp_regs(&vcpu->arch.guest_fpregs);
        return 0;
 }
 
index d6a50c1..f815118 100644 (file)
@@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
                return -ENOMEM;
 
        inti->type = KVM_S390_INT_EMERGENCY;
+       inti->emerg.code = vcpu->vcpu_id;
 
        spin_lock(&fi->lock);
        li = fi->local_int[cpu_addr];
@@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
                wake_up_interruptible(&li->wq);
        spin_unlock_bh(&li->lock);
        rc = 0; /* order accepted */
+       VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
+unlock:
+       spin_unlock(&fi->lock);
+       return rc;
+}
+
+static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
+{
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_s390_interrupt_info *inti;
+       int rc;
+
+       if (cpu_addr >= KVM_MAX_VCPUS)
+               return 3; /* not operational */
+
+       inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+       if (!inti)
+               return -ENOMEM;
+
+       inti->type = KVM_S390_INT_EXTERNAL_CALL;
+       inti->extcall.code = vcpu->vcpu_id;
+
+       spin_lock(&fi->lock);
+       li = fi->local_int[cpu_addr];
+       if (li == NULL) {
+               rc = 3; /* not operational */
+               kfree(inti);
+               goto unlock;
+       }
+       spin_lock_bh(&li->lock);
+       list_add_tail(&inti->list, &li->list);
+       atomic_set(&li->active, 1);
+       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+       if (waitqueue_active(&li->wq))
+               wake_up_interruptible(&li->wq);
+       spin_unlock_bh(&li->lock);
+       rc = 0; /* order accepted */
+       VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
 unlock:
        spin_unlock(&fi->lock);
-       VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
        return rc;
 }
 
@@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
                rc = __sigp_sense(vcpu, cpu_addr,
                                  &vcpu->arch.guest_gprs[r1]);
                break;
+       case SIGP_EXTERNAL_CALL:
+               vcpu->stat.instruction_sigp_external_call++;
+               rc = __sigp_external_call(vcpu, cpu_addr);
+               break;
        case SIGP_EMERGENCY:
                vcpu->stat.instruction_sigp_emergency++;
                rc = __sigp_emergency(vcpu, cpu_addr);
index 34595d5..3925d80 100644 (file)
 #define                APIC_TIMER_BASE_CLKIN           0x0
 #define                APIC_TIMER_BASE_TMBASE          0x1
 #define                APIC_TIMER_BASE_DIV             0x2
+#define                APIC_LVT_TIMER_ONESHOT          (0 << 17)
 #define                APIC_LVT_TIMER_PERIODIC         (1 << 17)
+#define                APIC_LVT_TIMER_TSCDEADLINE      (2 << 17)
 #define                APIC_LVT_MASKED                 (1 << 16)
 #define                APIC_LVT_LEVEL_TRIGGER          (1 << 15)
 #define                APIC_LVT_REMOTE_IRR             (1 << 14)
index aa6a488..2f84a43 100644 (file)
 #define X86_FEATURE_X2APIC     (4*32+21) /* x2APIC */
 #define X86_FEATURE_MOVBE      (4*32+22) /* MOVBE instruction */
 #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
 #define X86_FEATURE_AES                (4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE      (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE    (4*32+27) /* "" XSAVE enabled in the OS */
index 6040d11..a026507 100644 (file)
@@ -262,7 +262,7 @@ struct x86_emulate_ctxt {
        struct operand dst;
        bool has_seg_override;
        u8 seg_override;
-       unsigned int d;
+       u64 d;
        int (*execute)(struct x86_emulate_ctxt *ctxt);
        int (*check_perm)(struct x86_emulate_ctxt *ctxt);
        /* modrm */
@@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
        unsigned long _eip;
        /* Fields above regs are cleared together. */
        unsigned long regs[NR_VCPU_REGS];
+       struct operand memop;
+       struct operand *memopp;
        struct fetch_cache fetch;
        struct read_cache io_read;
        struct read_cache mem_read;
index dd51c83..b4973f4 100644 (file)
@@ -26,7 +26,8 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 
-#define KVM_MAX_VCPUS 64
+#define KVM_MAX_VCPUS 254
+#define KVM_SOFT_MAX_VCPUS 64
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -264,6 +265,7 @@ struct kvm_mmu {
        void (*new_cr3)(struct kvm_vcpu *vcpu);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
        unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+       u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
        int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
                          bool prefault);
        void (*inject_page_fault)(struct kvm_vcpu *vcpu,
@@ -411,8 +413,9 @@ struct kvm_vcpu_arch {
        u32  tsc_catchup_mult;
        s8   tsc_catchup_shift;
 
-       bool nmi_pending;
-       bool nmi_injected;
+       atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
+       unsigned nmi_pending; /* NMI queued after currently running handler */
+       bool nmi_injected;    /* Trying to inject an NMI this entry */
 
        struct mtrr_state_type mtrr_state;
        u32 pat;
@@ -628,14 +631,13 @@ struct kvm_x86_ops {
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
        u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+       u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
 
        void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 
        int (*check_intercept)(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage);
-
-       const struct trace_print_flags *exit_reasons_str;
 };
 
 struct kvm_arch_async_pf {
@@ -672,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
 
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
+
 /* control of guest tsc rate supported? */
 extern bool kvm_has_tsc_control;
 /* minimum supported tsc_khz for guests */
index d52609a..a6962d9 100644 (file)
 #define MSR_IA32_APICBASE_ENABLE       (1<<11)
 #define MSR_IA32_APICBASE_BASE         (0xfffff<<12)
 
+#define MSR_IA32_TSCDEADLINE           0x000006e0
+
 #define MSR_IA32_UCODE_WRITE           0x00000079
 #define MSR_IA32_UCODE_REV             0x0000008b
 
index 2caf290..31f180c 100644 (file)
@@ -350,6 +350,18 @@ enum vmcs_field {
 #define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
 
 
+/*
+ * Exit Qualifications for APIC-Access
+ */
+#define APIC_ACCESS_OFFSET              0xfff   /* 11:0, offset within the APIC page */
+#define APIC_ACCESS_TYPE                0xf000  /* 15:12, access type */
+#define TYPE_LINEAR_APIC_INST_READ      (0 << 12)
+#define TYPE_LINEAR_APIC_INST_WRITE     (1 << 12)
+#define TYPE_LINEAR_APIC_INST_FETCH     (2 << 12)
+#define TYPE_LINEAR_APIC_EVENT          (3 << 12)
+#define TYPE_PHYSICAL_APIC_EVENT        (10 << 12)
+#define TYPE_PHYSICAL_APIC_INST         (15 << 12)
+
 /* segment AR */
 #define SEGMENT_AR_L_MASK (1 << 13)
 
index 8b4cc5f..f1e3be1 100644 (file)
 #include "x86.h"
 #include "tss.h"
 
+/*
+ * Operand types
+ */
+#define OpNone             0ull
+#define OpImplicit         1ull  /* No generic decode */
+#define OpReg              2ull  /* Register */
+#define OpMem              3ull  /* Memory */
+#define OpAcc              4ull  /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI               5ull  /* ES:DI/EDI/RDI */
+#define OpMem64            6ull  /* Memory, 64-bit */
+#define OpImmUByte         7ull  /* Zero-extended 8-bit immediate */
+#define OpDX               8ull  /* DX register */
+#define OpCL               9ull  /* CL register (for shifts) */
+#define OpImmByte         10ull  /* 8-bit sign extended immediate */
+#define OpOne             11ull  /* Implied 1 */
+#define OpImm             12ull  /* Sign extended immediate */
+#define OpMem16           13ull  /* Memory operand (16-bit). */
+#define OpMem32           14ull  /* Memory operand (32-bit). */
+#define OpImmU            15ull  /* Immediate operand, zero extended */
+#define OpSI              16ull  /* SI/ESI/RSI */
+#define OpImmFAddr        17ull  /* Immediate far address */
+#define OpMemFAddr        18ull  /* Far address in memory */
+#define OpImmU16          19ull  /* Immediate operand, 16 bits, zero extended */
+#define OpES              20ull  /* ES */
+#define OpCS              21ull  /* CS */
+#define OpSS              22ull  /* SS */
+#define OpDS              23ull  /* DS */
+#define OpFS              24ull  /* FS */
+#define OpGS              25ull  /* GS */
+
+#define OpBits             5  /* Width of operand field */
+#define OpMask             ((1ull << OpBits) - 1)
+
 /*
  * Opcode effective-address decode tables.
  * Note that we only emulate instructions that have at least one memory
 /* Operand sizes: 8-bit operands or specified/overridden size. */
 #define ByteOp      (1<<0)     /* 8-bit operands. */
 /* Destination operand type. */
-#define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)     /* Register operand. */
-#define DstMem      (3<<1)     /* Memory operand. */
-#define DstAcc      (4<<1)     /* Destination Accumulator */
-#define DstDI       (5<<1)     /* Destination is in ES:(E)DI */
-#define DstMem64    (6<<1)     /* 64bit memory operand */
-#define DstImmUByte (7<<1)     /* 8-bit unsigned immediate operand */
-#define DstDX       (8<<1)     /* Destination is in DX register */
-#define DstMask     (0xf<<1)
+#define DstShift    1
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg      (OpReg << DstShift)
+#define DstMem      (OpMem << DstShift)
+#define DstAcc      (OpAcc << DstShift)
+#define DstDI       (OpDI << DstShift)
+#define DstMem64    (OpMem64 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX       (OpDX << DstShift)
+#define DstMask     (OpMask << DstShift)
 /* Source operand type. */
-#define SrcNone     (0<<5)     /* No source operand. */
-#define SrcReg      (1<<5)     /* Register operand. */
-#define SrcMem      (2<<5)     /* Memory operand. */
-#define SrcMem16    (3<<5)     /* Memory operand (16-bit). */
-#define SrcMem32    (4<<5)     /* Memory operand (32-bit). */
-#define SrcImm      (5<<5)     /* Immediate operand. */
-#define SrcImmByte  (6<<5)     /* 8-bit sign-extended immediate operand. */
-#define SrcOne      (7<<5)     /* Implied '1' */
-#define SrcImmUByte (8<<5)      /* 8-bit unsigned immediate operand. */
-#define SrcImmU     (9<<5)      /* Immediate operand, unsigned */
-#define SrcSI       (0xa<<5)   /* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<5)   /* Source is immediate far address */
-#define SrcMemFAddr (0xc<<5)   /* Source is far address in memory */
-#define SrcAcc      (0xd<<5)   /* Source Accumulator */
-#define SrcImmU16   (0xe<<5)    /* Immediate operand, unsigned, 16 bits */
-#define SrcDX       (0xf<<5)   /* Source is in DX register */
-#define SrcMask     (0xf<<5)
-/* Generic ModRM decode. */
-#define ModRM       (1<<9)
-/* Destination is only written; never read. */
-#define Mov         (1<<10)
+#define SrcShift    6
+#define SrcNone     (OpNone << SrcShift)
+#define SrcReg      (OpReg << SrcShift)
+#define SrcMem      (OpMem << SrcShift)
+#define SrcMem16    (OpMem16 << SrcShift)
+#define SrcMem32    (OpMem32 << SrcShift)
+#define SrcImm      (OpImm << SrcShift)
+#define SrcImmByte  (OpImmByte << SrcShift)
+#define SrcOne      (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU     (OpImmU << SrcShift)
+#define SrcSI       (OpSI << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc      (OpAcc << SrcShift)
+#define SrcImmU16   (OpImmU16 << SrcShift)
+#define SrcDX       (OpDX << SrcShift)
+#define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
 #define String      (1<<13)     /* String instruction (rep capable) */
 #define Prefix      (3<<15)     /* Instruction varies with 66/f2/f3 prefix */
 #define RMExt       (4<<15)     /* Opcode extension in ModRM r/m if mod == 3 */
 #define Sse         (1<<18)     /* SSE Vector instruction */
+/* Generic ModRM decode. */
+#define ModRM       (1<<19)
+/* Destination is only written; never read. */
+#define Mov         (1<<20)
 /* Misc flags */
 #define Prot        (1<<21) /* instruction generates #UD if not in prot-mode */
 #define VendorSpecific (1<<22) /* Vendor specific instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64       (1<<28)
 /* Source 2 operand type */
-#define Src2None    (0<<29)
-#define Src2CL      (1<<29)
-#define Src2ImmByte (2<<29)
-#define Src2One     (3<<29)
-#define Src2Imm     (4<<29)
-#define Src2Mask    (7<<29)
+#define Src2Shift   (29)
+#define Src2None    (OpNone << Src2Shift)
+#define Src2CL      (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One     (OpOne << Src2Shift)
+#define Src2Imm     (OpImm << Src2Shift)
+#define Src2ES      (OpES << Src2Shift)
+#define Src2CS      (OpCS << Src2Shift)
+#define Src2SS      (OpSS << Src2Shift)
+#define Src2DS      (OpDS << Src2Shift)
+#define Src2FS      (OpFS << Src2Shift)
+#define Src2GS      (OpGS << Src2Shift)
+#define Src2Mask    (OpMask << Src2Shift)
 
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
 #define X16(x...) X8(x), X8(x)
 
 struct opcode {
-       u32 flags;
-       u8 intercept;
+       u64 flags : 56;
+       u64 intercept : 8;
        union {
                int (*execute)(struct x86_emulate_ctxt *ctxt);
                struct opcode *group;
@@ -205,105 +247,100 @@ struct gprefix {
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
+#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype)  \
        do {                                                            \
                __asm__ __volatile__ (                                  \
                        _PRE_EFLAGS("0", "4", "2")                      \
                        _op _suffix " %"_x"3,%1; "                      \
                        _POST_EFLAGS("0", "4", "2")                     \
-                       : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
+                       : "=m" ((ctxt)->eflags),                        \
+                         "+q" (*(_dsttype*)&(ctxt)->dst.val),          \
                          "=&r" (_tmp)                                  \
-                       : _y ((_src).val), "i" (EFLAGS_MASK));          \
+                       : _y ((ctxt)->src.val), "i" (EFLAGS_MASK));     \
        } while (0)
 
 
 /* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy)         \
        do {                                                            \
                unsigned long _tmp;                                     \
                                                                        \
-               switch ((_dst).bytes) {                                 \
+               switch ((ctxt)->dst.bytes) {                            \
                case 2:                                                 \
-                       ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
+                       ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16);      \
                        break;                                          \
                case 4:                                                 \
-                       ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
+                       ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32);      \
                        break;                                          \
                case 8:                                                 \
-                       ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
+                       ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
                        break;                                          \
                }                                                       \
        } while (0)
 
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)                     \
        do {                                                                 \
                unsigned long _tmp;                                          \
-               switch ((_dst).bytes) {                                      \
+               switch ((ctxt)->dst.bytes) {                                 \
                case 1:                                                      \
-                       ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
+                       ____emulate_2op(ctxt,_op,_bx,_by,"b",u8);            \
                        break;                                               \
                default:                                                     \
-                       __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
+                       __emulate_2op_nobyte(ctxt, _op,                      \
                                             _wx, _wy, _lx, _ly, _qx, _qy);  \
                        break;                                               \
                }                                                            \
        } while (0)
 
 /* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "c", "b", "c", "b", "c", "b", "c")
+#define emulate_2op_SrcB(ctxt, _op)                                    \
+       __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
 
 /* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "q", "w", "r", _LO32, "r", "", "r")
+#define emulate_2op_SrcV(ctxt, _op)                                    \
+       __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
 
 /* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
-       __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
-                            "w", "r", _LO32, "r", "", "r")
+#define emulate_2op_SrcV_nobyte(ctxt, _op)                             \
+       __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
 
 /* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type)        \
+#define __emulate_2op_cl(ctxt, _op, _suffix, _type)            \
        do {                                                            \
                unsigned long _tmp;                                     \
-               _type _clv  = (_cl).val;                                \
-               _type _srcv = (_src).val;                               \
-               _type _dstv = (_dst).val;                               \
+               _type _clv  = (ctxt)->src2.val;                         \
+               _type _srcv = (ctxt)->src.val;                          \
+               _type _dstv = (ctxt)->dst.val;                          \
                                                                        \
                __asm__ __volatile__ (                                  \
                        _PRE_EFLAGS("0", "5", "2")                      \
                        _op _suffix " %4,%1 \n"                         \
                        _POST_EFLAGS("0", "5", "2")                     \
-                       : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)    \
+                       : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
                        : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)   \
                        );                                              \
                                                                        \
-               (_cl).val  = (unsigned long) _clv;                      \
-               (_src).val = (unsigned long) _srcv;                     \
-               (_dst).val = (unsigned long) _dstv;                     \
+               (ctxt)->src2.val  = (unsigned long) _clv;               \
+               (ctxt)->src2.val = (unsigned long) _srcv;               \
+               (ctxt)->dst.val = (unsigned long) _dstv;                \
        } while (0)
 
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)                  \
+#define emulate_2op_cl(ctxt, _op)                                      \
        do {                                                            \
-               switch ((_dst).bytes) {                                 \
+               switch ((ctxt)->dst.bytes) {                            \
                case 2:                                                 \
-                       __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
-                                        "w", unsigned short);          \
+                       __emulate_2op_cl(ctxt, _op, "w", u16);          \
                        break;                                          \
                case 4:                                                 \
-                       __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
-                                        "l", unsigned int);            \
+                       __emulate_2op_cl(ctxt, _op, "l", u32);          \
                        break;                                          \
                case 8:                                                 \
-                       ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
-                                             "q", unsigned long));     \
+                       ON64(__emulate_2op_cl(ctxt, _op, "q", ulong));  \
                        break;                                          \
                }                                                       \
        } while (0)
 
-#define __emulate_1op(_op, _dst, _eflags, _suffix)                     \
+#define __emulate_1op(ctxt, _op, _suffix)                              \
        do {                                                            \
                unsigned long _tmp;                                     \
                                                                        \
@@ -311,39 +348,27 @@ struct gprefix {
                        _PRE_EFLAGS("0", "3", "2")                      \
                        _op _suffix " %1; "                             \
                        _POST_EFLAGS("0", "3", "2")                     \
-                       : "=m" (_eflags), "+m" ((_dst).val),            \
+                       : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
                          "=&r" (_tmp)                                  \
                        : "i" (EFLAGS_MASK));                           \
        } while (0)
 
 /* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
+#define emulate_1op(ctxt, _op)                                         \
        do {                                                            \
-               switch ((_dst).bytes) {                                 \
-               case 1: __emulate_1op(_op, _dst, _eflags, "b"); break;  \
-               case 2: __emulate_1op(_op, _dst, _eflags, "w"); break;  \
-               case 4: __emulate_1op(_op, _dst, _eflags, "l"); break;  \
-               case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+               switch ((ctxt)->dst.bytes) {                            \
+               case 1: __emulate_1op(ctxt, _op, "b"); break;           \
+               case 2: __emulate_1op(ctxt, _op, "w"); break;           \
+               case 4: __emulate_1op(ctxt, _op, "l"); break;           \
+               case 8: ON64(__emulate_1op(ctxt, _op, "q")); break;     \
                }                                                       \
        } while (0)
 
-#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix)         \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               __asm__ __volatile__ (                                  \
-                       _PRE_EFLAGS("0", "4", "1")                      \
-                       _op _suffix " %5; "                             \
-                       _POST_EFLAGS("0", "4", "1")                     \
-                       : "=m" (_eflags), "=&r" (_tmp),                 \
-                         "+a" (_rax), "+d" (_rdx)                      \
-                       : "i" (EFLAGS_MASK), "m" ((_src).val),          \
-                         "a" (_rax), "d" (_rdx));                      \
-       } while (0)
-
-#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)                 \
        do {                                                            \
                unsigned long _tmp;                                     \
+               ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX];              \
+               ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX];              \
                                                                        \
                __asm__ __volatile__ (                                  \
                        _PRE_EFLAGS("0", "5", "1")                      \
@@ -356,53 +381,27 @@ struct gprefix {
                        "jmp 2b \n\t"                                   \
                        ".popsection \n\t"                              \
                        _ASM_EXTABLE(1b, 3b)                            \
-                       : "=m" (_eflags), "=&r" (_tmp),                 \
-                         "+a" (_rax), "+d" (_rdx), "+qm"(_ex)          \
-                       : "i" (EFLAGS_MASK), "m" ((_src).val),          \
-                         "a" (_rax), "d" (_rdx));                      \
+                       : "=m" ((ctxt)->eflags), "=&r" (_tmp),          \
+                         "+a" (*rax), "+d" (*rdx), "+qm"(_ex)          \
+                       : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val),     \
+                         "a" (*rax), "d" (*rdx));                      \
        } while (0)
 
 /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)            \
+#define emulate_1op_rax_rdx(ctxt, _op, _ex)    \
        do {                                                            \
-               switch((_src).bytes) {                                  \
+               switch((ctxt)->src.bytes) {                             \
                case 1:                                                 \
-                       __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,    \
-                                             _eflags, "b");            \
+                       __emulate_1op_rax_rdx(ctxt, _op, "b", _ex);     \
                        break;                                          \
                case 2:                                                 \
-                       __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,    \
-                                             _eflags, "w");            \
+                       __emulate_1op_rax_rdx(ctxt, _op, "w", _ex);     \
                        break;                                          \
                case 4:                                                 \
-                       __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,    \
-                                             _eflags, "l");            \
-                       break;                                          \
-               case 8:                                                 \
-                       ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
-                                                  _eflags, "q"));      \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
-#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex)    \
-       do {                                                            \
-               switch((_src).bytes) {                                  \
-               case 1:                                                 \
-                       __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-                                                _eflags, "b", _ex);    \
-                       break;                                          \
-               case 2:                                                 \
-                       __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-                                                _eflags, "w", _ex);    \
-                       break;                                          \
-               case 4:                                                 \
-                       __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-                                                _eflags, "l", _ex);    \
+                       __emulate_1op_rax_rdx(ctxt, _op, "l", _ex);     \
                        break;                                          \
                case 8: ON64(                                           \
-                       __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-                                                _eflags, "q", _ex));   \
+                       __emulate_1op_rax_rdx(ctxt, _op, "q", _ex));    \
                        break;                                          \
                }                                                       \
        } while (0)
@@ -651,41 +650,50 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
        return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
-                             unsigned long eip, u8 *dest)
+/*
+ * Fetch the next byte of the instruction being emulated which is pointed to
+ * by ctxt->_eip, then increment ctxt->_eip.
+ *
+ * Also prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
 {
        struct fetch_cache *fc = &ctxt->fetch;
        int rc;
        int size, cur_size;
 
-       if (eip == fc->end) {
+       if (ctxt->_eip == fc->end) {
                unsigned long linear;
-               struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
+               struct segmented_address addr = { .seg = VCPU_SREG_CS,
+                                                 .ea  = ctxt->_eip };
                cur_size = fc->end - fc->start;
-               size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+               size = min(15UL - cur_size,
+                          PAGE_SIZE - offset_in_page(ctxt->_eip));
                rc = __linearize(ctxt, addr, size, false, true, &linear);
-               if (rc != X86EMUL_CONTINUE)
+               if (unlikely(rc != X86EMUL_CONTINUE))
                        return rc;
                rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
                                      size, &ctxt->exception);
-               if (rc != X86EMUL_CONTINUE)
+               if (unlikely(rc != X86EMUL_CONTINUE))
                        return rc;
                fc->end += size;
        }
-       *dest = fc->data[eip - fc->start];
+       *dest = fc->data[ctxt->_eip - fc->start];
+       ctxt->_eip++;
        return X86EMUL_CONTINUE;
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-                        unsigned long eip, void *dest, unsigned size)
+                        void *dest, unsigned size)
 {
        int rc;
 
        /* x86 instructions are limited to 15 bytes. */
-       if (eip + size - ctxt->eip > 15)
+       if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
                return X86EMUL_UNHANDLEABLE;
        while (size--) {
-               rc = do_insn_fetch_byte(ctxt, eip++, dest++);
+               rc = do_insn_fetch_byte(ctxt, dest++);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
        }
@@ -693,20 +701,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 }
 
 /* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                 \
+#define insn_fetch(_type, _ctxt)                                       \
 ({     unsigned long _x;                                               \
-       rc = do_insn_fetch(ctxt, (_eip), &_x, (_size));                 \
+       rc = do_insn_fetch(_ctxt, &_x, sizeof(_type));                  \
        if (rc != X86EMUL_CONTINUE)                                     \
                goto done;                                              \
-       (_eip) += (_size);                                              \
        (_type)_x;                                                      \
 })
 
-#define insn_fetch_arr(_arr, _size, _eip)                              \
-({     rc = do_insn_fetch(ctxt, (_eip), _arr, (_size));                \
+#define insn_fetch_arr(_arr, _size, _ctxt)                             \
+({     rc = do_insn_fetch(_ctxt, _arr, (_size));                       \
        if (rc != X86EMUL_CONTINUE)                                     \
                goto done;                                              \
-       (_eip) += (_size);                                              \
 })
 
 /*
@@ -894,7 +900,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
        }
 
-       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+       ctxt->modrm = insn_fetch(u8, ctxt);
        ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
        ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
        ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -928,13 +934,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                switch (ctxt->modrm_mod) {
                case 0:
                        if (ctxt->modrm_rm == 6)
-                               modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
+                               modrm_ea += insn_fetch(u16, ctxt);
                        break;
                case 1:
-                       modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
+                       modrm_ea += insn_fetch(s8, ctxt);
                        break;
                case 2:
-                       modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
+                       modrm_ea += insn_fetch(u16, ctxt);
                        break;
                }
                switch (ctxt->modrm_rm) {
@@ -971,13 +977,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        } else {
                /* 32/64-bit ModR/M decode. */
                if ((ctxt->modrm_rm & 7) == 4) {
-                       sib = insn_fetch(u8, 1, ctxt->_eip);
+                       sib = insn_fetch(u8, ctxt);
                        index_reg |= (sib >> 3) & 7;
                        base_reg |= sib & 7;
                        scale = sib >> 6;
 
                        if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
-                               modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+                               modrm_ea += insn_fetch(s32, ctxt);
                        else
                                modrm_ea += ctxt->regs[base_reg];
                        if (index_reg != 4)
@@ -990,13 +996,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                switch (ctxt->modrm_mod) {
                case 0:
                        if (ctxt->modrm_rm == 5)
-                               modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+                               modrm_ea += insn_fetch(s32, ctxt);
                        break;
                case 1:
-                       modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
+                       modrm_ea += insn_fetch(s8, ctxt);
                        break;
                case 2:
-                       modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+                       modrm_ea += insn_fetch(s32, ctxt);
                        break;
                }
        }
@@ -1013,13 +1019,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
        op->type = OP_MEM;
        switch (ctxt->ad_bytes) {
        case 2:
-               op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
+               op->addr.mem.ea = insn_fetch(u16, ctxt);
                break;
        case 4:
-               op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
+               op->addr.mem.ea = insn_fetch(u32, ctxt);
                break;
        case 8:
-               op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
+               op->addr.mem.ea = insn_fetch(u64, ctxt);
                break;
        }
 done:
@@ -1452,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
        return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
-static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
 {
+       int seg = ctxt->src2.val;
+
        ctxt->src.val = get_segment_selector(ctxt, seg);
 
        return em_push(ctxt);
 }
 
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
 {
+       int seg = ctxt->src2.val;
        unsigned long selector;
        int rc;
 
@@ -1674,64 +1683,74 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
        switch (ctxt->modrm_reg) {
        case 0: /* rol */
-               emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "rol");
                break;
        case 1: /* ror */
-               emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "ror");
                break;
        case 2: /* rcl */
-               emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "rcl");
                break;
        case 3: /* rcr */
-               emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "rcr");
                break;
        case 4: /* sal/shl */
        case 6: /* sal/shl */
-               emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "sal");
                break;
        case 5: /* shr */
-               emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "shr");
                break;
        case 7: /* sar */
-               emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcB(ctxt, "sar");
                break;
        }
        return X86EMUL_CONTINUE;
 }
 
-static int em_grp3(struct x86_emulate_ctxt *ctxt)
+static int em_not(struct x86_emulate_ctxt *ctxt)
+{
+       ctxt->dst.val = ~ctxt->dst.val;
+       return X86EMUL_CONTINUE;
+}
+
+static int em_neg(struct x86_emulate_ctxt *ctxt)
+{
+       emulate_1op(ctxt, "neg");
+       return X86EMUL_CONTINUE;
+}
+
+static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
+{
+       u8 ex = 0;
+
+       emulate_1op_rax_rdx(ctxt, "mul", ex);
+       return X86EMUL_CONTINUE;
+}
+
+static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
+{
+       u8 ex = 0;
+
+       emulate_1op_rax_rdx(ctxt, "imul", ex);
+       return X86EMUL_CONTINUE;
+}
+
+static int em_div_ex(struct x86_emulate_ctxt *ctxt)
 {
-       unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
-       unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
        u8 de = 0;
 
-       switch (ctxt->modrm_reg) {
-       case 0 ... 1:   /* test */
-               emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
-               break;
-       case 2: /* not */
-               ctxt->dst.val = ~ctxt->dst.val;
-               break;
-       case 3: /* neg */
-               emulate_1op("neg", ctxt->dst, ctxt->eflags);
-               break;
-       case 4: /* mul */
-               emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
-               break;
-       case 5: /* imul */
-               emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
-               break;
-       case 6: /* div */
-               emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
-                                      ctxt->eflags, de);
-               break;
-       case 7: /* idiv */
-               emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
-                                      ctxt->eflags, de);
-               break;
-       default:
-               return X86EMUL_UNHANDLEABLE;
-       }
+       emulate_1op_rax_rdx(ctxt, "div", de);
+       if (de)
+               return emulate_de(ctxt);
+       return X86EMUL_CONTINUE;
+}
+
+static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
+{
+       u8 de = 0;
+
+       emulate_1op_rax_rdx(ctxt, "idiv", de);
        if (de)
                return emulate_de(ctxt);
        return X86EMUL_CONTINUE;
@@ -1743,10 +1762,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 
        switch (ctxt->modrm_reg) {
        case 0: /* inc */
-               emulate_1op("inc", ctxt->dst, ctxt->eflags);
+               emulate_1op(ctxt, "inc");
                break;
        case 1: /* dec */
-               emulate_1op("dec", ctxt->dst, ctxt->eflags);
+               emulate_1op(ctxt, "dec");
                break;
        case 2: /* call near abs */ {
                long int old_eip;
@@ -1812,8 +1831,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
        return rc;
 }
 
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
 {
+       int seg = ctxt->src2.val;
        unsigned short sel;
        int rc;
 
@@ -2452,7 +2472,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
        ctxt->src.type = OP_IMM;
        ctxt->src.val = 0;
        ctxt->src.bytes = 1;
-       emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "or");
        ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
        if (cf)
                ctxt->eflags |= X86_EFLAGS_CF;
@@ -2502,49 +2522,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 
 static int em_add(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "add");
        return X86EMUL_CONTINUE;
 }
 
 static int em_or(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "or");
        return X86EMUL_CONTINUE;
 }
 
 static int em_adc(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "adc");
        return X86EMUL_CONTINUE;
 }
 
 static int em_sbb(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "sbb");
        return X86EMUL_CONTINUE;
 }
 
 static int em_and(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "and");
        return X86EMUL_CONTINUE;
 }
 
 static int em_sub(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "sub");
        return X86EMUL_CONTINUE;
 }
 
 static int em_xor(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "xor");
        return X86EMUL_CONTINUE;
 }
 
 static int em_cmp(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "cmp");
        /* Disable writeback. */
        ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
@@ -2552,7 +2572,9 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt)
 
 static int em_test(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV(ctxt, "test");
+       /* Disable writeback. */
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
@@ -2570,7 +2592,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
 
 static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
+       emulate_2op_SrcV_nobyte(ctxt, "imul");
        return X86EMUL_CONTINUE;
 }
 
@@ -3025,9 +3047,14 @@ static struct opcode group1A[] = {
 };
 
 static struct opcode group3[] = {
-       D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
-       D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-       X4(D(SrcMem | ModRM)),
+       I(DstMem | SrcImm | ModRM, em_test),
+       I(DstMem | SrcImm | ModRM, em_test),
+       I(DstMem | SrcNone | ModRM | Lock, em_not),
+       I(DstMem | SrcNone | ModRM | Lock, em_neg),
+       I(SrcMem | ModRM, em_mul_ex),
+       I(SrcMem | ModRM, em_imul_ex),
+       I(SrcMem | ModRM, em_div_ex),
+       I(SrcMem | ModRM, em_idiv_ex),
 };
 
 static struct opcode group4[] = {
@@ -3090,16 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = {
 static struct opcode opcode_table[256] = {
        /* 0x00 - 0x07 */
        I6ALU(Lock, em_add),
-       D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+       I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+       I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
        /* 0x08 - 0x0F */
        I6ALU(Lock, em_or),
-       D(ImplicitOps | Stack | No64), N,
+       I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+       N,
        /* 0x10 - 0x17 */
        I6ALU(Lock, em_adc),
-       D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+       I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+       I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
        /* 0x18 - 0x1F */
        I6ALU(Lock, em_sbb),
-       D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+       I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+       I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
        /* 0x20 - 0x27 */
        I6ALU(Lock, em_and), N, N,
        /* 0x28 - 0x2F */
@@ -3167,7 +3198,8 @@ static struct opcode opcode_table[256] = {
        D2bv(DstMem | SrcImmByte | ModRM),
        I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
        I(ImplicitOps | Stack, em_ret),
-       D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
+       I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+       I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
        G(ByteOp, group11), G(0, group11),
        /* 0xC8 - 0xCF */
        N, N, N, I(ImplicitOps | Stack, em_ret_far),
@@ -3242,20 +3274,22 @@ static struct opcode twobyte_table[256] = {
        /* 0x90 - 0x9F */
        X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
        /* 0xA0 - 0xA7 */
-       D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+       I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
        DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
        D(DstMem | SrcReg | Src2ImmByte | ModRM),
        D(DstMem | SrcReg | Src2CL | ModRM), N, N,
        /* 0xA8 - 0xAF */
-       D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+       I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
        DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
        D(DstMem | SrcReg | Src2ImmByte | ModRM),
        D(DstMem | SrcReg | Src2CL | ModRM),
        D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
        /* 0xB0 - 0xB7 */
        D2bv(DstMem | SrcReg | ModRM | Lock),
-       D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
-       D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+       I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+       D(DstMem | SrcReg | ModRM | BitOp | Lock),
+       I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+       I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
        D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
        /* 0xB8 - 0xBF */
        N, N,
@@ -3309,13 +3343,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
        /* NB. Immediates are sign-extended as necessary. */
        switch (op->bytes) {
        case 1:
-               op->val = insn_fetch(s8, 1, ctxt->_eip);
+               op->val = insn_fetch(s8, ctxt);
                break;
        case 2:
-               op->val = insn_fetch(s16, 2, ctxt->_eip);
+               op->val = insn_fetch(s16, ctxt);
                break;
        case 4:
-               op->val = insn_fetch(s32, 4, ctxt->_eip);
+               op->val = insn_fetch(s32, ctxt);
                break;
        }
        if (!sign_extension) {
@@ -3335,6 +3369,125 @@ done:
        return rc;
 }
 
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+                         unsigned d)
+{
+       int rc = X86EMUL_CONTINUE;
+
+       switch (d) {
+       case OpReg:
+               decode_register_operand(ctxt, op,
+                        op == &ctxt->dst &&
+                        ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
+               break;
+       case OpImmUByte:
+               rc = decode_imm(ctxt, op, 1, false);
+               break;
+       case OpMem:
+               ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+       mem_common:
+               *op = ctxt->memop;
+               ctxt->memopp = op;
+               if ((ctxt->d & BitOp) && op == &ctxt->dst)
+                       fetch_bit_operand(ctxt);
+               op->orig_val = op->val;
+               break;
+       case OpMem64:
+               ctxt->memop.bytes = 8;
+               goto mem_common;
+       case OpAcc:
+               op->type = OP_REG;
+               op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+               fetch_register_operand(op);
+               op->orig_val = op->val;
+               break;
+       case OpDI:
+               op->type = OP_MEM;
+               op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               op->addr.mem.ea =
+                       register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+               op->addr.mem.seg = VCPU_SREG_ES;
+               op->val = 0;
+               break;
+       case OpDX:
+               op->type = OP_REG;
+               op->bytes = 2;
+               op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+               fetch_register_operand(op);
+               break;
+       case OpCL:
+               op->bytes = 1;
+               op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
+               break;
+       case OpImmByte:
+               rc = decode_imm(ctxt, op, 1, true);
+               break;
+       case OpOne:
+               op->bytes = 1;
+               op->val = 1;
+               break;
+       case OpImm:
+               rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+               break;
+       case OpMem16:
+               ctxt->memop.bytes = 2;
+               goto mem_common;
+       case OpMem32:
+               ctxt->memop.bytes = 4;
+               goto mem_common;
+       case OpImmU16:
+               rc = decode_imm(ctxt, op, 2, false);
+               break;
+       case OpImmU:
+               rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+               break;
+       case OpSI:
+               op->type = OP_MEM;
+               op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               op->addr.mem.ea =
+                       register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+               op->addr.mem.seg = seg_override(ctxt);
+               op->val = 0;
+               break;
+       case OpImmFAddr:
+               op->type = OP_IMM;
+               op->addr.mem.ea = ctxt->_eip;
+               op->bytes = ctxt->op_bytes + 2;
+               insn_fetch_arr(op->valptr, op->bytes, ctxt);
+               break;
+       case OpMemFAddr:
+               ctxt->memop.bytes = ctxt->op_bytes + 2;
+               goto mem_common;
+       case OpES:
+               op->val = VCPU_SREG_ES;
+               break;
+       case OpCS:
+               op->val = VCPU_SREG_CS;
+               break;
+       case OpSS:
+               op->val = VCPU_SREG_SS;
+               break;
+       case OpDS:
+               op->val = VCPU_SREG_DS;
+               break;
+       case OpFS:
+               op->val = VCPU_SREG_FS;
+               break;
+       case OpGS:
+               op->val = VCPU_SREG_GS;
+               break;
+       case OpImplicit:
+               /* Special instructions do their own operand decoding. */
+       default:
+               op->type = OP_NONE; /* Disable writeback. */
+               break;
+       }
+
+done:
+       return rc;
+}
+
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
        int rc = X86EMUL_CONTINUE;
@@ -3342,8 +3495,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
        bool op_prefix = false;
        struct opcode opcode;
-       struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
+       ctxt->memop.type = OP_NONE;
+       ctxt->memopp = NULL;
        ctxt->_eip = ctxt->eip;
        ctxt->fetch.start = ctxt->_eip;
        ctxt->fetch.end = ctxt->fetch.start + insn_len;
@@ -3366,7 +3520,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
                break;
 #endif
        default:
-               return -1;
+               return EMULATION_FAILED;
        }
 
        ctxt->op_bytes = def_op_bytes;
@@ -3374,7 +3528,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 
        /* Legacy prefixes. */
        for (;;) {
-               switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
+               switch (ctxt->b = insn_fetch(u8, ctxt)) {
                case 0x66:      /* operand-size override */
                        op_prefix = true;
                        /* switch between 2/4 bytes */
@@ -3430,7 +3584,7 @@ done_prefixes:
        /* Two-byte opcode? */
        if (ctxt->b == 0x0f) {
                ctxt->twobyte = 1;
-               ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+               ctxt->b = insn_fetch(u8, ctxt);
                opcode = twobyte_table[ctxt->b];
        }
        ctxt->d = opcode.flags;
@@ -3438,13 +3592,13 @@ done_prefixes:
        while (ctxt->d & GroupMask) {
                switch (ctxt->d & GroupMask) {
                case Group:
-                       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+                       ctxt->modrm = insn_fetch(u8, ctxt);
                        --ctxt->_eip;
                        goffset = (ctxt->modrm >> 3) & 7;
                        opcode = opcode.u.group[goffset];
                        break;
                case GroupDual:
-                       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+                       ctxt->modrm = insn_fetch(u8, ctxt);
                        --ctxt->_eip;
                        goffset = (ctxt->modrm >> 3) & 7;
                        if ((ctxt->modrm >> 6) == 3)
@@ -3458,7 +3612,7 @@ done_prefixes:
                        break;
                case Prefix:
                        if (ctxt->rep_prefix && op_prefix)
-                               return X86EMUL_UNHANDLEABLE;
+                               return EMULATION_FAILED;
                        simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
                        switch (simd_prefix) {
                        case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
@@ -3468,10 +3622,10 @@ done_prefixes:
                        }
                        break;
                default:
-                       return X86EMUL_UNHANDLEABLE;
+                       return EMULATION_FAILED;
                }
 
-               ctxt->d &= ~GroupMask;
+               ctxt->d &= ~(u64)GroupMask;
                ctxt->d |= opcode.flags;
        }
 
@@ -3481,10 +3635,10 @@ done_prefixes:
 
        /* Unrecognised? */
        if (ctxt->d == 0 || (ctxt->d & Undefined))
-               return -1;
+               return EMULATION_FAILED;
 
        if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
-               return -1;
+               return EMULATION_FAILED;
 
        if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
                ctxt->op_bytes = 8;
@@ -3501,96 +3655,27 @@ done_prefixes:
 
        /* ModRM and SIB bytes. */
        if (ctxt->d & ModRM) {
-               rc = decode_modrm(ctxt, &memop);
+               rc = decode_modrm(ctxt, &ctxt->memop);
                if (!ctxt->has_seg_override)
                        set_seg_override(ctxt, ctxt->modrm_seg);
        } else if (ctxt->d & MemAbs)
-               rc = decode_abs(ctxt, &memop);
+               rc = decode_abs(ctxt, &ctxt->memop);
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
        if (!ctxt->has_seg_override)
                set_seg_override(ctxt, VCPU_SREG_DS);
 
-       memop.addr.mem.seg = seg_override(ctxt);
+       ctxt->memop.addr.mem.seg = seg_override(ctxt);
 
-       if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
-               memop.addr.mem.ea = (u32)memop.addr.mem.ea;
+       if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
+               ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
 
        /*
         * Decode and fetch the source operand: register, memory
         * or immediate.
         */
-       switch (ctxt->d & SrcMask) {
-       case SrcNone:
-               break;
-       case SrcReg:
-               decode_register_operand(ctxt, &ctxt->src, 0);
-               break;
-       case SrcMem16:
-               memop.bytes = 2;
-               goto srcmem_common;
-       case SrcMem32:
-               memop.bytes = 4;
-               goto srcmem_common;
-       case SrcMem:
-               memop.bytes = (ctxt->d & ByteOp) ? 1 :
-                                                          ctxt->op_bytes;
-       srcmem_common:
-               ctxt->src = memop;
-               memopp = &ctxt->src;
-               break;
-       case SrcImmU16:
-               rc = decode_imm(ctxt, &ctxt->src, 2, false);
-               break;
-       case SrcImm:
-               rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
-               break;
-       case SrcImmU:
-               rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
-               break;
-       case SrcImmByte:
-               rc = decode_imm(ctxt, &ctxt->src, 1, true);
-               break;
-       case SrcImmUByte:
-               rc = decode_imm(ctxt, &ctxt->src, 1, false);
-               break;
-       case SrcAcc:
-               ctxt->src.type = OP_REG;
-               ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
-               fetch_register_operand(&ctxt->src);
-               break;
-       case SrcOne:
-               ctxt->src.bytes = 1;
-               ctxt->src.val = 1;
-               break;
-       case SrcSI:
-               ctxt->src.type = OP_MEM;
-               ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               ctxt->src.addr.mem.ea =
-                       register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
-               ctxt->src.addr.mem.seg = seg_override(ctxt);
-               ctxt->src.val = 0;
-               break;
-       case SrcImmFAddr:
-               ctxt->src.type = OP_IMM;
-               ctxt->src.addr.mem.ea = ctxt->_eip;
-               ctxt->src.bytes = ctxt->op_bytes + 2;
-               insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
-               break;
-       case SrcMemFAddr:
-               memop.bytes = ctxt->op_bytes + 2;
-               goto srcmem_common;
-               break;
-       case SrcDX:
-               ctxt->src.type = OP_REG;
-               ctxt->src.bytes = 2;
-               ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
-               fetch_register_operand(&ctxt->src);
-               break;
-       }
-
+       rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
@@ -3598,85 +3683,18 @@ done_prefixes:
         * Decode and fetch the second source operand: register, memory
         * or immediate.
         */
-       switch (ctxt->d & Src2Mask) {
-       case Src2None:
-               break;
-       case Src2CL:
-               ctxt->src2.bytes = 1;
-               ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
-               break;
-       case Src2ImmByte:
-               rc = decode_imm(ctxt, &ctxt->src2, 1, true);
-               break;
-       case Src2One:
-               ctxt->src2.bytes = 1;
-               ctxt->src2.val = 1;
-               break;
-       case Src2Imm:
-               rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
-               break;
-       }
-
+       rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
        /* Decode and fetch the destination operand: register or memory. */
-       switch (ctxt->d & DstMask) {
-       case DstReg:
-               decode_register_operand(ctxt, &ctxt->dst,
-                        ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
-               break;
-       case DstImmUByte:
-               ctxt->dst.type = OP_IMM;
-               ctxt->dst.addr.mem.ea = ctxt->_eip;
-               ctxt->dst.bytes = 1;
-               ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
-               break;
-       case DstMem:
-       case DstMem64:
-               ctxt->dst = memop;
-               memopp = &ctxt->dst;
-               if ((ctxt->d & DstMask) == DstMem64)
-                       ctxt->dst.bytes = 8;
-               else
-                       ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               if (ctxt->d & BitOp)
-                       fetch_bit_operand(ctxt);
-               ctxt->dst.orig_val = ctxt->dst.val;
-               break;
-       case DstAcc:
-               ctxt->dst.type = OP_REG;
-               ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
-               fetch_register_operand(&ctxt->dst);
-               ctxt->dst.orig_val = ctxt->dst.val;
-               break;
-       case DstDI:
-               ctxt->dst.type = OP_MEM;
-               ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-               ctxt->dst.addr.mem.ea =
-                       register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
-               ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
-               ctxt->dst.val = 0;
-               break;
-       case DstDX:
-               ctxt->dst.type = OP_REG;
-               ctxt->dst.bytes = 2;
-               ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
-               fetch_register_operand(&ctxt->dst);
-               break;
-       case ImplicitOps:
-               /* Special instructions do their own operand decoding. */
-       default:
-               ctxt->dst.type = OP_NONE; /* Disable writeback. */
-               break;
-       }
+       rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
 
 done:
-       if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
-               memopp->addr.mem.ea += ctxt->_eip;
+       if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+               ctxt->memopp->addr.mem.ea += ctxt->_eip;
 
-       return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+       return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -3825,32 +3843,11 @@ special_insn:
                goto twobyte_insn;
 
        switch (ctxt->b) {
-       case 0x06:              /* push es */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
-               break;
-       case 0x07:              /* pop es */
-               rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
-               break;
-       case 0x0e:              /* push cs */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
-               break;
-       case 0x16:              /* push ss */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
-               break;
-       case 0x17:              /* pop ss */
-               rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
-               break;
-       case 0x1e:              /* push ds */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
-               break;
-       case 0x1f:              /* pop ds */
-               rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
-               break;
        case 0x40 ... 0x47: /* inc r16/r32 */
-               emulate_1op("inc", ctxt->dst, ctxt->eflags);
+               emulate_1op(ctxt, "inc");
                break;
        case 0x48 ... 0x4f: /* dec r16/r32 */
-               emulate_1op("dec", ctxt->dst, ctxt->eflags);
+               emulate_1op(ctxt, "dec");
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3891,12 +3888,6 @@ special_insn:
        case 0xc0 ... 0xc1:
                rc = em_grp2(ctxt);
                break;
-       case 0xc4:              /* les */
-               rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
-               break;
-       case 0xc5:              /* lds */
-               rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
-               break;
        case 0xcc:              /* int3 */
                rc = emulate_int(ctxt, 3);
                break;
@@ -3953,9 +3944,6 @@ special_insn:
                /* complement carry flag from eflags reg */
                ctxt->eflags ^= EFLG_CF;
                break;
-       case 0xf6 ... 0xf7:     /* Grp3 */
-               rc = em_grp3(ctxt);
-               break;
        case 0xf8: /* clc */
                ctxt->eflags &= ~EFLG_CF;
                break;
@@ -4103,36 +4091,24 @@ twobyte_insn:
        case 0x90 ... 0x9f:     /* setcc r/m8 */
                ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
                break;
-       case 0xa0:        /* push fs */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
-               break;
-       case 0xa1:       /* pop fs */
-               rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
-               break;
        case 0xa3:
              bt:               /* bt */
                ctxt->dst.type = OP_NONE;
                /* only subword offset */
                ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte(ctxt, "bt");
                break;
        case 0xa4: /* shld imm8, r, r/m */
        case 0xa5: /* shld cl, r, r/m */
-               emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
-               break;
-       case 0xa8:      /* push gs */
-               rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
-               break;
-       case 0xa9:      /* pop gs */
-               rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
+               emulate_2op_cl(ctxt, "shld");
                break;
        case 0xab:
              bts:              /* bts */
-               emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte(ctxt, "bts");
                break;
        case 0xac: /* shrd imm8, r, r/m */
        case 0xad: /* shrd cl, r, r/m */
-               emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_cl(ctxt, "shrd");
                break;
        case 0xae:              /* clflush */
                break;
@@ -4143,7 +4119,7 @@ twobyte_insn:
                 */
                ctxt->src.orig_val = ctxt->src.val;
                ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcV(ctxt, "cmp");
                if (ctxt->eflags & EFLG_ZF) {
                        /* Success: write back to memory. */
                        ctxt->dst.val = ctxt->src.orig_val;
@@ -4153,18 +4129,9 @@ twobyte_insn:
                        ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
                }
                break;
-       case 0xb2:              /* lss */
-               rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
-               break;
        case 0xb3:
              btr:              /* btr */
-               emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
-               break;
-       case 0xb4:              /* lfs */
-               rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
-               break;
-       case 0xb5:              /* lgs */
-               rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
+               emulate_2op_SrcV_nobyte(ctxt, "btr");
                break;
        case 0xb6 ... 0xb7:     /* movzx */
                ctxt->dst.bytes = ctxt->op_bytes;
@@ -4185,7 +4152,7 @@ twobyte_insn:
                break;
        case 0xbb:
              btc:              /* btc */
-               emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte(ctxt, "btc");
                break;
        case 0xbc: {            /* bsf */
                u8 zf;
@@ -4217,7 +4184,7 @@ twobyte_insn:
                                                        (s16) ctxt->src.val;
                break;
        case 0xc0 ... 0xc1:     /* xadd */
-               emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
+               emulate_2op_SrcV(ctxt, "add");
                /* Write back the register source. */
                ctxt->src.val = ctxt->dst.orig_val;
                write_register_operand(&ctxt->src);
index efad723..76e3f1c 100644 (file)
@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
        kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
        kvm_iodevice_init(&pit->dev, &pit_dev_ops);
-       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+                                     KVM_PIT_MEM_LENGTH, &pit->dev);
        if (ret < 0)
                goto fail;
 
        if (flags & KVM_PIT_SPEAKER_DUMMY) {
                kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
                ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
-                                               &pit->speaker_dev);
+                                             KVM_SPEAKER_BASE_ADDRESS, 4,
+                                             &pit->speaker_dev);
                if (ret < 0)
                        goto fail_unregister;
        }
index 19fe855..cac4746 100644 (file)
@@ -34,6 +34,9 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+#define pr_pic_unimpl(fmt, ...)        \
+       pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
 static void pic_irq_request(struct kvm *kvm, int level);
 
 static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                        }
                        s->init_state = 1;
                        if (val & 0x02)
-                               printk(KERN_ERR "single mode not supported");
+                               pr_pic_unimpl("single mode not supported");
                        if (val & 0x08)
-                               printk(KERN_ERR
-                                      "level sensitive irq not supported");
+                               pr_pic_unimpl(
+                                       "level sensitive irq not supported");
                } else if (val & 0x08) {
                        if (val & 0x04)
                                s->poll = 1;
@@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr)
        }
 }
 
-static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
-{
-       return container_of(dev, struct kvm_pic, dev);
-}
-
-static int picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
                         gpa_t addr, int len, const void *val)
 {
-       struct kvm_pic *s = to_pic(this);
        unsigned char data = *(unsigned char *)val;
        if (!picdev_in_range(addr))
                return -EOPNOTSUPP;
 
        if (len != 1) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "PIC: non byte write\n");
+               pr_pic_unimpl("non byte write\n");
                return 0;
        }
        pic_lock(s);
@@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this,
        return 0;
 }
 
-static int picdev_read(struct kvm_io_device *this,
+static int picdev_read(struct kvm_pic *s,
                       gpa_t addr, int len, void *val)
 {
-       struct kvm_pic *s = to_pic(this);
        unsigned char data = 0;
        if (!picdev_in_range(addr))
                return -EOPNOTSUPP;
 
        if (len != 1) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "PIC: non byte read\n");
+               pr_pic_unimpl("non byte read\n");
                return 0;
        }
        pic_lock(s);
@@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this,
        return 0;
 }
 
+static int picdev_master_write(struct kvm_io_device *dev,
+                              gpa_t addr, int len, const void *val)
+{
+       return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+                           addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_io_device *dev,
+                             gpa_t addr, int len, void *val)
+{
+       return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+                           addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_io_device *dev,
+                             gpa_t addr, int len, const void *val)
+{
+       return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+                           addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_io_device *dev,
+                            gpa_t addr, int len, void *val)
+{
+       return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+                           addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_io_device *dev,
+                            gpa_t addr, int len, const void *val)
+{
+       return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+                           addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_io_device *dev,
+                           gpa_t addr, int len, void *val)
+{
+       return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+                           addr, len, val);
+}
+
 /*
  * callback when PIC0 irq status changed
  */
@@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
        s->output = level;
 }
 
-static const struct kvm_io_device_ops picdev_ops = {
-       .read     = picdev_read,
-       .write    = picdev_write,
+static const struct kvm_io_device_ops picdev_master_ops = {
+       .read     = picdev_master_read,
+       .write    = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+       .read     = picdev_slave_read,
+       .write    = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+       .read     = picdev_eclr_read,
+       .write    = picdev_eclr_write,
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        /*
         * Initialize PIO device
         */
-       kvm_iodevice_init(&s->dev, &picdev_ops);
+       kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+       kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+       kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
        mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+                                     &s->dev_master);
+       if (ret < 0)
+               goto fail_unlock;
+
+       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+       if (ret < 0)
+               goto fail_unreg_2;
+
+       ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+       if (ret < 0)
+               goto fail_unreg_1;
+
        mutex_unlock(&kvm->slots_lock);
-       if (ret < 0) {
-               kfree(s);
-               return NULL;
-       }
 
        return s;
+
+fail_unreg_1:
+       kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+       kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+       mutex_unlock(&kvm->slots_lock);
+
+       kfree(s);
+
+       return NULL;
 }
 
 void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm)
        struct kvm_pic *vpic = kvm->arch.vpic;
 
        if (vpic) {
-               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
                kvm->arch.vpic = NULL;
                kfree(vpic);
        }
index 53e2d08..2086f2b 100644 (file)
@@ -66,7 +66,9 @@ struct kvm_pic {
        struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
        int output;             /* intr from master PIC */
-       struct kvm_io_device dev;
+       struct kvm_io_device dev_master;
+       struct kvm_io_device dev_slave;
+       struct kvm_io_device dev_eclr;
        void (*ack_notifier)(void *opaque, int irq);
        unsigned long irq_states[16];
 };
index 3377d53..544076c 100644 (file)
@@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
        return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
-static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
-{
-       load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
-
-       return mmu->pdptrs[index];
-}
-
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
        ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
index 64bc6ea..497dbaa 100644 (file)
@@ -2,6 +2,8 @@
 struct kvm_timer {
        struct hrtimer timer;
        s64 period;                             /* unit: ns */
+       u32 timer_mode_mask;
+       u64 tscdeadline;
        atomic_t pending;                       /* accumulated triggered timers */
        bool reinject;
        struct kvm_timer_ops *t_ops;
index 57dcbd4..54abb40 100644 (file)
@@ -68,6 +68,9 @@
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
 
+static unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
 static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
        return *((u32 *) (apic->regs + reg_off));
@@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
        return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+       return ((apic_get_reg(apic, APIC_LVTT) &
+               apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+}
+
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-       return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+       return ((apic_get_reg(apic, APIC_LVTT) &
+               apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+       return ((apic_get_reg(apic, APIC_LVTT) &
+               apic->lapic_timer.timer_mode_mask) ==
+                       APIC_LVT_TIMER_TSCDEADLINE);
 }
 
 static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 }
 
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-       LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
+       LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
        LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
        LINT_MASK, LINT_MASK,   /* LVT0-1 */
@@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
                        result = 1;
                break;
        default:
-               printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
-                      apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+               apic_debug("Bad DFR vcpu %d: %08x\n",
+                          apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
                break;
        }
 
@@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                result = (target != source);
                break;
        default:
-               printk(KERN_WARNING "Bad dest shorthand value %x\n",
-                      short_hand);
+               apic_debug("kvm: apic: Bad dest shorthand value %x\n",
+                          short_hand);
                break;
        }
 
@@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                break;
 
        case APIC_DM_REMRD:
-               printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+               apic_debug("Ignoring delivery mode 3\n");
                break;
 
        case APIC_DM_SMI:
-               printk(KERN_DEBUG "Ignoring guest SMI\n");
+               apic_debug("Ignoring guest SMI\n");
                break;
 
        case APIC_DM_NMI:
@@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
                        val = kvm_apic_id(apic) << 24;
                break;
        case APIC_ARBPRI:
-               printk(KERN_WARNING "Access APIC ARBPRI register "
-                      "which is for P6\n");
+               apic_debug("Access APIC ARBPRI register which is for P6\n");
                break;
 
        case APIC_TMCCT:        /* Timer CCR */
+               if (apic_lvtt_tscdeadline(apic))
+                       return 0;
+
                val = apic_get_tmcct(apic);
                break;
 
@@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic)
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-       ktime_t now = apic->lapic_timer.timer.base->get_time();
-
-       apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
-                   APIC_BUS_CYCLE_NS * apic->divide_count;
+       ktime_t now;
        atomic_set(&apic->lapic_timer.pending, 0);
 
-       if (!apic->lapic_timer.period)
-               return;
-       /*
-        * Do not allow the guest to program periodic timers with small
-        * interval, since the hrtimers are not throttled by the host
-        * scheduler.
-        */
-       if (apic_lvtt_period(apic)) {
-               if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
-                       apic->lapic_timer.period = NSEC_PER_MSEC/2;
-       }
+       if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+               /* lapic timer in oneshot or peroidic mode */
+               now = apic->lapic_timer.timer.base->get_time();
+               apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+                           * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+               if (!apic->lapic_timer.period)
+                       return;
+               /*
+                * Do not allow the guest to program periodic timers with small
+                * interval, since the hrtimers are not throttled by the host
+                * scheduler.
+                */
+               if (apic_lvtt_period(apic)) {
+                       s64 min_period = min_timer_period_us * 1000LL;
+
+                       if (apic->lapic_timer.period < min_period) {
+                               pr_info_ratelimited(
+                                   "kvm: vcpu %i: requested %lld ns "
+                                   "lapic timer period limited to %lld ns\n",
+                                   apic->vcpu->vcpu_id,
+                                   apic->lapic_timer.period, min_period);
+                               apic->lapic_timer.period = min_period;
+                       }
+               }
 
-       hrtimer_start(&apic->lapic_timer.timer,
-                     ktime_add_ns(now, apic->lapic_timer.period),
-                     HRTIMER_MODE_ABS);
+               hrtimer_start(&apic->lapic_timer.timer,
+                             ktime_add_ns(now, apic->lapic_timer.period),
+                             HRTIMER_MODE_ABS);
 
-       apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+               apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
                           PRIx64 ", "
                           "timer initial count 0x%x, period %lldns, "
                           "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic)
                           apic->lapic_timer.period,
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
+       } else if (apic_lvtt_tscdeadline(apic)) {
+               /* lapic timer in tsc deadline mode */
+               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+               u64 ns = 0;
+               struct kvm_vcpu *vcpu = apic->vcpu;
+               unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+               unsigned long flags;
+
+               if (unlikely(!tscdeadline || !this_tsc_khz))
+                       return;
+
+               local_irq_save(flags);
+
+               now = apic->lapic_timer.timer.base->get_time();
+               guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+               if (likely(tscdeadline > guest_tsc)) {
+                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
+                       do_div(ns, this_tsc_khz);
+               }
+               hrtimer_start(&apic->lapic_timer.timer,
+                       ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+
+               local_irq_restore(flags);
+       }
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
        case APIC_LVT0:
                apic_manage_nmi_watchdog(apic, val);
-       case APIC_LVTT:
        case APIC_LVTTHMR:
        case APIC_LVTPC:
        case APIC_LVT1:
@@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
                break;
 
+       case APIC_LVTT:
+               if ((apic_get_reg(apic, APIC_LVTT) &
+                   apic->lapic_timer.timer_mode_mask) !=
+                  (val & apic->lapic_timer.timer_mode_mask))
+                       hrtimer_cancel(&apic->lapic_timer.timer);
+
+               if (!apic_sw_enabled(apic))
+                       val |= APIC_LVT_MASKED;
+               val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+               apic_set_reg(apic, APIC_LVTT, val);
+               break;
+
        case APIC_TMICT:
+               if (apic_lvtt_tscdeadline(apic))
+                       break;
+
                hrtimer_cancel(&apic->lapic_timer.timer);
                apic_set_reg(apic, APIC_TMICT, val);
                start_apic_timer(apic);
@@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
        case APIC_TDCR:
                if (val & 4)
-                       printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+                       apic_debug("KVM_WRITE:TDCR %x\n", val);
                apic_set_reg(apic, APIC_TDCR, val);
                update_divide_count(apic);
                break;
 
        case APIC_ESR:
                if (apic_x2apic_mode(apic) && val != 0) {
-                       printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+                       apic_debug("KVM_WRITE:ESR not zero %x\n", val);
                        ret = 1;
                }
                break;
@@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this,
        return 0;
 }
 
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (apic)
+               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
        if (!vcpu->arch.apic)
@@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
  *----------------------------------------------------------------------
  */
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       if (!apic)
+               return 0;
+
+       if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+               return 0;
+
+       return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       if (!apic)
+               return;
+
+       if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+               return;
+
+       hrtimer_cancel(&apic->lapic_timer.timer);
+       apic->lapic_timer.tscdeadline = data;
+       start_apic_timer(apic);
+}
+
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
index 52c9e6b..138e8cc 100644 (file)
@@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
@@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
index 8e8da79..f1b36cf 100644 (file)
@@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
                ASSERT(!VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
+                       pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
                        if (!is_present_gpte(pdptr)) {
                                vcpu->arch.mmu.pae_root[i] = 0;
                                continue;
@@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->direct_map = true;
        context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
        context->get_cr3 = get_cr3;
+       context->get_pdptr = kvm_pdptr_read;
        context->inject_page_fault = kvm_inject_page_fault;
        context->nx = is_nx(vcpu);
 
@@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
        vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
        vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+       vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
        vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
        return r;
@@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
        g_context->get_cr3           = get_cr3;
+       g_context->get_pdptr         = kvm_pdptr_read;
        g_context->inject_page_fault = kvm_inject_page_fault;
 
        /*
index 2460a26..746ec25 100644 (file)
@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
+       static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
        unsigned long *rmapp;
        struct kvm_mmu_page *rev_sp;
        gfn_t gfn;
 
-
        rev_sp = page_header(__pa(sptep));
        gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
        if (!gfn_to_memslot(kvm, gfn)) {
-               if (!printk_ratelimit())
+               if (!__ratelimit(&ratelimit_state))
                        return;
                audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
                audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 
        rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
        if (!*rmapp) {
-               if (!printk_ratelimit())
+               if (!__ratelimit(&ratelimit_state))
                        return;
                audit_printk(kvm, "no rmap for writable spte %llx\n",
                             *sptep);
index 507e2b8..9299410 100644 (file)
@@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        gfn_t table_gfn;
        unsigned index, pt_access, uninitialized_var(pte_access);
        gpa_t pte_gpa;
-       bool eperm;
+       bool eperm, last_gpte;
        int offset;
        const int write_fault = access & PFERR_WRITE_MASK;
        const int user_fault  = access & PFERR_USER_MASK;
@@ -163,7 +163,7 @@ retry_walk:
 
 #if PTTYPE == 64
        if (walker->level == PT32E_ROOT_LEVEL) {
-               pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
+               pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
                if (!is_present_gpte(pte))
                        goto error;
@@ -221,6 +221,17 @@ retry_walk:
                        eperm = true;
 #endif
 
+               last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
+               if (last_gpte) {
+                       pte_access = pt_access &
+                                    FNAME(gpte_access)(vcpu, pte, true);
+                       /* check if the kernel is fetching from user page */
+                       if (unlikely(pte_access & PT_USER_MASK) &&
+                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+                               if (fetch_fault && !user_fault)
+                                       eperm = true;
+               }
+
                if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
                        int ret;
                        trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -238,18 +249,12 @@ retry_walk:
 
                walker->ptes[walker->level - 1] = pte;
 
-               if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
+               if (last_gpte) {
                        int lvl = walker->level;
                        gpa_t real_gpa;
                        gfn_t gfn;
                        u32 ac;
 
-                       /* check if the kernel is fetching from user page */
-                       if (unlikely(pte_access & PT_USER_MASK) &&
-                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-                               if (fetch_fault && !user_fault)
-                                       eperm = true;
-
                        gfn = gpte_to_gfn_lvl(pte, lvl);
                        gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
@@ -295,7 +300,6 @@ retry_walk:
                walker->ptes[walker->level - 1] = pte;
        }
 
-       pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
        walker->pt_access = pt_access;
        walker->pte_access = pte_access;
        pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
index 475d1c9..e32243e 100644 (file)
@@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm)
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl = 1;
-               clr_intercept(svm, INTERCEPT_TASK_SWITCH);
                clr_intercept(svm, INTERCEPT_INVLPG);
                clr_exception_intercept(svm, PF_VECTOR);
                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
        return svm->nested.nested_cr3;
 }
 
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 cr3 = svm->nested.nested_cr3;
+       u64 pdpte;
+       int ret;
+
+       ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
+                                 offset_in_page(cr3) + index * 8, 8);
+       if (ret)
+               return 0;
+       return pdpte;
+}
+
 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
                                   unsigned long root)
 {
@@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+       vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu.shadow_root_level = get_npt_level();
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
@@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb->control.exit_info_1,
                                       vmcb->control.exit_info_2,
                                       vmcb->control.exit_int_info,
-                                      vmcb->control.exit_int_info_err);
+                                      vmcb->control.exit_int_info_err,
+                                      KVM_ISA_SVM);
 
        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
        if (!nested_vmcb)
@@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm)
        return 0;
 }
 
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+       struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
+       return vmcb->control.tsc_offset +
+               svm_scale_tsc(vcpu, native_read_tsc());
+}
+
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
        switch (ecx) {
        case MSR_IA32_TSC: {
-               struct vmcb *vmcb = get_host_vmcb(svm);
-
-               *data = vmcb->control.tsc_offset +
+               *data = svm->vmcb->control.tsc_offset +
                        svm_scale_tsc(vcpu, native_read_tsc());
 
                break;
@@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
        struct kvm_run *kvm_run = vcpu->run;
        u32 exit_code = svm->vmcb->control.exit_code;
 
-       trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
-
        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
                vcpu->arch.cr0 = svm->vmcb->save.cr0;
        if (npt_enabled)
@@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
                                        svm->vmcb->control.exit_info_1,
                                        svm->vmcb->control.exit_info_2,
                                        svm->vmcb->control.exit_int_info,
-                                       svm->vmcb->control.exit_int_info_err);
+                                       svm->vmcb->control.exit_int_info_err,
+                                       KVM_ISA_SVM);
 
                vmexit = nested_svm_exit_special(svm);
 
@@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
+       trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
+
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
                kvm_before_handle_nmi(&svm->vcpu);
 
@@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
        }
 }
 
-static const struct trace_print_flags svm_exit_reasons_str[] = {
-       { SVM_EXIT_READ_CR0,                    "read_cr0" },
-       { SVM_EXIT_READ_CR3,                    "read_cr3" },
-       { SVM_EXIT_READ_CR4,                    "read_cr4" },
-       { SVM_EXIT_READ_CR8,                    "read_cr8" },
-       { SVM_EXIT_WRITE_CR0,                   "write_cr0" },
-       { SVM_EXIT_WRITE_CR3,                   "write_cr3" },
-       { SVM_EXIT_WRITE_CR4,                   "write_cr4" },
-       { SVM_EXIT_WRITE_CR8,                   "write_cr8" },
-       { SVM_EXIT_READ_DR0,                    "read_dr0" },
-       { SVM_EXIT_READ_DR1,                    "read_dr1" },
-       { SVM_EXIT_READ_DR2,                    "read_dr2" },
-       { SVM_EXIT_READ_DR3,                    "read_dr3" },
-       { SVM_EXIT_WRITE_DR0,                   "write_dr0" },
-       { SVM_EXIT_WRITE_DR1,                   "write_dr1" },
-       { SVM_EXIT_WRITE_DR2,                   "write_dr2" },
-       { SVM_EXIT_WRITE_DR3,                   "write_dr3" },
-       { SVM_EXIT_WRITE_DR5,                   "write_dr5" },
-       { SVM_EXIT_WRITE_DR7,                   "write_dr7" },
-       { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" },
-       { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" },
-       { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" },
-       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" },
-       { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" },
-       { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" },
-       { SVM_EXIT_INTR,                        "interrupt" },
-       { SVM_EXIT_NMI,                         "nmi" },
-       { SVM_EXIT_SMI,                         "smi" },
-       { SVM_EXIT_INIT,                        "init" },
-       { SVM_EXIT_VINTR,                       "vintr" },
-       { SVM_EXIT_CPUID,                       "cpuid" },
-       { SVM_EXIT_INVD,                        "invd" },
-       { SVM_EXIT_HLT,                         "hlt" },
-       { SVM_EXIT_INVLPG,                      "invlpg" },
-       { SVM_EXIT_INVLPGA,                     "invlpga" },
-       { SVM_EXIT_IOIO,                        "io" },
-       { SVM_EXIT_MSR,                         "msr" },
-       { SVM_EXIT_TASK_SWITCH,                 "task_switch" },
-       { SVM_EXIT_SHUTDOWN,                    "shutdown" },
-       { SVM_EXIT_VMRUN,                       "vmrun" },
-       { SVM_EXIT_VMMCALL,                     "hypercall" },
-       { SVM_EXIT_VMLOAD,                      "vmload" },
-       { SVM_EXIT_VMSAVE,                      "vmsave" },
-       { SVM_EXIT_STGI,                        "stgi" },
-       { SVM_EXIT_CLGI,                        "clgi" },
-       { SVM_EXIT_SKINIT,                      "skinit" },
-       { SVM_EXIT_WBINVD,                      "wbinvd" },
-       { SVM_EXIT_MONITOR,                     "monitor" },
-       { SVM_EXIT_MWAIT,                       "mwait" },
-       { SVM_EXIT_XSETBV,                      "xsetbv" },
-       { SVM_EXIT_NPF,                         "npf" },
-       { -1, NULL }
-};
-
 static int svm_get_lpage_level(void)
 {
        return PT_PDPE_LEVEL;
@@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = {
        .get_mt_mask = svm_get_mt_mask,
 
        .get_exit_info = svm_get_exit_info,
-       .exit_reasons_str = svm_exit_reasons_str,
 
        .get_lpage_level = svm_get_lpage_level,
 
@@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .write_tsc_offset = svm_write_tsc_offset,
        .adjust_tsc_offset = svm_adjust_tsc_offset,
        .compute_tsc_offset = svm_compute_tsc_offset,
+       .read_l1_tsc = svm_read_l1_tsc,
 
        .set_tdp_cr3 = set_tdp_cr3,
 
index 3ff898c..911d264 100644 (file)
@@ -2,6 +2,8 @@
 #define _TRACE_KVM_H
 
 #include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic,
 #define KVM_ISA_VMX   1
 #define KVM_ISA_SVM   2
 
+#define VMX_EXIT_REASONS \
+       { EXIT_REASON_EXCEPTION_NMI,            "EXCEPTION_NMI" }, \
+       { EXIT_REASON_EXTERNAL_INTERRUPT,       "EXTERNAL_INTERRUPT" }, \
+       { EXIT_REASON_TRIPLE_FAULT,             "TRIPLE_FAULT" }, \
+       { EXIT_REASON_PENDING_INTERRUPT,        "PENDING_INTERRUPT" }, \
+       { EXIT_REASON_NMI_WINDOW,               "NMI_WINDOW" }, \
+       { EXIT_REASON_TASK_SWITCH,              "TASK_SWITCH" }, \
+       { EXIT_REASON_CPUID,                    "CPUID" }, \
+       { EXIT_REASON_HLT,                      "HLT" }, \
+       { EXIT_REASON_INVLPG,                   "INVLPG" }, \
+       { EXIT_REASON_RDPMC,                    "RDPMC" }, \
+       { EXIT_REASON_RDTSC,                    "RDTSC" }, \
+       { EXIT_REASON_VMCALL,                   "VMCALL" }, \
+       { EXIT_REASON_VMCLEAR,                  "VMCLEAR" }, \
+       { EXIT_REASON_VMLAUNCH,                 "VMLAUNCH" }, \
+       { EXIT_REASON_VMPTRLD,                  "VMPTRLD" }, \
+       { EXIT_REASON_VMPTRST,                  "VMPTRST" }, \
+       { EXIT_REASON_VMREAD,                   "VMREAD" }, \
+       { EXIT_REASON_VMRESUME,                 "VMRESUME" }, \
+       { EXIT_REASON_VMWRITE,                  "VMWRITE" }, \
+       { EXIT_REASON_VMOFF,                    "VMOFF" }, \
+       { EXIT_REASON_VMON,                     "VMON" }, \
+       { EXIT_REASON_CR_ACCESS,                "CR_ACCESS" }, \
+       { EXIT_REASON_DR_ACCESS,                "DR_ACCESS" }, \
+       { EXIT_REASON_IO_INSTRUCTION,           "IO_INSTRUCTION" }, \
+       { EXIT_REASON_MSR_READ,                 "MSR_READ" }, \
+       { EXIT_REASON_MSR_WRITE,                "MSR_WRITE" }, \
+       { EXIT_REASON_MWAIT_INSTRUCTION,        "MWAIT_INSTRUCTION" }, \
+       { EXIT_REASON_MONITOR_INSTRUCTION,      "MONITOR_INSTRUCTION" }, \
+       { EXIT_REASON_PAUSE_INSTRUCTION,        "PAUSE_INSTRUCTION" }, \
+       { EXIT_REASON_MCE_DURING_VMENTRY,       "MCE_DURING_VMENTRY" }, \
+       { EXIT_REASON_TPR_BELOW_THRESHOLD,      "TPR_BELOW_THRESHOLD" },        \
+       { EXIT_REASON_APIC_ACCESS,              "APIC_ACCESS" }, \
+       { EXIT_REASON_EPT_VIOLATION,            "EPT_VIOLATION" }, \
+       { EXIT_REASON_EPT_MISCONFIG,            "EPT_MISCONFIG" }, \
+       { EXIT_REASON_WBINVD,                   "WBINVD" }
+
+#define SVM_EXIT_REASONS \
+       { SVM_EXIT_READ_CR0,                    "read_cr0" }, \
+       { SVM_EXIT_READ_CR3,                    "read_cr3" }, \
+       { SVM_EXIT_READ_CR4,                    "read_cr4" }, \
+       { SVM_EXIT_READ_CR8,                    "read_cr8" }, \
+       { SVM_EXIT_WRITE_CR0,                   "write_cr0" }, \
+       { SVM_EXIT_WRITE_CR3,                   "write_cr3" }, \
+       { SVM_EXIT_WRITE_CR4,                   "write_cr4" }, \
+       { SVM_EXIT_WRITE_CR8,                   "write_cr8" }, \
+       { SVM_EXIT_READ_DR0,                    "read_dr0" }, \
+       { SVM_EXIT_READ_DR1,                    "read_dr1" }, \
+       { SVM_EXIT_READ_DR2,                    "read_dr2" }, \
+       { SVM_EXIT_READ_DR3,                    "read_dr3" }, \
+       { SVM_EXIT_WRITE_DR0,                   "write_dr0" }, \
+       { SVM_EXIT_WRITE_DR1,                   "write_dr1" }, \
+       { SVM_EXIT_WRITE_DR2,                   "write_dr2" }, \
+       { SVM_EXIT_WRITE_DR3,                   "write_dr3" }, \
+       { SVM_EXIT_WRITE_DR5,                   "write_dr5" }, \
+       { SVM_EXIT_WRITE_DR7,                   "write_dr7" }, \
+       { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" }, \
+       { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" }, \
+       { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" }, \
+       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
+       { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" }, \
+       { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" }, \
+       { SVM_EXIT_INTR,                        "interrupt" }, \
+       { SVM_EXIT_NMI,                         "nmi" }, \
+       { SVM_EXIT_SMI,                         "smi" }, \
+       { SVM_EXIT_INIT,                        "init" }, \
+       { SVM_EXIT_VINTR,                       "vintr" }, \
+       { SVM_EXIT_CPUID,                       "cpuid" }, \
+       { SVM_EXIT_INVD,                        "invd" }, \
+       { SVM_EXIT_HLT,                         "hlt" }, \
+       { SVM_EXIT_INVLPG,                      "invlpg" }, \
+       { SVM_EXIT_INVLPGA,                     "invlpga" }, \
+       { SVM_EXIT_IOIO,                        "io" }, \
+       { SVM_EXIT_MSR,                         "msr" }, \
+       { SVM_EXIT_TASK_SWITCH,                 "task_switch" }, \
+       { SVM_EXIT_SHUTDOWN,                    "shutdown" }, \
+       { SVM_EXIT_VMRUN,                       "vmrun" }, \
+       { SVM_EXIT_VMMCALL,                     "hypercall" }, \
+       { SVM_EXIT_VMLOAD,                      "vmload" }, \
+       { SVM_EXIT_VMSAVE,                      "vmsave" }, \
+       { SVM_EXIT_STGI,                        "stgi" }, \
+       { SVM_EXIT_CLGI,                        "clgi" }, \
+       { SVM_EXIT_SKINIT,                      "skinit" }, \
+       { SVM_EXIT_WBINVD,                      "wbinvd" }, \
+       { SVM_EXIT_MONITOR,                     "monitor" }, \
+       { SVM_EXIT_MWAIT,                       "mwait" }, \
+       { SVM_EXIT_XSETBV,                      "xsetbv" }, \
+       { SVM_EXIT_NPF,                         "npf" }
+
 /*
  * Tracepoint for kvm guest exit:
  */
@@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit,
        ),
 
        TP_printk("reason %s rip 0x%lx info %llx %llx",
-                ftrace_print_symbols_seq(p, __entry->exit_reason,
-                                         kvm_x86_ops->exit_reasons_str),
+                (__entry->isa == KVM_ISA_VMX) ?
+                __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
+                __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
                 __entry->guest_rip, __entry->info1, __entry->info2)
 );
 
@@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts,
 TRACE_EVENT(kvm_nested_vmexit,
            TP_PROTO(__u64 rip, __u32 exit_code,
                     __u64 exit_info1, __u64 exit_info2,
-                    __u32 exit_int_info, __u32 exit_int_info_err),
+                    __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
            TP_ARGS(rip, exit_code, exit_info1, exit_info2,
-                   exit_int_info, exit_int_info_err),
+                   exit_int_info, exit_int_info_err, isa),
 
        TP_STRUCT__entry(
                __field(        __u64,          rip                     )
@@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit,
                __field(        __u64,          exit_info2              )
                __field(        __u32,          exit_int_info           )
                __field(        __u32,          exit_int_info_err       )
+               __field(        __u32,          isa                     )
        ),
 
        TP_fast_assign(
@@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit,
                __entry->exit_info2             = exit_info2;
                __entry->exit_int_info          = exit_int_info;
                __entry->exit_int_info_err      = exit_int_info_err;
+               __entry->isa                    = isa;
        ),
        TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
                  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
                  __entry->rip,
-                 ftrace_print_symbols_seq(p, __entry->exit_code,
-                                          kvm_x86_ops->exit_reasons_str),
+                (__entry->isa == KVM_ISA_VMX) ?
+                __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+                __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
                  __entry->exit_info1, __entry->exit_info2,
                  __entry->exit_int_info, __entry->exit_int_info_err)
 );
@@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit,
 TRACE_EVENT(kvm_nested_vmexit_inject,
            TP_PROTO(__u32 exit_code,
                     __u64 exit_info1, __u64 exit_info2,
-                    __u32 exit_int_info, __u32 exit_int_info_err),
+                    __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
            TP_ARGS(exit_code, exit_info1, exit_info2,
-                   exit_int_info, exit_int_info_err),
+                   exit_int_info, exit_int_info_err, isa),
 
        TP_STRUCT__entry(
                __field(        __u32,          exit_code               )
@@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
                __field(        __u64,          exit_info2              )
                __field(        __u32,          exit_int_info           )
                __field(        __u32,          exit_int_info_err       )
+               __field(        __u32,          isa                     )
        ),
 
        TP_fast_assign(
@@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
                __entry->exit_info2             = exit_info2;
                __entry->exit_int_info          = exit_int_info;
                __entry->exit_int_info_err      = exit_int_info_err;
+               __entry->isa                    = isa;
        ),
 
        TP_printk("reason: %s ext_inf1: 0x%016llx "
                  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
-                 ftrace_print_symbols_seq(p, __entry->exit_code,
-                                          kvm_x86_ops->exit_reasons_str),
+                (__entry->isa == KVM_ISA_VMX) ?
+                __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+                __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
                __entry->exit_info1, __entry->exit_info2,
                __entry->exit_int_info, __entry->exit_int_info_err)
 );
index e65a158..a0d6bd9 100644 (file)
@@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+static int __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1747,6 +1750,21 @@ static u64 guest_read_tsc(void)
        return host_tsc + tsc_offset;
 }
 
+/*
+ * Like guest_read_tsc, but always returns L1's notion of the timestamp
+ * counter, even if a nested guest (L2) is currently running.
+ */
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+       u64 host_tsc, tsc_offset;
+
+       rdtscll(host_tsc);
+       tsc_offset = is_guest_mode(vcpu) ?
+               to_vmx(vcpu)->nested.vmcs01_tsc_offset :
+               vmcs_read64(TSC_OFFSET);
+       return host_tsc + tsc_offset;
+}
+
 /*
  * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
  * ioctl. In this case the call-back should update internal vmx state to make
@@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
  */
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
-       vmcs_write64(TSC_OFFSET, offset);
-       if (is_guest_mode(vcpu))
+       if (is_guest_mode(vcpu)) {
                /*
-                * We're here if L1 chose not to trap the TSC MSR. Since
-                * prepare_vmcs12() does not copy tsc_offset, we need to also
-                * set the vmcs12 field here.
+                * We're here if L1 chose not to trap WRMSR to TSC. According
+                * to the spec, this should set L1's TSC; The offset that L1
+                * set for L2 remains unchanged, and still needs to be added
+                * to the newly set TSC to get L2's TSC.
                 */
-               get_vmcs12(vcpu)->tsc_offset = offset -
-                       to_vmx(vcpu)->nested.vmcs01_tsc_offset;
+               struct vmcs12 *vmcs12;
+               to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
+               /* recalculate vmcs02.TSC_OFFSET: */
+               vmcs12 = get_vmcs12(vcpu);
+               vmcs_write64(TSC_OFFSET, offset +
+                       (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
+                        vmcs12->tsc_offset : 0));
+       } else {
+               vmcs_write64(TSC_OFFSET, offset);
+       }
 }
 
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
-               printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-                      __func__);
+               pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+                                    __func__);
                vmcs_write32(GUEST_TR_AR_BYTES,
                             (guest_tr_ar & ~AR_TYPE_MASK)
                             | AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
                /* EPT won't cause page fault directly */
-               if (enable_ept)
-                       BUG();
+               BUG_ON(enable_ept);
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                trace_kvm_page_fault(cr2, error_code);
 
@@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
+       if (likely(fasteoi)) {
+               unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+               int access_type, offset;
+
+               access_type = exit_qualification & APIC_ACCESS_TYPE;
+               offset = exit_qualification & APIC_ACCESS_OFFSET;
+               /*
+                * Sane guest uses MOV to write EOI, with written value
+                * not cared. So make a short-circuit here by avoiding
+                * heavy instruction emulation.
+                */
+               if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+                   (offset == APIC_EOI)) {
+                       kvm_lapic_set_eoi(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+       }
        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
@@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return 0;
 
        if (unlikely(vmx->fail)) {
-               printk(KERN_INFO "%s failed vm entry %x\n",
-                      __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+               pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+                                   vmcs_read32(VM_INSTRUCTION_ERROR));
                return 1;
        }
 
@@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
 
-       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required && emulate_invalid_guest_state)
                return handle_invalid_guest_state(vcpu);
@@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+       trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
 
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
        return ret;
 }
 
-#define _ER(x) { EXIT_REASON_##x, #x }
-
-static const struct trace_print_flags vmx_exit_reasons_str[] = {
-       _ER(EXCEPTION_NMI),
-       _ER(EXTERNAL_INTERRUPT),
-       _ER(TRIPLE_FAULT),
-       _ER(PENDING_INTERRUPT),
-       _ER(NMI_WINDOW),
-       _ER(TASK_SWITCH),
-       _ER(CPUID),
-       _ER(HLT),
-       _ER(INVLPG),
-       _ER(RDPMC),
-       _ER(RDTSC),
-       _ER(VMCALL),
-       _ER(VMCLEAR),
-       _ER(VMLAUNCH),
-       _ER(VMPTRLD),
-       _ER(VMPTRST),
-       _ER(VMREAD),
-       _ER(VMRESUME),
-       _ER(VMWRITE),
-       _ER(VMOFF),
-       _ER(VMON),
-       _ER(CR_ACCESS),
-       _ER(DR_ACCESS),
-       _ER(IO_INSTRUCTION),
-       _ER(MSR_READ),
-       _ER(MSR_WRITE),
-       _ER(MWAIT_INSTRUCTION),
-       _ER(MONITOR_INSTRUCTION),
-       _ER(PAUSE_INSTRUCTION),
-       _ER(MCE_DURING_VMENTRY),
-       _ER(TPR_BELOW_THRESHOLD),
-       _ER(APIC_ACCESS),
-       _ER(EPT_VIOLATION),
-       _ER(EPT_MISCONFIG),
-       _ER(WBINVD),
-       { -1, NULL }
-};
-
-#undef _ER
-
 static int vmx_get_lpage_level(void)
 {
        if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        set_cr4_guest_host_mask(vmx);
 
-       vmcs_write64(TSC_OFFSET,
-               vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vmcs_write64(TSC_OFFSET,
+                       vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+       else
+               vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
        if (enable_vpid) {
                /*
@@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        if (vmcs12->vm_entry_msr_load_count > 0 ||
            vmcs12->vm_exit_msr_load_count > 0 ||
            vmcs12->vm_exit_msr_store_count > 0) {
-               if (printk_ratelimit())
-                       printk(KERN_WARNING
-                         "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+               pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+                                   __func__);
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
@@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 
        load_vmcs12_host_state(vcpu, vmcs12);
 
-       /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+       /* Update TSC_OFFSET if TSC was changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_mt_mask = vmx_get_mt_mask,
 
        .get_exit_info = vmx_get_exit_info,
-       .exit_reasons_str = vmx_exit_reasons_str,
 
        .get_lpage_level = vmx_get_lpage_level,
 
@@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset = vmx_adjust_tsc_offset,
        .compute_tsc_offset = vmx_compute_tsc_offset,
+       .read_l1_tsc = vmx_read_l1_tsc,
 
        .set_tdp_cr3 = vmx_set_cr3,
 
index 84a28ea..cf26909 100644 (file)
@@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                    struct kvm_cpuid_entry2 __user *entries);
+static void process_nmi(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
-       vcpu->arch.nmi_pending = 1;
+       atomic_inc(&vcpu->arch.nmi_queued);
+       kvm_make_request(KVM_REQ_NMI, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
@@ -599,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u32 timer_mode_mask;
 
        best = kvm_find_cpuid_entry(vcpu, 1, 0);
        if (!best)
@@ -610,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
                if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
                        best->ecx |= bit(X86_FEATURE_OSXSAVE);
        }
+
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+               best->function == 0x1) {
+               best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
+               timer_mode_mask = 3 << 17;
+       } else
+               timer_mode_mask = 1 << 17;
+
+       if (apic)
+               apic->lapic_timer.timer_mode_mask = timer_mode_mask;
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -825,6 +838,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 
 static u32 emulated_msrs[] = {
+       MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
        MSR_IA32_MCG_CTL,
@@ -1000,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void)
        return ret;
 }
 
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.virtual_tsc_khz)
                return vcpu->arch.virtual_tsc_khz;
@@ -1098,7 +1112,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+       tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
        kernel_ns = get_kernel_ns();
        this_tsc_khz = vcpu_tsc_khz(v);
        if (unlikely(this_tsc_khz == 0)) {
@@ -1564,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                break;
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                return kvm_x2apic_msr_write(vcpu, msr, data);
+       case MSR_IA32_TSCDEADLINE:
+               kvm_set_lapic_tscdeadline_msr(vcpu, data);
+               break;
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -1825,6 +1842,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
        case HV_X64_MSR_TPR:
                return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+       case HV_X64_MSR_APIC_ASSIST_PAGE:
+               data = vcpu->arch.hv_vapic;
+               break;
        default:
                pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
                return 1;
@@ -1839,7 +1859,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
        switch (msr) {
        case MSR_IA32_PLATFORM_ID:
-       case MSR_IA32_UCODE_REV:
        case MSR_IA32_EBL_CR_POWERON:
        case MSR_IA32_DEBUGCTLMSR:
        case MSR_IA32_LASTBRANCHFROMIP:
@@ -1860,6 +1879,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_FAM10H_MMIO_CONF_BASE:
                data = 0;
                break;
+       case MSR_IA32_UCODE_REV:
+               data = 0x100000000ULL;
+               break;
        case MSR_MTRRcap:
                data = 0x500 | KVM_NR_VAR_MTRR;
                break;
@@ -1888,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
                return kvm_x2apic_msr_read(vcpu, msr, pdata);
                break;
+       case MSR_IA32_TSCDEADLINE:
+               data = kvm_get_lapic_tscdeadline_msr(vcpu);
+               break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -2086,6 +2111,9 @@ int kvm_dev_ioctl_check_extension(long ext)
                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                break;
        case KVM_CAP_NR_VCPUS:
+               r = KVM_SOFT_MAX_VCPUS;
+               break;
+       case KVM_CAP_MAX_VCPUS:
                r = KVM_MAX_VCPUS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
@@ -2210,7 +2238,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                s64 tsc_delta;
                u64 tsc;
 
-               kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+               tsc = kvm_x86_ops->read_l1_tsc(vcpu);
                tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
                             tsc - vcpu->arch.last_guest_tsc;
 
@@ -2234,7 +2262,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        kvm_x86_ops->vcpu_put(vcpu);
        kvm_put_guest_fpu(vcpu);
-       kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 }
 
 static int is_efer_nx(void)
@@ -2819,6 +2847,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
 {
+       process_nmi(vcpu);
        events->exception.injected =
                vcpu->arch.exception.pending &&
                !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2865,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                        KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
        events->nmi.injected = vcpu->arch.nmi_injected;
-       events->nmi.pending = vcpu->arch.nmi_pending;
+       events->nmi.pending = vcpu->arch.nmi_pending != 0;
        events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
        events->nmi.pad = 0;
 
@@ -2856,6 +2885,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                              | KVM_VCPUEVENT_VALID_SHADOW))
                return -EINVAL;
 
+       process_nmi(vcpu);
        vcpu->arch.exception.pending = events->exception.injected;
        vcpu->arch.exception.nr = events->exception.nr;
        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -3556,7 +3586,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        if (r) {
                                mutex_lock(&kvm->slots_lock);
                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-                                                         &vpic->dev);
+                                                         &vpic->dev_master);
+                               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                                         &vpic->dev_slave);
+                               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                                         &vpic->dev_eclr);
                                mutex_unlock(&kvm->slots_lock);
                                kfree(vpic);
                                goto create_irqchip_unlock;
@@ -4045,84 +4079,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
        return 0;
 }
 
-static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
-                                 unsigned long addr,
-                                 void *val,
-                                 unsigned int bytes,
-                                 struct x86_exception *exception)
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                       const void *val, int bytes)
 {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       gpa_t gpa;
-       int handled, ret;
+       int ret;
 
+       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+       if (ret < 0)
+               return 0;
+       kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+       return 1;
+}
+
+struct read_write_emulator_ops {
+       int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+                                 int bytes);
+       int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                 void *val, int bytes);
+       int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                              int bytes, void *val);
+       int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                   void *val, int bytes);
+       bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
                               vcpu->mmio_phys_addr, *(u64 *)val);
                vcpu->mmio_read_completed = 0;
-               return X86EMUL_CONTINUE;
+               return 1;
        }
 
-       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
-
-       if (ret < 0)
-               return X86EMUL_PROPAGATE_FAULT;
-
-       if (ret)
-               goto mmio;
-
-       if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
-           == X86EMUL_CONTINUE)
-               return X86EMUL_CONTINUE;
+       return 0;
+}
 
-mmio:
-       /*
-        * Is this MMIO handled locally?
-        */
-       handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+                       void *val, int bytes)
+{
+       return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+}
 
-       if (handled == bytes)
-               return X86EMUL_CONTINUE;
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+                        void *val, int bytes)
+{
+       return emulator_write_phys(vcpu, gpa, val, bytes);
+}
 
-       gpa += handled;
-       bytes -= handled;
-       val += handled;
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+       return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
 
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+                         void *val, int bytes)
+{
        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
-
-       vcpu->mmio_needed = 1;
-       vcpu->run->exit_reason = KVM_EXIT_MMIO;
-       vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
-       vcpu->mmio_index = 0;
-
        return X86EMUL_IO_NEEDED;
 }
 
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const void *val, int bytes)
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+                          void *val, int bytes)
 {
-       int ret;
-
-       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-       if (ret < 0)
-               return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-       return 1;
+       memcpy(vcpu->mmio_data, val, bytes);
+       memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+       return X86EMUL_CONTINUE;
 }
 
-static int emulator_write_emulated_onepage(unsigned long addr,
-                                          const void *val,
-                                          unsigned int bytes,
-                                          struct x86_exception *exception,
-                                          struct kvm_vcpu *vcpu)
+static struct read_write_emulator_ops read_emultor = {
+       .read_write_prepare = read_prepare,
+       .read_write_emulate = read_emulate,
+       .read_write_mmio = vcpu_mmio_read,
+       .read_write_exit_mmio = read_exit_mmio,
+};
+
+static struct read_write_emulator_ops write_emultor = {
+       .read_write_emulate = write_emulate,
+       .read_write_mmio = write_mmio,
+       .read_write_exit_mmio = write_exit_mmio,
+       .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+                                      unsigned int bytes,
+                                      struct x86_exception *exception,
+                                      struct kvm_vcpu *vcpu,
+                                      struct read_write_emulator_ops *ops)
 {
        gpa_t gpa;
        int handled, ret;
+       bool write = ops->write;
 
-       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
+       if (ops->read_write_prepare &&
+                 ops->read_write_prepare(vcpu, val, bytes))
+               return X86EMUL_CONTINUE;
+
+       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
        if (ret < 0)
                return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4186,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
        if (ret)
                goto mmio;
 
-       if (emulator_write_phys(vcpu, gpa, val, bytes))
+       if (ops->read_write_emulate(vcpu, gpa, val, bytes))
                return X86EMUL_CONTINUE;
 
 mmio:
-       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
        /*
         * Is this MMIO handled locally?
         */
-       handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+       handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
        if (handled == bytes)
                return X86EMUL_CONTINUE;
 
@@ -4148,23 +4202,20 @@ mmio:
        val += handled;
 
        vcpu->mmio_needed = 1;
-       memcpy(vcpu->mmio_data, val, bytes);
        vcpu->run->exit_reason = KVM_EXIT_MMIO;
        vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
        vcpu->mmio_size = bytes;
        vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-       vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-       memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+       vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
        vcpu->mmio_index = 0;
 
-       return X86EMUL_CONTINUE;
+       return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
-                           unsigned long addr,
-                           const void *val,
-                           unsigned int bytes,
-                           struct x86_exception *exception)
+int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+                       void *val, unsigned int bytes,
+                       struct x86_exception *exception,
+                       struct read_write_emulator_ops *ops)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
@@ -4173,16 +4224,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
                int rc, now;
 
                now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, exception,
-                                                    vcpu);
+               rc = emulator_read_write_onepage(addr, val, now, exception,
+                                                vcpu, ops);
+
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                addr += now;
                val += now;
                bytes -= now;
        }
-       return emulator_write_emulated_onepage(addr, val, bytes, exception,
-                                              vcpu);
+
+       return emulator_read_write_onepage(addr, val, bytes, exception,
+                                          vcpu, ops);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+                                 unsigned long addr,
+                                 void *val,
+                                 unsigned int bytes,
+                                 struct x86_exception *exception)
+{
+       return emulator_read_write(ctxt, addr, val, bytes,
+                                  exception, &read_emultor);
+}
+
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+                           unsigned long addr,
+                           const void *val,
+                           unsigned int bytes,
+                           struct x86_exception *exception)
+{
+       return emulator_read_write(ctxt, addr, (void *)val, bytes,
+                                  exception, &write_emultor);
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4712,7 +4785,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
        kvm_set_rflags(vcpu, ctxt->eflags);
 
        if (irq == NMI_VECTOR)
-               vcpu->arch.nmi_pending = false;
+               vcpu->arch.nmi_pending = 0;
        else
                vcpu->arch.interrupt.pending = false;
 
@@ -4788,7 +4861,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 
                trace_kvm_emulate_insn_start(vcpu);
                ++vcpu->stat.insn_emulation;
-               if (r)  {
+               if (r != EMULATION_OK)  {
                        if (emulation_type & EMULTYPE_TRAP_UD)
                                return EMULATE_FAIL;
                        if (reexecute_instruction(vcpu, cr2))
@@ -5521,7 +5594,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
        /* try to inject new event if pending */
        if (vcpu->arch.nmi_pending) {
                if (kvm_x86_ops->nmi_allowed(vcpu)) {
-                       vcpu->arch.nmi_pending = false;
+                       --vcpu->arch.nmi_pending;
                        vcpu->arch.nmi_injected = true;
                        kvm_x86_ops->set_nmi(vcpu);
                }
@@ -5553,10 +5626,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
        }
 }
 
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+       unsigned limit = 2;
+
+       /*
+        * x86 is limited to one NMI running, and one NMI pending after it.
+        * If an NMI is already in progress, limit further NMIs to just one.
+        * Otherwise, allow two (and we'll inject the first one immediately).
+        */
+       if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+               limit = 1;
+
+       vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+       vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
-       bool nmi_pending;
        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
                vcpu->run->request_interrupt_window;
 
@@ -5596,6 +5685,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
                        record_steal_time(vcpu);
+               if (kvm_check_request(KVM_REQ_NMI, vcpu))
+                       process_nmi(vcpu);
 
        }
 
@@ -5603,19 +5694,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (unlikely(r))
                goto out;
 
-       /*
-        * An NMI can be injected between local nmi_pending read and
-        * vcpu->arch.nmi_pending read inside inject_pending_event().
-        * But in that case, KVM_REQ_EVENT will be set, which makes
-        * the race described above benign.
-        */
-       nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
-
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                inject_pending_event(vcpu);
 
                /* enable NMI/IRQ window open exits if needed */
-               if (nmi_pending)
+               if (vcpu->arch.nmi_pending)
                        kvm_x86_ops->enable_nmi_window(vcpu);
                else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
                        kvm_x86_ops->enable_irq_window(vcpu);
@@ -5678,7 +5761,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
 
-       kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
@@ -6323,7 +6406,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.nmi_pending = false;
+       atomic_set(&vcpu->arch.nmi_queued, 0);
+       vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
 
        vcpu->arch.switch_db_regs = 0;
@@ -6598,7 +6682,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-               || vcpu->arch.nmi_pending ||
+               || atomic_read(&vcpu->arch.nmi_queued) ||
                (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_has_interrupt(vcpu));
 }
index aace6b8..f47fcd3 100644 (file)
@@ -371,6 +371,7 @@ struct kvm_s390_psw {
 #define KVM_S390_INT_VIRTIO            0xffff2603u
 #define KVM_S390_INT_SERVICE           0xffff2401u
 #define KVM_S390_INT_EMERGENCY         0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL     0xffff1202u
 
 struct kvm_s390_interrupt {
        __u32 type;
@@ -463,7 +464,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
 #define KVM_CAP_CLOCKSOURCE 8
-#define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
+#define KVM_CAP_NR_VCPUS 9       /* returns recommended max vcpus per vm */
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_PIT 11
 #define KVM_CAP_NOP_IO_DELAY 12
@@ -553,6 +554,9 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA        65
+#define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
+#define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
 
 #ifdef KVM_CAP_IRQ_ROUTING
index eabb21a..d526231 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/msi.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
+#include <linux/ratelimit.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -48,6 +49,7 @@
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
+#define KVM_REQ_NMI               14
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID    0
 
@@ -55,16 +57,16 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
+struct kvm_io_range {
+       gpa_t addr;
+       int len;
+       struct kvm_io_device *dev;
+};
+
 struct kvm_io_bus {
        int                   dev_count;
-#define NR_IOBUS_DEVS 200
-       struct kvm_io_device *devs[NR_IOBUS_DEVS];
+#define NR_IOBUS_DEVS 300
+       struct kvm_io_range range[NR_IOBUS_DEVS];
 };
 
 enum kvm_bus {
@@ -77,8 +79,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
                    void *val);
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                           struct kvm_io_device *dev);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev);
 
@@ -256,8 +258,9 @@ struct kvm {
        struct kvm_arch arch;
        atomic_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-       struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
        struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
+       spinlock_t ring_lock;
+       struct list_head coalesced_zones;
 #endif
 
        struct mutex irq_lock;
@@ -281,11 +284,8 @@ struct kvm {
 
 /* The guest did something we don't support. */
 #define pr_unimpl(vcpu, fmt, ...)                                      \
- do {                                                                  \
-       if (printk_ratelimit())                                         \
-               printk(KERN_ERR "kvm: %i: cpu%i " fmt,                  \
-                      current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while (0)
+       pr_err_ratelimited("kvm: %i: cpu%i " fmt,                       \
+                          current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__)
 
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
index eaf3a50..3ad0925 100644 (file)
@@ -58,8 +58,6 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
        struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-       u32 vector;
-       int index;
 
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
                spin_lock(&assigned_dev->intx_lock);
@@ -68,31 +66,35 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
                spin_unlock(&assigned_dev->intx_lock);
        }
 
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               index = find_index_from_host_irq(assigned_dev, irq);
-               if (index >= 0) {
-                       vector = assigned_dev->
-                                       guest_msix_entries[index].vector;
-                       kvm_set_irq(assigned_dev->kvm,
-                                   assigned_dev->irq_source_id, vector, 1);
-               }
-       } else
+       kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+                   assigned_dev->guest_irq, 1);
+
+       return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+       int index = find_index_from_host_irq(assigned_dev, irq);
+       u32 vector;
+
+       if (index >= 0) {
+               vector = assigned_dev->guest_msix_entries[index].vector;
                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                           assigned_dev->guest_irq, 1);
+                           vector, 1);
+       }
 
        return IRQ_HANDLED;
 }
+#endif
 
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
-       struct kvm_assigned_dev_kernel *dev;
-
-       if (kian->gsi == -1)
-               return;
-
-       dev = container_of(kian, struct kvm_assigned_dev_kernel,
-                          ack_notifier);
+       struct kvm_assigned_dev_kernel *dev =
+               container_of(kian, struct kvm_assigned_dev_kernel,
+                            ack_notifier);
 
        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 
@@ -110,8 +112,9 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 static void deassign_guest_irq(struct kvm *kvm,
                               struct kvm_assigned_dev_kernel *assigned_dev)
 {
-       kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-       assigned_dev->ack_notifier.gsi = -1;
+       if (assigned_dev->ack_notifier.gsi != -1)
+               kvm_unregister_irq_ack_notifier(kvm,
+                                               &assigned_dev->ack_notifier);
 
        kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
                    assigned_dev->guest_irq, 0);
@@ -143,7 +146,7 @@ static void deassign_host_irq(struct kvm *kvm,
 
                for (i = 0; i < assigned_dev->entries_nr; i++)
                        free_irq(assigned_dev->host_msix_entries[i].vector,
-                                (void *)assigned_dev);
+                                assigned_dev);
 
                assigned_dev->entries_nr = 0;
                kfree(assigned_dev->host_msix_entries);
@@ -153,7 +156,7 @@ static void deassign_host_irq(struct kvm *kvm,
                /* Deal with MSI and INTx */
                disable_irq(assigned_dev->host_irq);
 
-               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+               free_irq(assigned_dev->host_irq, assigned_dev);
 
                if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
                        pci_disable_msi(assigned_dev->dev);
@@ -239,7 +242,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
         * are going to be long delays in accepting, acking, etc.
         */
        if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                                IRQF_ONESHOT, dev->irq_name, (void *)dev))
+                                IRQF_ONESHOT, dev->irq_name, dev))
                return -EIO;
        return 0;
 }
@@ -258,7 +261,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
        dev->host_irq = dev->dev->irq;
        if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-                                0, dev->irq_name, (void *)dev)) {
+                                0, dev->irq_name, dev)) {
                pci_disable_msi(dev->dev);
                return -EIO;
        }
@@ -284,8 +287,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 
        for (i = 0; i < dev->entries_nr; i++) {
                r = request_threaded_irq(dev->host_msix_entries[i].vector,
-                                        NULL, kvm_assigned_dev_thread,
-                                        0, dev->irq_name, (void *)dev);
+                                        NULL, kvm_assigned_dev_thread_msix,
+                                        0, dev->irq_name, dev);
                if (r)
                        goto err;
        }
@@ -293,7 +296,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
        return 0;
 err:
        for (i -= 1; i >= 0; i--)
-               free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+               free_irq(dev->host_msix_entries[i].vector, dev);
        pci_disable_msix(dev->dev);
        return r;
 }
@@ -406,7 +409,8 @@ static int assign_guest_irq(struct kvm *kvm,
 
        if (!r) {
                dev->irq_requested_type |= guest_irq_type;
-               kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+               if (dev->ack_notifier.gsi != -1)
+                       kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
        } else
                kvm_free_irq_source_id(kvm, dev->irq_source_id);
 
index fc84875..a6ec206 100644 (file)
@@ -24,10 +24,19 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
                                   gpa_t addr, int len)
 {
-       struct kvm_coalesced_mmio_zone *zone;
+       /* is it in a batchable area ?
+        * (addr,len) is fully included in
+        * (zone->addr, zone->size)
+        */
+
+       return (dev->zone.addr <= addr &&
+               addr + len <= dev->zone.addr + dev->zone.size);
+}
+
+static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
+{
        struct kvm_coalesced_mmio_ring *ring;
        unsigned avail;
-       int i;
 
        /* Are we able to batch it ? */
 
@@ -37,25 +46,12 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
         */
        ring = dev->kvm->coalesced_mmio_ring;
        avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
-       if (avail < KVM_MAX_VCPUS) {
+       if (avail == 0) {
                /* full */
                return 0;
        }
 
-       /* is it in a batchable area ? */
-
-       for (i = 0; i < dev->nb_zones; i++) {
-               zone = &dev->zone[i];
-
-               /* (addr,len) is fully included in
-                * (zone->addr, zone->size)
-                */
-
-               if (zone->addr <= addr &&
-                   addr + len <= zone->addr + zone->size)
-                       return 1;
-       }
-       return 0;
+       return 1;
 }
 
 static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -63,10 +59,16 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+
        if (!coalesced_mmio_in_range(dev, addr, len))
                return -EOPNOTSUPP;
 
-       spin_lock(&dev->lock);
+       spin_lock(&dev->kvm->ring_lock);
+
+       if (!coalesced_mmio_has_room(dev)) {
+               spin_unlock(&dev->kvm->ring_lock);
+               return -EOPNOTSUPP;
+       }
 
        /* copy data in first free entry of the ring */
 
@@ -75,7 +77,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
        memcpy(ring->coalesced_mmio[ring->last].data, val, len);
        smp_wmb();
        ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
-       spin_unlock(&dev->lock);
+       spin_unlock(&dev->kvm->ring_lock);
        return 0;
 }
 
@@ -83,6 +85,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 
+       list_del(&dev->list);
+
        kfree(dev);
 }
 
@@ -93,7 +97,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
-       struct kvm_coalesced_mmio_dev *dev;
        struct page *page;
        int ret;
 
@@ -101,31 +104,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!page)
                goto out_err;
-       kvm->coalesced_mmio_ring = page_address(page);
-
-       ret = -ENOMEM;
-       dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-       if (!dev)
-               goto out_free_page;
-       spin_lock_init(&dev->lock);
-       kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
-       dev->kvm = kvm;
-       kvm->coalesced_mmio_dev = dev;
 
-       mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
-       mutex_unlock(&kvm->slots_lock);
-       if (ret < 0)
-               goto out_free_dev;
+       ret = 0;
+       kvm->coalesced_mmio_ring = page_address(page);
 
-       return ret;
+       /*
+        * We're using this spinlock to sync access to the coalesced ring.
+        * The list doesn't need it's own lock since device registration and
+        * unregistration should only happen when kvm->slots_lock is held.
+        */
+       spin_lock_init(&kvm->ring_lock);
+       INIT_LIST_HEAD(&kvm->coalesced_zones);
 
-out_free_dev:
-       kvm->coalesced_mmio_dev = NULL;
-       kfree(dev);
-out_free_page:
-       kvm->coalesced_mmio_ring = NULL;
-       __free_page(page);
 out_err:
        return ret;
 }
@@ -139,51 +129,50 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
                                         struct kvm_coalesced_mmio_zone *zone)
 {
-       struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+       int ret;
+       struct kvm_coalesced_mmio_dev *dev;
 
-       if (dev == NULL)
-               return -ENXIO;
+       dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+
+       kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+       dev->kvm = kvm;
+       dev->zone = *zone;
 
        mutex_lock(&kvm->slots_lock);
-       if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
-               mutex_unlock(&kvm->slots_lock);
-               return -ENOBUFS;
-       }
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
+                                     zone->size, &dev->dev);
+       if (ret < 0)
+               goto out_free_dev;
+       list_add_tail(&dev->list, &kvm->coalesced_zones);
+       mutex_unlock(&kvm->slots_lock);
 
-       dev->zone[dev->nb_zones] = *zone;
-       dev->nb_zones++;
+       return ret;
 
+out_free_dev:
        mutex_unlock(&kvm->slots_lock);
+
+       kfree(dev);
+
+       if (dev == NULL)
+               return -ENXIO;
+
        return 0;
 }
 
 int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
 {
-       int i;
-       struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
-       struct kvm_coalesced_mmio_zone *z;
-
-       if (dev == NULL)
-               return -ENXIO;
+       struct kvm_coalesced_mmio_dev *dev, *tmp;
 
        mutex_lock(&kvm->slots_lock);
 
-       i = dev->nb_zones;
-       while (i) {
-               z = &dev->zone[i - 1];
-
-               /* unregister all zones
-                * included in (zone->addr, zone->size)
-                */
-
-               if (zone->addr <= z->addr &&
-                   z->addr + z->size <= zone->addr + zone->size) {
-                       dev->nb_zones--;
-                       *z = dev->zone[dev->nb_zones];
+       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+               if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+                       kvm_iodevice_destructor(&dev->dev);
                }
-               i--;
-       }
 
        mutex_unlock(&kvm->slots_lock);
 
index 8a5959e..b280c20 100644 (file)
 
 #ifdef CONFIG_KVM_MMIO
 
-#define KVM_COALESCED_MMIO_ZONE_MAX 100
+#include <linux/list.h>
 
 struct kvm_coalesced_mmio_dev {
+       struct list_head list;
        struct kvm_io_device dev;
        struct kvm *kvm;
-       spinlock_t lock;
-       int nb_zones;
-       struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+       struct kvm_coalesced_mmio_zone zone;
 };
 
 int kvm_coalesced_mmio_init(struct kvm *kvm);
index 73358d2..f59c1e8 100644 (file)
@@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
        kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 
-       ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+       ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
+                                     &p->dev);
        if (ret < 0)
                goto unlock_fail;
 
index 8df1ca1..3eed61e 100644 (file)
@@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm)
        kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
        ioapic->kvm = kvm;
        mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+                                     IOAPIC_MEM_LENGTH, &ioapic->dev);
        mutex_unlock(&kvm->slots_lock);
        if (ret < 0) {
                kvm->arch.vioapic = NULL;
index aefdda3..d9cfb78 100644 (file)
@@ -47,6 +47,8 @@
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        int i;
 
        for (i = 0; i < bus->dev_count; i++) {
-               struct kvm_io_device *pos = bus->devs[i];
+               struct kvm_io_device *pos = bus->range[i].dev;
 
                kvm_iodevice_destructor(pos);
        }
        kfree(bus);
 }
 
+int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+       const struct kvm_io_range *r1 = p1;
+       const struct kvm_io_range *r2 = p2;
+
+       if (r1->addr < r2->addr)
+               return -1;
+       if (r1->addr + r1->len > r2->addr + r2->len)
+               return 1;
+       return 0;
+}
+
+int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+                         gpa_t addr, int len)
+{
+       if (bus->dev_count == NR_IOBUS_DEVS)
+               return -ENOSPC;
+
+       bus->range[bus->dev_count++] = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+               .dev = dev,
+       };
+
+       sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
+               kvm_io_bus_sort_cmp, NULL);
+
+       return 0;
+}
+
+int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+                            gpa_t addr, int len)
+{
+       struct kvm_io_range *range, key;
+       int off;
+
+       key = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
+
+       range = bsearch(&key, bus->range, bus->dev_count,
+                       sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+       if (range == NULL)
+               return -ENOENT;
+
+       off = range - bus->range;
+
+       while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+               off--;
+
+       return off;
+}
+
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
-       int i;
+       int idx;
        struct kvm_io_bus *bus;
+       struct kvm_io_range range;
+
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
 
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       for (i = 0; i < bus->dev_count; i++)
-               if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+       idx = kvm_io_bus_get_first_dev(bus, addr, len);
+       if (idx < 0)
+               return -EOPNOTSUPP;
+
+       while (idx < bus->dev_count &&
+               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
                        return 0;
+               idx++;
+       }
+
        return -EOPNOTSUPP;
 }
 
@@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
-       int i;
+       int idx;
        struct kvm_io_bus *bus;
+       struct kvm_io_range range;
+
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
 
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       for (i = 0; i < bus->dev_count; i++)
-               if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+       idx = kvm_io_bus_get_first_dev(bus, addr, len);
+       if (idx < 0)
+               return -EOPNOTSUPP;
+
+       while (idx < bus->dev_count &&
+               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
                        return 0;
+               idx++;
+       }
+
        return -EOPNOTSUPP;
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                           struct kvm_io_device *dev)
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, struct kvm_io_device *dev)
 {
        struct kvm_io_bus *new_bus, *bus;
 
@@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        if (!new_bus)
                return -ENOMEM;
        memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-       new_bus->devs[new_bus->dev_count++] = dev;
+       kvm_io_bus_insert_dev(new_bus, dev, addr, len);
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
@@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
        r = -ENOENT;
        for (i = 0; i < new_bus->dev_count; i++)
-               if (new_bus->devs[i] == dev) {
+               if (new_bus->range[i].dev == dev) {
                        r = 0;
-                       new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+                       new_bus->dev_count--;
+                       new_bus->range[i] = new_bus->range[new_bus->dev_count];
+                       sort(new_bus->range, new_bus->dev_count,
+                            sizeof(struct kvm_io_range),
+                            kvm_io_bus_sort_cmp, NULL);
                        break;
                }