Merge tag 'kvm-3.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
Pull kvm updates from Paolo Bonzini:
 "PPC and ARM do not have much going on this time.  Most of the cool
  stuff, instead, is in s390 and (after a few releases) x86.

  ARM has some caching fixes and PPC has transactional memory support in
  guests.  MIPS has some fixes, with more probably coming in 3.16 as
  QEMU will soon get support for MIPS KVM.

  For x86 there are optimizations for debug registers, which trigger on
  some Windows games, and other important fixes for Windows guests.  We
  now expose to the guest Broadwell instruction set extensions and also
  Intel MPX.  There's also a fix/workaround for OS X guests, nested
  virtualization features (preemption timer), and a couple kvmclock
  refinements.

  For s390, the main news is asynchronous page faults, together with
  improvements to IRQs (floating irqs and adapter irqs) that speed up
  virtio devices"

* tag 'kvm-3.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (96 commits)
  KVM: PPC: Book3S HV: Save/restore host PMU registers that are new in POWER8
  KVM: PPC: Book3S HV: Fix decrementer timeouts with non-zero TB offset
  KVM: PPC: Book3S HV: Don't use kvm_memslots() in real mode
  KVM: PPC: Book3S HV: Return ENODEV error rather than EIO
  KVM: PPC: Book3S: Trim top 4 bits of physical address in RTAS code
  KVM: PPC: Book3S HV: Add get/set_one_reg for new TM state
  KVM: PPC: Book3S HV: Add transactional memory support
  KVM: Specify byte order for KVM_EXIT_MMIO
  KVM: vmx: fix MPX detection
  KVM: PPC: Book3S HV: Fix KVM hang with CONFIG_KVM_XICS=n
  KVM: PPC: Book3S: Introduce hypervisor call H_GET_TCE
  KVM: PPC: Book3S HV: Fix incorrect userspace exit on ioeventfd write
  KVM: s390: clear local interrupts at cpu initial reset
  KVM: s390: Fix possible memory leak in SIGP functions
  KVM: s390: fix calculation of idle_mask array size
  KVM: s390: randomize sca address
  KVM: ioapic: reinject pending interrupts on KVM_SET_IRQCHIP
  KVM: Bump KVM_MAX_IRQ_ROUTES for s390
  KVM: s390: irq routing for adapter interrupts.
  KVM: s390: adapter interrupt sources
  ...

74 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/s390_flic.txt [new file with mode: 0644]
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/kvm_asm.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/kernel/asm-offsets.c
arch/arm/kvm/coproc.c
arch/arm/kvm/coproc.h
arch/arm/kvm/coproc_a15.c
arch/arm/kvm/coproc_a7.c
arch/arm/kvm/guest.c
arch/arm/kvm/interrupts_head.S
arch/arm/kvm/mmu.c
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs.h
arch/ia64/kvm/kvm-ia64.c
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/kvm_mips_emul.c
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/tm.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_interrupts.S
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_rtas.c
arch/s390/include/asm/irq.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/processor.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/kernel/irq.c
arch/s390/kvm/Kconfig
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/irq.h [new file with mode: 0644]
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/trace.h
arch/s390/mm/fault.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/vmx.h
arch/x86/include/asm/xsave.h
arch/x86/include/uapi/asm/msr-index.h
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
drivers/s390/kvm/virtio_ccw.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
virt/kvm/Kconfig
virt/kvm/async_pf.c
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/kvm_main.c

index 6cd63a9..c24211d 100644 (file)
@@ -586,8 +586,8 @@ struct kvm_fpu {
 
 4.24 KVM_CREATE_IRQCHIP
 
-Capability: KVM_CAP_IRQCHIP
-Architectures: x86, ia64, ARM, arm64
+Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390)
+Architectures: x86, ia64, ARM, arm64, s390
 Type: vm ioctl
 Parameters: none
 Returns: 0 on success, -1 on error
@@ -596,7 +596,10 @@ Creates an interrupt controller model in the kernel.  On x86, creates a virtual
 ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
 local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
 only go to the IOAPIC.  On ia64, a IOSAPIC is created. On ARM/arm64, a GIC is
-created.
+created. On s390, a dummy irq routing table is created.
+
+Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled
+before KVM_CREATE_IRQCHIP can be used.
 
 
 4.25 KVM_IRQ_LINE
@@ -612,6 +615,20 @@ On some architectures it is required that an interrupt controller model has
 been previously created with KVM_CREATE_IRQCHIP.  Note that edge-triggered
 interrupts require the level to be set to 1 and then back to 0.
 
+On real hardware, interrupt pins can be active-low or active-high.  This
+does not matter for the level field of struct kvm_irq_level: 1 always
+means active (asserted), 0 means inactive (deasserted).
+
+x86 allows the operating system to program the interrupt polarity
+(active-low/active-high) for level-triggered interrupts, and KVM used
+to consider the polarity.  However, due to bitrot in the handling of
+active-low interrupts, the above convention is now valid on x86 too.
+This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED.  Userspace
+should not present interrupts to the guest as active-low unless this
+capability is present (or unless it is not using the in-kernel irqchip,
+of course).
+
+
 ARM/arm64 can signal an interrupt either at the CPU level, or at the
 in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
 use PPIs designated for specific cpus.  The irq field is interpreted
@@ -628,7 +645,7 @@ The irq_type field has the following values:
 
 (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
 
-In both cases, level is used to raise/lower the line.
+In both cases, level is used to assert/deassert the line.
 
 struct kvm_irq_level {
        union {
@@ -918,9 +935,9 @@ documentation when it pops into existence).
 
 4.37 KVM_ENABLE_CAP
 
-Capability: KVM_CAP_ENABLE_CAP
+Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
 Architectures: ppc, s390
-Type: vcpu ioctl
+Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
 Parameters: struct kvm_enable_cap (in)
 Returns: 0 on success; -1 on error
 
@@ -951,6 +968,8 @@ function properly, this is the place to put them.
        __u8  pad[64];
 };
 
+The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl
+for vm-wide capabilities.
 
 4.38 KVM_GET_MP_STATE
 
@@ -1320,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
-Architectures: x86 ia64
+Architectures: x86 ia64 s390
 Type: vm ioctl
 Parameters: struct kvm_irq_routing (in)
 Returns: 0 on success, -1 on error
@@ -1343,6 +1362,7 @@ struct kvm_irq_routing_entry {
        union {
                struct kvm_irq_routing_irqchip irqchip;
                struct kvm_irq_routing_msi msi;
+               struct kvm_irq_routing_s390_adapter adapter;
                __u32 pad[8];
        } u;
 };
@@ -1350,6 +1370,7 @@ struct kvm_irq_routing_entry {
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
 
 No flags are specified so far, the corresponding field must be set to zero.
 
@@ -1365,6 +1386,14 @@ struct kvm_irq_routing_msi {
        __u32 pad;
 };
 
+struct kvm_irq_routing_s390_adapter {
+       __u64 ind_addr;
+       __u64 summary_addr;
+       __u64 ind_offset;
+       __u32 summary_offset;
+       __u32 adapter_id;
+};
+
 
 4.53 KVM_ASSIGN_SET_MSIX_NR
 
@@ -2566,6 +2595,10 @@ executed a memory-mapped I/O instruction which could not be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
 
+The 'data' member contains, in its first 'len' bytes, the value as it would
+appear if the VCPU performed a load or store of the appropriate width directly
+to the byte array.
+
 NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
       KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
new file mode 100644 (file)
index 0000000..4ceef53
--- /dev/null
@@ -0,0 +1,91 @@
+FLIC (floating interrupt controller)
+====================================
+
+FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some
+machine check interruptions. All interrupts are stored in a per-vm list of
+pending interrupts. FLIC performs operations on this list.
+
+Only one FLIC instance may be instantiated.
+
+FLIC provides support to
+- add interrupts (KVM_DEV_FLIC_ENQUEUE)
+- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS)
+- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
+- enable/disable for the guest transparent async page faults
+- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
+
+Groups:
+  KVM_DEV_FLIC_ENQUEUE
+    Passes a buffer and length into the kernel which are then injected into
+    the list of pending interrupts.
+    attr->addr contains the pointer to the buffer and attr->attr contains
+    the length of the buffer.
+    The format of the data structure kvm_s390_irq as it is copied from userspace
+    is defined in usr/include/linux/kvm.h.
+
+  KVM_DEV_FLIC_GET_ALL_IRQS
+    Copies all floating interrupts into a buffer provided by userspace.
+    When the buffer is too small it returns -ENOMEM, which is the indication
+    for userspace to try again with a bigger buffer.
+    All interrupts remain pending, i.e. are not deleted from the list of
+    currently pending interrupts.
+    attr->addr contains the userspace address of the buffer into which all
+    interrupt data will be copied.
+    attr->attr contains the size of the buffer in bytes.
+
+  KVM_DEV_FLIC_CLEAR_IRQS
+    Simply deletes all elements from the list of currently pending floating
+    interrupts.  No interrupts are injected into the guest.
+
+  KVM_DEV_FLIC_APF_ENABLE
+    Enables async page faults for the guest. So in case of a major page fault
+    the host is allowed to handle this async and continues the guest.
+
+  KVM_DEV_FLIC_APF_DISABLE_WAIT
+    Disables async page faults for the guest and waits until already pending
+    async page faults are done. This is necessary to trigger a completion interrupt
+    for every init interrupt before migrating the interrupt list.
+
+  KVM_DEV_FLIC_ADAPTER_REGISTER
+    Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter
+    describing the adapter to register:
+
+struct kvm_s390_io_adapter {
+       __u32 id;
+       __u8 isc;
+       __u8 maskable;
+       __u8 swap;
+       __u8 pad;
+};
+
+   id contains the unique id for the adapter, isc the I/O interruption subclass
+   to use, maskable whether this adapter may be masked (interrupts turned off)
+   and swap whether the indicators need to be byte swapped.
+
+
+  KVM_DEV_FLIC_ADAPTER_MODIFY
+    Modifies attributes of an existing I/O adapter interrupt source. Takes
+    a kvm_s390_io_adapter_req specifiying the adapter and the operation:
+
+struct kvm_s390_io_adapter_req {
+       __u32 id;
+       __u8 type;
+       __u8 mask;
+       __u16 pad0;
+       __u64 addr;
+};
+
+    id specifies the adapter and type the operation. The supported operations
+    are:
+
+    KVM_S390_IO_ADAPTER_MASK
+      mask or unmask the adapter, as specified in mask
+
+    KVM_S390_IO_ADAPTER_MAP
+      perform a gmap translation for the guest address provided in addr,
+      pin a userspace page for the translated address and add it to the
+      list of mappings
+
+    KVM_S390_IO_ADAPTER_UNMAP
+      release a userspace page for the translated address specified in addr
+      from the list of mappings
index 1d3153c..816db0b 100644 (file)
@@ -55,6 +55,7 @@
  * The bits we set in HCR:
  * TAC:                Trap ACTLR
  * TSC:                Trap SMC
+ * TVM:                Trap VM ops (until MMU and caches are on)
  * TSW:                Trap cache operations by set/way
  * TWI:                Trap WFI
  * TWE:                Trap WFE
@@ -68,8 +69,7 @@
  */
 #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
                        HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
-                       HCR_TWE | HCR_SWIO | HCR_TIDCP)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+                       HCR_TVM | HCR_TWE | HCR_SWIO | HCR_TIDCP)
 
 /* System Control Register (SCTLR) bits */
 #define SCTLR_TE       (1 << 30)
index 661da11..53b3c4a 100644 (file)
@@ -48,7 +48,9 @@
 #define c13_TID_URO    26      /* Thread ID, User R/O */
 #define c13_TID_PRIV   27      /* Thread ID, Privileged */
 #define c14_CNTKCTL    28      /* Timer Control Register (PL1) */
-#define NR_CP15_REGS   29      /* Number of regs (incl. invalid) */
+#define c10_AMAIR0     29      /* Auxilary Memory Attribute Indirection Reg0 */
+#define c10_AMAIR1     30      /* Auxilary Memory Attribute Indirection Reg1 */
+#define NR_CP15_REGS   31      /* Number of regs (incl. invalid) */
 
 #define ARM_EXCEPTION_RESET      0
 #define ARM_EXCEPTION_UNDEFINED   1
index 098f7dd..09af149 100644 (file)
@@ -101,6 +101,12 @@ struct kvm_vcpu_arch {
        /* The CPU type we expose to the VM */
        u32 midr;
 
+       /* HYP trapping configuration */
+       u32 hcr;
+
+       /* Interrupt related fields */
+       u32 irq_lines;          /* IRQ and FIQ levels */
+
        /* Exception Information */
        struct kvm_vcpu_fault_info fault;
 
@@ -128,9 +134,6 @@ struct kvm_vcpu_arch {
        /* IO related fields */
        struct kvm_decode mmio_decode;
 
-       /* Interrupt related fields */
-       u32 irq_lines;          /* IRQ and FIQ levels */
-
        /* Cache some mmu pages needed inside spinlock regions */
        struct kvm_mmu_memory_cache mmu_page_cache;
 
index 2d122ad..5c7aa3c 100644 (file)
@@ -114,11 +114,34 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
        pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+/* Open coded p*d_addr_end that can deal with 64bit addresses */
+#define kvm_pgd_addr_end(addr, end)                                    \
+({     u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;            \
+       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
+})
+
+#define kvm_pud_addr_end(addr,end)             (end)
+
+#define kvm_pmd_addr_end(addr, end)                                    \
+({     u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK;                \
+       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
+})
+
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
-                                             unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
+       return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+                                            unsigned long size)
+{
+       if (!vcpu_has_cache_enabled(vcpu))
+               kvm_flush_dcache_to_poc((void *)hva, size);
+       
        /*
         * If we are going to insert an instruction page and the icache is
         * either VIPT or PIPT, there is a potential problem where the host
@@ -139,9 +162,10 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
        }
 }
 
-#define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
 #define kvm_virt_to_phys(x)            virt_to_idmap((unsigned long)(x))
 
+void stage2_flush_vm(struct kvm *kvm);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ARM_KVM_MMU_H__ */
index ded0417..85598b5 100644 (file)
@@ -174,6 +174,7 @@ int main(void)
   DEFINE(VCPU_FIQ_REGS,                offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
   DEFINE(VCPU_PC,              offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_pc));
   DEFINE(VCPU_CPSR,            offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_cpsr));
+  DEFINE(VCPU_HCR,             offsetof(struct kvm_vcpu, arch.hcr));
   DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HSR,             offsetof(struct kvm_vcpu, arch.fault.hsr));
   DEFINE(VCPU_HxFAR,           offsetof(struct kvm_vcpu, arch.fault.hxfar));
index 78c0885..c58a351 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/kvm_host.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
 #include <trace/events/kvm.h>
@@ -204,6 +205,44 @@ done:
        return true;
 }
 
+/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+                         const struct coproc_params *p,
+                         const struct coproc_reg *r)
+{
+       BUG_ON(!p->is_write);
+
+       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
+       if (p->is_64bit)
+               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
+
+       return true;
+}
+
+/*
+ * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ *
+ * Used by the cpu-specific code.
+ */
+bool access_sctlr(struct kvm_vcpu *vcpu,
+                 const struct coproc_params *p,
+                 const struct coproc_reg *r)
+{
+       access_vm_reg(vcpu, p, r);
+
+       if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
+               vcpu->arch.hcr &= ~HCR_TVM;
+               stage2_flush_vm(vcpu->kvm);
+       }
+
+       return true;
+}
+
 /*
  * We could trap ID_DFR0 and tell the guest we don't support performance
  * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -261,33 +300,36 @@ static const struct coproc_reg cp15_regs[] = {
        { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32,
                        NULL, reset_val, c1_CPACR, 0x00000000 },
 
-       /* TTBR0/TTBR1: swapped by interrupt.S. */
-       { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 },
-       { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 },
-
-       /* TTBCR: swapped by interrupt.S. */
+       /* TTBR0/TTBR1/TTBCR: swapped by interrupt.S. */
+       { CRm64( 2), Op1( 0), is64, access_vm_reg, reset_unknown64, c2_TTBR0 },
+       { CRn(2), CRm( 0), Op1( 0), Op2( 0), is32,
+                       access_vm_reg, reset_unknown, c2_TTBR0 },
+       { CRn(2), CRm( 0), Op1( 0), Op2( 1), is32,
+                       access_vm_reg, reset_unknown, c2_TTBR1 },
        { CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32,
-                       NULL, reset_val, c2_TTBCR, 0x00000000 },
+                       access_vm_reg, reset_val, c2_TTBCR, 0x00000000 },
+       { CRm64( 2), Op1( 1), is64, access_vm_reg, reset_unknown64, c2_TTBR1 },
+
 
        /* DACR: swapped by interrupt.S. */
        { CRn( 3), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c3_DACR },
+                       access_vm_reg, reset_unknown, c3_DACR },
 
        /* DFSR/IFSR/ADFSR/AIFSR: swapped by interrupt.S. */
        { CRn( 5), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c5_DFSR },
+                       access_vm_reg, reset_unknown, c5_DFSR },
        { CRn( 5), CRm( 0), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c5_IFSR },
+                       access_vm_reg, reset_unknown, c5_IFSR },
        { CRn( 5), CRm( 1), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c5_ADFSR },
+                       access_vm_reg, reset_unknown, c5_ADFSR },
        { CRn( 5), CRm( 1), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c5_AIFSR },
+                       access_vm_reg, reset_unknown, c5_AIFSR },
 
        /* DFAR/IFAR: swapped by interrupt.S. */
        { CRn( 6), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c6_DFAR },
+                       access_vm_reg, reset_unknown, c6_DFAR },
        { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
-                       NULL, reset_unknown, c6_IFAR },
+                       access_vm_reg, reset_unknown, c6_IFAR },
 
        /* PAR swapped by interrupt.S */
        { CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
@@ -324,9 +366,15 @@ static const struct coproc_reg cp15_regs[] = {
 
        /* PRRR/NMRR (aka MAIR0/MAIR1): swapped by interrupt.S. */
        { CRn(10), CRm( 2), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c10_PRRR},
+                       access_vm_reg, reset_unknown, c10_PRRR},
        { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c10_NMRR},
+                       access_vm_reg, reset_unknown, c10_NMRR},
+
+       /* AMAIR0/AMAIR1: swapped by interrupt.S. */
+       { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32,
+                       access_vm_reg, reset_unknown, c10_AMAIR0},
+       { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
+                       access_vm_reg, reset_unknown, c10_AMAIR1},
 
        /* VBAR: swapped by interrupt.S. */
        { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
@@ -334,7 +382,7 @@ static const struct coproc_reg cp15_regs[] = {
 
        /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
        { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
-                       NULL, reset_val, c13_CID, 0x00000000 },
+                       access_vm_reg, reset_val, c13_CID, 0x00000000 },
        { CRn(13), CRm( 0), Op1( 0), Op2( 2), is32,
                        NULL, reset_unknown, c13_TID_URW },
        { CRn(13), CRm( 0), Op1( 0), Op2( 3), is32,
@@ -443,7 +491,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        struct coproc_params params;
 
-       params.CRm = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
+       params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
        params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf;
        params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0);
        params.is_64bit = true;
@@ -451,7 +499,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
        params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 16) & 0xf;
        params.Op2 = 0;
        params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf;
-       params.CRn = 0;
+       params.CRm = 0;
 
        return emulate_cp15(vcpu, &params);
 }
index 0461d5c..1a44bbe 100644 (file)
@@ -58,8 +58,8 @@ static inline void print_cp_instr(const struct coproc_params *p)
 {
        /* Look, we even formatted it for you to paste into the table! */
        if (p->is_64bit) {
-               kvm_pr_unimpl(" { CRm(%2lu), Op1(%2lu), is64, func_%s },\n",
-                             p->CRm, p->Op1, p->is_write ? "write" : "read");
+               kvm_pr_unimpl(" { CRm64(%2lu), Op1(%2lu), is64, func_%s },\n",
+                             p->CRn, p->Op1, p->is_write ? "write" : "read");
        } else {
                kvm_pr_unimpl(" { CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32,"
                              " func_%s },\n",
@@ -135,13 +135,13 @@ static inline int cmp_reg(const struct coproc_reg *i1,
                return -1;
        if (i1->CRn != i2->CRn)
                return i1->CRn - i2->CRn;
-       if (i1->is_64 != i2->is_64)
-               return i2->is_64 - i1->is_64;
        if (i1->CRm != i2->CRm)
                return i1->CRm - i2->CRm;
        if (i1->Op1 != i2->Op1)
                return i1->Op1 - i2->Op1;
-       return i1->Op2 - i2->Op2;
+       if (i1->Op2 != i2->Op2)
+               return i1->Op2 - i2->Op2;
+       return i2->is_64 - i1->is_64;
 }
 
 
@@ -153,4 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1,
 #define is64           .is_64 = true
 #define is32           .is_64 = false
 
+bool access_sctlr(struct kvm_vcpu *vcpu,
+                 const struct coproc_params *p,
+                 const struct coproc_reg *r);
+
 #endif /* __ARM_KVM_COPROC_LOCAL_H__ */
index bb0cac1..e6f4ae4 100644 (file)
@@ -34,7 +34,7 @@
 static const struct coproc_reg a15_regs[] = {
        /* SCTLR: swapped by interrupt.S. */
        { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_val, c1_SCTLR, 0x00C50078 },
+                       access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
 };
 
 static struct kvm_coproc_target_table a15_target_table = {
index 1df7673..17fc7cd 100644 (file)
@@ -37,7 +37,7 @@
 static const struct coproc_reg a7_regs[] = {
        /* SCTLR: swapped by interrupt.S. */
        { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_val, c1_SCTLR, 0x00C50878 },
+                       access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
 };
 
 static struct kvm_coproc_target_table a7_target_table = {
index 2786eae..b23a59c 100644 (file)
@@ -38,6 +38,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.hcr = HCR_GUEST_MASK;
        return 0;
 }
 
index 6f18695..76af930 100644 (file)
@@ -303,13 +303,17 @@ vcpu      .req    r0              @ vcpu pointer always in r0
 
        mrc     p15, 0, r2, c14, c1, 0  @ CNTKCTL
        mrrc    p15, 0, r4, r5, c7      @ PAR
+       mrc     p15, 0, r6, c10, c3, 0  @ AMAIR0
+       mrc     p15, 0, r7, c10, c3, 1  @ AMAIR1
 
        .if \store_to_vcpu == 0
-       push    {r2,r4-r5}
+       push    {r2,r4-r7}
        .else
        str     r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
        add     r12, vcpu, #CP15_OFFSET(c7_PAR)
        strd    r4, r5, [r12]
+       str     r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+       str     r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
        .endif
 .endm
 
@@ -322,15 +326,19 @@ vcpu      .req    r0              @ vcpu pointer always in r0
  */
 .macro write_cp15_state read_from_vcpu
        .if \read_from_vcpu == 0
-       pop     {r2,r4-r5}
+       pop     {r2,r4-r7}
        .else
        ldr     r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
        add     r12, vcpu, #CP15_OFFSET(c7_PAR)
        ldrd    r4, r5, [r12]
+       ldr     r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+       ldr     r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
        .endif
 
        mcr     p15, 0, r2, c14, c1, 0  @ CNTKCTL
        mcrr    p15, 0, r4, r5, c7      @ PAR
+       mcr     p15, 0, r6, c10, c3, 0  @ AMAIR0
+       mcr     p15, 0, r7, c10, c3, 1  @ AMAIR1
 
        .if \read_from_vcpu == 0
        pop     {r2-r12}
@@ -597,17 +605,14 @@ vcpu      .req    r0              @ vcpu pointer always in r0
 
 /* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */
 .macro configure_hyp_role operation
-       mrc     p15, 4, r2, c1, c1, 0   @ HCR
-       bic     r2, r2, #HCR_VIRT_EXCP_MASK
-       ldr     r3, =HCR_GUEST_MASK
        .if \operation == vmentry
-       orr     r2, r2, r3
+       ldr     r2, [vcpu, #VCPU_HCR]
        ldr     r3, [vcpu, #VCPU_IRQ_LINES]
        orr     r2, r2, r3
        .else
-       bic     r2, r2, r3
+       mov     r2, #0
        .endif
-       mcr     p15, 4, r2, c1, c1, 0
+       mcr     p15, 4, r2, c1, c1, 0   @ HCR
 .endm
 
 .macro load_vcpu
index 7789857..80bb1e6 100644 (file)
@@ -144,8 +144,9 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
        while (addr < end) {
                pgd = pgdp + pgd_index(addr);
                pud = pud_offset(pgd, addr);
+               pte = NULL;
                if (pud_none(*pud)) {
-                       addr = pud_addr_end(addr, end);
+                       addr = kvm_pud_addr_end(addr, end);
                        continue;
                }
 
@@ -155,13 +156,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                         * move on.
                         */
                        clear_pud_entry(kvm, pud, addr);
-                       addr = pud_addr_end(addr, end);
+                       addr = kvm_pud_addr_end(addr, end);
                        continue;
                }
 
                pmd = pmd_offset(pud, addr);
                if (pmd_none(*pmd)) {
-                       addr = pmd_addr_end(addr, end);
+                       addr = kvm_pmd_addr_end(addr, end);
                        continue;
                }
 
@@ -174,12 +175,12 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                /*
                 * If the pmd entry is to be cleared, walk back up the ladder
                 */
-               if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
+               if (kvm_pmd_huge(*pmd) || (pte && page_empty(pte))) {
                        clear_pmd_entry(kvm, pmd, addr);
-                       next = pmd_addr_end(addr, end);
+                       next = kvm_pmd_addr_end(addr, end);
                        if (page_empty(pmd) && !page_empty(pud)) {
                                clear_pud_entry(kvm, pud, addr);
-                               next = pud_addr_end(addr, end);
+                               next = kvm_pud_addr_end(addr, end);
                        }
                }
 
@@ -187,6 +188,99 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
        }
 }
 
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pte_t *pte;
+
+       pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                       kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE);
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pmd_t *pmd;
+       phys_addr_t next;
+
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = kvm_pmd_addr_end(addr, end);
+               if (!pmd_none(*pmd)) {
+                       if (kvm_pmd_huge(*pmd)) {
+                               hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                               kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE);
+                       } else {
+                               stage2_flush_ptes(kvm, pmd, addr, next);
+                       }
+               }
+       } while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pud_t *pud;
+       phys_addr_t next;
+
+       pud = pud_offset(pgd, addr);
+       do {
+               next = kvm_pud_addr_end(addr, end);
+               if (!pud_none(*pud)) {
+                       if (pud_huge(*pud)) {
+                               hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                               kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE);
+                       } else {
+                               stage2_flush_pmds(kvm, pud, addr, next);
+                       }
+               }
+       } while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot)
+{
+       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+       phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+       phys_addr_t next;
+       pgd_t *pgd;
+
+       pgd = kvm->arch.pgd + pgd_index(addr);
+       do {
+               next = kvm_pgd_addr_end(addr, end);
+               stage2_flush_puds(kvm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+void stage2_flush_vm(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots)
+               stage2_flush_memslot(kvm, memslot);
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+}
+
 /**
  * free_boot_hyp_pgd - free HYP boot page tables
  *
@@ -715,7 +809,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        kvm_set_s2pmd_writable(&new_pmd);
                        kvm_set_pfn_dirty(pfn);
                }
-               coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
+               coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE);
                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
        } else {
                pte_t new_pte = pfn_pte(pfn, PAGE_S2);
@@ -723,7 +817,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        kvm_set_s2pte_writable(&new_pte);
                        kvm_set_pfn_dirty(pfn);
                }
-               coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
+               coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
        }
 
index 21ef48d..3d69030 100644 (file)
@@ -62,6 +62,7 @@
  * RW:         64bit by default, can be overriden for 32bit VMs
  * TAC:                Trap ACTLR
  * TSC:                Trap SMC
+ * TVM:                Trap VM ops (until M+C set in SCTLR_EL1)
  * TSW:                Trap cache operations by set/way
  * TWE:                Trap WFE
  * TWI:                Trap WFI
@@ -74,7 +75,7 @@
  * SWIO:       Turn set/way invalidates into set/way clean+invalidate
  */
 #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
-                        HCR_BSU_IS | HCR_FB | HCR_TAC | \
+                        HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
                         HCR_AMO | HCR_IMO | HCR_FMO | \
                         HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
index b25763b..9fcd54b 100644 (file)
@@ -79,7 +79,8 @@
 #define c13_TID_URW    (TPIDR_EL0 * 2) /* Thread ID, User R/W */
 #define c13_TID_URO    (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
 #define c13_TID_PRIV   (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR      (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR0     (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR1     (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
 #define c14_CNTKCTL    (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
 #define NR_CP15_REGS   (NR_SYS_REGS * 2)
 
index 7f1f940..7d29847 100644 (file)
@@ -106,7 +106,6 @@ static inline bool kvm_is_write_fault(unsigned long esr)
        return true;
 }
 
-static inline void kvm_clean_dcache_area(void *addr, size_t size) {}
 static inline void kvm_clean_pgd(pgd_t *pgd) {}
 static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
 static inline void kvm_clean_pte(pte_t *pte) {}
@@ -122,11 +121,25 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
        pmd_val(*pmd) |= PMD_S2_RDWR;
 }
 
+#define kvm_pgd_addr_end(addr, end)    pgd_addr_end(addr, end)
+#define kvm_pud_addr_end(addr, end)    pud_addr_end(addr, end)
+#define kvm_pmd_addr_end(addr, end)    pmd_addr_end(addr, end)
+
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
-                                             unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l)   __flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
+       return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+                                            unsigned long size)
+{
+       if (!vcpu_has_cache_enabled(vcpu))
+               kvm_flush_dcache_to_poc((void *)hva, size);
+
        if (!icache_is_aliasing()) {            /* PIPT */
                flush_icache_range(hva, hva + size);
        } else if (!icache_is_aivivt()) {       /* non ASID-tagged VIVT */
@@ -135,8 +148,9 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
        }
 }
 
-#define kvm_flush_dcache_to_poc(a,l)   __flush_dcache_area((a), (l))
 #define kvm_virt_to_phys(x)            __virt_to_phys((unsigned long)(x))
 
+void stage2_flush_vm(struct kvm *kvm);
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
index 02e9d09..0324458 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/kvm_host.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
 #include <trace/events/kvm.h>
@@ -120,6 +121,48 @@ done:
        return true;
 }
 
+/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+                         const struct sys_reg_params *p,
+                         const struct sys_reg_desc *r)
+{
+       unsigned long val;
+
+       BUG_ON(!p->is_write);
+
+       val = *vcpu_reg(vcpu, p->Rt);
+       if (!p->is_aarch32) {
+               vcpu_sys_reg(vcpu, r->reg) = val;
+       } else {
+               vcpu_cp15(vcpu, r->reg) = val & 0xffffffffUL;
+               if (!p->is_32bit)
+                       vcpu_cp15(vcpu, r->reg + 1) = val >> 32;
+       }
+       return true;
+}
+
+/*
+ * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set.  If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ */
+static bool access_sctlr(struct kvm_vcpu *vcpu,
+                        const struct sys_reg_params *p,
+                        const struct sys_reg_desc *r)
+{
+       access_vm_reg(vcpu, p, r);
+
+       if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
+               vcpu->arch.hcr_el2 &= ~HCR_TVM;
+               stage2_flush_vm(vcpu->kvm);
+       }
+
+       return true;
+}
+
 /*
  * We could trap ID_DFR0 and tell the guest we don't support performance
  * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -185,32 +228,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
          NULL, reset_mpidr, MPIDR_EL1 },
        /* SCTLR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
-         NULL, reset_val, SCTLR_EL1, 0x00C50078 },
+         access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
        /* CPACR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
          NULL, reset_val, CPACR_EL1, 0 },
        /* TTBR0_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000),
-         NULL, reset_unknown, TTBR0_EL1 },
+         access_vm_reg, reset_unknown, TTBR0_EL1 },
        /* TTBR1_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001),
-         NULL, reset_unknown, TTBR1_EL1 },
+         access_vm_reg, reset_unknown, TTBR1_EL1 },
        /* TCR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010),
-         NULL, reset_val, TCR_EL1, 0 },
+         access_vm_reg, reset_val, TCR_EL1, 0 },
 
        /* AFSR0_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000),
-         NULL, reset_unknown, AFSR0_EL1 },
+         access_vm_reg, reset_unknown, AFSR0_EL1 },
        /* AFSR1_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001),
-         NULL, reset_unknown, AFSR1_EL1 },
+         access_vm_reg, reset_unknown, AFSR1_EL1 },
        /* ESR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000),
-         NULL, reset_unknown, ESR_EL1 },
+         access_vm_reg, reset_unknown, ESR_EL1 },
        /* FAR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000),
-         NULL, reset_unknown, FAR_EL1 },
+         access_vm_reg, reset_unknown, FAR_EL1 },
        /* PAR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000),
          NULL, reset_unknown, PAR_EL1 },
@@ -224,17 +267,17 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
        /* MAIR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
-         NULL, reset_unknown, MAIR_EL1 },
+         access_vm_reg, reset_unknown, MAIR_EL1 },
        /* AMAIR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000),
-         NULL, reset_amair_el1, AMAIR_EL1 },
+         access_vm_reg, reset_amair_el1, AMAIR_EL1 },
 
        /* VBAR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
          NULL, reset_val, VBAR_EL1, 0 },
        /* CONTEXTIDR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
-         NULL, reset_val, CONTEXTIDR_EL1, 0 },
+         access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
        /* TPIDR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100),
          NULL, reset_unknown, TPIDR_EL1 },
@@ -305,14 +348,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
          NULL, reset_val, FPEXC32_EL2, 0x70 },
 };
 
-/* Trapped cp15 registers */
+/*
+ * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
+ * depending on the way they are accessed (as a 32bit or a 64bit
+ * register).
+ */
 static const struct sys_reg_desc cp15_regs[] = {
+       { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+       { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
+       { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR },
+       { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR },
+       { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR },
+       { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR },
+       { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR },
+       { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR },
+       { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR },
+
        /*
         * DC{C,I,CI}SW operations:
         */
        { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw },
        { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw },
        { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
+
        { Op1( 0), CRn( 9), CRm(12), Op2( 0), pm_fake },
        { Op1( 0), CRn( 9), CRm(12), Op2( 1), pm_fake },
        { Op1( 0), CRn( 9), CRm(12), Op2( 2), pm_fake },
@@ -326,6 +387,14 @@ static const struct sys_reg_desc cp15_regs[] = {
        { Op1( 0), CRn( 9), CRm(14), Op2( 0), pm_fake },
        { Op1( 0), CRn( 9), CRm(14), Op2( 1), pm_fake },
        { Op1( 0), CRn( 9), CRm(14), Op2( 2), pm_fake },
+
+       { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
+       { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
+       { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 },
+       { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 },
+       { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+
+       { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
 };
 
 /* Target specific emulation tables */
@@ -437,6 +506,8 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
        u32 hsr = kvm_vcpu_get_hsr(vcpu);
        int Rt2 = (hsr >> 10) & 0xf;
 
+       params.is_aarch32 = true;
+       params.is_32bit = false;
        params.CRm = (hsr >> 1) & 0xf;
        params.Rt = (hsr >> 5) & 0xf;
        params.is_write = ((hsr & 1) == 0);
@@ -480,6 +551,8 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
        struct sys_reg_params params;
        u32 hsr = kvm_vcpu_get_hsr(vcpu);
 
+       params.is_aarch32 = true;
+       params.is_32bit = true;
        params.CRm = (hsr >> 1) & 0xf;
        params.Rt  = (hsr >> 5) & 0xf;
        params.is_write = ((hsr & 1) == 0);
@@ -549,6 +622,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 
+       params.is_aarch32 = false;
+       params.is_32bit = false;
        params.Op0 = (esr >> 20) & 3;
        params.Op1 = (esr >> 14) & 0x7;
        params.CRn = (esr >> 10) & 0xf;
index d50d372..d411e25 100644 (file)
@@ -30,6 +30,8 @@ struct sys_reg_params {
        u8      Op2;
        u8      Rt;
        bool    is_write;
+       bool    is_aarch32;
+       bool    is_32bit;       /* Only valid if is_aarch32 is true */
 };
 
 struct sys_reg_desc {
index 53f44be..6a4309b 100644 (file)
@@ -199,6 +199,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_MP_STATE:
        case KVM_CAP_IRQ_INJECT_STATUS:
+       case KVM_CAP_IOAPIC_POLARITY_IGNORED:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
index a995fce..060aaa6 100644 (file)
 
 
 /* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR     0x0
+#define KVM_GUEST_COMMPAGE_ADDR                0x0
 
 #define KVM_GUEST_KERNEL_MODE(vcpu)    ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
                                        ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
 
-#define KVM_GUEST_KUSEG             0x00000000UL
-#define KVM_GUEST_KSEG0             0x40000000UL
-#define KVM_GUEST_KSEG23            0x60000000UL
-#define KVM_GUEST_KSEGX(a)          ((_ACAST32_(a)) & 0x60000000)
-#define KVM_GUEST_CPHYSADDR(a)      ((_ACAST32_(a)) & 0x1fffffff)
+#define KVM_GUEST_KUSEG                        0x00000000UL
+#define KVM_GUEST_KSEG0                        0x40000000UL
+#define KVM_GUEST_KSEG23               0x60000000UL
+#define KVM_GUEST_KSEGX(a)             ((_ACAST32_(a)) & 0x60000000)
+#define KVM_GUEST_CPHYSADDR(a)         ((_ACAST32_(a)) & 0x1fffffff)
 
 #define KVM_GUEST_CKSEG0ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG0)
 #define KVM_GUEST_CKSEG1ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
 #define KVM_GUEST_KSEG1ADDR(a)         (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
 #define KVM_GUEST_KSEG23ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
 
-#define KVM_INVALID_PAGE            0xdeadbeef
-#define KVM_INVALID_INST            0xdeadbeef
-#define KVM_INVALID_ADDR            0xdeadbeef
+#define KVM_INVALID_PAGE               0xdeadbeef
+#define KVM_INVALID_INST               0xdeadbeef
+#define KVM_INVALID_ADDR               0xdeadbeef
 
-#define KVM_MALTA_GUEST_RTC_ADDR    0xb8000070UL
+#define KVM_MALTA_GUEST_RTC_ADDR       0xb8000070UL
 
-#define GUEST_TICKS_PER_JIFFY (40000000/HZ)
-#define MS_TO_NS(x) (x * 1E6L)
+#define GUEST_TICKS_PER_JIFFY          (40000000/HZ)
+#define MS_TO_NS(x)                    (x * 1E6L)
 
-#define CAUSEB_DC       27
-#define CAUSEF_DC       (_ULCAST_(1)   << 27)
+#define CAUSEB_DC                      27
+#define CAUSEF_DC                      (_ULCAST_(1) << 27)
 
 struct kvm;
 struct kvm_run;
@@ -126,8 +126,8 @@ struct kvm_arch {
        int commpage_tlb;
 };
 
-#define N_MIPS_COPROC_REGS      32
-#define N_MIPS_COPROC_SEL      8
+#define N_MIPS_COPROC_REGS     32
+#define N_MIPS_COPROC_SEL      8
 
 struct mips_coproc {
        unsigned long reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];
@@ -139,124 +139,124 @@ struct mips_coproc {
 /*
  * Coprocessor 0 register names
  */
-#define        MIPS_CP0_TLB_INDEX          0
-#define        MIPS_CP0_TLB_RANDOM         1
-#define        MIPS_CP0_TLB_LOW            2
-#define        MIPS_CP0_TLB_LO0            2
-#define        MIPS_CP0_TLB_LO1            3
-#define        MIPS_CP0_TLB_CONTEXT    4
-#define        MIPS_CP0_TLB_PG_MASK    5
-#define        MIPS_CP0_TLB_WIRED          6
-#define        MIPS_CP0_HWRENA             7
-#define        MIPS_CP0_BAD_VADDR          8
-#define        MIPS_CP0_COUNT          9
-#define        MIPS_CP0_TLB_HI         10
-#define        MIPS_CP0_COMPARE            11
-#define        MIPS_CP0_STATUS         12
-#define        MIPS_CP0_CAUSE          13
-#define        MIPS_CP0_EXC_PC         14
-#define        MIPS_CP0_PRID               15
-#define        MIPS_CP0_CONFIG         16
-#define        MIPS_CP0_LLADDR         17
-#define        MIPS_CP0_WATCH_LO           18
-#define        MIPS_CP0_WATCH_HI           19
-#define        MIPS_CP0_TLB_XCONTEXT   20
-#define        MIPS_CP0_ECC                26
-#define        MIPS_CP0_CACHE_ERR          27
-#define        MIPS_CP0_TAG_LO         28
-#define        MIPS_CP0_TAG_HI         29
-#define        MIPS_CP0_ERROR_PC           30
-#define        MIPS_CP0_DEBUG          23
-#define        MIPS_CP0_DEPC               24
-#define        MIPS_CP0_PERFCNT            25
-#define        MIPS_CP0_ERRCTL         26
-#define        MIPS_CP0_DATA_LO            28
-#define        MIPS_CP0_DATA_HI            29
-#define        MIPS_CP0_DESAVE         31
-
-#define MIPS_CP0_CONFIG_SEL        0
-#define MIPS_CP0_CONFIG1_SEL    1
-#define MIPS_CP0_CONFIG2_SEL    2
-#define MIPS_CP0_CONFIG3_SEL    3
+#define MIPS_CP0_TLB_INDEX     0
+#define MIPS_CP0_TLB_RANDOM    1
+#define MIPS_CP0_TLB_LOW       2
+#define MIPS_CP0_TLB_LO0       2
+#define MIPS_CP0_TLB_LO1       3
+#define MIPS_CP0_TLB_CONTEXT   4
+#define MIPS_CP0_TLB_PG_MASK   5
+#define MIPS_CP0_TLB_WIRED     6
+#define MIPS_CP0_HWRENA                7
+#define MIPS_CP0_BAD_VADDR     8
+#define MIPS_CP0_COUNT         9
+#define MIPS_CP0_TLB_HI                10
+#define MIPS_CP0_COMPARE       11
+#define MIPS_CP0_STATUS                12
+#define MIPS_CP0_CAUSE         13
+#define MIPS_CP0_EXC_PC                14
+#define MIPS_CP0_PRID          15
+#define MIPS_CP0_CONFIG                16
+#define MIPS_CP0_LLADDR                17
+#define MIPS_CP0_WATCH_LO      18
+#define MIPS_CP0_WATCH_HI      19
+#define MIPS_CP0_TLB_XCONTEXT  20
+#define MIPS_CP0_ECC           26
+#define MIPS_CP0_CACHE_ERR     27
+#define MIPS_CP0_TAG_LO                28
+#define MIPS_CP0_TAG_HI                29
+#define MIPS_CP0_ERROR_PC      30
+#define MIPS_CP0_DEBUG         23
+#define MIPS_CP0_DEPC          24
+#define MIPS_CP0_PERFCNT       25
+#define MIPS_CP0_ERRCTL                26
+#define MIPS_CP0_DATA_LO       28
+#define MIPS_CP0_DATA_HI       29
+#define MIPS_CP0_DESAVE                31
+
+#define MIPS_CP0_CONFIG_SEL    0
+#define MIPS_CP0_CONFIG1_SEL   1
+#define MIPS_CP0_CONFIG2_SEL   2
+#define MIPS_CP0_CONFIG3_SEL   3
 
 /* Config0 register bits */
-#define CP0C0_M    31
-#define CP0C0_K23  28
-#define CP0C0_KU   25
-#define CP0C0_MDU  20
-#define CP0C0_MM   17
-#define CP0C0_BM   16
-#define CP0C0_BE   15
-#define CP0C0_AT   13
-#define CP0C0_AR   10
-#define CP0C0_MT   7
-#define CP0C0_VI   3
-#define CP0C0_K0   0
+#define CP0C0_M                        31
+#define CP0C0_K23              28
+#define CP0C0_KU               25
+#define CP0C0_MDU              20
+#define CP0C0_MM               17
+#define CP0C0_BM               16
+#define CP0C0_BE               15
+#define CP0C0_AT               13
+#define CP0C0_AR               10
+#define CP0C0_MT               7
+#define CP0C0_VI               3
+#define CP0C0_K0               0
 
 /* Config1 register bits */
-#define CP0C1_M    31
-#define CP0C1_MMU  25
-#define CP0C1_IS   22
-#define CP0C1_IL   19
-#define CP0C1_IA   16
-#define CP0C1_DS   13
-#define CP0C1_DL   10
-#define CP0C1_DA   7
-#define CP0C1_C2   6
-#define CP0C1_MD   5
-#define CP0C1_PC   4
-#define CP0C1_WR   3
-#define CP0C1_CA   2
-#define CP0C1_EP   1
-#define CP0C1_FP   0
+#define CP0C1_M                        31
+#define CP0C1_MMU              25
+#define CP0C1_IS               22
+#define CP0C1_IL               19
+#define CP0C1_IA               16
+#define CP0C1_DS               13
+#define CP0C1_DL               10
+#define CP0C1_DA               7
+#define CP0C1_C2               6
+#define CP0C1_MD               5
+#define CP0C1_PC               4
+#define CP0C1_WR               3
+#define CP0C1_CA               2
+#define CP0C1_EP               1
+#define CP0C1_FP               0
 
 /* Config2 Register bits */
-#define CP0C2_M    31
-#define CP0C2_TU   28
-#define CP0C2_TS   24
-#define CP0C2_TL   20
-#define CP0C2_TA   16
-#define CP0C2_SU   12
-#define CP0C2_SS   8
-#define CP0C2_SL   4
-#define CP0C2_SA   0
+#define CP0C2_M                        31
+#define CP0C2_TU               28
+#define CP0C2_TS               24
+#define CP0C2_TL               20
+#define CP0C2_TA               16
+#define CP0C2_SU               12
+#define CP0C2_SS               8
+#define CP0C2_SL               4
+#define CP0C2_SA               0
 
 /* Config3 Register bits */
-#define CP0C3_M    31
-#define CP0C3_ISA_ON_EXC 16
-#define CP0C3_ULRI  13
-#define CP0C3_DSPP 10
-#define CP0C3_LPA  7
-#define CP0C3_VEIC 6
-#define CP0C3_VInt 5
-#define CP0C3_SP   4
-#define CP0C3_MT   2
-#define CP0C3_SM   1
-#define CP0C3_TL   0
+#define CP0C3_M                        31
+#define CP0C3_ISA_ON_EXC       16
+#define CP0C3_ULRI             13
+#define CP0C3_DSPP             10
+#define CP0C3_LPA              7
+#define CP0C3_VEIC             6
+#define CP0C3_VInt             5
+#define CP0C3_SP               4
+#define CP0C3_MT               2
+#define CP0C3_SM               1
+#define CP0C3_TL               0
 
 /* Have config1, Cacheable, noncoherent, write-back, write allocate*/
-#define MIPS_CONFIG0                                              \
+#define MIPS_CONFIG0                                           \
   ((1 << CP0C0_M) | (0x3 << CP0C0_K0))
 
 /* Have config2, no coprocessor2 attached, no MDMX support attached,
    no performance counters, watch registers present,
    no code compression, EJTAG present, no FPU, no watch registers */
-#define MIPS_CONFIG1                                              \
-((1 << CP0C1_M) |                                                 \
- (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) |            \
- (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) |            \
+#define MIPS_CONFIG1                                           \
+((1 << CP0C1_M) |                                              \
+ (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) |         \
+ (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) |         \
  (0 << CP0C1_FP))
 
 /* Have config3, no tertiary/secondary caches implemented */
-#define MIPS_CONFIG2                                              \
+#define MIPS_CONFIG2                                           \
 ((1 << CP0C2_M))
 
 /* No config4, no DSP ASE, no large physaddr (PABITS),
    no external interrupt controller, no vectored interrupts,
    no 1kb pages, no SmartMIPS ASE, no trace logic */
-#define MIPS_CONFIG3                                              \
-((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) |          \
- (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) |        \
+#define MIPS_CONFIG3                                           \
+((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) |       \
+ (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) |     \
  (0 << CP0C3_SM) | (0 << CP0C3_TL))
 
 /* MMU types, the first four entries have the same layout as the
@@ -274,36 +274,36 @@ enum mips_mmu_types {
 /*
  * Trap codes
  */
-#define T_INT           0      /* Interrupt pending */
-#define T_TLB_MOD       1      /* TLB modified fault */
-#define T_TLB_LD_MISS       2  /* TLB miss on load or ifetch */
-#define T_TLB_ST_MISS       3  /* TLB miss on a store */
-#define T_ADDR_ERR_LD       4  /* Address error on a load or ifetch */
-#define T_ADDR_ERR_ST       5  /* Address error on a store */
-#define T_BUS_ERR_IFETCH    6  /* Bus error on an ifetch */
-#define T_BUS_ERR_LD_ST     7  /* Bus error on a load or store */
-#define T_SYSCALL       8      /* System call */
-#define T_BREAK         9      /* Breakpoint */
-#define T_RES_INST      10     /* Reserved instruction exception */
-#define T_COP_UNUSABLE      11 /* Coprocessor unusable */
-#define T_OVFLOW        12     /* Arithmetic overflow */
+#define T_INT                  0       /* Interrupt pending */
+#define T_TLB_MOD              1       /* TLB modified fault */
+#define T_TLB_LD_MISS          2       /* TLB miss on load or ifetch */
+#define T_TLB_ST_MISS          3       /* TLB miss on a store */
+#define T_ADDR_ERR_LD          4       /* Address error on a load or ifetch */
+#define T_ADDR_ERR_ST          5       /* Address error on a store */
+#define T_BUS_ERR_IFETCH       6       /* Bus error on an ifetch */
+#define T_BUS_ERR_LD_ST                7       /* Bus error on a load or store */
+#define T_SYSCALL              8       /* System call */
+#define T_BREAK                        9       /* Breakpoint */
+#define T_RES_INST             10      /* Reserved instruction exception */
+#define T_COP_UNUSABLE         11      /* Coprocessor unusable */
+#define T_OVFLOW               12      /* Arithmetic overflow */
 
 /*
  * Trap definitions added for r4000 port.
  */
-#define T_TRAP          13     /* Trap instruction */
-#define T_VCEI          14     /* Virtual coherency exception */
-#define T_FPE           15     /* Floating point exception */
-#define T_WATCH         23     /* Watch address reference */
-#define T_VCED          31     /* Virtual coherency data */
+#define T_TRAP                 13      /* Trap instruction */
+#define T_VCEI                 14      /* Virtual coherency exception */
+#define T_FPE                  15      /* Floating point exception */
+#define T_WATCH                        23      /* Watch address reference */
+#define T_VCED                 31      /* Virtual coherency data */
 
 /* Resume Flags */
-#define RESUME_FLAG_DR          (1<<0) /* Reload guest nonvolatile state? */
-#define RESUME_FLAG_HOST        (1<<1) /* Resume host? */
+#define RESUME_FLAG_DR         (1<<0)  /* Reload guest nonvolatile state? */
+#define RESUME_FLAG_HOST       (1<<1)  /* Resume host? */
 
-#define RESUME_GUEST            0
-#define RESUME_GUEST_DR         RESUME_FLAG_DR
-#define RESUME_HOST             RESUME_FLAG_HOST
+#define RESUME_GUEST           0
+#define RESUME_GUEST_DR                RESUME_FLAG_DR
+#define RESUME_HOST            RESUME_FLAG_HOST
 
 enum emulation_result {
        EMULATE_DONE,           /* no further processing */
@@ -313,24 +313,27 @@ enum emulation_result {
        EMULATE_PRIV_FAIL,
 };
 
-#define MIPS3_PG_G  0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V  0x00000002 /* Valid */
-#define MIPS3_PG_NV 0x00000000
-#define MIPS3_PG_D  0x00000004 /* Dirty */
+#define MIPS3_PG_G     0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
+#define MIPS3_PG_V     0x00000002 /* Valid */
+#define MIPS3_PG_NV    0x00000000
+#define MIPS3_PG_D     0x00000004 /* Dirty */
 
 #define mips3_paddr_to_tlbpfn(x) \
-    (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
+       (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
 #define mips3_tlbpfn_to_paddr(x) \
-    ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
+       ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
 
-#define MIPS3_PG_SHIFT      6
-#define MIPS3_PG_FRAME      0x3fffffc0
+#define MIPS3_PG_SHIFT         6
+#define MIPS3_PG_FRAME         0x3fffffc0
 
-#define VPN2_MASK           0xffffe000
-#define TLB_IS_GLOBAL(x)    (((x).tlb_lo0 & MIPS3_PG_G) && ((x).tlb_lo1 & MIPS3_PG_G))
-#define TLB_VPN2(x)         ((x).tlb_hi & VPN2_MASK)
-#define TLB_ASID(x)         ((x).tlb_hi & ASID_MASK)
-#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) ? ((x).tlb_lo1 & MIPS3_PG_V) : ((x).tlb_lo0 & MIPS3_PG_V))
+#define VPN2_MASK              0xffffe000
+#define TLB_IS_GLOBAL(x)       (((x).tlb_lo0 & MIPS3_PG_G) &&  \
+                                ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_VPN2(x)            ((x).tlb_hi & VPN2_MASK)
+#define TLB_ASID(x)            ((x).tlb_hi & ASID_MASK)
+#define TLB_IS_VALID(x, va)    (((va) & (1 << PAGE_SHIFT))     \
+                                ? ((x).tlb_lo1 & MIPS3_PG_V)   \
+                                : ((x).tlb_lo0 & MIPS3_PG_V))
 
 struct kvm_mips_tlb {
        long tlb_mask;
@@ -339,7 +342,7 @@ struct kvm_mips_tlb {
        long tlb_lo1;
 };
 
-#define KVM_MIPS_GUEST_TLB_SIZE     64
+#define KVM_MIPS_GUEST_TLB_SIZE        64
 struct kvm_vcpu_arch {
        void *host_ebase, *guest_ebase;
        unsigned long host_stack;
@@ -400,65 +403,67 @@ struct kvm_vcpu_arch {
 };
 
 
-#define kvm_read_c0_guest_index(cop0)               (cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val)         (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0)            (cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_read_c0_guest_entrylo1(cop0)            (cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_read_c0_guest_context(cop0)             (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val)       (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0)           (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_read_c0_guest_pagemask(cop0)            (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val)      (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0)               (cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val)         (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0)            (cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val)      (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0)               (cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val)         (cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0)             (cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val)       (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0)             (cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val)       (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0)              (cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0)              (cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0)               (cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val)         (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0)                 (cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val)           (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0)                (cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val)          (cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0)               (cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val)         (cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0)              (cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0)             (cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0)             (cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0)             (cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config7(cop0)             (cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val)        (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config7(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0)            (cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val)      (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-
-#define kvm_set_c0_guest_status(cop0, val)          (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
-#define kvm_set_c0_guest_cause(cop0, val)           (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
-#define kvm_clear_c0_guest_cause(cop0, val)         (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
-#define kvm_change_c0_guest_cause(cop0, change, val)  \
-{                                                     \
-    kvm_clear_c0_guest_cause(cop0, change);           \
-    kvm_set_c0_guest_cause(cop0, ((val) & (change))); \
+#define kvm_read_c0_guest_index(cop0)          (cop0->reg[MIPS_CP0_TLB_INDEX][0])
+#define kvm_write_c0_guest_index(cop0, val)    (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
+#define kvm_read_c0_guest_entrylo0(cop0)       (cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_read_c0_guest_entrylo1(cop0)       (cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_read_c0_guest_context(cop0)                (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
+#define kvm_write_c0_guest_context(cop0, val)  (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
+#define kvm_read_c0_guest_userlocal(cop0)      (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
+#define kvm_read_c0_guest_pagemask(cop0)       (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
+#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
+#define kvm_read_c0_guest_wired(cop0)          (cop0->reg[MIPS_CP0_TLB_WIRED][0])
+#define kvm_write_c0_guest_wired(cop0, val)    (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
+#define kvm_read_c0_guest_hwrena(cop0)         (cop0->reg[MIPS_CP0_HWRENA][0])
+#define kvm_write_c0_guest_hwrena(cop0, val)   (cop0->reg[MIPS_CP0_HWRENA][0] = (val))
+#define kvm_read_c0_guest_badvaddr(cop0)       (cop0->reg[MIPS_CP0_BAD_VADDR][0])
+#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
+#define kvm_read_c0_guest_count(cop0)          (cop0->reg[MIPS_CP0_COUNT][0])
+#define kvm_write_c0_guest_count(cop0, val)    (cop0->reg[MIPS_CP0_COUNT][0] = (val))
+#define kvm_read_c0_guest_entryhi(cop0)                (cop0->reg[MIPS_CP0_TLB_HI][0])
+#define kvm_write_c0_guest_entryhi(cop0, val)  (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
+#define kvm_read_c0_guest_compare(cop0)                (cop0->reg[MIPS_CP0_COMPARE][0])
+#define kvm_write_c0_guest_compare(cop0, val)  (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
+#define kvm_read_c0_guest_status(cop0)         (cop0->reg[MIPS_CP0_STATUS][0])
+#define kvm_write_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] = (val))
+#define kvm_read_c0_guest_intctl(cop0)         (cop0->reg[MIPS_CP0_STATUS][1])
+#define kvm_write_c0_guest_intctl(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][1] = (val))
+#define kvm_read_c0_guest_cause(cop0)          (cop0->reg[MIPS_CP0_CAUSE][0])
+#define kvm_write_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
+#define kvm_read_c0_guest_epc(cop0)            (cop0->reg[MIPS_CP0_EXC_PC][0])
+#define kvm_write_c0_guest_epc(cop0, val)      (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
+#define kvm_read_c0_guest_prid(cop0)           (cop0->reg[MIPS_CP0_PRID][0])
+#define kvm_write_c0_guest_prid(cop0, val)     (cop0->reg[MIPS_CP0_PRID][0] = (val))
+#define kvm_read_c0_guest_ebase(cop0)          (cop0->reg[MIPS_CP0_PRID][1])
+#define kvm_write_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] = (val))
+#define kvm_read_c0_guest_config(cop0)         (cop0->reg[MIPS_CP0_CONFIG][0])
+#define kvm_read_c0_guest_config1(cop0)                (cop0->reg[MIPS_CP0_CONFIG][1])
+#define kvm_read_c0_guest_config2(cop0)                (cop0->reg[MIPS_CP0_CONFIG][2])
+#define kvm_read_c0_guest_config3(cop0)                (cop0->reg[MIPS_CP0_CONFIG][3])
+#define kvm_read_c0_guest_config7(cop0)                (cop0->reg[MIPS_CP0_CONFIG][7])
+#define kvm_write_c0_guest_config(cop0, val)   (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
+#define kvm_write_c0_guest_config1(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
+#define kvm_write_c0_guest_config2(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
+#define kvm_write_c0_guest_config3(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
+#define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
+#define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
+#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+
+#define kvm_set_c0_guest_status(cop0, val)     (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
+#define kvm_clear_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+#define kvm_set_c0_guest_cause(cop0, val)      (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
+#define kvm_clear_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
+#define kvm_change_c0_guest_cause(cop0, change, val)                   \
+{                                                                      \
+       kvm_clear_c0_guest_cause(cop0, change);                         \
+       kvm_set_c0_guest_cause(cop0, ((val) & (change)));               \
 }
-#define kvm_set_c0_guest_ebase(cop0, val)           (cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val)         (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val)  \
-{                                                     \
-    kvm_clear_c0_guest_ebase(cop0, change);           \
-    kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \
+#define kvm_set_c0_guest_ebase(cop0, val)      (cop0->reg[MIPS_CP0_PRID][1] |= (val))
+#define kvm_clear_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
+#define kvm_change_c0_guest_ebase(cop0, change, val)                   \
+{                                                                      \
+       kvm_clear_c0_guest_ebase(cop0, change);                         \
+       kvm_set_c0_guest_ebase(cop0, ((val) & (change)));               \
 }
 
 
index 4b6274b..e3fec99 100644 (file)
@@ -436,13 +436,6 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
        sel = inst & 0x7;
        co_bit = (inst >> 25) & 1;
 
-       /* Verify that the register is valid */
-       if (rd > MIPS_CP0_DESAVE) {
-               printk("Invalid rd: %d\n", rd);
-               er = EMULATE_FAIL;
-               goto done;
-       }
-
        if (co_bit) {
                op = (inst) & 0xff;
 
@@ -1542,8 +1535,15 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
        }
 
        if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+               int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
                int rd = (inst & RD) >> 11;
                int rt = (inst & RT) >> 16;
+               /* If usermode, check RDHWR rd is allowed by guest HWREna */
+               if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
+                       kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
+                                 rd, opc);
+                       goto emulate_ri;
+               }
                switch (rd) {
                case 0: /* CPU number */
                        arch->gprs[rt] = 0;
@@ -1567,31 +1567,27 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                        }
                        break;
                case 29:
-#if 1
                        arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
-#else
-                       /* UserLocal not implemented */
-                       er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
-#endif
                        break;
 
                default:
-                       printk("RDHWR not supported\n");
-                       er = EMULATE_FAIL;
-                       break;
+                       kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
+                       goto emulate_ri;
                }
        } else {
-               printk("Emulate RI not supported @ %p: %#x\n", opc, inst);
-               er = EMULATE_FAIL;
+               kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+               goto emulate_ri;
        }
 
+       return EMULATE_DONE;
+
+emulate_ri:
        /*
-        * Rollback PC only if emulation was unsuccessful
+        * Rollback PC (if in branch delay slot then the PC already points to
+        * branch target), and pass the RI exception to the guest OS.
         */
-       if (er == EMULATE_FAIL) {
-               vcpu->arch.pc = curr_pc;
-       }
-       return er;
+       vcpu->arch.pc = curr_pc;
+       return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
 }
 
 enum emulation_result
index 83851aa..bb1e38a 100644 (file)
@@ -304,6 +304,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return vcpu->arch.fault_dar;
 }
 
+static inline bool is_kvmppc_resume_guest(int r)
+{
+       return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
index bf0fa8b..51388be 100644 (file)
@@ -289,6 +289,18 @@ static inline void note_hpte_modification(struct kvm *kvm,
        if (atomic_read(&kvm->arch.hpte_mod_interest))
                rev->guest_rpte |= HPTE_GR_MODIFIED;
 }
+
+/*
+ * Like kvm_memslots(), but for use in real mode when we can't do
+ * any RCU stuff (since the secondary threads are offline from the
+ * kernel's point of view), and we can't print anything.
+ * Thus we use rcu_dereference_raw() rather than rcu_dereference_check().
+ */
+static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
+{
+       return rcu_dereference_raw_notrace(kvm->memslots);
+}
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index f3a91dc..821725c 100644 (file)
@@ -94,7 +94,7 @@ struct kvmppc_host_state {
        unsigned long xics_phys;
        u32 saved_xirr;
        u64 dabr;
-       u64 host_mmcr[3];
+       u64 host_mmcr[7];       /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
        u32 host_pmc[8];
        u64 host_purr;
        u64 host_spurr;
index fcd53f0..4096f16 100644 (file)
@@ -129,6 +129,8 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                            unsigned long ioba);
 extern struct kvm_rma_info *kvm_alloc_rma(void);
 extern void kvm_release_rma(struct kvm_rma_info *ri);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
index 1a36b8e..0dcc48a 100644 (file)
 #define SPRN_ACOP      0x1F    /* Available Coprocessor Register */
 #define SPRN_TFIAR     0x81    /* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR    0x82    /* Transaction EXception & Summary */
+#define   TEXASR_FS    __MASK(63-36)   /* Transaction Failure Summary */
 #define SPRN_TEXASRU   0x83    /* ''      ''      ''    Upper 32  */
 #define SPRN_TFHAR     0x80    /* Transaction Failure Handler Addr */
 #define SPRN_CTRLF     0x088
index 0c9f8b7..c22d704 100644 (file)
@@ -7,6 +7,8 @@
 
 #include <uapi/asm/tm.h>
 
+#ifndef __ASSEMBLY__
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 extern void do_load_up_transact_fpu(struct thread_struct *thread);
 extern void do_load_up_transact_altivec(struct thread_struct *thread);
@@ -21,3 +23,5 @@ extern void tm_recheckpoint(struct thread_struct *thread,
 extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
+
+#endif /* __ASSEMBLY__ */
index 303ece7..fb25ebc 100644 (file)
@@ -262,7 +262,14 @@ int kvmppc_mmu_hv_init(void)
 
 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 {
-       kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+       unsigned long msr = vcpu->arch.intr_msr;
+
+       /* If transactional, change to suspend mode on IRQ delivery */
+       if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+               msr |= MSR_TS_S;
+       else
+               msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
+       kvmppc_set_msr(vcpu, msr);
 }
 
 /*
index 2c25f54..89e96b3 100644 (file)
@@ -75,3 +75,31 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
        return H_TOO_HARD;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvmppc_spapr_tce_table *stt;
+
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == liobn) {
+                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+                       struct page *page;
+                       u64 *tbl;
+
+                       if (ioba >= stt->window_size)
+                               return H_PARAMETER;
+
+                       page = stt->pages[idx / TCES_PER_PAGE];
+                       tbl = (u64 *)page_address(page);
+
+                       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+                       return H_SUCCESS;
+               }
+       }
+
+       /* Didn't find the liobn, punt it to userspace */
+       return H_TOO_HARD;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
index 17fc949..8227dba 100644 (file)
@@ -86,7 +86,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 
        /* CPU points to the first thread of the core */
        if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_PPC_ICP_NATIVE
                int real_cpu = cpu + vcpu->arch.ptid;
                if (paca[real_cpu].kvm_hstate.xics_phys)
                        xics_wake_cpu(real_cpu);
@@ -879,17 +879,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_IAMR:
                *val = get_reg_val(id, vcpu->arch.iamr);
                break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       case KVM_REG_PPC_TFHAR:
-               *val = get_reg_val(id, vcpu->arch.tfhar);
-               break;
-       case KVM_REG_PPC_TFIAR:
-               *val = get_reg_val(id, vcpu->arch.tfiar);
-               break;
-       case KVM_REG_PPC_TEXASR:
-               *val = get_reg_val(id, vcpu->arch.texasr);
-               break;
-#endif
        case KVM_REG_PPC_FSCR:
                *val = get_reg_val(id, vcpu->arch.fscr);
                break;
@@ -970,6 +959,69 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_PPR:
                *val = get_reg_val(id, vcpu->arch.ppr);
                break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case KVM_REG_PPC_TFHAR:
+               *val = get_reg_val(id, vcpu->arch.tfhar);
+               break;
+       case KVM_REG_PPC_TFIAR:
+               *val = get_reg_val(id, vcpu->arch.tfiar);
+               break;
+       case KVM_REG_PPC_TEXASR:
+               *val = get_reg_val(id, vcpu->arch.texasr);
+               break;
+       case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+               i = id - KVM_REG_PPC_TM_GPR0;
+               *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+               break;
+       case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+       {
+               int j;
+               i = id - KVM_REG_PPC_TM_VSR0;
+               if (i < 32)
+                       for (j = 0; j < TS_FPRWIDTH; j++)
+                               val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+               else {
+                       if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               val->vval = vcpu->arch.vr_tm.vr[i-32];
+                       else
+                               r = -ENXIO;
+               }
+               break;
+       }
+       case KVM_REG_PPC_TM_CR:
+               *val = get_reg_val(id, vcpu->arch.cr_tm);
+               break;
+       case KVM_REG_PPC_TM_LR:
+               *val = get_reg_val(id, vcpu->arch.lr_tm);
+               break;
+       case KVM_REG_PPC_TM_CTR:
+               *val = get_reg_val(id, vcpu->arch.ctr_tm);
+               break;
+       case KVM_REG_PPC_TM_FPSCR:
+               *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+               break;
+       case KVM_REG_PPC_TM_AMR:
+               *val = get_reg_val(id, vcpu->arch.amr_tm);
+               break;
+       case KVM_REG_PPC_TM_PPR:
+               *val = get_reg_val(id, vcpu->arch.ppr_tm);
+               break;
+       case KVM_REG_PPC_TM_VRSAVE:
+               *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+               break;
+       case KVM_REG_PPC_TM_VSCR:
+               if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                       *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+               else
+                       r = -ENXIO;
+               break;
+       case KVM_REG_PPC_TM_DSCR:
+               *val = get_reg_val(id, vcpu->arch.dscr_tm);
+               break;
+       case KVM_REG_PPC_TM_TAR:
+               *val = get_reg_val(id, vcpu->arch.tar_tm);
+               break;
+#endif
        case KVM_REG_PPC_ARCH_COMPAT:
                *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
                break;
@@ -1039,17 +1091,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_IAMR:
                vcpu->arch.iamr = set_reg_val(id, *val);
                break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       case KVM_REG_PPC_TFHAR:
-               vcpu->arch.tfhar = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_TFIAR:
-               vcpu->arch.tfiar = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_TEXASR:
-               vcpu->arch.texasr = set_reg_val(id, *val);
-               break;
-#endif
        case KVM_REG_PPC_FSCR:
                vcpu->arch.fscr = set_reg_val(id, *val);
                break;
@@ -1144,6 +1185,68 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_PPR:
                vcpu->arch.ppr = set_reg_val(id, *val);
                break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case KVM_REG_PPC_TFHAR:
+               vcpu->arch.tfhar = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TFIAR:
+               vcpu->arch.tfiar = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TEXASR:
+               vcpu->arch.texasr = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+               i = id - KVM_REG_PPC_TM_GPR0;
+               vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+       {
+               int j;
+               i = id - KVM_REG_PPC_TM_VSR0;
+               if (i < 32)
+                       for (j = 0; j < TS_FPRWIDTH; j++)
+                               vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+               else
+                       if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               vcpu->arch.vr_tm.vr[i-32] = val->vval;
+                       else
+                               r = -ENXIO;
+               break;
+       }
+       case KVM_REG_PPC_TM_CR:
+               vcpu->arch.cr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_LR:
+               vcpu->arch.lr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_CTR:
+               vcpu->arch.ctr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_FPSCR:
+               vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_AMR:
+               vcpu->arch.amr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_PPR:
+               vcpu->arch.ppr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VRSAVE:
+               vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VSCR:
+               if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                       vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+               else
+                       r = - ENXIO;
+               break;
+       case KVM_REG_PPC_TM_DSCR:
+               vcpu->arch.dscr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_TAR:
+               vcpu->arch.tar_tm = set_reg_val(id, *val);
+               break;
+#endif
        case KVM_REG_PPC_ARCH_COMPAT:
                r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
                break;
@@ -1360,9 +1463,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
        smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
        if (cpu != smp_processor_id()) {
-#ifdef CONFIG_KVM_XICS
                xics_wake_cpu(cpu);
-#endif
                if (vcpu->arch.ptid)
                        ++vc->n_woken;
        }
@@ -1530,7 +1631,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
                vcpu->arch.trap = 0;
 
                if (vcpu->arch.ceded) {
-                       if (ret != RESUME_GUEST)
+                       if (!is_kvmppc_resume_guest(ret))
                                kvmppc_end_cede(vcpu);
                        else
                                kvmppc_set_timer(vcpu);
@@ -1541,7 +1642,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
        vc->vcore_state = VCORE_INACTIVE;
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
-               if (vcpu->arch.ret != RESUME_GUEST) {
+               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
                }
@@ -1731,7 +1832,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
                        srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
                }
-       } while (r == RESUME_GUEST);
+       } while (is_kvmppc_resume_guest(r));
 
  out:
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -2366,7 +2467,7 @@ static int kvmppc_book3s_init_hv(void)
         */
        r = kvmppc_core_check_processor_compat_hv();
        if (r < 0)
-               return r;
+               return -ENODEV;
 
        kvm_ops_hv.owner = THIS_MODULE;
        kvmppc_hv_ops = &kvm_ops_hv;
index e873796..e18e3cf 100644 (file)
@@ -71,6 +71,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        mtmsrd  r10,1
 
        /* Save host PMU registers */
+BEGIN_FTR_SECTION
+       /* Work around P8 PMAE bug */
+       li      r3, -1
+       clrrdi  r3, r3, 10
+       mfspr   r8, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, 1
        sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
        mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
@@ -87,9 +95,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        cmpwi   r5, 0
        beq     31f                     /* skip if not */
        mfspr   r5, SPRN_MMCR1
+       mfspr   r9, SPRN_SIAR
+       mfspr   r10, SPRN_SDAR
        std     r7, HSTATE_MMCR(r13)
        std     r5, HSTATE_MMCR + 8(r13)
        std     r6, HSTATE_MMCR + 16(r13)
+       std     r9, HSTATE_MMCR + 24(r13)
+       std     r10, HSTATE_MMCR + 32(r13)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR + 40(r13)
+       std     r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        mfspr   r3, SPRN_PMC1
        mfspr   r5, SPRN_PMC2
        mfspr   r6, SPRN_PMC3
@@ -110,6 +127,11 @@ BEGIN_FTR_SECTION
        stw     r10, HSTATE_PMC + 24(r13)
        stw     r11, HSTATE_PMC + 28(r13)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR + 40(r13)
+       std     r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 31:
 
        /*
index 37fb3ca..1d6c56a 100644 (file)
@@ -111,7 +111,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
        ptel = rev->guest_rpte |= rcbits;
        gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
        if (!memslot)
                return;
 
@@ -192,7 +192,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
        /* Find the memslot (if any) for this address */
        gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
        gfn = gpa >> PAGE_SHIFT;
-       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
        pa = 0;
        is_io = ~0ul;
        rmap = NULL;
@@ -670,7 +670,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
                        psize = hpte_page_size(v, r);
                        gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
-                       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+                       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
                        if (memslot) {
                                hva = __gfn_to_hva_memslot(memslot, gfn);
                                pte = lookup_linux_pte_and_update(pgdir, hva,
index 53d647f..ffbb871 100644 (file)
@@ -28,6 +28,9 @@
 #include <asm/exception-64s.h>
 #include <asm/kvm_book3s_asm.h>
 #include <asm/mmu-hash64.h>
+#include <asm/tm.h>
+
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
 #ifdef __LITTLE_ENDIAN__
 #error Need to fix lppaca and SLB shadow accesses in little endian mode
@@ -106,8 +109,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        ld      r3, HSTATE_MMCR(r13)
        ld      r4, HSTATE_MMCR + 8(r13)
        ld      r5, HSTATE_MMCR + 16(r13)
+       ld      r6, HSTATE_MMCR + 24(r13)
+       ld      r7, HSTATE_MMCR + 32(r13)
        mtspr   SPRN_MMCR1, r4
        mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_SIAR, r6
+       mtspr   SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+       ld      r8, HSTATE_MMCR + 40(r13)
+       ld      r9, HSTATE_MMCR + 48(r13)
+       mtspr   SPRN_MMCR2, r8
+       mtspr   SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        mtspr   SPRN_MMCR0, r3
        isync
 23:
@@ -597,6 +610,116 @@ BEGIN_FTR_SECTION
  END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89)
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       b       skip_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     skip_tm /* TM not active in guest */
+
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      .load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      .load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+skip_tm:
+#endif
+
        /* Load guest PMU registers */
        /* R4 is live here (vcpu pointer) */
        li      r3, 1
@@ -704,14 +827,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        ld      r6, VCPU_VTB(r4)
        mtspr   SPRN_IC, r5
        mtspr   SPRN_VTB, r6
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
-#endif
        ld      r8, VCPU_EBBHR(r4)
        mtspr   SPRN_EBBHR, r8
        ld      r5, VCPU_EBBRR(r4)
@@ -736,6 +851,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         * Set the decrementer to the guest decrementer.
         */
        ld      r8,VCPU_DEC_EXPIRES(r4)
+       /* r8 is a host timebase value here, convert to guest TB */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r6,VCORE_TB_OFFSET(r5)
+       add     r8,r8,r6
        mftb    r7
        subf    r3,r7,r8
        mtspr   SPRN_DEC,r3
@@ -817,7 +936,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 12:    mtspr   SPRN_SRR0, r10
        mr      r10,r0
        mtspr   SPRN_SRR1, r11
-       ld      r11, VCPU_INTR_MSR(r4)
+       mr      r9, r4
+       bl      kvmppc_msr_interrupt
 5:
 
 /*
@@ -1098,17 +1218,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
        mftb    r6
        extsw   r5,r5
        add     r5,r5,r6
+       /* r5 is a guest timebase value here, convert to host TB */
+       ld      r3,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_TB_OFFSET(r3)
+       subf    r5,r4,r5
        std     r5,VCPU_DEC_EXPIRES(r9)
 
 BEGIN_FTR_SECTION
        b       8f
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
        /* Save POWER8-specific registers */
        mfspr   r5, SPRN_IAMR
        mfspr   r6, SPRN_PSPB
@@ -1122,14 +1240,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        std     r5, VCPU_IC(r9)
        std     r6, VCPU_VTB(r9)
        std     r7, VCPU_TAR(r9)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
-#endif
        mfspr   r8, SPRN_EBBHR
        std     r8, VCPU_EBBHR(r9)
        mfspr   r5, SPRN_EBBRR
@@ -1387,7 +1497,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        ld      r8,VCORE_TB_OFFSET(r5)
        cmpdi   r8,0
        beq     17f
-       mftb    r6                      /* current host timebase */
+       mftb    r6                      /* current guest timebase */
        subf    r8,r8,r6
        mtspr   SPRN_TBU40,r8           /* update upper 40 bits */
        mftb    r7                      /* check if lower 24 bits overflowed */
@@ -1557,7 +1667,7 @@ kvmppc_hdsi:
        mtspr   SPRN_SRR0, r10
        mtspr   SPRN_SRR1, r11
        li      r10, BOOK3S_INTERRUPT_DATA_STORAGE
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
 fast_interrupt_c_return:
 6:     ld      r7, VCPU_CTR(r9)
        lwz     r8, VCPU_XER(r9)
@@ -1626,7 +1736,7 @@ kvmppc_hisi:
 1:     mtspr   SPRN_SRR0, r10
        mtspr   SPRN_SRR1, r11
        li      r10, BOOK3S_INTERRUPT_INST_STORAGE
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
        b       fast_interrupt_c_return
 
 3:     ld      r6, VCPU_KVM(r9)        /* not relocated, use VRMA */
@@ -1669,7 +1779,7 @@ sc_1_fast_return:
        mtspr   SPRN_SRR0,r10
        mtspr   SPRN_SRR1,r11
        li      r10, BOOK3S_INTERRUPT_SYSCALL
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
        mr      r4,r9
        b       fast_guest_return
 
@@ -1691,7 +1801,7 @@ hcall_real_table:
        .long   0               /* 0x10 - H_CLEAR_MOD */
        .long   0               /* 0x14 - H_CLEAR_REF */
        .long   .kvmppc_h_protect - hcall_real_table
-       .long   0               /* 0x1c - H_GET_TCE */
+       .long   .kvmppc_h_get_tce - hcall_real_table
        .long   .kvmppc_h_put_tce - hcall_real_table
        .long   0               /* 0x24 - H_SET_SPRG0 */
        .long   .kvmppc_h_set_dabr - hcall_real_table
@@ -1997,7 +2107,7 @@ machine_check_realmode:
        beq     mc_cont
        /* If not, deliver a machine check.  SRR0/1 are already set */
        li      r10, BOOK3S_INTERRUPT_MACHINE_CHECK
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
        b       fast_interrupt_c_return
 
 /*
@@ -2138,8 +2248,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        mfspr   r6,SPRN_VRSAVE
        stw     r6,VCPU_VRSAVE(r31)
        mtlr    r30
-       mtmsrd  r5
-       isync
        blr
 
 /*
@@ -2186,3 +2294,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  */
 kvmppc_bad_host_intr:
        b       .
+
+/*
+ * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken
+ * from VCPU_INTR_MSR and is modified based on the required TM state changes.
+ *   r11 has the guest MSR value (in/out)
+ *   r9 has a vcpu pointer (in)
+ *   r0 is used as a scratch register
+ */
+kvmppc_msr_interrupt:
+       rldicl  r0, r11, 64 - MSR_TS_S_LG, 62
+       cmpwi   r0, 2 /* Check if we are in transactional state..  */
+       ld      r11, VCPU_INTR_MSR(r9)
+       bne     1f
+       /* ... if transactional, change to suspended */
+       li      r0, 1
+1:     rldimi  r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       blr
index cf95cde..7a05315 100644 (file)
@@ -213,8 +213,11 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
        gpa_t args_phys;
        int rc;
 
-       /* r4 contains the guest physical address of the RTAS args */
-       args_phys = kvmppc_get_gpr(vcpu, 4);
+       /*
+        * r4 contains the guest physical address of the RTAS args
+        * Mask off the top 4 bits since this is a guest real address
+        */
+       args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
 
        rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
        if (rc)
index 5f8bcc5..35f0faa 100644 (file)
@@ -53,6 +53,7 @@ enum interruption_class {
        IRQIO_PCI,
        IRQIO_MSI,
        IRQIO_VIR,
+       IRQIO_VAI,
        NMI_NMI,
        CPU_RST,
        NR_ARCH_IRQS
index 9bf95bb..154b600 100644 (file)
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
 #include <linux/kvm_host.h>
+#include <linux/kvm.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
+#include <asm/isc.h>
 
 #define KVM_MAX_VCPUS 64
 #define KVM_USER_MEM_SLOTS 32
 
+/*
+ * These seem to be used for allocating ->chip in the routing table,
+ * which we don't use. 4096 is an out-of-thin-air value. If we need
+ * to look at ->chip later on, we'll need to revisit this.
+ */
+#define KVM_NR_IRQCHIPS 1
+#define KVM_IRQCHIP_NUM_PINS 4096
+
 struct sca_entry {
        atomic_t scn;
        __u32   reserved;
@@ -108,7 +118,9 @@ struct kvm_s390_sie_block {
        __u32   fac;                    /* 0x01a0 */
        __u8    reserved1a4[20];        /* 0x01a4 */
        __u64   cbrlo;                  /* 0x01b8 */
-       __u8    reserved1c0[40];        /* 0x01c0 */
+       __u8    reserved1c0[30];        /* 0x01c0 */
+       __u64   pp;                     /* 0x01de */
+       __u8    reserved1e6[2];         /* 0x01e6 */
        __u64   itdba;                  /* 0x01e8 */
        __u8    reserved1f0[16];        /* 0x01f0 */
 } __attribute__((packed));
@@ -171,18 +183,6 @@ struct kvm_vcpu_stat {
        u32 diagnose_9c;
 };
 
-struct kvm_s390_io_info {
-       __u16        subchannel_id;            /* 0x0b8 */
-       __u16        subchannel_nr;            /* 0x0ba */
-       __u32        io_int_parm;              /* 0x0bc */
-       __u32        io_int_word;              /* 0x0c0 */
-};
-
-struct kvm_s390_ext_info {
-       __u32 ext_params;
-       __u64 ext_params2;
-};
-
 #define PGM_OPERATION            0x01
 #define PGM_PRIVILEGED_OP       0x02
 #define PGM_EXECUTE              0x03
@@ -191,27 +191,6 @@ struct kvm_s390_ext_info {
 #define PGM_SPECIFICATION        0x06
 #define PGM_DATA                 0x07
 
-struct kvm_s390_pgm_info {
-       __u16 code;
-};
-
-struct kvm_s390_prefix_info {
-       __u32 address;
-};
-
-struct kvm_s390_extcall_info {
-       __u16 code;
-};
-
-struct kvm_s390_emerg_info {
-       __u16 code;
-};
-
-struct kvm_s390_mchk_info {
-       __u64 cr14;
-       __u64 mcic;
-};
-
 struct kvm_s390_interrupt_info {
        struct list_head list;
        u64     type;
@@ -246,9 +225,8 @@ struct kvm_s390_float_interrupt {
        struct list_head list;
        atomic_t active;
        int next_rr_cpu;
-       unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1)
-                               / sizeof(long)];
-       struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS];
+       unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+       unsigned int irq_count;
 };
 
 
@@ -265,6 +243,10 @@ struct kvm_vcpu_arch {
                u64             stidp_data;
        };
        struct gmap *gmap;
+#define KVM_S390_PFAULT_TOKEN_INVALID  (-1UL)
+       unsigned long pfault_token;
+       unsigned long pfault_select;
+       unsigned long pfault_compare;
 };
 
 struct kvm_vm_stat {
@@ -274,12 +256,36 @@ struct kvm_vm_stat {
 struct kvm_arch_memory_slot {
 };
 
+struct s390_map_info {
+       struct list_head list;
+       __u64 guest_addr;
+       __u64 addr;
+       struct page *page;
+};
+
+struct s390_io_adapter {
+       unsigned int id;
+       int isc;
+       bool maskable;
+       bool masked;
+       bool swap;
+       struct rw_semaphore maps_lock;
+       struct list_head maps;
+       atomic_t nr_maps;
+};
+
+#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
+#define MAX_S390_ADAPTER_MAPS 256
+
 struct kvm_arch{
        struct sca_block *sca;
        debug_info_t *dbf;
        struct kvm_s390_float_interrupt float_int;
+       struct kvm_device *flic;
        struct gmap *gmap;
        int css_support;
+       int use_irqchip;
+       struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
@@ -290,6 +296,24 @@ static inline bool kvm_is_error_hva(unsigned long addr)
        return IS_ERR_VALUE(addr);
 }
 
+#define ASYNC_PF_PER_VCPU      64
+struct kvm_vcpu;
+struct kvm_async_pf;
+struct kvm_arch_async_pf {
+       unsigned long pfault_token;
+};
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+                              struct kvm_async_pf *work);
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work);
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work);
+
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 extern char sie_exit;
 #endif
index 1ab75ea..50a75d9 100644 (file)
@@ -782,6 +782,7 @@ static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @crst_list: list of all crst tables used in the guest address space
+ * @pfault_enabled: defines if pfaults are applicable for the guest
  */
 struct gmap {
        struct list_head list;
@@ -790,6 +791,7 @@ struct gmap {
        unsigned long asce;
        void *private;
        struct list_head crst_list;
+       bool pfault_enabled;
 };
 
 /**
index 0a876bc..dc5fc4f 100644 (file)
@@ -79,6 +79,7 @@ struct thread_struct {
         unsigned long ksp;              /* kernel stack pointer             */
        mm_segment_t mm_segment;
        unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_pfault;       /* signal of a pending guest pfault */
        struct per_regs per_user;       /* User specified PER registers */
        struct per_event per_event;     /* Cause of the last PER trap */
        unsigned long per_flags;        /* Flags to control debug behavior */
index d25da59..c003c6a 100644 (file)
 
 #define __KVM_S390
 
+/* Device control API: s390-specific devices */
+#define KVM_DEV_FLIC_GET_ALL_IRQS      1
+#define KVM_DEV_FLIC_ENQUEUE           2
+#define KVM_DEV_FLIC_CLEAR_IRQS                3
+#define KVM_DEV_FLIC_APF_ENABLE                4
+#define KVM_DEV_FLIC_APF_DISABLE_WAIT  5
+#define KVM_DEV_FLIC_ADAPTER_REGISTER  6
+#define KVM_DEV_FLIC_ADAPTER_MODIFY    7
+/*
+ * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
+ * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
+ * There are also sclp and machine checks. This gives us
+ * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000
+ * Lets round up to 8192 pages.
+ */
+#define KVM_S390_MAX_FLOAT_IRQS        266250
+#define KVM_S390_FLIC_MAX_BUFFER       0x2000000
+
+struct kvm_s390_io_adapter {
+       __u32 id;
+       __u8 isc;
+       __u8 maskable;
+       __u8 swap;
+       __u8 pad;
+};
+
+#define KVM_S390_IO_ADAPTER_MASK 1
+#define KVM_S390_IO_ADAPTER_MAP 2
+#define KVM_S390_IO_ADAPTER_UNMAP 3
+
+struct kvm_s390_io_adapter_req {
+       __u32 id;
+       __u8 type;
+       __u8 mask;
+       __u16 pad0;
+       __u64 addr;
+};
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
        /* general purpose regs for s390 */
@@ -57,4 +95,9 @@ struct kvm_sync_regs {
 #define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2)
 #define KVM_REG_S390_CPU_TIMER  (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3)
 #define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_S390_PFTOKEN   (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_S390_PFSELECT  (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_S390_PP                (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_S390_GBEA      (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9)
 #endif
index a770be9..d42b14c 100644 (file)
@@ -85,6 +85,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
        [IRQIO_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
        [IRQIO_MSI]  = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
        [IRQIO_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
+       [IRQIO_VAI]  = {.name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"},
        [NMI_NMI]    = {.name = "NMI", .desc = "[NMI] Machine Check"},
        [CPU_RST]    = {.name = "RST", .desc = "[CPU] CPU Restart"},
 };
index 70b46ea..10d529a 100644 (file)
@@ -23,6 +23,10 @@ config KVM
        select ANON_INODES
        select HAVE_KVM_CPU_RELAX_INTERCEPT
        select HAVE_KVM_EVENTFD
+       select KVM_ASYNC_PF
+       select KVM_ASYNC_PF_SYNC
+       select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_IRQ_ROUTING
        ---help---
          Support hosting paravirtualized guest machines using the SIE
          virtualization capability on the mainframe. This should work
index 40b4c64..d3adb37 100644 (file)
@@ -7,7 +7,7 @@
 # as published by the Free Software Foundation.
 
 KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
index 6f9cfa5..03a05ff 100644 (file)
@@ -18,6 +18,7 @@
 #include "kvm-s390.h"
 #include "trace.h"
 #include "trace-s390.h"
+#include "gaccess.h"
 
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
@@ -47,6 +48,87 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
+{
+       struct prs_parm {
+               u16 code;
+               u16 subcode;
+               u16 parm_len;
+               u16 parm_version;
+               u64 token_addr;
+               u64 select_mask;
+               u64 compare_mask;
+               u64 zarch;
+       };
+       struct prs_parm parm;
+       int rc;
+       u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4;
+       u16 ry = (vcpu->arch.sie_block->ipa & 0x0f);
+       unsigned long hva_token = KVM_HVA_ERR_BAD;
+
+       if (vcpu->run->s.regs.gprs[rx] & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm)))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       switch (parm.subcode) {
+       case 0: /* TOKEN */
+               if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) {
+                       /*
+                        * If the pagefault handshake is already activated,
+                        * the token must not be changed.  We have to return
+                        * decimal 8 instead, as mandated in SC24-6084.
+                        */
+                       vcpu->run->s.regs.gprs[ry] = 8;
+                       return 0;
+               }
+
+               if ((parm.compare_mask & parm.select_mask) != parm.compare_mask ||
+                   parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
+                       return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+               hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr));
+               if (kvm_is_error_hva(hva_token))
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+               vcpu->arch.pfault_token = parm.token_addr;
+               vcpu->arch.pfault_select = parm.select_mask;
+               vcpu->arch.pfault_compare = parm.compare_mask;
+               vcpu->run->s.regs.gprs[ry] = 0;
+               rc = 0;
+               break;
+       case 1: /*
+                * CANCEL
+                * Specification allows to let already pending tokens survive
+                * the cancel, therefore to reduce code complexity, we assume
+                * all outstanding tokens are already pending.
+                */
+               if (parm.token_addr || parm.select_mask ||
+                   parm.compare_mask || parm.zarch)
+                       return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+               vcpu->run->s.regs.gprs[ry] = 0;
+               /*
+                * If the pfault handling was not established or is already
+                * canceled SC24-6084 requests to return decimal 4.
+                */
+               if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+                       vcpu->run->s.regs.gprs[ry] = 4;
+               else
+                       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+
+               rc = 0;
+               break;
+       default:
+               rc = -EOPNOTSUPP;
+               break;
+       }
+
+       return rc;
+}
+
 static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
@@ -153,6 +235,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
                return __diag_time_slice_end(vcpu);
        case 0x9c:
                return __diag_time_slice_end_directed(vcpu);
+       case 0x258:
+               return __diag_page_ref_service(vcpu);
        case 0x308:
                return __diag_ipl_functions(vcpu);
        case 0x500:
index 5f79d2d..200a8f9 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * handling kvm guest interrupts
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2014
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -13,6 +13,7 @@
 #include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <linux/hrtimer.h>
+#include <linux/mmu_context.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <asm/asm-offsets.h>
@@ -31,7 +32,7 @@ static int is_ioint(u64 type)
        return ((type & 0xfffe0000u) != 0xfffe0000u);
 }
 
-static int psw_extint_disabled(struct kvm_vcpu *vcpu)
+int psw_extint_disabled(struct kvm_vcpu *vcpu)
 {
        return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
 }
@@ -78,11 +79,8 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                        return 1;
                return 0;
        case KVM_S390_INT_SERVICE:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
-                       return 1;
-               return 0;
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
        case KVM_S390_INT_VIRTIO:
                if (psw_extint_disabled(vcpu))
                        return 0;
@@ -117,14 +115,12 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
 
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
-       BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
        atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
        set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
-       BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
        atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
        clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
@@ -150,6 +146,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
        case KVM_S390_INT_EXTERNAL_CALL:
        case KVM_S390_INT_EMERGENCY:
        case KVM_S390_INT_SERVICE:
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
        case KVM_S390_INT_VIRTIO:
                if (psw_extint_disabled(vcpu))
                        __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -223,6 +221,30 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                rc |= put_guest(vcpu, inti->ext.ext_params,
                                (u32 __user *)__LC_EXT_PARAMS);
                break;
+       case KVM_S390_INT_PFAULT_INIT:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+                                                inti->ext.ext_params2);
+               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params2,
+                               (u64 __user *) __LC_EXT_PARAMS2);
+               break;
+       case KVM_S390_INT_PFAULT_DONE:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+                                                inti->ext.ext_params2);
+               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params2,
+                               (u64 __user *) __LC_EXT_PARAMS2);
+               break;
        case KVM_S390_INT_VIRTIO:
                VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
                           inti->ext.ext_params, inti->ext.ext_params2);
@@ -357,7 +379,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -482,11 +504,26 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
        struct kvm_vcpu *vcpu;
 
        vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+       vcpu->preempted = true;
        tasklet_schedule(&vcpu->arch.tasklet);
 
        return HRTIMER_NORESTART;
 }
 
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+
+       spin_lock_bh(&li->lock);
+       list_for_each_entry_safe(inti, n, &li->list, list) {
+               list_del(&inti->list);
+               kfree(inti);
+       }
+       atomic_set(&li->active, 0);
+       spin_unlock_bh(&li->lock);
+}
+
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -528,6 +565,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
                        list_for_each_entry_safe(inti, n, &fi->list, list) {
                                if (__interrupt_is_deliverable(vcpu, inti)) {
                                        list_del(&inti->list);
+                                       fi->irq_count--;
                                        deliver = 1;
                                        break;
                                }
@@ -583,6 +621,7 @@ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
                                if ((inti->type == KVM_S390_MCHK) &&
                                    __interrupt_is_deliverable(vcpu, inti)) {
                                        list_del(&inti->list);
+                                       fi->irq_count--;
                                        deliver = 1;
                                        break;
                                }
@@ -650,8 +689,10 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                inti = iter;
                break;
        }
-       if (inti)
+       if (inti) {
                list_del_init(&inti->list);
+               fi->irq_count--;
+       }
        if (list_empty(&fi->list))
                atomic_set(&fi->active, 0);
        spin_unlock(&fi->lock);
@@ -659,53 +700,101 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
        return inti;
 }
 
-int kvm_s390_inject_vm(struct kvm *kvm,
-                      struct kvm_s390_interrupt *s390int)
+static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
        struct kvm_s390_local_interrupt *li;
        struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *inti, *iter;
+       struct kvm_s390_interrupt_info *iter;
+       struct kvm_vcpu *dst_vcpu = NULL;
        int sigcpu;
+       int rc = 0;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
+               rc = -EINVAL;
+               goto unlock_fi;
+       }
+       fi->irq_count++;
+       if (!is_ioint(inti->type)) {
+               list_add_tail(&inti->list, &fi->list);
+       } else {
+               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+               /* Keep I/O interrupts sorted in isc order. */
+               list_for_each_entry(iter, &fi->list, list) {
+                       if (!is_ioint(iter->type))
+                               continue;
+                       if (int_word_to_isc_bits(iter->io.io_int_word)
+                           <= isc_bits)
+                               continue;
+                       break;
+               }
+               list_add_tail(&inti->list, &iter->list);
+       }
+       atomic_set(&fi->active, 1);
+       sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+       if (sigcpu == KVM_MAX_VCPUS) {
+               do {
+                       sigcpu = fi->next_rr_cpu++;
+                       if (sigcpu == KVM_MAX_VCPUS)
+                               sigcpu = fi->next_rr_cpu = 0;
+               } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
+       }
+       dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+       li = &dst_vcpu->arch.local_int;
+       spin_lock_bh(&li->lock);
+       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+       if (waitqueue_active(li->wq))
+               wake_up_interruptible(li->wq);
+       kvm_get_vcpu(kvm, sigcpu)->preempted = true;
+       spin_unlock_bh(&li->lock);
+unlock_fi:
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+       return rc;
+}
+
+int kvm_s390_inject_vm(struct kvm *kvm,
+                      struct kvm_s390_interrupt *s390int)
+{
+       struct kvm_s390_interrupt_info *inti;
 
        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
        if (!inti)
                return -ENOMEM;
 
-       switch (s390int->type) {
+       inti->type = s390int->type;
+       switch (inti->type) {
        case KVM_S390_INT_VIRTIO:
                VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
                         s390int->parm, s390int->parm64);
-               inti->type = s390int->type;
                inti->ext.ext_params = s390int->parm;
                inti->ext.ext_params2 = s390int->parm64;
                break;
        case KVM_S390_INT_SERVICE:
                VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
-               inti->type = s390int->type;
                inti->ext.ext_params = s390int->parm;
                break;
-       case KVM_S390_PROGRAM_INT:
-       case KVM_S390_SIGP_STOP:
-       case KVM_S390_INT_EXTERNAL_CALL:
-       case KVM_S390_INT_EMERGENCY:
-               kfree(inti);
-               return -EINVAL;
+       case KVM_S390_INT_PFAULT_DONE:
+               inti->type = s390int->type;
+               inti->ext.ext_params2 = s390int->parm64;
+               break;
        case KVM_S390_MCHK:
                VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
                         s390int->parm64);
-               inti->type = s390int->type;
                inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
                inti->mchk.mcic = s390int->parm64;
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (s390int->type & IOINT_AI_MASK)
+               if (inti->type & IOINT_AI_MASK)
                        VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
                else
                        VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
                                 s390int->type & IOINT_CSSID_MASK,
                                 s390int->type & IOINT_SSID_MASK,
                                 s390int->type & IOINT_SCHID_MASK);
-               inti->type = s390int->type;
                inti->io.subchannel_id = s390int->parm >> 16;
                inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
                inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -718,43 +807,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
        trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
                                 2);
 
-       mutex_lock(&kvm->lock);
-       fi = &kvm->arch.float_int;
-       spin_lock(&fi->lock);
-       if (!is_ioint(inti->type))
-               list_add_tail(&inti->list, &fi->list);
-       else {
-               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
-
-               /* Keep I/O interrupts sorted in isc order. */
-               list_for_each_entry(iter, &fi->list, list) {
-                       if (!is_ioint(iter->type))
-                               continue;
-                       if (int_word_to_isc_bits(iter->io.io_int_word)
-                           <= isc_bits)
-                               continue;
-                       break;
-               }
-               list_add_tail(&inti->list, &iter->list);
-       }
-       atomic_set(&fi->active, 1);
-       sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
-       if (sigcpu == KVM_MAX_VCPUS) {
-               do {
-                       sigcpu = fi->next_rr_cpu++;
-                       if (sigcpu == KVM_MAX_VCPUS)
-                               sigcpu = fi->next_rr_cpu = 0;
-               } while (fi->local_int[sigcpu] == NULL);
-       }
-       li = fi->local_int[sigcpu];
-       spin_lock_bh(&li->lock);
-       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-       if (waitqueue_active(li->wq))
-               wake_up_interruptible(li->wq);
-       spin_unlock_bh(&li->lock);
-       spin_unlock(&fi->lock);
-       mutex_unlock(&kvm->lock);
-       return 0;
+       return __inject_vm(kvm, inti);
 }
 
 int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -814,6 +867,10 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                inti->type = s390int->type;
                inti->mchk.mcic = s390int->parm64;
                break;
+       case KVM_S390_INT_PFAULT_INIT:
+               inti->type = s390int->type;
+               inti->ext.ext_params2 = s390int->parm64;
+               break;
        case KVM_S390_INT_VIRTIO:
        case KVM_S390_INT_SERVICE:
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -837,7 +894,528 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
        atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
        if (waitqueue_active(&vcpu->wq))
                wake_up_interruptible(&vcpu->wq);
+       vcpu->preempted = true;
        spin_unlock_bh(&li->lock);
        mutex_unlock(&vcpu->kvm->lock);
        return 0;
 }
+
+static void clear_floating_interrupts(struct kvm *kvm)
+{
+       struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       list_for_each_entry_safe(inti, n, &fi->list, list) {
+               list_del(&inti->list);
+               kfree(inti);
+       }
+       fi->irq_count = 0;
+       atomic_set(&fi->active, 0);
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+}
+
+static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
+                                  u8 *addr)
+{
+       struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+       struct kvm_s390_irq irq = {0};
+
+       irq.type = inti->type;
+       switch (inti->type) {
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
+       case KVM_S390_INT_VIRTIO:
+       case KVM_S390_INT_SERVICE:
+               irq.u.ext = inti->ext;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               irq.u.io = inti->io;
+               break;
+       case KVM_S390_MCHK:
+               irq.u.mchk = inti->mchk;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (copy_to_user(uptr, &irq, sizeof(irq)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
+{
+       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_float_interrupt *fi;
+       int ret = 0;
+       int n = 0;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+
+       list_for_each_entry(inti, &fi->list, list) {
+               if (len < sizeof(struct kvm_s390_irq)) {
+                       /* signal userspace to try again */
+                       ret = -ENOMEM;
+                       break;
+               }
+               ret = copy_irq_to_user(inti, buf);
+               if (ret)
+                       break;
+               buf += sizeof(struct kvm_s390_irq);
+               len -= sizeof(struct kvm_s390_irq);
+               n++;
+       }
+
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+
+       return ret < 0 ? ret : n;
+}
+
+static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int r;
+
+       switch (attr->group) {
+       case KVM_DEV_FLIC_GET_ALL_IRQS:
+               r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr,
+                                         attr->attr);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
+                                    u64 addr)
+{
+       struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+       void *target = NULL;
+       void __user *source;
+       u64 size;
+
+       if (get_user(inti->type, (u64 __user *)addr))
+               return -EFAULT;
+
+       switch (inti->type) {
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
+       case KVM_S390_INT_VIRTIO:
+       case KVM_S390_INT_SERVICE:
+               target = (void *) &inti->ext;
+               source = &uptr->u.ext;
+               size = sizeof(inti->ext);
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               target = (void *) &inti->io;
+               source = &uptr->u.io;
+               size = sizeof(inti->io);
+               break;
+       case KVM_S390_MCHK:
+               target = (void *) &inti->mchk;
+               source = &uptr->u.mchk;
+               size = sizeof(inti->mchk);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (copy_from_user(target, source, size))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int enqueue_floating_irq(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       struct kvm_s390_interrupt_info *inti = NULL;
+       int r = 0;
+       int len = attr->attr;
+
+       if (len % sizeof(struct kvm_s390_irq) != 0)
+               return -EINVAL;
+       else if (len > KVM_S390_FLIC_MAX_BUFFER)
+               return -EINVAL;
+
+       while (len >= sizeof(struct kvm_s390_irq)) {
+               inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+               if (!inti)
+                       return -ENOMEM;
+
+               r = copy_irq_from_user(inti, attr->addr);
+               if (r) {
+                       kfree(inti);
+                       return r;
+               }
+               r = __inject_vm(dev->kvm, inti);
+               if (r) {
+                       kfree(inti);
+                       return r;
+               }
+               len -= sizeof(struct kvm_s390_irq);
+               attr->addr += sizeof(struct kvm_s390_irq);
+       }
+
+       return r;
+}
+
+static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id)
+{
+       if (id >= MAX_S390_IO_ADAPTERS)
+               return NULL;
+       return kvm->arch.adapters[id];
+}
+
+static int register_io_adapter(struct kvm_device *dev,
+                              struct kvm_device_attr *attr)
+{
+       struct s390_io_adapter *adapter;
+       struct kvm_s390_io_adapter adapter_info;
+
+       if (copy_from_user(&adapter_info,
+                          (void __user *)attr->addr, sizeof(adapter_info)))
+               return -EFAULT;
+
+       if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) ||
+           (dev->kvm->arch.adapters[adapter_info.id] != NULL))
+               return -EINVAL;
+
+       adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+       if (!adapter)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&adapter->maps);
+       init_rwsem(&adapter->maps_lock);
+       atomic_set(&adapter->nr_maps, 0);
+       adapter->id = adapter_info.id;
+       adapter->isc = adapter_info.isc;
+       adapter->maskable = adapter_info.maskable;
+       adapter->masked = false;
+       adapter->swap = adapter_info.swap;
+       dev->kvm->arch.adapters[adapter->id] = adapter;
+
+       return 0;
+}
+
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
+{
+       int ret;
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+       if (!adapter || !adapter->maskable)
+               return -EINVAL;
+       ret = adapter->masked;
+       adapter->masked = masked;
+       return ret;
+}
+
+static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+       struct s390_map_info *map;
+       int ret;
+
+       if (!adapter || !addr)
+               return -EINVAL;
+
+       map = kzalloc(sizeof(*map), GFP_KERNEL);
+       if (!map) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       INIT_LIST_HEAD(&map->list);
+       map->guest_addr = addr;
+       map->addr = gmap_translate(addr, kvm->arch.gmap);
+       if (map->addr == -EFAULT) {
+               ret = -EFAULT;
+               goto out;
+       }
+       ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret != 1);
+       down_write(&adapter->maps_lock);
+       if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
+               list_add_tail(&map->list, &adapter->maps);
+               ret = 0;
+       } else {
+               put_page(map->page);
+               ret = -EINVAL;
+       }
+       up_write(&adapter->maps_lock);
+out:
+       if (ret)
+               kfree(map);
+       return ret;
+}
+
+static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+       struct s390_map_info *map, *tmp;
+       int found = 0;
+
+       if (!adapter || !addr)
+               return -EINVAL;
+
+       down_write(&adapter->maps_lock);
+       list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
+               if (map->guest_addr == addr) {
+                       found = 1;
+                       atomic_dec(&adapter->nr_maps);
+                       list_del(&map->list);
+                       put_page(map->page);
+                       kfree(map);
+                       break;
+               }
+       }
+       up_write(&adapter->maps_lock);
+
+       return found ? 0 : -EINVAL;
+}
+
+void kvm_s390_destroy_adapters(struct kvm *kvm)
+{
+       int i;
+       struct s390_map_info *map, *tmp;
+
+       for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+               if (!kvm->arch.adapters[i])
+                       continue;
+               list_for_each_entry_safe(map, tmp,
+                                        &kvm->arch.adapters[i]->maps, list) {
+                       list_del(&map->list);
+                       put_page(map->page);
+                       kfree(map);
+               }
+               kfree(kvm->arch.adapters[i]);
+       }
+}
+
+static int modify_io_adapter(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       struct kvm_s390_io_adapter_req req;
+       struct s390_io_adapter *adapter;
+       int ret;
+
+       if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+               return -EFAULT;
+
+       adapter = get_io_adapter(dev->kvm, req.id);
+       if (!adapter)
+               return -EINVAL;
+       switch (req.type) {
+       case KVM_S390_IO_ADAPTER_MASK:
+               ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask);
+               if (ret > 0)
+                       ret = 0;
+               break;
+       case KVM_S390_IO_ADAPTER_MAP:
+               ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
+               break;
+       case KVM_S390_IO_ADAPTER_UNMAP:
+               ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int r = 0;
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+
+       switch (attr->group) {
+       case KVM_DEV_FLIC_ENQUEUE:
+               r = enqueue_floating_irq(dev, attr);
+               break;
+       case KVM_DEV_FLIC_CLEAR_IRQS:
+               r = 0;
+               clear_floating_interrupts(dev->kvm);
+               break;
+       case KVM_DEV_FLIC_APF_ENABLE:
+               dev->kvm->arch.gmap->pfault_enabled = 1;
+               break;
+       case KVM_DEV_FLIC_APF_DISABLE_WAIT:
+               dev->kvm->arch.gmap->pfault_enabled = 0;
+               /*
+                * Make sure no async faults are in transition when
+                * clearing the queues. So we don't need to worry
+                * about late coming workers.
+                */
+               synchronize_srcu(&dev->kvm->srcu);
+               kvm_for_each_vcpu(i, vcpu, dev->kvm)
+                       kvm_clear_async_pf_completion_queue(vcpu);
+               break;
+       case KVM_DEV_FLIC_ADAPTER_REGISTER:
+               r = register_io_adapter(dev, attr);
+               break;
+       case KVM_DEV_FLIC_ADAPTER_MODIFY:
+               r = modify_io_adapter(dev, attr);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+static int flic_create(struct kvm_device *dev, u32 type)
+{
+       if (!dev)
+               return -EINVAL;
+       if (dev->kvm->arch.flic)
+               return -EINVAL;
+       dev->kvm->arch.flic = dev;
+       return 0;
+}
+
+static void flic_destroy(struct kvm_device *dev)
+{
+       dev->kvm->arch.flic = NULL;
+       kfree(dev);
+}
+
+/* s390 floating irq controller (flic) */
+struct kvm_device_ops kvm_flic_ops = {
+       .name = "kvm-flic",
+       .get_attr = flic_get_attr,
+       .set_attr = flic_set_attr,
+       .create = flic_create,
+       .destroy = flic_destroy,
+};
+
+static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
+{
+       unsigned long bit;
+
+       bit = bit_nr + (addr % PAGE_SIZE) * 8;
+
+       return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
+}
+
+static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
+                                         u64 addr)
+{
+       struct s390_map_info *map;
+
+       if (!adapter)
+               return NULL;
+
+       list_for_each_entry(map, &adapter->maps, list) {
+               if (map->guest_addr == addr)
+                       return map;
+       }
+       return NULL;
+}
+
+static int adapter_indicators_set(struct kvm *kvm,
+                                 struct s390_io_adapter *adapter,
+                                 struct kvm_s390_adapter_int *adapter_int)
+{
+       unsigned long bit;
+       int summary_set, idx;
+       struct s390_map_info *info;
+       void *map;
+
+       info = get_map_info(adapter, adapter_int->ind_addr);
+       if (!info)
+               return -1;
+       map = page_address(info->page);
+       bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
+       set_bit(bit, map);
+       idx = srcu_read_lock(&kvm->srcu);
+       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(info->page);
+       info = get_map_info(adapter, adapter_int->summary_addr);
+       if (!info) {
+               srcu_read_unlock(&kvm->srcu, idx);
+               return -1;
+       }
+       map = page_address(info->page);
+       bit = get_ind_bit(info->addr, adapter_int->summary_offset,
+                         adapter->swap);
+       summary_set = test_and_set_bit(bit, map);
+       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(info->page);
+       srcu_read_unlock(&kvm->srcu, idx);
+       return summary_set ? 0 : 1;
+}
+
+/*
+ * < 0 - not injected due to error
+ * = 0 - coalesced, summary indicator already active
+ * > 0 - injected interrupt
+ */
+static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
+                          struct kvm *kvm, int irq_source_id, int level,
+                          bool line_status)
+{
+       int ret;
+       struct s390_io_adapter *adapter;
+
+       /* We're only interested in the 0->1 transition. */
+       if (!level)
+               return 0;
+       adapter = get_io_adapter(kvm, e->adapter.adapter_id);
+       if (!adapter)
+               return -1;
+       down_read(&adapter->maps_lock);
+       ret = adapter_indicators_set(kvm, adapter, &e->adapter);
+       up_read(&adapter->maps_lock);
+       if ((ret > 0) && !adapter->masked) {
+               struct kvm_s390_interrupt s390int = {
+                       .type = KVM_S390_INT_IO(1, 0, 0, 0),
+                       .parm = 0,
+                       .parm64 = (adapter->isc << 27) | 0x80000000,
+               };
+               ret = kvm_s390_inject_vm(kvm, &s390int);
+               if (ret == 0)
+                       ret = 1;
+       }
+       return ret;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue)
+{
+       int ret;
+
+       switch (ue->type) {
+       case KVM_IRQ_ROUTING_S390_ADAPTER:
+               e->set = set_adapter_int;
+               e->adapter.summary_addr = ue->u.adapter.summary_addr;
+               e->adapter.ind_addr = ue->u.adapter.ind_addr;
+               e->adapter.summary_offset = ue->u.adapter.summary_offset;
+               e->adapter.ind_offset = ue->u.adapter.ind_offset;
+               e->adapter.adapter_id = ue->u.adapter.adapter_id;
+               ret = 0;
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+               int irq_source_id, int level, bool line_status)
+{
+       return -EINVAL;
+}
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
new file mode 100644 (file)
index 0000000..d98e415
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * s390 irqchip routines
+ *
+ * Copyright IBM Corp. 2014
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+ */
+#ifndef __KVM_IRQ_H
+#define __KVM_IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return 1;
+}
+
+#endif
index 10b5db3..b3ecb8f 100644 (file)
@@ -153,11 +153,14 @@ int kvm_dev_ioctl_check_extension(long ext)
 #ifdef CONFIG_KVM_S390_UCONTROL
        case KVM_CAP_S390_UCONTROL:
 #endif
+       case KVM_CAP_ASYNC_PF:
        case KVM_CAP_SYNC_REGS:
        case KVM_CAP_ONE_REG:
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_S390_CSS_SUPPORT:
        case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_DEVICE_CTRL:
+       case KVM_CAP_ENABLE_CAP_VM:
                r = 1;
                break;
        case KVM_CAP_NR_VCPUS:
@@ -186,6 +189,25 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        return 0;
 }
 
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_S390_IRQCHIP:
+               kvm->arch.use_irqchip = 1;
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -203,6 +225,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_s390_inject_vm(kvm, &s390int);
                break;
        }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       break;
+               r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+               break;
+       }
+       case KVM_CREATE_IRQCHIP: {
+               struct kvm_irq_routing_entry routing;
+
+               r = -EINVAL;
+               if (kvm->arch.use_irqchip) {
+                       /* Set up dummy routing. */
+                       memset(&routing, 0, sizeof(routing));
+                       kvm_set_irq_routing(kvm, &routing, 0, 0);
+                       r = 0;
+               }
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@ -214,6 +256,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
        int rc;
        char debug_name[16];
+       static unsigned long sca_offset;
 
        rc = -EINVAL;
 #ifdef CONFIG_KVM_S390_UCONTROL
@@ -235,6 +278,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
        if (!kvm->arch.sca)
                goto out_err;
+       spin_lock(&kvm_lock);
+       sca_offset = (sca_offset + 16) & 0x7f0;
+       kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
+       spin_unlock(&kvm_lock);
 
        sprintf(debug_name, "kvm-%u", current->pid);
 
@@ -255,9 +302,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                if (!kvm->arch.gmap)
                        goto out_nogmap;
                kvm->arch.gmap->private = kvm;
+               kvm->arch.gmap->pfault_enabled = 0;
        }
 
        kvm->arch.css_support = 0;
+       kvm->arch.use_irqchip = 0;
 
        return 0;
 out_nogmap:
@@ -272,6 +321,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
        trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
+       kvm_clear_async_pf_completion_queue(vcpu);
        if (!kvm_is_ucontrol(vcpu->kvm)) {
                clear_bit(63 - vcpu->vcpu_id,
                          (unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -320,11 +370,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        debug_unregister(kvm->arch.dbf);
        if (!kvm_is_ucontrol(kvm))
                gmap_free(kvm->arch.gmap);
+       kvm_s390_destroy_adapters(kvm);
 }
 
 /* Section: vcpu related */
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+       kvm_clear_async_pf_completion_queue(vcpu);
        if (kvm_is_ucontrol(vcpu->kvm)) {
                vcpu->arch.gmap = gmap_alloc(current->mm);
                if (!vcpu->arch.gmap)
@@ -385,7 +438,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.guest_fpregs.fpc = 0;
        asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
        vcpu->arch.sie_block->gbea = 1;
+       vcpu->arch.sie_block->pp = 0;
+       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+       kvm_clear_async_pf_completion_queue(vcpu);
        atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_local_irqs(vcpu);
 }
 
 int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -466,11 +523,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        spin_lock_init(&vcpu->arch.local_int.lock);
        INIT_LIST_HEAD(&vcpu->arch.local_int.list);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-       spin_lock(&kvm->arch.float_int.lock);
-       kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
        vcpu->arch.local_int.wq = &vcpu->wq;
        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
-       spin_unlock(&kvm->arch.float_int.lock);
 
        rc = kvm_vcpu_init(vcpu, kvm, id);
        if (rc)
@@ -490,9 +544,7 @@ out:
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-       /* kvm common code refers to this, but never calls it */
-       BUG();
-       return 0;
+       return kvm_cpu_has_interrupt(vcpu);
 }
 
 void s390_vcpu_block(struct kvm_vcpu *vcpu)
@@ -568,6 +620,26 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
                r = put_user(vcpu->arch.sie_block->ckc,
                             (u64 __user *)reg->addr);
                break;
+       case KVM_REG_S390_PFTOKEN:
+               r = put_user(vcpu->arch.pfault_token,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFCOMPARE:
+               r = put_user(vcpu->arch.pfault_compare,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFSELECT:
+               r = put_user(vcpu->arch.pfault_select,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PP:
+               r = put_user(vcpu->arch.sie_block->pp,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_GBEA:
+               r = put_user(vcpu->arch.sie_block->gbea,
+                            (u64 __user *)reg->addr);
+               break;
        default:
                break;
        }
@@ -597,6 +669,26 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
                r = get_user(vcpu->arch.sie_block->ckc,
                             (u64 __user *)reg->addr);
                break;
+       case KVM_REG_S390_PFTOKEN:
+               r = get_user(vcpu->arch.pfault_token,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFCOMPARE:
+               r = get_user(vcpu->arch.pfault_compare,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFSELECT:
+               r = get_user(vcpu->arch.pfault_select,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PP:
+               r = get_user(vcpu->arch.sie_block->pp,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_GBEA:
+               r = get_user(vcpu->arch.sie_block->gbea,
+                            (u64 __user *)reg->addr);
+               break;
        default:
                break;
        }
@@ -715,10 +807,100 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu)
+{
+       long rc;
+       hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+       struct mm_struct *mm = current->mm;
+       down_read(&mm->mmap_sem);
+       rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL);
+       up_read(&mm->mmap_sem);
+       return rc;
+}
+
+static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
+                                     unsigned long token)
+{
+       struct kvm_s390_interrupt inti;
+       inti.parm64 = token;
+
+       if (start_token) {
+               inti.type = KVM_S390_INT_PFAULT_INIT;
+               WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &inti));
+       } else {
+               inti.type = KVM_S390_INT_PFAULT_DONE;
+               WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti));
+       }
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work)
+{
+       trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
+       __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work)
+{
+       trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token);
+       __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+                              struct kvm_async_pf *work)
+{
+       /* s390 will always inject the page directly */
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+       /*
+        * s390 will always inject the page directly,
+        * but we still want check_async_completion to cleanup
+        */
+       return true;
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+{
+       hva_t hva;
+       struct kvm_arch_async_pf arch;
+       int rc;
+
+       if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+               return 0;
+       if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
+           vcpu->arch.pfault_compare)
+               return 0;
+       if (psw_extint_disabled(vcpu))
+               return 0;
+       if (kvm_cpu_has_interrupt(vcpu))
+               return 0;
+       if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
+               return 0;
+       if (!vcpu->arch.gmap->pfault_enabled)
+               return 0;
+
+       hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+       if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8))
+               return 0;
+
+       rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
+       return rc;
+}
+
 static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
        int rc, cpuflags;
 
+       /*
+        * On s390 notifications for arriving pages will be delivered directly
+        * to the guest but the house keeping for completed pfaults is
+        * handled outside the worker.
+        */
+       kvm_check_async_pf_completion(vcpu);
+
        memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
 
        if (need_resched())
@@ -744,7 +926,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 {
-       int rc;
+       int rc = -1;
 
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                   vcpu->arch.sie_block->icptcode);
@@ -758,7 +940,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
                                                current->thread.gmap_addr;
                vcpu->run->s390_ucontrol.pgm_code = 0x10;
                rc = -EREMOTE;
-       } else {
+
+       } else if (current->thread.gmap_pfault) {
+               trace_kvm_s390_major_guest_pfault(vcpu);
+               current->thread.gmap_pfault = 0;
+               if (kvm_arch_setup_async_pf(vcpu) ||
+                   (kvm_arch_fault_in_sync(vcpu) >= 0))
+                       rc = 0;
+       }
+
+       if (rc == -1) {
                VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
                trace_kvm_s390_sie_fault(vcpu);
                rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -768,7 +959,8 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 
        if (rc == 0) {
                if (kvm_is_ucontrol(vcpu->kvm))
-                       rc = -EOPNOTSUPP;
+                       /* Don't exit for host interrupts. */
+                       rc = vcpu->arch.sie_block->icptcode ? -EOPNOTSUPP : 0;
                else
                        rc = kvm_handle_sie_intercept(vcpu);
        }
@@ -831,8 +1023,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
 
-       BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL);
-
        switch (kvm_run->exit_reason) {
        case KVM_EXIT_S390_SIEIC:
        case KVM_EXIT_UNKNOWN:
index 564514f..3c1e227 100644 (file)
@@ -129,6 +129,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu);
 int __must_check kvm_s390_inject_vm(struct kvm *kvm,
                                    struct kvm_s390_interrupt *s390int);
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -136,6 +137,7 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                    u64 cr6, u64 schid);
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in priv.c */
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
@@ -161,4 +163,9 @@ bool kvm_enabled_cmma(void);
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 
+/* implemented in interrupt.c */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int psw_extint_disabled(struct kvm_vcpu *vcpu);
+void kvm_s390_destroy_adapters(struct kvm *kvm);
+
 #endif
index aacb6b1..476e9e2 100644 (file)
@@ -396,15 +396,10 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
 
 static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        int cpus = 0;
        int n;
 
-       spin_lock(&fi->lock);
-       for (n = 0; n < KVM_MAX_VCPUS; n++)
-               if (fi->local_int[n])
-                       cpus++;
-       spin_unlock(&fi->lock);
+       cpus = atomic_read(&vcpu->kvm->online_vcpus);
 
        /* deal with other level 3 hypervisors */
        if (stsi(mem, 3, 2, 2))
index 87c2b3a..26caeb5 100644 (file)
 static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
                        u64 *reg)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
+       int cpuflags;
        int rc;
 
        if (cpu_addr >= KVM_MAX_VCPUS)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       spin_lock(&fi->lock);
-       if (fi->local_int[cpu_addr] == NULL)
-               rc = SIGP_CC_NOT_OPERATIONAL;
-       else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                  & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+
+       cpuflags = atomic_read(li->cpuflags);
+       if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
                rc = SIGP_CC_ORDER_CODE_ACCEPTED;
        else {
                *reg &= 0xffffffff00000000UL;
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_ECALL_PEND)
+               if (cpuflags & CPUSTAT_ECALL_PEND)
                        *reg |= SIGP_STATUS_EXT_CALL_PENDING;
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_STOPPED)
+               if (cpuflags & CPUSTAT_STOPPED)
                        *reg |= SIGP_STATUS_STOPPED;
                rc = SIGP_CC_STATUS_STORED;
        }
-       spin_unlock(&fi->lock);
 
        VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
        return rc;
@@ -53,12 +54,13 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 
 static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li;
        struct kvm_s390_interrupt_info *inti;
-       int rc;
+       struct kvm_vcpu *dst_vcpu = NULL;
 
-       if (cpu_addr >= KVM_MAX_VCPUS)
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
                return SIGP_CC_NOT_OPERATIONAL;
 
        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -68,13 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
        inti->type = KVM_S390_INT_EMERGENCY;
        inti->emerg.code = vcpu->vcpu_id;
 
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               kfree(inti);
-               goto unlock;
-       }
+       li = &dst_vcpu->arch.local_int;
        spin_lock_bh(&li->lock);
        list_add_tail(&inti->list, &li->list);
        atomic_set(&li->active, 1);
@@ -82,11 +78,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
        if (waitqueue_active(li->wq))
                wake_up_interruptible(li->wq);
        spin_unlock_bh(&li->lock);
-       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
        VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
-unlock:
-       spin_unlock(&fi->lock);
-       return rc;
+
+       return SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
 static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
@@ -122,12 +116,13 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
 
 static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li;
        struct kvm_s390_interrupt_info *inti;
-       int rc;
+       struct kvm_vcpu *dst_vcpu = NULL;
 
-       if (cpu_addr >= KVM_MAX_VCPUS)
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
                return SIGP_CC_NOT_OPERATIONAL;
 
        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -137,13 +132,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
        inti->type = KVM_S390_INT_EXTERNAL_CALL;
        inti->extcall.code = vcpu->vcpu_id;
 
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               kfree(inti);
-               goto unlock;
-       }
+       li = &dst_vcpu->arch.local_int;
        spin_lock_bh(&li->lock);
        list_add_tail(&inti->list, &li->list);
        atomic_set(&li->active, 1);
@@ -151,11 +140,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
        if (waitqueue_active(li->wq))
                wake_up_interruptible(li->wq);
        spin_unlock_bh(&li->lock);
-       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
        VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
-unlock:
-       spin_unlock(&fi->lock);
-       return rc;
+
+       return SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
 static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
@@ -189,31 +176,26 @@ out:
 
 static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
        int rc;
 
        if (cpu_addr >= KVM_MAX_VCPUS)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               goto unlock;
-       }
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
 
        rc = __inject_sigp_stop(li, action);
 
-unlock:
-       spin_unlock(&fi->lock);
        VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
 
        if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) {
                /* If the CPU has already been stopped, we still have
                 * to save the status when doing stop-and-store. This
                 * has to be done after unlocking all spinlocks. */
-               struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
                rc = kvm_s390_store_status_unloaded(dst_vcpu,
                                                KVM_S390_STORE_STATUS_NOADDR);
        }
@@ -224,6 +206,8 @@ unlock:
 static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
 {
        int rc;
+       unsigned int i;
+       struct kvm_vcpu *v;
 
        switch (parameter & 0xff) {
        case 0:
@@ -231,6 +215,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
                break;
        case 1:
        case 2:
+               kvm_for_each_vcpu(i, v, vcpu->kvm) {
+                       v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+                       kvm_clear_async_pf_completion_queue(v);
+               }
+
                rc = SIGP_CC_ORDER_CODE_ACCEPTED;
                break;
        default:
@@ -242,12 +231,18 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
 static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
                             u64 *reg)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
-       struct kvm_s390_local_interrupt *li = NULL;
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
        struct kvm_s390_interrupt_info *inti;
        int rc;
        u8 tmp;
 
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+
        /* make sure that the new value is valid memory */
        address = address & 0x7fffe000u;
        if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
@@ -261,18 +256,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
        if (!inti)
                return SIGP_CC_BUSY;
 
-       spin_lock(&fi->lock);
-       if (cpu_addr < KVM_MAX_VCPUS)
-               li = fi->local_int[cpu_addr];
-
-       if (li == NULL) {
-               *reg &= 0xffffffff00000000UL;
-               *reg |= SIGP_STATUS_INCORRECT_STATE;
-               rc = SIGP_CC_STATUS_STORED;
-               kfree(inti);
-               goto out_fi;
-       }
-
        spin_lock_bh(&li->lock);
        /* cpu must be in stopped state */
        if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
@@ -295,8 +278,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
        VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
 out_li:
        spin_unlock_bh(&li->lock);
-out_fi:
-       spin_unlock(&fi->lock);
        return rc;
 }
 
@@ -334,28 +315,26 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id,
 static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
                                u64 *reg)
 {
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
        int rc;
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 
        if (cpu_addr >= KVM_MAX_VCPUS)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       spin_lock(&fi->lock);
-       if (fi->local_int[cpu_addr] == NULL)
-               rc = SIGP_CC_NOT_OPERATIONAL;
-       else {
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_RUNNING) {
-                       /* running */
-                       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
-               } else {
-                       /* not running */
-                       *reg &= 0xffffffff00000000UL;
-                       *reg |= SIGP_STATUS_NOT_RUNNING;
-                       rc = SIGP_CC_STATUS_STORED;
-               }
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+       if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+               /* running */
+               rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+       } else {
+               /* not running */
+               *reg &= 0xffffffff00000000UL;
+               *reg |= SIGP_STATUS_NOT_RUNNING;
+               rc = SIGP_CC_STATUS_STORED;
        }
-       spin_unlock(&fi->lock);
 
        VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", cpu_addr,
                   rc);
@@ -366,26 +345,22 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
 /* Test whether the destination CPU is available and not busy */
 static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li;
        int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+       struct kvm_vcpu *dst_vcpu = NULL;
 
        if (cpu_addr >= KVM_MAX_VCPUS)
                return SIGP_CC_NOT_OPERATIONAL;
 
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               goto out;
-       }
-
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
        spin_lock_bh(&li->lock);
        if (li->action_bits & ACTION_STOP_ON_STOP)
                rc = SIGP_CC_BUSY;
        spin_unlock_bh(&li->lock);
-out:
-       spin_unlock(&fi->lock);
+
        return rc;
 }
 
index 3db76b2..e8e7213 100644 (file)
        TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id,           \
                  __entry->pswmask, __entry->pswaddr, p_args)
 
+TRACE_EVENT(kvm_s390_major_guest_pfault,
+           TP_PROTO(VCPU_PROTO_COMMON),
+           TP_ARGS(VCPU_ARGS_COMMON),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   ),
+           VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault")
+       );
+
+TRACE_EVENT(kvm_s390_pfault_init,
+           TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+           TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(long, pfault_token)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->pfault_token = pfault_token;
+                   ),
+           VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token)
+       );
+
+TRACE_EVENT(kvm_s390_pfault_done,
+           TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+           TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(long, pfault_token)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->pfault_token = pfault_token;
+                   ),
+           VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token)
+       );
+
 /*
  * Tracepoints for SIE entry and exit.
  */
index d95265b..88cef50 100644 (file)
@@ -50,6 +50,7 @@
 #define VM_FAULT_BADMAP                0x020000
 #define VM_FAULT_BADACCESS     0x040000
 #define VM_FAULT_SIGNAL                0x080000
+#define VM_FAULT_PFAULT                0x100000
 
 static unsigned long store_indication __read_mostly;
 
@@ -227,6 +228,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
                        return;
                }
        case VM_FAULT_BADCONTEXT:
+       case VM_FAULT_PFAULT:
                do_no_context(regs);
                break;
        case VM_FAULT_SIGNAL:
@@ -264,6 +266,9 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
  */
 static inline int do_exception(struct pt_regs *regs, int access)
 {
+#ifdef CONFIG_PGSTE
+       struct gmap *gmap;
+#endif
        struct task_struct *tsk;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
@@ -304,9 +309,10 @@ static inline int do_exception(struct pt_regs *regs, int access)
        down_read(&mm->mmap_sem);
 
 #ifdef CONFIG_PGSTE
-       if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
-               address = __gmap_fault(address,
-                                    (struct gmap *) S390_lowcore.gmap);
+       gmap = (struct gmap *)
+               ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0);
+       if (gmap) {
+               address = __gmap_fault(address, gmap);
                if (address == -EFAULT) {
                        fault = VM_FAULT_BADMAP;
                        goto out_up;
@@ -315,6 +321,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
                        fault = VM_FAULT_OOM;
                        goto out_up;
                }
+               if (gmap->pfault_enabled)
+                       flags |= FAULT_FLAG_RETRY_NOWAIT;
        }
 #endif
 
@@ -371,9 +379,19 @@ retry:
                                      regs, address);
                }
                if (fault & VM_FAULT_RETRY) {
+#ifdef CONFIG_PGSTE
+                       if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                               /* FAULT_FLAG_RETRY_NOWAIT has been set,
+                                * mmap_sem has not been released */
+                               current->thread.gmap_pfault = 1;
+                               fault = VM_FAULT_PFAULT;
+                               goto out_up;
+                       }
+#endif
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                         * of starvation. */
-                       flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags &= ~(FAULT_FLAG_ALLOW_RETRY |
+                                  FAULT_FLAG_RETRY_NOWAIT);
                        flags |= FAULT_FLAG_TRIED;
                        down_read(&mm->mmap_sem);
                        goto retry;
index fdf83af..fcaf9c9 100644 (file)
@@ -337,6 +337,11 @@ struct kvm_pmu {
        u64 reprogram_pmi;
 };
 
+enum {
+       KVM_DEBUGREG_BP_ENABLED = 1,
+       KVM_DEBUGREG_WONT_EXIT = 2,
+};
+
 struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
@@ -444,7 +449,6 @@ struct kvm_vcpu_arch {
        } st;
 
        u64 last_guest_tsc;
-       u64 last_kernel_ns;
        u64 last_host_tsc;
        u64 tsc_offset_adjustment;
        u64 this_tsc_nsec;
@@ -464,7 +468,7 @@ struct kvm_vcpu_arch {
        struct mtrr_state_type mtrr_state;
        u32 pat;
 
-       int switch_db_regs;
+       unsigned switch_db_regs;
        unsigned long db[KVM_NR_DB_REGS];
        unsigned long dr6;
        unsigned long dr7;
@@ -599,6 +603,8 @@ struct kvm_arch {
        bool use_master_clock;
        u64 master_kernel_ns;
        cycle_t master_cycle_now;
+       struct delayed_work kvmclock_update_work;
+       struct delayed_work kvmclock_sync_work;
 
        struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -702,6 +708,7 @@ struct kvm_x86_ops {
        void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
        u64 (*get_dr6)(struct kvm_vcpu *vcpu);
        void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
        void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
        void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -728,8 +735,8 @@ struct kvm_x86_ops {
        int (*nmi_allowed)(struct kvm_vcpu *vcpu);
        bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
        void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
-       int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-       int (*enable_irq_window)(struct kvm_vcpu *vcpu);
+       void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+       void (*enable_irq_window)(struct kvm_vcpu *vcpu);
        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
        int (*vm_has_apicv)(struct kvm *kvm);
        void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
@@ -765,6 +772,9 @@ struct kvm_x86_ops {
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage);
        void (*handle_external_intr)(struct kvm_vcpu *vcpu);
+       bool (*mpx_supported)(void);
+
+       int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 };
 
 struct kvm_arch_async_pf {
index 2067264..7004d21 100644 (file)
@@ -85,6 +85,7 @@
 #define VM_EXIT_SAVE_IA32_EFER                  0x00100000
 #define VM_EXIT_LOAD_IA32_EFER                  0x00200000
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
+#define VM_EXIT_CLEAR_BNDCFGS                   0x00800000
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR      0x00036dff
 
@@ -95,6 +96,7 @@
 #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL     0x00002000
 #define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
 #define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
+#define VM_ENTRY_LOAD_BNDCFGS                   0x00010000
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR     0x000011ff
 
@@ -174,6 +176,8 @@ enum vmcs_field {
        GUEST_PDPTR2_HIGH               = 0x0000280f,
        GUEST_PDPTR3                    = 0x00002810,
        GUEST_PDPTR3_HIGH               = 0x00002811,
+       GUEST_BNDCFGS                   = 0x00002812,
+       GUEST_BNDCFGS_HIGH              = 0x00002813,
        HOST_IA32_PAT                   = 0x00002c00,
        HOST_IA32_PAT_HIGH              = 0x00002c01,
        HOST_IA32_EFER                  = 0x00002c02,
index 6c1d741..d949ef2 100644 (file)
@@ -16,6 +16,8 @@
 #define XSTATE_Hi16_ZMM                0x80
 
 #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK     (~(XSTATE_FPSSE | (1ULL << 63)))
 
 #define FXSAVE_SIZE    512
 
index 4924f4b..c827ace 100644 (file)
 #define MSR_SMI_COUNT                  0x00000034
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 #define MSR_IA32_TSC_ADJUST             0x0000003b
+#define MSR_IA32_BNDCFGS               0x00000d90
 
 #define FEATURE_CONTROL_LOCKED                         (1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX       (1<<1)
index 713f1b3..0331cb3 100644 (file)
@@ -417,7 +417,6 @@ void kvm_disable_steal_time(void)
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-       WARN_ON(kvm_register_clock("primary cpu clock"));
        kvm_guest_cpu_init();
        native_smp_prepare_boot_cpu();
        kvm_spinlock_init();
index e604109..d9156ce 100644 (file)
@@ -242,7 +242,7 @@ void __init kvmclock_init(void)
        hv_clock = __va(mem);
        memset(hv_clock, 0, size);
 
-       if (kvm_register_clock("boot clock")) {
+       if (kvm_register_clock("primary cpu clock")) {
                hv_clock = NULL;
                memblock_free(mem, size);
                return;
index e5503d8..bea6067 100644 (file)
@@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv)
        int feature_bit = 0;
        u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
-       xstate_bv &= ~XSTATE_FPSSE;
+       xstate_bv &= XSTATE_EXTEND_MASK;
        while (xstate_bv) {
                if (xstate_bv & 0x1) {
                        u32 eax, ebx, ecx, edx;
@@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv)
        return ret;
 }
 
+u64 kvm_supported_xcr0(void)
+{
+       u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+       if (!kvm_x86_ops->mpx_supported())
+               xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+
+       return xcr0;
+}
+
 void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
        } else {
                vcpu->arch.guest_supported_xcr0 =
                        (best->eax | ((u64)best->edx << 32)) &
-                       host_xcr0 & KVM_SUPPORTED_XCR0;
-               vcpu->arch.guest_xstate_size =
-                       xstate_required_size(vcpu->arch.guest_supported_xcr0);
+                       kvm_supported_xcr0();
+               vcpu->arch.guest_xstate_size = best->ebx =
+                       xstate_required_size(vcpu->arch.xcr0);
        }
 
        kvm_pmu_cpuid_update(vcpu);
@@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        entry->flags = 0;
 }
 
-static bool supported_xcr0_bit(unsigned bit)
-{
-       u64 mask = ((u64)1 << bit);
-
-       return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
-}
-
 #define F(x) bit(X86_FEATURE_##x)
 
 static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
@@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #endif
        unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
        unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+       unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
 
        /* cpuid 1.edx */
        const u32 kvm_supported_word0_x86_features =
@@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        /* cpuid 7.0.ebx */
        const u32 kvm_supported_word9_x86_features =
                F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
+               F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+               F(ADX);
 
        /* all calls to cpuid_count() should be made on the same cpu */
        get_cpu();
@@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        }
        case 0xd: {
                int idx, i;
+               u64 supported = kvm_supported_xcr0();
 
-               entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
-               entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
+               entry->eax &= supported;
+               entry->edx &= supported >> 32;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                for (idx = 1, i = 1; idx < 64; ++idx) {
+                       u64 mask = ((u64)1 << idx);
                        if (*nent >= maxnent)
                                goto out;
 
                        do_cpuid_1_ent(&entry[i], function, idx);
-                       if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+                       if (entry[i].eax == 0 || !(supported & mask))
                                continue;
                        entry[i].flags |=
                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
index 07ffca0..205b17e 100644 (file)
@@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = {
        I(0, em_mov), N, N, N,
 };
 
+static const struct gprefix pfx_0f_28_0f_29 = {
+       I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
 static const struct escape escape_d9 = { {
        N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
 }, {
@@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = {
        IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
        IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
        N, N, N, N,
-       N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+       GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+       GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+       N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
        N, N, N, N,
        /* 0x30 - 0x3F */
        II(ImplicitOps | Priv, em_wrmsr, wrmsr),
index 9b53135..f5704d9 100644 (file)
@@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
        arch.direct_map = vcpu->arch.mmu.direct_map;
        arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
 
-       return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+       return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
 }
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
index cba218a..b1e6c1b 100644 (file)
@@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
  *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
  *   used by guest then tlbs are not flushed, so guest is allowed to access the
  *   freed pages.
- *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ *   We set tlbs_dirty to let the notifier know this change and delay the flush
+ *   until such a case actually happens.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                        return -EINVAL;
 
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-                       vcpu->kvm->tlbs_dirty++;
+                       vcpu->kvm->tlbs_dirty = true;
                        continue;
                }
 
@@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                if (gfn != sp->gfns[i]) {
                        drop_spte(vcpu->kvm, &sp->spt[i]);
-                       vcpu->kvm->tlbs_dirty++;
+                       vcpu->kvm->tlbs_dirty = true;
                        continue;
                }
 
index 2de1bc0..7f4f9c2 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
+#include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 
 #include <asm/virtext.h>
@@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
        return vmcb->control.intercept_cr & (1U << bit);
 }
 
-static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
 {
        struct vmcb *vmcb = get_host_vmcb(svm);
 
-       vmcb->control.intercept_dr |= (1U << bit);
+       vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+               | (1 << INTERCEPT_DR1_READ)
+               | (1 << INTERCEPT_DR2_READ)
+               | (1 << INTERCEPT_DR3_READ)
+               | (1 << INTERCEPT_DR4_READ)
+               | (1 << INTERCEPT_DR5_READ)
+               | (1 << INTERCEPT_DR6_READ)
+               | (1 << INTERCEPT_DR7_READ)
+               | (1 << INTERCEPT_DR0_WRITE)
+               | (1 << INTERCEPT_DR1_WRITE)
+               | (1 << INTERCEPT_DR2_WRITE)
+               | (1 << INTERCEPT_DR3_WRITE)
+               | (1 << INTERCEPT_DR4_WRITE)
+               | (1 << INTERCEPT_DR5_WRITE)
+               | (1 << INTERCEPT_DR6_WRITE)
+               | (1 << INTERCEPT_DR7_WRITE);
 
        recalc_intercepts(svm);
 }
 
-static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 {
        struct vmcb *vmcb = get_host_vmcb(svm);
 
-       vmcb->control.intercept_dr &= ~(1U << bit);
+       vmcb->control.intercept_dr = 0;
 
        recalc_intercepts(svm);
 }
@@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
        set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 
-       set_dr_intercept(svm, INTERCEPT_DR0_READ);
-       set_dr_intercept(svm, INTERCEPT_DR1_READ);
-       set_dr_intercept(svm, INTERCEPT_DR2_READ);
-       set_dr_intercept(svm, INTERCEPT_DR3_READ);
-       set_dr_intercept(svm, INTERCEPT_DR4_READ);
-       set_dr_intercept(svm, INTERCEPT_DR5_READ);
-       set_dr_intercept(svm, INTERCEPT_DR6_READ);
-       set_dr_intercept(svm, INTERCEPT_DR7_READ);
-
-       set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+       set_dr_intercepts(svm);
 
        set_exception_intercept(svm, PF_VECTOR);
        set_exception_intercept(svm, UD_VECTOR);
@@ -1684,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
        mark_dirty(svm->vmcb, VMCB_DR);
 }
 
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       get_debugreg(vcpu->arch.db[0], 0);
+       get_debugreg(vcpu->arch.db[1], 1);
+       get_debugreg(vcpu->arch.db[2], 2);
+       get_debugreg(vcpu->arch.db[3], 3);
+       vcpu->arch.dr6 = svm_get_dr6(vcpu);
+       vcpu->arch.dr7 = svm->vmcb->save.dr7;
+
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+       set_dr_intercepts(svm);
+}
+
 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -2842,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm)
        clr_intercept(svm, INTERCEPT_IRET);
        svm->vcpu.arch.hflags |= HF_IRET_MASK;
        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
        return 1;
 }
 
@@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm)
        unsigned long val;
        int err;
 
+       if (svm->vcpu.guest_debug == 0) {
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               clr_dr_intercepts(svm);
+               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
+
        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
                return emulate_on_interception(svm);
 
@@ -3649,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
        return ret;
 }
 
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3663,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
                svm_set_vintr(svm);
                svm_inject_irq(svm, 0x0);
        }
-       return 0;
 }
 
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
            == HF_NMI_MASK)
-               return 0; /* IRET will cause a vm exit */
+               return; /* IRET will cause a vm exit */
 
        /*
         * Something prevents NMI from been injected. Single step over possible
@@ -3681,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu)
        svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
        update_db_bp_intercept(vcpu);
-       return 0;
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4064,6 +4089,11 @@ static bool svm_invpcid_supported(void)
        return false;
 }
 
+static bool svm_mpx_supported(void)
+{
+       return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
        return true;
@@ -4302,6 +4332,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .get_dr6 = svm_get_dr6,
        .set_dr6 = svm_set_dr6,
        .set_dr7 = svm_set_dr7,
+       .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
        .cache_reg = svm_cache_reg,
        .get_rflags = svm_get_rflags,
        .set_rflags = svm_set_rflags,
@@ -4345,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
        .rdtscp_supported = svm_rdtscp_supported,
        .invpcid_supported = svm_invpcid_supported,
+       .mpx_supported = svm_mpx_supported,
 
        .set_supported_cpuid = svm_set_supported_cpuid,
 
index 3927528..1320e0f 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
+#include <linux/hrtimer.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -42,6 +43,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/perf_event.h>
+#include <asm/debugreg.h>
 #include <asm/kexec.h>
 
 #include "trace.h"
@@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);
 
 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  * ple_gap:    upper bound on the amount of time between two successive
@@ -202,6 +206,7 @@ struct __packed vmcs12 {
        u64 guest_pdptr1;
        u64 guest_pdptr2;
        u64 guest_pdptr3;
+       u64 guest_bndcfgs;
        u64 host_ia32_pat;
        u64 host_ia32_efer;
        u64 host_ia32_perf_global_ctrl;
@@ -374,6 +379,9 @@ struct nested_vmx {
         */
        struct page *apic_access_page;
        u64 msr_ia32_feature_control;
+
+       struct hrtimer preemption_timer;
+       bool preemption_timer_expired;
 };
 
 #define POSTED_INTR_ON  0
@@ -441,6 +449,7 @@ struct vcpu_vmx {
 #endif
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
+               u64           msr_host_bndcfgs;
        } host_state;
        struct {
                int vm86_active;
@@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {
        GUEST_CS_LIMIT,
        GUEST_CS_BASE,
        GUEST_ES_BASE,
+       GUEST_BNDCFGS,
        CR0_GUEST_HOST_MASK,
        CR0_READ_SHADOW,
        CR4_READ_SHADOW,
@@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(GUEST_PDPTR1, guest_pdptr1),
        FIELD64(GUEST_PDPTR2, guest_pdptr2),
        FIELD64(GUEST_PDPTR3, guest_pdptr3),
+       FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
        FIELD64(HOST_IA32_PAT, host_ia32_pat),
        FIELD64(HOST_IA32_EFER, host_ia32_efer),
        FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static bool vmx_mpx_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_mpx_supported(void);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+       return vmcs12->pin_based_vm_exec_control &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
 {
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
        if (is_long_mode(&vmx->vcpu))
                wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #endif
+       if (boot_cpu_has(X86_FEATURE_MPX))
+               rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
        for (i = 0; i < vmx->save_nmsrs; ++i)
                kvm_set_shared_msr(vmx->guest_msrs[i].index,
                                   vmx->guest_msrs[i].data,
@@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
+       if (vmx->host_state.msr_host_bndcfgs)
+               wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
        /*
         * If the FPU is not active (through the host task or
         * the guest vcpu), then restore the cr0.TS bit.
@@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         */
        nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                PIN_BASED_VMX_PREEMPTION_TIMER;
-       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 
        /*
         * Exit controls
@@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
-           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
-               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-       }
-       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                VM_ENTRY_LOAD_IA32_PAT;
        nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
                                       VM_ENTRY_LOAD_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-       nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
-               VMX_MISC_SAVE_EFER_LMA;
-       nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
+       nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+               VMX_MISC_ACTIVITY_HLT;
        nested_vmx_misc_high = 0;
 }
 
@@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        case MSR_IA32_SYSENTER_ESP:
                data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               data = vmcs_read64(GUEST_BNDCFGS);
+               break;
        case MSR_IA32_FEATURE_CONTROL:
                if (!nested_vmx_allowed(vcpu))
                        return 1;
@@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_SYSENTER_ESP:
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               vmcs_write64(GUEST_BNDCFGS, data);
+               break;
        case MSR_IA32_TSC:
                kvm_write_tsc(vcpu, msr_info);
                break;
@@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                      vmx_capability.ept, vmx_capability.vpid);
        }
 
-       min = 0;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT;
+               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
@@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
-       min = 0;
-       opt = VM_ENTRY_LOAD_IA32_PAT;
+       min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+       opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control) < 0)
                return -EIO;
@@ -4223,6 +4255,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
        u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+       if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+               exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
                exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
@@ -4496,39 +4532,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
                PIN_BASED_NMI_EXITING;
 }
 
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
 
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-               /*
-                * We get here if vmx_interrupt_allowed() said we can't
-                * inject to L1 now because L2 must run. The caller will have
-                * to make L2 exit right after entry, so we can inject to L1
-                * more promptly.
-                */
-               return -EBUSY;
-
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
 }
 
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
 
-       if (!cpu_has_virtual_nmis())
-               return enable_irq_window(vcpu);
-
-       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
-               return enable_irq_window(vcpu);
+       if (!cpu_has_virtual_nmis() ||
+           vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+               enable_irq_window(vcpu);
+               return;
+       }
 
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4620,22 +4645,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_nmi(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                         NMI_VECTOR | INTR_TYPE_NMI_INTR |
-                                         INTR_INFO_VALID_MASK, 0);
-                       /*
-                        * The NMI-triggered VM exit counts as injection:
-                        * clear this one and block further NMIs.
-                        */
-                       vcpu->arch.nmi_pending = 0;
-                       vmx_set_nmi_mask(vcpu, true);
-                       return 0;
-               }
-       }
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return 0;
 
        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
                return 0;
@@ -4647,19 +4658,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_intr(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
-                                         0, 0);
-                       /*
-                        * fall through to normal code, but now in L1, not L2
-                        */
-               }
-       }
-
-       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       return (!to_vmx(vcpu)->nested.nested_run_pending &&
+               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
@@ -5102,6 +5102,22 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                }
        }
 
+       if (vcpu->guest_debug == 0) {
+               u32 cpu_based_vm_exec_control;
+
+               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
+
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
@@ -5128,6 +5144,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 {
 }
 
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       get_debugreg(vcpu->arch.db[0], 0);
+       get_debugreg(vcpu->arch.db[1], 1);
+       get_debugreg(vcpu->arch.db[2], 2);
+       get_debugreg(vcpu->arch.db[3], 3);
+       get_debugreg(vcpu->arch.dr6, 6);
+       vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 {
        vmcs_writel(GUEST_DR7, val);
@@ -5727,6 +5761,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
         */
 }
 
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+       struct vcpu_vmx *vmx =
+               container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+       vmx->nested.preemption_timer_expired = true;
+       kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+       kvm_vcpu_kick(&vmx->vcpu);
+
+       return HRTIMER_NORESTART;
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -5791,6 +5837,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
        vmx->nested.vmcs02_num = 0;
 
+       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL);
+       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
        vmx->nested.vmxon = true;
 
        skip_emulated_instruction(vcpu);
@@ -6767,9 +6817,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * table is L0's fault.
                 */
                return 0;
-       case EXIT_REASON_PREEMPTION_TIMER:
-               return vmcs12->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER;
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
@@ -6785,27 +6832,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
-static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
-{
-       u64 delta_tsc_l1;
-       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
-
-       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER))
-               return;
-       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
-                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
-       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
-       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
-               - vcpu->arch.last_guest_tsc;
-       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
-       if (preempt_val_l2 <= preempt_val_l1)
-               preempt_val_l2 = 0;
-       else
-               preempt_val_l2 -= preempt_val_l1;
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
-}
-
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7052,6 +7078,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                local_irq_enable();
 }
 
+static bool vmx_mpx_supported(void)
+{
+       return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+               (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -7218,8 +7250,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
-       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
-               nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -7616,6 +7646,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                kvm_inject_page_fault(vcpu, fault);
 }
 
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.virtual_tsc_khz == 0)
+               return;
+
+       /* Make sure short timeouts reliably trigger an immediate vmexit.
+        * hrtimer_start does not guarantee this. */
+       if (preemption_timeout <= 1) {
+               vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+               return;
+       }
+
+       preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+       preemption_timeout *= 1000000;
+       do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+       hrtimer_start(&vmx->nested.preemption_timer,
+                     ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7629,7 +7681,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
-       u32 exit_control;
 
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7687,13 +7738,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               (vmcs_config.pin_based_exec_ctrl |
-                vmcs12->pin_based_vm_exec_control));
+       exec_control = vmcs12->pin_based_vm_exec_control;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
-                            vmcs12->vmx_preemption_timer_value);
+       vmx->nested.preemption_timer_expired = false;
+       if (nested_cpu_has_preemption_timer(vmcs12))
+               vmx_start_preemption_timer(vcpu);
 
        /*
         * Whether page-faults are trapped is determined by a combination of
@@ -7721,7 +7773,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                enable_ept ? vmcs12->page_fault_error_code_match : 0);
 
        if (cpu_has_secondary_exec_ctrls()) {
-               u32 exec_control = vmx_secondary_exec_control(vmx);
+               exec_control = vmx_secondary_exec_control(vmx);
                if (!vmx->rdtscp_enabled)
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
                /* Take the following fields only from vmcs12 */
@@ -7808,10 +7860,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-       exit_control = vmcs_config.vmexit_ctrl;
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       vm_exit_controls_init(vmx, exit_control);
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
@@ -7830,6 +7879,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        set_cr4_guest_host_mask(vmx);
 
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vmcs_write64(TSC_OFFSET,
                        vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -8155,6 +8207,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
        }
 }
 
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+           vmx->nested.preemption_timer_expired) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+               return 0;
+       }
+
+       if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+               if (vmx->nested.nested_run_pending ||
+                   vcpu->arch.interrupt.pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+                                 NMI_VECTOR | INTR_TYPE_NMI_INTR |
+                                 INTR_INFO_VALID_MASK, 0);
+               /*
+                * The NMI-triggered VM exit counts as injection:
+                * clear this one and block further NMIs.
+                */
+               vcpu->arch.nmi_pending = 0;
+               vmx_set_nmi_mask(vcpu, true);
+               return 0;
+       }
+
+       if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
+           nested_exit_on_intr(vcpu)) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+       }
+
+       return 0;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+       ktime_t remaining =
+               hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+       u64 value;
+
+       if (ktime_to_ns(remaining) <= 0)
+               return 0;
+
+       value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+       do_div(value, 1000000);
+       return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
 /*
  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8225,10 +8329,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        else
                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
 
-       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
-           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
-               vmcs12->vmx_preemption_timer_value =
-                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       if (nested_cpu_has_preemption_timer(vmcs12)) {
+               if (vmcs12->vm_exit_controls &
+                   VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+                       vmcs12->vmx_preemption_timer_value =
+                               vmx_get_preemption_timer_value(vcpu);
+               hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+       }
 
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -8260,6 +8367,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+       if (vmx_mpx_supported())
+               vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 
        /* update exit information fields: */
 
@@ -8369,6 +8478,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
 
+       /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
+       if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, 0);
+
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
                vcpu->arch.pat = vmcs12->host_ia32_pat;
@@ -8495,6 +8608,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                nested_vmx_succeed(vcpu);
        if (enable_shadow_vmcs)
                vmx->nested.sync_shadow_vmcs = true;
+
+       /* in case we halted in L2 */
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 
 /*
@@ -8573,6 +8689,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_dr6 = vmx_get_dr6,
        .set_dr6 = vmx_set_dr6,
        .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
        .cache_reg = vmx_cache_reg,
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
@@ -8634,6 +8751,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
        .check_intercept = vmx_check_intercept,
        .handle_external_intr = vmx_handle_external_intr,
+       .mpx_supported = vmx_mpx_supported,
+
+       .check_nested_events = vmx_check_nested_events,
 };
 
 static int __init vmx_init(void)
@@ -8721,6 +8841,8 @@ static int __init vmx_init(void)
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
        memcpy(vmx_msr_bitmap_legacy_x2apic,
                        vmx_msr_bitmap_legacy, PAGE_SIZE);
        memcpy(vmx_msr_bitmap_longmode_x2apic,
index 2b85784..d1c55f8 100644 (file)
@@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
-       u64 xcr0;
+       u64 xcr0 = xcr;
+       u64 old_xcr0 = vcpu->arch.xcr0;
        u64 valid_bits;
 
        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
        if (index != XCR_XFEATURE_ENABLED_MASK)
                return 1;
-       xcr0 = xcr;
        if (!(xcr0 & XSTATE_FP))
                return 1;
        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
        if (xcr0 & ~valid_bits)
                return 1;
 
+       if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+               return 1;
+
        kvm_put_guest_xcr0(vcpu);
        vcpu->arch.xcr0 = xcr0;
+
+       if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+               kvm_update_cpuid(vcpu);
        return 0;
 }
 
@@ -753,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
        else
                dr7 = vcpu->arch.dr7;
        kvm_x86_ops->set_dr7(vcpu, dr7);
-       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+       if (dr7 & DR7_BP_EN_MASK)
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 }
 
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -879,7 +887,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
-       MSR_IA32_FEATURE_CONTROL
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
 };
 
 static unsigned num_msrs_to_save;
@@ -1581,7 +1589,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
-       vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
 
        /*
@@ -1623,14 +1630,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  * the others.
  *
  * So in those cases, request a kvmclock update for all vcpus.
- * The worst case for a remote vcpu to update its kvmclock
- * is then bounded by maximum nohz sleep latency.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
  */
 
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
 {
        int i;
-       struct kvm *kvm = v->kvm;
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_update_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
        struct kvm_vcpu *vcpu;
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1639,6 +1653,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
        }
 }
 
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+       struct kvm *kvm = v->kvm;
+
+       set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+                                       KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_sync_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
        switch (msr) {
@@ -2323,9 +2360,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case HV_X64_MSR_VP_INDEX: {
                int r;
                struct kvm_vcpu *v;
-               kvm_for_each_vcpu(r, v, vcpu->kvm)
-                       if (v == vcpu)
+               kvm_for_each_vcpu(r, v, vcpu->kvm) {
+                       if (v == vcpu) {
                                data = r;
+                               break;
+                       }
+               }
                break;
        }
        case HV_X64_MSR_EOI:
@@ -2617,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_KVMCLOCK_CTRL:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_HYPERV_TIME:
+       case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -3043,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                 * with old userspace.
                 */
-               if (xstate_bv & ~KVM_SUPPORTED_XCR0)
-                       return -EINVAL;
-               if (xstate_bv & ~host_xcr0)
+               if (xstate_bv & ~kvm_supported_xcr0())
                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->xsave,
                        guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3898,6 +3937,23 @@ static void kvm_init_msr_list(void)
        for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
                        continue;
+
+               /*
+                * Even MSRs that are valid in the host may not be exposed
+                * to the guests in some cases.  We could work around this
+                * in VMX with the generic MSR save/load machinery, but it
+                * is not really worthwhile since it will really only
+                * happen with nested virtualization.
+                */
+               switch (msrs_to_save[i]) {
+               case MSR_IA32_BNDCFGS:
+                       if (!kvm_x86_ops->mpx_supported())
+                               continue;
+                       break;
+               default:
+                       break;
+               }
+
                if (j < i)
                        msrs_to_save[j] = msrs_to_save[i];
                j++;
@@ -4394,6 +4450,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
        if (!exchanged)
                return X86EMUL_CMPXCHG_FAILED;
 
+       mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
        kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
        return X86EMUL_CONTINUE;
@@ -5537,9 +5594,10 @@ int kvm_arch_init(void *opaque)
                goto out_free_percpu;
 
        kvm_set_mmio_spte_mask();
-       kvm_init_msr_list();
 
        kvm_x86_ops = ops;
+       kvm_init_msr_list();
+
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
@@ -5782,8 +5840,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 {
+       int r;
+
        /* try to reinject previous events if any */
        if (vcpu->arch.exception.pending) {
                trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5793,17 +5853,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                                          vcpu->arch.exception.has_error_code,
                                          vcpu->arch.exception.error_code,
                                          vcpu->arch.exception.reinject);
-               return;
+               return 0;
        }
 
        if (vcpu->arch.nmi_injected) {
                kvm_x86_ops->set_nmi(vcpu);
-               return;
+               return 0;
        }
 
        if (vcpu->arch.interrupt.pending) {
                kvm_x86_ops->set_irq(vcpu);
-               return;
+               return 0;
+       }
+
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+               r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+               if (r != 0)
+                       return r;
        }
 
        /* try to inject new event if pending */
@@ -5820,6 +5886,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                        kvm_x86_ops->set_irq(vcpu);
                }
        }
+       return 0;
 }
 
 static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5924,15 +5991,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               inject_pending_event(vcpu);
-
+               if (inject_pending_event(vcpu, req_int_win) != 0)
+                       req_immediate_exit = true;
                /* enable NMI/IRQ window open exits if needed */
-               if (vcpu->arch.nmi_pending)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_nmi_window(vcpu) != 0;
+               else if (vcpu->arch.nmi_pending)
+                       kvm_x86_ops->enable_nmi_window(vcpu);
                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_irq_window(vcpu) != 0;
+                       kvm_x86_ops->enable_irq_window(vcpu);
 
                if (kvm_lapic_enabled(vcpu)) {
                        /*
@@ -5992,11 +6057,27 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[1], 1);
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
+               set_debugreg(vcpu->arch.dr6, 6);
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu);
 
+       /*
+        * Do this here before restoring debug registers on the host.  And
+        * since we do this before handling the vmexit, a DR access vmexit
+        * can (a) read the correct value of the debug registers, (b) set
+        * KVM_DEBUGREG_WONT_EXIT again.
+        */
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+               int i;
+
+               WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+               kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+               for (i = 0; i < KVM_NR_DB_REGS; i++)
+                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+       }
+
        /*
         * If the guest has used debug registers, at least dr7
         * will be disabled while returning to the host.
@@ -6711,6 +6792,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
        int r;
        struct msr_data msr;
+       struct kvm *kvm = vcpu->kvm;
 
        r = vcpu_load(vcpu);
        if (r)
@@ -6721,6 +6803,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        kvm_write_tsc(vcpu, &msr);
        vcpu_put(vcpu);
 
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+
        return r;
 }
 
@@ -7013,6 +7098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        pvclock_update_vm_gtod_copy(kvm);
 
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
        return 0;
 }
 
@@ -7050,6 +7138,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_sync_events(struct kvm *kvm)
 {
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
        kvm_free_all_assigned_devices(kvm);
        kvm_free_pit(kvm);
 }
@@ -7248,6 +7338,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+               kvm_x86_ops->check_nested_events(vcpu, false);
+
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
index 8da5823..8c97bac 100644 (file)
@@ -122,9 +122,12 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
        struct x86_exception *exception);
 
-#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
+                               | XSTATE_BNDREGS | XSTATE_BNDCSR)
 extern u64 host_xcr0;
 
+extern u64 kvm_supported_xcr0(void);
+
 extern unsigned int min_timer_period_us;
 
 extern struct static_key kvm_no_apic_vcpu;
index 0fc5848..1e1fc67 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * ccw based virtio transport
  *
- * Copyright IBM Corp. 2012
+ * Copyright IBM Corp. 2012, 2014
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -32,6 +32,8 @@
 #include <asm/cio.h>
 #include <asm/ccwdev.h>
 #include <asm/virtio-ccw.h>
+#include <asm/isc.h>
+#include <asm/airq.h>
 
 /*
  * virtio related functions
@@ -58,6 +60,9 @@ struct virtio_ccw_device {
        unsigned long indicators;
        unsigned long indicators2;
        struct vq_config_block *config_block;
+       bool is_thinint;
+       bool going_away;
+       void *airq_info;
 };
 
 struct vq_info_block {
@@ -72,15 +77,38 @@ struct virtio_feature_desc {
        __u8 index;
 } __packed;
 
+struct virtio_thinint_area {
+       unsigned long summary_indicator;
+       unsigned long indicator;
+       u64 bit_nr;
+       u8 isc;
+} __packed;
+
 struct virtio_ccw_vq_info {
        struct virtqueue *vq;
        int num;
        void *queue;
        struct vq_info_block *info_block;
+       int bit_nr;
        struct list_head node;
        long cookie;
 };
 
+#define VIRTIO_AIRQ_ISC IO_SCH_ISC /* inherit from subchannel */
+
+#define VIRTIO_IV_BITS (L1_CACHE_BYTES * 8)
+#define MAX_AIRQ_AREAS 20
+
+static int virtio_ccw_use_airq = 1;
+
+struct airq_info {
+       rwlock_t lock;
+       u8 summary_indicator;
+       struct airq_struct airq;
+       struct airq_iv *aiv;
+};
+static struct airq_info *airq_areas[MAX_AIRQ_AREAS];
+
 #define CCW_CMD_SET_VQ 0x13
 #define CCW_CMD_VDEV_RESET 0x33
 #define CCW_CMD_SET_IND 0x43
@@ -91,6 +119,7 @@ struct virtio_ccw_vq_info {
 #define CCW_CMD_WRITE_CONF 0x21
 #define CCW_CMD_WRITE_STATUS 0x31
 #define CCW_CMD_READ_VQ_CONF 0x32
+#define CCW_CMD_SET_IND_ADAPTER 0x73
 
 #define VIRTIO_CCW_DOING_SET_VQ 0x00010000
 #define VIRTIO_CCW_DOING_RESET 0x00040000
@@ -102,6 +131,7 @@ struct virtio_ccw_vq_info {
 #define VIRTIO_CCW_DOING_SET_IND 0x01000000
 #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
 #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
+#define VIRTIO_CCW_DOING_SET_IND_ADAPTER 0x08000000
 #define VIRTIO_CCW_INTPARM_MASK 0xffff0000
 
 static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
@@ -109,6 +139,125 @@ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
        return container_of(vdev, struct virtio_ccw_device, vdev);
 }
 
+static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info)
+{
+       unsigned long i, flags;
+
+       write_lock_irqsave(&info->lock, flags);
+       for (i = 0; i < airq_iv_end(info->aiv); i++) {
+               if (vq == (void *)airq_iv_get_ptr(info->aiv, i)) {
+                       airq_iv_free_bit(info->aiv, i);
+                       airq_iv_set_ptr(info->aiv, i, 0);
+                       break;
+               }
+       }
+       write_unlock_irqrestore(&info->lock, flags);
+}
+
+static void virtio_airq_handler(struct airq_struct *airq)
+{
+       struct airq_info *info = container_of(airq, struct airq_info, airq);
+       unsigned long ai;
+
+       inc_irq_stat(IRQIO_VAI);
+       read_lock(&info->lock);
+       /* Walk through indicators field, summary indicator active. */
+       for (ai = 0;;) {
+               ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+               if (ai == -1UL)
+                       break;
+               vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+       }
+       info->summary_indicator = 0;
+       smp_wmb();
+       /* Walk through indicators field, summary indicator not active. */
+       for (ai = 0;;) {
+               ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+               if (ai == -1UL)
+                       break;
+               vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+       }
+       read_unlock(&info->lock);
+}
+
+static struct airq_info *new_airq_info(void)
+{
+       struct airq_info *info;
+       int rc;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return NULL;
+       rwlock_init(&info->lock);
+       info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR);
+       if (!info->aiv) {
+               kfree(info);
+               return NULL;
+       }
+       info->airq.handler = virtio_airq_handler;
+       info->airq.lsi_ptr = &info->summary_indicator;
+       info->airq.lsi_mask = 0xff;
+       info->airq.isc = VIRTIO_AIRQ_ISC;
+       rc = register_adapter_interrupt(&info->airq);
+       if (rc) {
+               airq_iv_release(info->aiv);
+               kfree(info);
+               return NULL;
+       }
+       return info;
+}
+
+static void destroy_airq_info(struct airq_info *info)
+{
+       if (!info)
+               return;
+
+       unregister_adapter_interrupt(&info->airq);
+       airq_iv_release(info->aiv);
+       kfree(info);
+}
+
+static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs,
+                                       u64 *first, void **airq_info)
+{
+       int i, j;
+       struct airq_info *info;
+       unsigned long indicator_addr = 0;
+       unsigned long bit, flags;
+
+       for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) {
+               if (!airq_areas[i])
+                       airq_areas[i] = new_airq_info();
+               info = airq_areas[i];
+               if (!info)
+                       return 0;
+               write_lock_irqsave(&info->lock, flags);
+               bit = airq_iv_alloc(info->aiv, nvqs);
+               if (bit == -1UL) {
+                       /* Not enough vacancies. */
+                       write_unlock_irqrestore(&info->lock, flags);
+                       continue;
+               }
+               *first = bit;
+               *airq_info = info;
+               indicator_addr = (unsigned long)info->aiv->vector;
+               for (j = 0; j < nvqs; j++) {
+                       airq_iv_set_ptr(info->aiv, bit + j,
+                                       (unsigned long)vqs[j]);
+               }
+               write_unlock_irqrestore(&info->lock, flags);
+       }
+       return indicator_addr;
+}
+
+static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev)
+{
+       struct virtio_ccw_vq_info *info;
+
+       list_for_each_entry(info, &vcdev->virtqueues, node)
+               drop_airq_indicator(info->vq, vcdev->airq_info);
+}
+
 static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
 {
        unsigned long flags;
@@ -145,6 +294,51 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
        return ret ? ret : vcdev->err;
 }
 
+static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
+                                     struct ccw1 *ccw)
+{
+       int ret;
+       unsigned long *indicatorp = NULL;
+       struct virtio_thinint_area *thinint_area = NULL;
+       struct airq_info *airq_info = vcdev->airq_info;
+
+       if (vcdev->is_thinint) {
+               thinint_area = kzalloc(sizeof(*thinint_area),
+                                      GFP_DMA | GFP_KERNEL);
+               if (!thinint_area)
+                       return;
+               thinint_area->summary_indicator =
+                       (unsigned long) &airq_info->summary_indicator;
+               thinint_area->isc = VIRTIO_AIRQ_ISC;
+               ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+               ccw->count = sizeof(*thinint_area);
+               ccw->cda = (__u32)(unsigned long) thinint_area;
+       } else {
+               indicatorp = kmalloc(sizeof(&vcdev->indicators),
+                                    GFP_DMA | GFP_KERNEL);
+               if (!indicatorp)
+                       return;
+               *indicatorp = 0;
+               ccw->cmd_code = CCW_CMD_SET_IND;
+               ccw->count = sizeof(vcdev->indicators);
+               ccw->cda = (__u32)(unsigned long) indicatorp;
+       }
+       /* Deregister indicators from host. */
+       vcdev->indicators = 0;
+       ccw->flags = 0;
+       ret = ccw_io_helper(vcdev, ccw,
+                           vcdev->is_thinint ?
+                           VIRTIO_CCW_DOING_SET_IND_ADAPTER :
+                           VIRTIO_CCW_DOING_SET_IND);
+       if (ret && (ret != -ENODEV))
+               dev_info(&vcdev->cdev->dev,
+                        "Failed to deregister indicators (%d)\n", ret);
+       else if (vcdev->is_thinint)
+               virtio_ccw_drop_indicators(vcdev);
+       kfree(indicatorp);
+       kfree(thinint_area);
+}
+
 static inline long do_kvm_notify(struct subchannel_id schid,
                                 unsigned long queue_index,
                                 long cookie)
@@ -232,11 +426,13 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
 {
        struct virtqueue *vq, *n;
        struct ccw1 *ccw;
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
 
        ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
        if (!ccw)
                return;
 
+       virtio_ccw_drop_indicator(vcdev, ccw);
 
        list_for_each_entry_safe(vq, n, &vdev->vqs, list)
                virtio_ccw_del_vq(vq, ccw);
@@ -326,6 +522,54 @@ out_err:
        return ERR_PTR(err);
 }
 
+static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev,
+                                          struct virtqueue *vqs[], int nvqs,
+                                          struct ccw1 *ccw)
+{
+       int ret;
+       struct virtio_thinint_area *thinint_area = NULL;
+       struct airq_info *info;
+
+       thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL);
+       if (!thinint_area) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       /* Try to get an indicator. */
+       thinint_area->indicator = get_airq_indicator(vqs, nvqs,
+                                                    &thinint_area->bit_nr,
+                                                    &vcdev->airq_info);
+       if (!thinint_area->indicator) {
+               ret = -ENOSPC;
+               goto out;
+       }
+       info = vcdev->airq_info;
+       thinint_area->summary_indicator =
+               (unsigned long) &info->summary_indicator;
+       thinint_area->isc = VIRTIO_AIRQ_ISC;
+       ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+       ccw->flags = CCW_FLAG_SLI;
+       ccw->count = sizeof(*thinint_area);
+       ccw->cda = (__u32)(unsigned long)thinint_area;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND_ADAPTER);
+       if (ret) {
+               if (ret == -EOPNOTSUPP) {
+                       /*
+                        * The host does not support adapter interrupts
+                        * for virtio-ccw, stop trying.
+                        */
+                       virtio_ccw_use_airq = 0;
+                       pr_info("Adapter interrupts unsupported on host\n");
+               } else
+                       dev_warn(&vcdev->cdev->dev,
+                                "enabling adapter interrupts = %d\n", ret);
+               virtio_ccw_drop_indicators(vcdev);
+       }
+out:
+       kfree(thinint_area);
+       return ret;
+}
+
 static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
                               struct virtqueue *vqs[],
                               vq_callback_t *callbacks[],
@@ -355,15 +599,23 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
        if (!indicatorp)
                goto out;
        *indicatorp = (unsigned long) &vcdev->indicators;
-       /* Register queue indicators with host. */
-       vcdev->indicators = 0;
-       ccw->cmd_code = CCW_CMD_SET_IND;
-       ccw->flags = 0;
-       ccw->count = sizeof(vcdev->indicators);
-       ccw->cda = (__u32)(unsigned long) indicatorp;
-       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
-       if (ret)
-               goto out;
+       if (vcdev->is_thinint) {
+               ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw);
+               if (ret)
+                       /* no error, just fall back to legacy interrupts */
+                       vcdev->is_thinint = 0;
+       }
+       if (!vcdev->is_thinint) {
+               /* Register queue indicators with host. */
+               vcdev->indicators = 0;
+               ccw->cmd_code = CCW_CMD_SET_IND;
+               ccw->flags = 0;
+               ccw->count = sizeof(vcdev->indicators);
+               ccw->cda = (__u32)(unsigned long) indicatorp;
+               ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
+               if (ret)
+                       goto out;
+       }
        /* Register indicators2 with host for config changes */
        *indicatorp = (unsigned long) &vcdev->indicators2;
        vcdev->indicators2 = 0;
@@ -636,6 +888,8 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
        struct virtqueue *vq;
        struct virtio_driver *drv;
 
+       if (!vcdev)
+               return;
        /* Check if it's a notification from the host. */
        if ((intparm == 0) &&
            (scsw_stctl(&irb->scsw) ==
@@ -663,6 +917,7 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
                case VIRTIO_CCW_DOING_SET_CONF_IND:
                case VIRTIO_CCW_DOING_RESET:
                case VIRTIO_CCW_DOING_READ_VQ_CONF:
+               case VIRTIO_CCW_DOING_SET_IND_ADAPTER:
                        vcdev->curr_io &= ~activity;
                        wake_up(&vcdev->wait_q);
                        break;
@@ -734,23 +989,46 @@ static int virtio_ccw_probe(struct ccw_device *cdev)
        return 0;
 }
 
+static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev)
+{
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev;
+
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+       vcdev = dev_get_drvdata(&cdev->dev);
+       if (!vcdev || vcdev->going_away) {
+               spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+               return NULL;
+       }
+       vcdev->going_away = true;
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+       return vcdev;
+}
+
 static void virtio_ccw_remove(struct ccw_device *cdev)
 {
-       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
 
-       if (cdev->online) {
+       if (vcdev && cdev->online)
                unregister_virtio_device(&vcdev->vdev);
-               dev_set_drvdata(&cdev->dev, NULL);
-       }
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+       dev_set_drvdata(&cdev->dev, NULL);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
        cdev->handler = NULL;
 }
 
 static int virtio_ccw_offline(struct ccw_device *cdev)
 {
-       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
 
-       unregister_virtio_device(&vcdev->vdev);
-       dev_set_drvdata(&cdev->dev, NULL);
+       if (vcdev) {
+               unregister_virtio_device(&vcdev->vdev);
+               spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+               dev_set_drvdata(&cdev->dev, NULL);
+               spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+       }
        return 0;
 }
 
@@ -759,6 +1037,7 @@ static int virtio_ccw_online(struct ccw_device *cdev)
 {
        int ret;
        struct virtio_ccw_device *vcdev;
+       unsigned long flags;
 
        vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
        if (!vcdev) {
@@ -778,6 +1057,8 @@ static int virtio_ccw_online(struct ccw_device *cdev)
                goto out_free;
        }
 
+       vcdev->is_thinint = virtio_ccw_use_airq; /* at least try */
+
        vcdev->vdev.dev.parent = &cdev->dev;
        vcdev->vdev.dev.release = virtio_ccw_release_dev;
        vcdev->vdev.config = &virtio_ccw_config_ops;
@@ -786,7 +1067,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
        INIT_LIST_HEAD(&vcdev->virtqueues);
        spin_lock_init(&vcdev->lock);
 
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
        dev_set_drvdata(&cdev->dev, vcdev);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
        vcdev->vdev.id.vendor = cdev->id.cu_type;
        vcdev->vdev.id.device = cdev->id.cu_model;
        ret = register_virtio_device(&vcdev->vdev);
@@ -797,7 +1080,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
        }
        return 0;
 out_put:
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
        dev_set_drvdata(&cdev->dev, NULL);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
        put_device(&vcdev->vdev.dev);
        return ret;
 out_free:
@@ -935,6 +1220,10 @@ module_init(virtio_ccw_init);
 
 static void __exit virtio_ccw_exit(void)
 {
+       int i;
+
        ccw_driver_unregister(&virtio_ccw_driver);
+       for (i = 0; i < MAX_AIRQ_AREAS; i++)
+               destroy_airq_info(airq_areas[i]);
 }
 module_exit(virtio_ccw_exit);
index b8e9a43..7d21cf9 100644 (file)
@@ -192,7 +192,7 @@ struct kvm_async_pf {
 
 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
                       struct kvm_arch_async_pf *arch);
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
@@ -297,6 +297,14 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
        return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 }
 
+struct kvm_s390_adapter_int {
+       u64 ind_addr;
+       u64 summary_addr;
+       u64 ind_offset;
+       u32 summary_offset;
+       u32 adapter_id;
+};
+
 struct kvm_kernel_irq_routing_entry {
        u32 gsi;
        u32 type;
@@ -309,6 +317,7 @@ struct kvm_kernel_irq_routing_entry {
                        unsigned pin;
                } irqchip;
                struct msi_msg msi;
+               struct kvm_s390_adapter_int adapter;
        };
        struct hlist_node link;
 };
@@ -401,7 +410,9 @@ struct kvm {
        unsigned long mmu_notifier_seq;
        long mmu_notifier_count;
 #endif
-       long tlbs_dirty;
+       /* Protected by mmu_lock */
+       bool tlbs_dirty;
+
        struct list_head devices;
 };
 
@@ -911,7 +922,11 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
+#ifdef CONFIG_S390
+#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that...
+#else
 #define KVM_MAX_IRQ_ROUTES 1024
+#endif
 
 int kvm_setup_default_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
@@ -1064,6 +1079,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_vfio_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_flic_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
index 932d7f2..a8f4ee5 100644 (file)
@@ -413,6 +413,8 @@ struct kvm_s390_psw {
 #define KVM_S390_PROGRAM_INT           0xfffe0001u
 #define KVM_S390_SIGP_SET_PREFIX       0xfffe0002u
 #define KVM_S390_RESTART               0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT       0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE       0xfffe0005u
 #define KVM_S390_MCHK                  0xfffe1000u
 #define KVM_S390_INT_VIRTIO            0xffff2603u
 #define KVM_S390_INT_SERVICE           0xffff2401u
@@ -434,6 +436,69 @@ struct kvm_s390_interrupt {
        __u64 parm64;
 };
 
+struct kvm_s390_io_info {
+       __u16 subchannel_id;
+       __u16 subchannel_nr;
+       __u32 io_int_parm;
+       __u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+       __u32 ext_params;
+       __u32 pad;
+       __u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+       __u64 trans_exc_code;
+       __u64 mon_code;
+       __u64 per_address;
+       __u32 data_exc_code;
+       __u16 code;
+       __u16 mon_class_nr;
+       __u8 per_code;
+       __u8 per_atmid;
+       __u8 exc_access_id;
+       __u8 per_access_id;
+       __u8 op_access_id;
+       __u8 pad[3];
+};
+
+struct kvm_s390_prefix_info {
+       __u32 address;
+};
+
+struct kvm_s390_extcall_info {
+       __u16 code;
+};
+
+struct kvm_s390_emerg_info {
+       __u16 code;
+};
+
+struct kvm_s390_mchk_info {
+       __u64 cr14;
+       __u64 mcic;
+       __u64 failing_storage_address;
+       __u32 ext_damage_code;
+       __u32 pad;
+       __u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+       __u64 type;
+       union {
+               struct kvm_s390_io_info io;
+               struct kvm_s390_ext_info ext;
+               struct kvm_s390_pgm_info pgm;
+               struct kvm_s390_emerg_info emerg;
+               struct kvm_s390_extcall_info extcall;
+               struct kvm_s390_prefix_info prefix;
+               struct kvm_s390_mchk_info mchk;
+               char reserved[64];
+       } u;
+};
+
 /* for KVM_SET_GUEST_DEBUG */
 
 #define KVM_GUESTDBG_ENABLE            0x00000001
@@ -675,6 +740,9 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SPAPR_MULTITCE 94
 #define KVM_CAP_EXT_EMUL_CPUID 95
 #define KVM_CAP_HYPERV_TIME 96
+#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
+#define KVM_CAP_ENABLE_CAP_VM 98
+#define KVM_CAP_S390_IRQCHIP 99
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -690,9 +758,18 @@ struct kvm_irq_routing_msi {
        __u32 pad;
 };
 
+struct kvm_irq_routing_s390_adapter {
+       __u64 ind_addr;
+       __u64 summary_addr;
+       __u64 ind_offset;
+       __u32 summary_offset;
+       __u32 adapter_id;
+};
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
 
 struct kvm_irq_routing_entry {
        __u32 gsi;
@@ -702,6 +779,7 @@ struct kvm_irq_routing_entry {
        union {
                struct kvm_irq_routing_irqchip irqchip;
                struct kvm_irq_routing_msi msi;
+               struct kvm_irq_routing_s390_adapter adapter;
                __u32 pad[8];
        } u;
 };
@@ -855,6 +933,7 @@ struct kvm_device_attr {
 #define   KVM_DEV_VFIO_GROUP_ADD                       1
 #define   KVM_DEV_VFIO_GROUP_DEL                       2
 #define KVM_DEV_TYPE_ARM_VGIC_V2       5
+#define KVM_DEV_TYPE_FLIC              6
 
 /*
  * ioctls for VM fds
@@ -1009,6 +1088,10 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_DEBUGREGS */
 #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
 #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
+/*
+ * vcpu version available with KVM_ENABLE_CAP
+ * vm version available with KVM_CAP_ENABLE_CAP_VM
+ */
 #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE            _IOR(KVMIO,  0xa4, struct kvm_xsave)
index fbe1a48..13f2d19 100644 (file)
@@ -22,6 +22,10 @@ config KVM_MMIO
 config KVM_ASYNC_PF
        bool
 
+# Toggle to switch between direct notification and batch job
+config KVM_ASYNC_PF_SYNC
+       bool
+
 config HAVE_KVM_MSI
        bool
 
index 8631d9c..10df100 100644 (file)
 #include "async_pf.h"
 #include <trace/events/kvm.h>
 
+static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
+                                              struct kvm_async_pf *work)
+{
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+       kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
+                                               struct kvm_async_pf *work)
+{
+#ifndef CONFIG_KVM_ASYNC_PF_SYNC
+       kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+
 static struct kmem_cache *async_pf_cache;
 
 int kvm_async_pf_init(void)
@@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work)
        down_read(&mm->mmap_sem);
        get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
        up_read(&mm->mmap_sem);
+       kvm_async_page_present_sync(vcpu, apf);
        unuse_mm(mm);
 
        spin_lock(&vcpu->async_pf.lock);
@@ -97,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                        list_entry(vcpu->async_pf.queue.next,
                                   typeof(*work), queue);
                list_del(&work->queue);
+
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+               flush_work(&work->work);
+#else
                if (cancel_work_sync(&work->work)) {
                        mmdrop(work->mm);
                        kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
                        kmem_cache_free(async_pf_cache, work);
                }
+#endif
        }
 
        spin_lock(&vcpu->async_pf.lock);
@@ -130,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
                spin_unlock(&vcpu->async_pf.lock);
 
                kvm_arch_async_page_ready(vcpu, work);
-               kvm_arch_async_page_present(vcpu, work);
+               kvm_async_page_present_async(vcpu, work);
 
                list_del(&work->queue);
                vcpu->async_pf.queued--;
@@ -138,7 +159,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
        }
 }
 
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
                       struct kvm_arch_async_pf *arch)
 {
        struct kvm_async_pf *work;
@@ -159,7 +180,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
        work->wakeup_all = false;
        work->vcpu = vcpu;
        work->gva = gva;
-       work->addr = gfn_to_hva(vcpu->kvm, gfn);
+       work->addr = hva;
        work->arch = *arch;
        work->mm = current->mm;
        atomic_inc(&work->mm->mm_count);
index abe4d60..29c2a04 100644 (file)
@@ -391,19 +391,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
                                           lockdep_is_held(&kvm->irqfds.lock));
        irqfd_update(kvm, irqfd, irq_rt);
 
-       events = f.file->f_op->poll(f.file, &irqfd->pt);
-
        list_add_tail(&irqfd->list, &kvm->irqfds.items);
 
+       spin_unlock_irq(&kvm->irqfds.lock);
+
        /*
         * Check if there was an event already pending on the eventfd
         * before we registered, and trigger it as if we didn't miss it.
         */
+       events = f.file->f_op->poll(f.file, &irqfd->pt);
+
        if (events & POLLIN)
                schedule_work(&irqfd->inject);
 
-       spin_unlock_irq(&kvm->irqfds.lock);
-
        /*
         * do not drop the file until the irqfd is fully initialized, otherwise
         * we might race against the POLLHUP
index ce9ed99..d4b6015 100644 (file)
@@ -50,7 +50,7 @@
 #else
 #define ioapic_debug(fmt, arg...)
 #endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
                bool line_status);
 
 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -163,23 +163,67 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
        return false;
 }
 
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
-               bool line_status)
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+               int irq_level, bool line_status)
 {
-       union kvm_ioapic_redirect_entry *pent;
-       int injected = -1;
+       union kvm_ioapic_redirect_entry entry;
+       u32 mask = 1 << irq;
+       u32 old_irr;
+       int edge, ret;
 
-       pent = &ioapic->redirtbl[idx];
+       entry = ioapic->redirtbl[irq];
+       edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
 
-       if (!pent->fields.mask) {
-               injected = ioapic_deliver(ioapic, idx, line_status);
-               if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-                       pent->fields.remote_irr = 1;
+       if (!irq_level) {
+               ioapic->irr &= ~mask;
+               ret = 1;
+               goto out;
+       }
+
+       /*
+        * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+        * this only happens if a previous edge has not been delivered due
+        * do masking.  For level interrupts, the remote_irr field tells
+        * us if the interrupt is waiting for an EOI.
+        *
+        * RTC is special: it is edge-triggered, but userspace likes to know
+        * if it has been already ack-ed via EOI because coalesced RTC
+        * interrupts lead to time drift in Windows guests.  So we track
+        * EOI manually for the RTC interrupt.
+        */
+       if (irq == RTC_GSI && line_status &&
+               rtc_irq_check_coalesced(ioapic)) {
+               ret = 0;
+               goto out;
        }
 
-       return injected;
+       old_irr = ioapic->irr;
+       ioapic->irr |= mask;
+       if ((edge && old_irr == ioapic->irr) ||
+           (!edge && entry.fields.remote_irr)) {
+               ret = 0;
+               goto out;
+       }
+
+       ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+       trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+       return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+       u32 idx;
+
+       rtc_irq_eoi_tracking_reset(ioapic);
+       for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+               ioapic_set_irq(ioapic, idx, 1, true);
+
+       kvm_rtc_eoi_tracking_restore_all(ioapic);
 }
 
+
 static void update_handled_vectors(struct kvm_ioapic *ioapic)
 {
        DECLARE_BITMAP(handled_vectors, 256);
@@ -282,12 +326,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
        }
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 {
        union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
        struct kvm_lapic_irq irqe;
        int ret;
 
+       if (entry->fields.mask)
+               return -1;
+
        ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
                     "vector=%x trig_mode=%x\n",
                     entry->fields.dest_id, entry->fields.dest_mode,
@@ -302,6 +349,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
        irqe.level = 1;
        irqe.shorthand = 0;
 
+       if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+               ioapic->irr &= ~(1 << irq);
+
        if (irq == RTC_GSI && line_status) {
                BUG_ON(ioapic->rtc_status.pending_eoi != 0);
                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
@@ -310,45 +360,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
        } else
                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
 
+       if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+               entry->fields.remote_irr = 1;
+
        return ret;
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
                       int level, bool line_status)
 {
-       u32 old_irr;
-       u32 mask = 1 << irq;
-       union kvm_ioapic_redirect_entry entry;
        int ret, irq_level;
 
        BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
 
        spin_lock(&ioapic->lock);
-       old_irr = ioapic->irr;
        irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
                                         irq_source_id, level);
-       entry = ioapic->redirtbl[irq];
-       irq_level ^= entry.fields.polarity;
-       if (!irq_level) {
-               ioapic->irr &= ~mask;
-               ret = 1;
-       } else {
-               int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+       ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
 
-               if (irq == RTC_GSI && line_status &&
-                       rtc_irq_check_coalesced(ioapic)) {
-                       ret = 0; /* coalesced */
-                       goto out;
-               }
-               ioapic->irr |= mask;
-               if ((edge && old_irr != ioapic->irr) ||
-                   (!edge && !entry.fields.remote_irr))
-                       ret = ioapic_service(ioapic, irq, line_status);
-               else
-                       ret = 0; /* report coalesced interrupt */
-       }
-out:
-       trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        spin_unlock(&ioapic->lock);
 
        return ret;
@@ -394,7 +423,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
                ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
                ent->fields.remote_irr = 0;
-               if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+               if (ioapic->irr & (1 << i))
                        ioapic_service(ioapic, i, false);
        }
 }
@@ -595,9 +624,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 
        spin_lock(&ioapic->lock);
        memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+       ioapic->irr = 0;
        update_handled_vectors(ioapic);
        kvm_vcpu_request_scan_ioapic(kvm);
-       kvm_rtc_eoi_tracking_restore_all(ioapic);
+       kvm_ioapic_inject_all(ioapic, state->irr);
        spin_unlock(&ioapic->lock);
        return 0;
 }
index b5ec7fb..56baae8 100644 (file)
@@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-       long dirty_count = kvm->tlbs_dirty;
-
-       smp_mb();
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
-       cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+       kvm->tlbs_dirty = false;
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
@@ -1804,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        if (vcpu == me)
                                continue;
-                       if (waitqueue_active(&vcpu->wq))
+                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;
@@ -2283,6 +2280,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        case KVM_DEV_TYPE_ARM_VGIC_V2:
                ops = &kvm_arm_vgic_v2_ops;
                break;
+#endif
+#ifdef CONFIG_S390
+       case KVM_DEV_TYPE_FLIC:
+               ops = &kvm_flic_ops;
+               break;
 #endif
        default:
                return -ENODEV;