Merge tag 'kvm-3.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt

index 6cd63a9..c24211d 100644 (file)
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -586,8 +586,8 @@ struct kvm_fpu {
  
  4.24 KVM_CREATE_IRQCHIP
  
-Capability: KVM_CAP_IRQCHIP
-Architectures: x86, ia64, ARM, arm64
+Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390)
+Architectures: x86, ia64, ARM, arm64, s390
  Type: vm ioctl
  Parameters: none
  Returns: 0 on success, -1 on error
@@ -596,7 +596,10 @@ Creates an interrupt controller model in the kernel.  On x86, creates a virtual
  ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
  local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
  only go to the IOAPIC.  On ia64, a IOSAPIC is created. On ARM/arm64, a GIC is
-created.
+created. On s390, a dummy irq routing table is created.
+
+Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled
+before KVM_CREATE_IRQCHIP can be used.
  
  
  4.25 KVM_IRQ_LINE
@@ -612,6 +615,20 @@ On some architectures it is required that an interrupt controller model has
  been previously created with KVM_CREATE_IRQCHIP.  Note that edge-triggered
  interrupts require the level to be set to 1 and then back to 0.
  
+On real hardware, interrupt pins can be active-low or active-high.  This
+does not matter for the level field of struct kvm_irq_level: 1 always
+means active (asserted), 0 means inactive (deasserted).
+
+x86 allows the operating system to program the interrupt polarity
+(active-low/active-high) for level-triggered interrupts, and KVM used
+to consider the polarity.  However, due to bitrot in the handling of
+active-low interrupts, the above convention is now valid on x86 too.
+This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED.  Userspace
+should not present interrupts to the guest as active-low unless this
+capability is present (or unless it is not using the in-kernel irqchip,
+of course).
+
+
  ARM/arm64 can signal an interrupt either at the CPU level, or at the
  in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
  use PPIs designated for specific cpus.  The irq field is interpreted
@@ -628,7 +645,7 @@ The irq_type field has the following values:
  
  (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
  
-In both cases, level is used to raise/lower the line.
+In both cases, level is used to assert/deassert the line.
  
  struct kvm_irq_level {
         union {
@@ -918,9 +935,9 @@ documentation when it pops into existence).
  
  4.37 KVM_ENABLE_CAP
  
-Capability: KVM_CAP_ENABLE_CAP
+Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
  Architectures: ppc, s390
-Type: vcpu ioctl
+Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
  Parameters: struct kvm_enable_cap (in)
  Returns: 0 on success; -1 on error
  
@@ -951,6 +968,8 @@ function properly, this is the place to put them.
         __u8  pad[64];
  };
  
+The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl
+for vm-wide capabilities.
  
  4.38 KVM_GET_MP_STATE
  
@@ -1320,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
  4.52 KVM_SET_GSI_ROUTING
  
  Capability: KVM_CAP_IRQ_ROUTING
-Architectures: x86 ia64
+Architectures: x86 ia64 s390
  Type: vm ioctl
  Parameters: struct kvm_irq_routing (in)
  Returns: 0 on success, -1 on error
@@ -1343,6 +1362,7 @@ struct kvm_irq_routing_entry {
         union {
                 struct kvm_irq_routing_irqchip irqchip;
                 struct kvm_irq_routing_msi msi;
+               struct kvm_irq_routing_s390_adapter adapter;
                 __u32 pad[8];
         } u;
  };
@@ -1350,6 +1370,7 @@ struct kvm_irq_routing_entry {
  /* gsi routing entry types */
  #define KVM_IRQ_ROUTING_IRQCHIP 1
  #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
  
  No flags are specified so far, the corresponding field must be set to zero.
  
@@ -1365,6 +1386,14 @@ struct kvm_irq_routing_msi {
         __u32 pad;
  };
  
+struct kvm_irq_routing_s390_adapter {
+       __u64 ind_addr;
+       __u64 summary_addr;
+       __u64 ind_offset;
+       __u32 summary_offset;
+       __u32 adapter_id;
+};
+
  
  4.53 KVM_ASSIGN_SET_MSIX_NR
  
@@ -2566,6 +2595,10 @@ executed a memory-mapped I/O instruction which could not be satisfied
  by kvm.  The 'data' member contains the written data if 'is_write' is
  true, and should be filled by application code otherwise.
  
+The 'data' member contains, in its first 'len' bytes, the value as it would
+appear if the VCPU performed a load or store of the appropriate width directly
+to the byte array.
+
  NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
        KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
  operations are complete (and guest state is consistent) only after userspace
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt

new file mode 100644 (file)

index 0000000..4ceef53
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -0,0 +1,91 @@
+FLIC (floating interrupt controller)
+====================================
+
+FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some
+machine check interruptions. All interrupts are stored in a per-vm list of
+pending interrupts. FLIC performs operations on this list.
+
+Only one FLIC instance may be instantiated.
+
+FLIC provides support to
+- add interrupts (KVM_DEV_FLIC_ENQUEUE)
+- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS)
+- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
+- enable/disable for the guest transparent async page faults
+- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
+
+Groups:
+  KVM_DEV_FLIC_ENQUEUE
+    Passes a buffer and length into the kernel which are then injected into
+    the list of pending interrupts.
+    attr->addr contains the pointer to the buffer and attr->attr contains
+    the length of the buffer.
+    The format of the data structure kvm_s390_irq as it is copied from userspace
+    is defined in usr/include/linux/kvm.h.
+
+  KVM_DEV_FLIC_GET_ALL_IRQS
+    Copies all floating interrupts into a buffer provided by userspace.
+    When the buffer is too small it returns -ENOMEM, which is the indication
+    for userspace to try again with a bigger buffer.
+    All interrupts remain pending, i.e. are not deleted from the list of
+    currently pending interrupts.
+    attr->addr contains the userspace address of the buffer into which all
+    interrupt data will be copied.
+    attr->attr contains the size of the buffer in bytes.
+
+  KVM_DEV_FLIC_CLEAR_IRQS
+    Simply deletes all elements from the list of currently pending floating
+    interrupts.  No interrupts are injected into the guest.
+
+  KVM_DEV_FLIC_APF_ENABLE
+    Enables async page faults for the guest. So in case of a major page fault
+    the host is allowed to handle this async and continues the guest.
+
+  KVM_DEV_FLIC_APF_DISABLE_WAIT
+    Disables async page faults for the guest and waits until already pending
+    async page faults are done. This is necessary to trigger a completion interrupt
+    for every init interrupt before migrating the interrupt list.
+
+  KVM_DEV_FLIC_ADAPTER_REGISTER
+    Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter
+    describing the adapter to register:
+
+struct kvm_s390_io_adapter {
+       __u32 id;
+       __u8 isc;
+       __u8 maskable;
+       __u8 swap;
+       __u8 pad;
+};
+
+   id contains the unique id for the adapter, isc the I/O interruption subclass
+   to use, maskable whether this adapter may be masked (interrupts turned off)
+   and swap whether the indicators need to be byte swapped.
+
+
+  KVM_DEV_FLIC_ADAPTER_MODIFY
+    Modifies attributes of an existing I/O adapter interrupt source. Takes
+    a kvm_s390_io_adapter_req specifiying the adapter and the operation:
+
+struct kvm_s390_io_adapter_req {
+       __u32 id;
+       __u8 type;
+       __u8 mask;
+       __u16 pad0;
+       __u64 addr;
+};
+
+    id specifies the adapter and type the operation. The supported operations
+    are:
+
+    KVM_S390_IO_ADAPTER_MASK
+      mask or unmask the adapter, as specified in mask
+
+    KVM_S390_IO_ADAPTER_MAP
+      perform a gmap translation for the guest address provided in addr,
+      pin a userspace page for the translated address and add it to the
+      list of mappings
+
+    KVM_S390_IO_ADAPTER_UNMAP
+      release a userspace page for the translated address specified in addr
+      from the list of mappings
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h

index 1d3153c..816db0b 100644 (file)
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -55,6 +55,7 @@
   * The bits we set in HCR:
   * TAC:                Trap ACTLR
   * TSC:                Trap SMC
+ * TVM:                Trap VM ops (until MMU and caches are on)
   * TSW:                Trap cache operations by set/way
   * TWI:                Trap WFI
   * TWE:                Trap WFE
@@ -68,8 +69,7 @@
   */
  #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
                         HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
-                       HCR_TWE | HCR_SWIO | HCR_TIDCP)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+                       HCR_TVM | HCR_TWE | HCR_SWIO | HCR_TIDCP)
  
  /* System Control Register (SCTLR) bits */
  #define SCTLR_TE       (1 << 30)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h

index 661da11..53b3c4a 100644 (file)
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -48,7 +48,9 @@
  #define c13_TID_URO    26      /* Thread ID, User R/O */
  #define c13_TID_PRIV   27      /* Thread ID, Privileged */
  #define c14_CNTKCTL    28      /* Timer Control Register (PL1) */
-#define NR_CP15_REGS   29      /* Number of regs (incl. invalid) */
+#define c10_AMAIR0     29      /* Auxilary Memory Attribute Indirection Reg0 */
+#define c10_AMAIR1     30      /* Auxilary Memory Attribute Indirection Reg1 */
+#define NR_CP15_REGS   31      /* Number of regs (incl. invalid) */
  
  #define ARM_EXCEPTION_RESET      0
  #define ARM_EXCEPTION_UNDEFINED   1
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h

index 098f7dd..09af149 100644 (file)
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -101,6 +101,12 @@ struct kvm_vcpu_arch {
         /* The CPU type we expose to the VM */
         u32 midr;
  
+       /* HYP trapping configuration */
+       u32 hcr;
+
+       /* Interrupt related fields */
+       u32 irq_lines;          /* IRQ and FIQ levels */
+
         /* Exception Information */
         struct kvm_vcpu_fault_info fault;
  
@@ -128,9 +134,6 @@ struct kvm_vcpu_arch {
         /* IO related fields */
         struct kvm_decode mmio_decode;
  
-       /* Interrupt related fields */
-       u32 irq_lines;          /* IRQ and FIQ levels */
-
         /* Cache some mmu pages needed inside spinlock regions */
         struct kvm_mmu_memory_cache mmu_page_cache;
  
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h

index 2d122ad..5c7aa3c 100644 (file)
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,11 +114,34 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
         pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
+/* Open coded p*d_addr_end that can deal with 64bit addresses */
+#define kvm_pgd_addr_end(addr, end)                                    \
+({     u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;            \
+       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
+})
+
+#define kvm_pud_addr_end(addr,end)             (end)
+
+#define kvm_pmd_addr_end(addr, end)                                    \
+({     u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK;                \
+       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
+})
+
  struct kvm;
  
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
-                                             unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
  {
+       return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+                                            unsigned long size)
+{
+       if (!vcpu_has_cache_enabled(vcpu))
+               kvm_flush_dcache_to_poc((void *)hva, size);
+       
         /*
          * If we are going to insert an instruction page and the icache is
          * either VIPT or PIPT, there is a potential problem where the host
@@ -139,9 +162,10 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
         }
  }
  
-#define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
  #define kvm_virt_to_phys(x)            virt_to_idmap((unsigned long)(x))
  
+void stage2_flush_vm(struct kvm *kvm);
+
  #endif /* !__ASSEMBLY__ */
  
  #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c

index ded0417..85598b5 100644 (file)
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -174,6 +174,7 @@ int main(void)
    DEFINE(VCPU_FIQ_REGS,                offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
    DEFINE(VCPU_PC,              offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_pc));
    DEFINE(VCPU_CPSR,            offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_cpsr));
+  DEFINE(VCPU_HCR,             offsetof(struct kvm_vcpu, arch.hcr));
    DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
    DEFINE(VCPU_HSR,             offsetof(struct kvm_vcpu, arch.fault.hsr));
    DEFINE(VCPU_HxFAR,           offsetof(struct kvm_vcpu, arch.fault.hxfar));
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c

index 78c0885..c58a351 100644 (file)
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -23,6 +23,7 @@
  #include <asm/kvm_host.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
  #include <asm/cacheflush.h>
  #include <asm/cputype.h>
  #include <trace/events/kvm.h>
@@ -204,6 +205,44 @@ done:
         return true;
  }
  
+/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+                         const struct coproc_params *p,
+                         const struct coproc_reg *r)
+{
+       BUG_ON(!p->is_write);
+
+       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
+       if (p->is_64bit)
+               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
+
+       return true;
+}
+
+/*
+ * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ *
+ * Used by the cpu-specific code.
+ */
+bool access_sctlr(struct kvm_vcpu *vcpu,
+                 const struct coproc_params *p,
+                 const struct coproc_reg *r)
+{
+       access_vm_reg(vcpu, p, r);
+
+       if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
+               vcpu->arch.hcr &= ~HCR_TVM;
+               stage2_flush_vm(vcpu->kvm);
+       }
+
+       return true;
+}
+
  /*
   * We could trap ID_DFR0 and tell the guest we don't support performance
   * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -261,33 +300,36 @@ static const struct coproc_reg cp15_regs[] = {
         { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32,
                         NULL, reset_val, c1_CPACR, 0x00000000 },
  
-       /* TTBR0/TTBR1: swapped by interrupt.S. */
-       { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 },
-       { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 },
-
-       /* TTBCR: swapped by interrupt.S. */
+       /* TTBR0/TTBR1/TTBCR: swapped by interrupt.S. */
+       { CRm64( 2), Op1( 0), is64, access_vm_reg, reset_unknown64, c2_TTBR0 },
+       { CRn(2), CRm( 0), Op1( 0), Op2( 0), is32,
+                       access_vm_reg, reset_unknown, c2_TTBR0 },
+       { CRn(2), CRm( 0), Op1( 0), Op2( 1), is32,
+                       access_vm_reg, reset_unknown, c2_TTBR1 },
         { CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32,
-                       NULL, reset_val, c2_TTBCR, 0x00000000 },
+                       access_vm_reg, reset_val, c2_TTBCR, 0x00000000 },
+       { CRm64( 2), Op1( 1), is64, access_vm_reg, reset_unknown64, c2_TTBR1 },
+
  
         /* DACR: swapped by interrupt.S. */
         { CRn( 3), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c3_DACR },
+                       access_vm_reg, reset_unknown, c3_DACR },
  
         /* DFSR/IFSR/ADFSR/AIFSR: swapped by interrupt.S. */
         { CRn( 5), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c5_DFSR },
+                       access_vm_reg, reset_unknown, c5_DFSR },
         { CRn( 5), CRm( 0), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c5_IFSR },
+                       access_vm_reg, reset_unknown, c5_IFSR },
         { CRn( 5), CRm( 1), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c5_ADFSR },
+                       access_vm_reg, reset_unknown, c5_ADFSR },
         { CRn( 5), CRm( 1), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c5_AIFSR },
+                       access_vm_reg, reset_unknown, c5_AIFSR },
  
         /* DFAR/IFAR: swapped by interrupt.S. */
         { CRn( 6), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c6_DFAR },
+                       access_vm_reg, reset_unknown, c6_DFAR },
         { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
-                       NULL, reset_unknown, c6_IFAR },
+                       access_vm_reg, reset_unknown, c6_IFAR },
  
         /* PAR swapped by interrupt.S */
         { CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
@@ -324,9 +366,15 @@ static const struct coproc_reg cp15_regs[] = {
  
         /* PRRR/NMRR (aka MAIR0/MAIR1): swapped by interrupt.S. */
         { CRn(10), CRm( 2), Op1( 0), Op2( 0), is32,
-                       NULL, reset_unknown, c10_PRRR},
+                       access_vm_reg, reset_unknown, c10_PRRR},
         { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32,
-                       NULL, reset_unknown, c10_NMRR},
+                       access_vm_reg, reset_unknown, c10_NMRR},
+
+       /* AMAIR0/AMAIR1: swapped by interrupt.S. */
+       { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32,
+                       access_vm_reg, reset_unknown, c10_AMAIR0},
+       { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
+                       access_vm_reg, reset_unknown, c10_AMAIR1},
  
         /* VBAR: swapped by interrupt.S. */
         { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
@@ -334,7 +382,7 @@ static const struct coproc_reg cp15_regs[] = {
  
         /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
         { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
-                       NULL, reset_val, c13_CID, 0x00000000 },
+                       access_vm_reg, reset_val, c13_CID, 0x00000000 },
         { CRn(13), CRm( 0), Op1( 0), Op2( 2), is32,
                         NULL, reset_unknown, c13_TID_URW },
         { CRn(13), CRm( 0), Op1( 0), Op2( 3), is32,
@@ -443,7 +491,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
  {
         struct coproc_params params;
  
-       params.CRm = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
+       params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
         params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf;
         params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0);
         params.is_64bit = true;
@@ -451,7 +499,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
         params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 16) & 0xf;
         params.Op2 = 0;
         params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf;
-       params.CRn = 0;
+       params.CRm = 0;
  
         return emulate_cp15(vcpu, &params);
  }
diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h

index 0461d5c..1a44bbe 100644 (file)
--- a/arch/arm/kvm/coproc.h
+++ b/arch/arm/kvm/coproc.h
@@ -58,8 +58,8 @@ static inline void print_cp_instr(const struct coproc_params *p)
  {
         /* Look, we even formatted it for you to paste into the table! */
         if (p->is_64bit) {
-               kvm_pr_unimpl(" { CRm(%2lu), Op1(%2lu), is64, func_%s },\n",
-                             p->CRm, p->Op1, p->is_write ? "write" : "read");
+               kvm_pr_unimpl(" { CRm64(%2lu), Op1(%2lu), is64, func_%s },\n",
+                             p->CRn, p->Op1, p->is_write ? "write" : "read");
         } else {
                 kvm_pr_unimpl(" { CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32,"
                               " func_%s },\n",
@@ -135,13 +135,13 @@ static inline int cmp_reg(const struct coproc_reg *i1,
                 return -1;
         if (i1->CRn != i2->CRn)
                 return i1->CRn - i2->CRn;
-       if (i1->is_64 != i2->is_64)
-               return i2->is_64 - i1->is_64;
         if (i1->CRm != i2->CRm)
                 return i1->CRm - i2->CRm;
         if (i1->Op1 != i2->Op1)
                 return i1->Op1 - i2->Op1;
-       return i1->Op2 - i2->Op2;
+       if (i1->Op2 != i2->Op2)
+               return i1->Op2 - i2->Op2;
+       return i2->is_64 - i1->is_64;
  }
  
  
@@ -153,4 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1,
  #define is64           .is_64 = true
  #define is32           .is_64 = false
  
+bool access_sctlr(struct kvm_vcpu *vcpu,
+                 const struct coproc_params *p,
+                 const struct coproc_reg *r);
+
  #endif /* __ARM_KVM_COPROC_LOCAL_H__ */
diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c

index bb0cac1..e6f4ae4 100644 (file)
--- a/arch/arm/kvm/coproc_a15.c
+++ b/arch/arm/kvm/coproc_a15.c
@@ -34,7 +34,7 @@
  static const struct coproc_reg a15_regs[] = {
         /* SCTLR: swapped by interrupt.S. */
         { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_val, c1_SCTLR, 0x00C50078 },
+                       access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
  };
  
  static struct kvm_coproc_target_table a15_target_table = {
diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c

index 1df7673..17fc7cd 100644 (file)
--- a/arch/arm/kvm/coproc_a7.c
+++ b/arch/arm/kvm/coproc_a7.c
@@ -37,7 +37,7 @@
  static const struct coproc_reg a7_regs[] = {
         /* SCTLR: swapped by interrupt.S. */
         { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-                       NULL, reset_val, c1_SCTLR, 0x00C50878 },
+                       access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
  };
  
  static struct kvm_coproc_target_table a7_target_table = {
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c

index 2786eae..b23a59c 100644 (file)
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -38,6 +38,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
  
  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
  {
+       vcpu->arch.hcr = HCR_GUEST_MASK;
         return 0;
  }
  
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S

index 6f18695..76af930 100644 (file)
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -303,13 +303,17 @@ vcpu      .req    r0              @ vcpu pointer always in r0
  
         mrc     p15, 0, r2, c14, c1, 0  @ CNTKCTL
         mrrc    p15, 0, r4, r5, c7      @ PAR
+       mrc     p15, 0, r6, c10, c3, 0  @ AMAIR0
+       mrc     p15, 0, r7, c10, c3, 1  @ AMAIR1
  
         .if \store_to_vcpu == 0
-       push    {r2,r4-r5}
+       push    {r2,r4-r7}
         .else
         str     r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
         add     r12, vcpu, #CP15_OFFSET(c7_PAR)
         strd    r4, r5, [r12]
+       str     r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+       str     r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
         .endif
  .endm
  
@@ -322,15 +326,19 @@ vcpu      .req    r0              @ vcpu pointer always in r0
   */
  .macro write_cp15_state read_from_vcpu
         .if \read_from_vcpu == 0
-       pop     {r2,r4-r5}
+       pop     {r2,r4-r7}
         .else
         ldr     r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
         add     r12, vcpu, #CP15_OFFSET(c7_PAR)
         ldrd    r4, r5, [r12]
+       ldr     r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+       ldr     r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
         .endif
  
         mcr     p15, 0, r2, c14, c1, 0  @ CNTKCTL
         mcrr    p15, 0, r4, r5, c7      @ PAR
+       mcr     p15, 0, r6, c10, c3, 0  @ AMAIR0
+       mcr     p15, 0, r7, c10, c3, 1  @ AMAIR1
  
         .if \read_from_vcpu == 0
         pop     {r2-r12}
@@ -597,17 +605,14 @@ vcpu      .req    r0              @ vcpu pointer always in r0
  
  /* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */
  .macro configure_hyp_role operation
-       mrc     p15, 4, r2, c1, c1, 0   @ HCR
-       bic     r2, r2, #HCR_VIRT_EXCP_MASK
-       ldr     r3, =HCR_GUEST_MASK
         .if \operation == vmentry
-       orr     r2, r2, r3
+       ldr     r2, [vcpu, #VCPU_HCR]
         ldr     r3, [vcpu, #VCPU_IRQ_LINES]
         orr     r2, r2, r3
         .else
-       bic     r2, r2, r3
+       mov     r2, #0
         .endif
-       mcr     p15, 4, r2, c1, c1, 0
+       mcr     p15, 4, r2, c1, c1, 0   @ HCR
  .endm
  
  .macro load_vcpu
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c

index 7789857..80bb1e6 100644 (file)
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -144,8 +144,9 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
         while (addr < end) {
                 pgd = pgdp + pgd_index(addr);
                 pud = pud_offset(pgd, addr);
+               pte = NULL;
                 if (pud_none(*pud)) {
-                       addr = pud_addr_end(addr, end);
+                       addr = kvm_pud_addr_end(addr, end);
                         continue;
                 }
  
@@ -155,13 +156,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                          * move on.
                          */
                         clear_pud_entry(kvm, pud, addr);
-                       addr = pud_addr_end(addr, end);
+                       addr = kvm_pud_addr_end(addr, end);
                         continue;
                 }
  
                 pmd = pmd_offset(pud, addr);
                 if (pmd_none(*pmd)) {
-                       addr = pmd_addr_end(addr, end);
+                       addr = kvm_pmd_addr_end(addr, end);
                         continue;
                 }
  
@@ -174,12 +175,12 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                 /*
                  * If the pmd entry is to be cleared, walk back up the ladder
                  */
-               if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
+               if (kvm_pmd_huge(*pmd) || (pte && page_empty(pte))) {
                         clear_pmd_entry(kvm, pmd, addr);
-                       next = pmd_addr_end(addr, end);
+                       next = kvm_pmd_addr_end(addr, end);
                         if (page_empty(pmd) && !page_empty(pud)) {
                                 clear_pud_entry(kvm, pud, addr);
-                               next = pud_addr_end(addr, end);
+                               next = kvm_pud_addr_end(addr, end);
                         }
                 }
  
@@ -187,6 +188,99 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
         }
  }
  
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pte_t *pte;
+
+       pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                       kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE);
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pmd_t *pmd;
+       phys_addr_t next;
+
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = kvm_pmd_addr_end(addr, end);
+               if (!pmd_none(*pmd)) {
+                       if (kvm_pmd_huge(*pmd)) {
+                               hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                               kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE);
+                       } else {
+                               stage2_flush_ptes(kvm, pmd, addr, next);
+                       }
+               }
+       } while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pud_t *pud;
+       phys_addr_t next;
+
+       pud = pud_offset(pgd, addr);
+       do {
+               next = kvm_pud_addr_end(addr, end);
+               if (!pud_none(*pud)) {
+                       if (pud_huge(*pud)) {
+                               hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+                               kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE);
+                       } else {
+                               stage2_flush_pmds(kvm, pud, addr, next);
+                       }
+               }
+       } while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot)
+{
+       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+       phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+       phys_addr_t next;
+       pgd_t *pgd;
+
+       pgd = kvm->arch.pgd + pgd_index(addr);
+       do {
+               next = kvm_pgd_addr_end(addr, end);
+               stage2_flush_puds(kvm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+void stage2_flush_vm(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots)
+               stage2_flush_memslot(kvm, memslot);
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+}
+
  /**
   * free_boot_hyp_pgd - free HYP boot page tables
   *
@@ -715,7 +809,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                         kvm_set_s2pmd_writable(&new_pmd);
                         kvm_set_pfn_dirty(pfn);
                 }
-               coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
+               coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE);
                 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
         } else {
                 pte_t new_pte = pfn_pte(pfn, PAGE_S2);
@@ -723,7 +817,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                         kvm_set_s2pte_writable(&new_pte);
                         kvm_set_pfn_dirty(pfn);
                 }
-               coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
+               coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
                 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
         }
  
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h

index 21ef48d..3d69030 100644 (file)
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -62,6 +62,7 @@
   * RW:         64bit by default, can be overriden for 32bit VMs
   * TAC:                Trap ACTLR
   * TSC:                Trap SMC
+ * TVM:                Trap VM ops (until M+C set in SCTLR_EL1)
   * TSW:                Trap cache operations by set/way
   * TWE:                Trap WFE
   * TWI:                Trap WFI
@@ -74,7 +75,7 @@
   * SWIO:       Turn set/way invalidates into set/way clean+invalidate
   */
  #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
-                        HCR_BSU_IS | HCR_FB | HCR_TAC | \
+                        HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
                          HCR_AMO | HCR_IMO | HCR_FMO | \
                          HCR_SWIO | HCR_TIDCP | HCR_RW)
  #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index b25763b..9fcd54b 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -79,7 +79,8 @@
  #define c13_TID_URW    (TPIDR_EL0 * 2) /* Thread ID, User R/W */
  #define c13_TID_URO    (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
  #define c13_TID_PRIV   (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR      (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR0     (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR1     (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
  #define c14_CNTKCTL    (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
  #define NR_CP15_REGS   (NR_SYS_REGS * 2)
  
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h

index 7f1f940..7d29847 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -106,7 +106,6 @@ static inline bool kvm_is_write_fault(unsigned long esr)
         return true;
  }
  
-static inline void kvm_clean_dcache_area(void *addr, size_t size) {}
  static inline void kvm_clean_pgd(pgd_t *pgd) {}
  static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
  static inline void kvm_clean_pte(pte_t *pte) {}
@@ -122,11 +121,25 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
         pmd_val(*pmd) |= PMD_S2_RDWR;
  }
  
+#define kvm_pgd_addr_end(addr, end)    pgd_addr_end(addr, end)
+#define kvm_pud_addr_end(addr, end)    pud_addr_end(addr, end)
+#define kvm_pmd_addr_end(addr, end)    pmd_addr_end(addr, end)
+
  struct kvm;
  
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
-                                             unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l)   __flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
  {
+       return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+                                            unsigned long size)
+{
+       if (!vcpu_has_cache_enabled(vcpu))
+               kvm_flush_dcache_to_poc((void *)hva, size);
+
         if (!icache_is_aliasing()) {            /* PIPT */
                 flush_icache_range(hva, hva + size);
         } else if (!icache_is_aivivt()) {       /* non ASID-tagged VIVT */
@@ -135,8 +148,9 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
         }
  }
  
-#define kvm_flush_dcache_to_poc(a,l)   __flush_dcache_area((a), (l))
  #define kvm_virt_to_phys(x)            __virt_to_phys((unsigned long)(x))
  
+void stage2_flush_vm(struct kvm *kvm);
+
  #endif /* __ASSEMBLY__ */
  #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c

index 02e9d09..0324458 100644 (file)
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -27,6 +27,7 @@
  #include <asm/kvm_host.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
  #include <asm/cacheflush.h>
  #include <asm/cputype.h>
  #include <trace/events/kvm.h>
@@ -120,6 +121,48 @@ done:
         return true;
  }
  
+/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+                         const struct sys_reg_params *p,
+                         const struct sys_reg_desc *r)
+{
+       unsigned long val;
+
+       BUG_ON(!p->is_write);
+
+       val = *vcpu_reg(vcpu, p->Rt);
+       if (!p->is_aarch32) {
+               vcpu_sys_reg(vcpu, r->reg) = val;
+       } else {
+               vcpu_cp15(vcpu, r->reg) = val & 0xffffffffUL;
+               if (!p->is_32bit)
+                       vcpu_cp15(vcpu, r->reg + 1) = val >> 32;
+       }
+       return true;
+}
+
+/*
+ * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set.  If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ */
+static bool access_sctlr(struct kvm_vcpu *vcpu,
+                        const struct sys_reg_params *p,
+                        const struct sys_reg_desc *r)
+{
+       access_vm_reg(vcpu, p, r);
+
+       if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
+               vcpu->arch.hcr_el2 &= ~HCR_TVM;
+               stage2_flush_vm(vcpu->kvm);
+       }
+
+       return true;
+}
+
  /*
   * We could trap ID_DFR0 and tell the guest we don't support performance
   * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -185,32 +228,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
           NULL, reset_mpidr, MPIDR_EL1 },
         /* SCTLR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
-         NULL, reset_val, SCTLR_EL1, 0x00C50078 },
+         access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
         /* CPACR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
           NULL, reset_val, CPACR_EL1, 0 },
         /* TTBR0_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000),
-         NULL, reset_unknown, TTBR0_EL1 },
+         access_vm_reg, reset_unknown, TTBR0_EL1 },
         /* TTBR1_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001),
-         NULL, reset_unknown, TTBR1_EL1 },
+         access_vm_reg, reset_unknown, TTBR1_EL1 },
         /* TCR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010),
-         NULL, reset_val, TCR_EL1, 0 },
+         access_vm_reg, reset_val, TCR_EL1, 0 },
  
         /* AFSR0_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000),
-         NULL, reset_unknown, AFSR0_EL1 },
+         access_vm_reg, reset_unknown, AFSR0_EL1 },
         /* AFSR1_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001),
-         NULL, reset_unknown, AFSR1_EL1 },
+         access_vm_reg, reset_unknown, AFSR1_EL1 },
         /* ESR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000),
-         NULL, reset_unknown, ESR_EL1 },
+         access_vm_reg, reset_unknown, ESR_EL1 },
         /* FAR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000),
-         NULL, reset_unknown, FAR_EL1 },
+         access_vm_reg, reset_unknown, FAR_EL1 },
         /* PAR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000),
           NULL, reset_unknown, PAR_EL1 },
@@ -224,17 +267,17 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  
         /* MAIR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
-         NULL, reset_unknown, MAIR_EL1 },
+         access_vm_reg, reset_unknown, MAIR_EL1 },
         /* AMAIR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000),
-         NULL, reset_amair_el1, AMAIR_EL1 },
+         access_vm_reg, reset_amair_el1, AMAIR_EL1 },
  
         /* VBAR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
           NULL, reset_val, VBAR_EL1, 0 },
         /* CONTEXTIDR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
-         NULL, reset_val, CONTEXTIDR_EL1, 0 },
+         access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
         /* TPIDR_EL1 */
         { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100),
           NULL, reset_unknown, TPIDR_EL1 },
@@ -305,14 +348,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
           NULL, reset_val, FPEXC32_EL2, 0x70 },
  };
  
-/* Trapped cp15 registers */
+/*
+ * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
+ * depending on the way they are accessed (as a 32bit or a 64bit
+ * register).
+ */
  static const struct sys_reg_desc cp15_regs[] = {
+       { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+       { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
+       { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
+       { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR },
+       { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR },
+       { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR },
+       { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR },
+       { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR },
+       { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR },
+       { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR },
+
         /*
          * DC{C,I,CI}SW operations:
          */
         { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw },
         { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw },
         { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
+
         { Op1( 0), CRn( 9), CRm(12), Op2( 0), pm_fake },
         { Op1( 0), CRn( 9), CRm(12), Op2( 1), pm_fake },
         { Op1( 0), CRn( 9), CRm(12), Op2( 2), pm_fake },
@@ -326,6 +387,14 @@ static const struct sys_reg_desc cp15_regs[] = {
         { Op1( 0), CRn( 9), CRm(14), Op2( 0), pm_fake },
         { Op1( 0), CRn( 9), CRm(14), Op2( 1), pm_fake },
         { Op1( 0), CRn( 9), CRm(14), Op2( 2), pm_fake },
+
+       { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
+       { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
+       { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 },
+       { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 },
+       { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+
+       { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
  };
  
  /* Target specific emulation tables */
@@ -437,6 +506,8 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
         u32 hsr = kvm_vcpu_get_hsr(vcpu);
         int Rt2 = (hsr >> 10) & 0xf;
  
+       params.is_aarch32 = true;
+       params.is_32bit = false;
         params.CRm = (hsr >> 1) & 0xf;
         params.Rt = (hsr >> 5) & 0xf;
         params.is_write = ((hsr & 1) == 0);
@@ -480,6 +551,8 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
         struct sys_reg_params params;
         u32 hsr = kvm_vcpu_get_hsr(vcpu);
  
+       params.is_aarch32 = true;
+       params.is_32bit = true;
         params.CRm = (hsr >> 1) & 0xf;
         params.Rt  = (hsr >> 5) & 0xf;
         params.is_write = ((hsr & 1) == 0);
@@ -549,6 +622,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
         struct sys_reg_params params;
         unsigned long esr = kvm_vcpu_get_hsr(vcpu);
  
+       params.is_aarch32 = false;
+       params.is_32bit = false;
         params.Op0 = (esr >> 20) & 3;
         params.Op1 = (esr >> 14) & 0x7;
         params.CRn = (esr >> 10) & 0xf;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h

index d50d372..d411e25 100644 (file)
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -30,6 +30,8 @@ struct sys_reg_params {
         u8      Op2;
         u8      Rt;
         bool    is_write;
+       bool    is_aarch32;
+       bool    is_32bit;       /* Only valid if is_aarch32 is true */
  };
  
  struct sys_reg_desc {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c

index 53f44be..6a4309b 100644 (file)
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -199,6 +199,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_IRQCHIP:
         case KVM_CAP_MP_STATE:
         case KVM_CAP_IRQ_INJECT_STATUS:
+       case KVM_CAP_IOAPIC_POLARITY_IGNORED:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h

index a995fce..060aaa6 100644 (file)
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -30,16 +30,16 @@
  
  
  /* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR     0x0
+#define KVM_GUEST_COMMPAGE_ADDR                0x0
  
  #define KVM_GUEST_KERNEL_MODE(vcpu)    ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
                                         ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
  
-#define KVM_GUEST_KUSEG             0x00000000UL
-#define KVM_GUEST_KSEG0             0x40000000UL
-#define KVM_GUEST_KSEG23            0x60000000UL
-#define KVM_GUEST_KSEGX(a)          ((_ACAST32_(a)) & 0x60000000)
-#define KVM_GUEST_CPHYSADDR(a)      ((_ACAST32_(a)) & 0x1fffffff)
+#define KVM_GUEST_KUSEG                        0x00000000UL
+#define KVM_GUEST_KSEG0                        0x40000000UL
+#define KVM_GUEST_KSEG23               0x60000000UL
+#define KVM_GUEST_KSEGX(a)             ((_ACAST32_(a)) & 0x60000000)
+#define KVM_GUEST_CPHYSADDR(a)         ((_ACAST32_(a)) & 0x1fffffff)
  
  #define KVM_GUEST_CKSEG0ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG0)
  #define KVM_GUEST_CKSEG1ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
@@ -52,17 +52,17 @@
  #define KVM_GUEST_KSEG1ADDR(a)         (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
  #define KVM_GUEST_KSEG23ADDR(a)                (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
  
-#define KVM_INVALID_PAGE            0xdeadbeef
-#define KVM_INVALID_INST            0xdeadbeef
-#define KVM_INVALID_ADDR            0xdeadbeef
+#define KVM_INVALID_PAGE               0xdeadbeef
+#define KVM_INVALID_INST               0xdeadbeef
+#define KVM_INVALID_ADDR               0xdeadbeef
  
-#define KVM_MALTA_GUEST_RTC_ADDR    0xb8000070UL
+#define KVM_MALTA_GUEST_RTC_ADDR       0xb8000070UL
  
-#define GUEST_TICKS_PER_JIFFY (40000000/HZ)
-#define MS_TO_NS(x) (x * 1E6L)
+#define GUEST_TICKS_PER_JIFFY          (40000000/HZ)
+#define MS_TO_NS(x)                    (x * 1E6L)
  
-#define CAUSEB_DC       27
-#define CAUSEF_DC       (_ULCAST_(1)   << 27)
+#define CAUSEB_DC                      27
+#define CAUSEF_DC                      (_ULCAST_(1) << 27)
  
  struct kvm;
  struct kvm_run;
@@ -126,8 +126,8 @@ struct kvm_arch {
         int commpage_tlb;
  };
  
-#define N_MIPS_COPROC_REGS      32
-#define N_MIPS_COPROC_SEL      8
+#define N_MIPS_COPROC_REGS     32
+#define N_MIPS_COPROC_SEL      8
  
  struct mips_coproc {
         unsigned long reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];
@@ -139,124 +139,124 @@ struct mips_coproc {
  /*
   * Coprocessor 0 register names
   */
-#define        MIPS_CP0_TLB_INDEX          0
-#define        MIPS_CP0_TLB_RANDOM         1
-#define        MIPS_CP0_TLB_LOW            2
-#define        MIPS_CP0_TLB_LO0            2
-#define        MIPS_CP0_TLB_LO1            3
-#define        MIPS_CP0_TLB_CONTEXT    4
-#define        MIPS_CP0_TLB_PG_MASK    5
-#define        MIPS_CP0_TLB_WIRED          6
-#define        MIPS_CP0_HWRENA             7
-#define        MIPS_CP0_BAD_VADDR          8
-#define        MIPS_CP0_COUNT          9
-#define        MIPS_CP0_TLB_HI         10
-#define        MIPS_CP0_COMPARE            11
-#define        MIPS_CP0_STATUS         12
-#define        MIPS_CP0_CAUSE          13
-#define        MIPS_CP0_EXC_PC         14
-#define        MIPS_CP0_PRID               15
-#define        MIPS_CP0_CONFIG         16
-#define        MIPS_CP0_LLADDR         17
-#define        MIPS_CP0_WATCH_LO           18
-#define        MIPS_CP0_WATCH_HI           19
-#define        MIPS_CP0_TLB_XCONTEXT   20
-#define        MIPS_CP0_ECC                26
-#define        MIPS_CP0_CACHE_ERR          27
-#define        MIPS_CP0_TAG_LO         28
-#define        MIPS_CP0_TAG_HI         29
-#define        MIPS_CP0_ERROR_PC           30
-#define        MIPS_CP0_DEBUG          23
-#define        MIPS_CP0_DEPC               24
-#define        MIPS_CP0_PERFCNT            25
-#define        MIPS_CP0_ERRCTL         26
-#define        MIPS_CP0_DATA_LO            28
-#define        MIPS_CP0_DATA_HI            29
-#define        MIPS_CP0_DESAVE         31
-
-#define MIPS_CP0_CONFIG_SEL        0
-#define MIPS_CP0_CONFIG1_SEL    1
-#define MIPS_CP0_CONFIG2_SEL    2
-#define MIPS_CP0_CONFIG3_SEL    3
+#define MIPS_CP0_TLB_INDEX     0
+#define MIPS_CP0_TLB_RANDOM    1
+#define MIPS_CP0_TLB_LOW       2
+#define MIPS_CP0_TLB_LO0       2
+#define MIPS_CP0_TLB_LO1       3
+#define MIPS_CP0_TLB_CONTEXT   4
+#define MIPS_CP0_TLB_PG_MASK   5
+#define MIPS_CP0_TLB_WIRED     6
+#define MIPS_CP0_HWRENA                7
+#define MIPS_CP0_BAD_VADDR     8
+#define MIPS_CP0_COUNT         9
+#define MIPS_CP0_TLB_HI                10
+#define MIPS_CP0_COMPARE       11
+#define MIPS_CP0_STATUS                12
+#define MIPS_CP0_CAUSE         13
+#define MIPS_CP0_EXC_PC                14
+#define MIPS_CP0_PRID          15
+#define MIPS_CP0_CONFIG                16
+#define MIPS_CP0_LLADDR                17
+#define MIPS_CP0_WATCH_LO      18
+#define MIPS_CP0_WATCH_HI      19
+#define MIPS_CP0_TLB_XCONTEXT  20
+#define MIPS_CP0_ECC           26
+#define MIPS_CP0_CACHE_ERR     27
+#define MIPS_CP0_TAG_LO                28
+#define MIPS_CP0_TAG_HI                29
+#define MIPS_CP0_ERROR_PC      30
+#define MIPS_CP0_DEBUG         23
+#define MIPS_CP0_DEPC          24
+#define MIPS_CP0_PERFCNT       25
+#define MIPS_CP0_ERRCTL                26
+#define MIPS_CP0_DATA_LO       28
+#define MIPS_CP0_DATA_HI       29
+#define MIPS_CP0_DESAVE                31
+
+#define MIPS_CP0_CONFIG_SEL    0
+#define MIPS_CP0_CONFIG1_SEL   1
+#define MIPS_CP0_CONFIG2_SEL   2
+#define MIPS_CP0_CONFIG3_SEL   3
  
  /* Config0 register bits */
-#define CP0C0_M    31
-#define CP0C0_K23  28
-#define CP0C0_KU   25
-#define CP0C0_MDU  20
-#define CP0C0_MM   17
-#define CP0C0_BM   16
-#define CP0C0_BE   15
-#define CP0C0_AT   13
-#define CP0C0_AR   10
-#define CP0C0_MT   7
-#define CP0C0_VI   3
-#define CP0C0_K0   0
+#define CP0C0_M                        31
+#define CP0C0_K23              28
+#define CP0C0_KU               25
+#define CP0C0_MDU              20
+#define CP0C0_MM               17
+#define CP0C0_BM               16
+#define CP0C0_BE               15
+#define CP0C0_AT               13
+#define CP0C0_AR               10
+#define CP0C0_MT               7
+#define CP0C0_VI               3
+#define CP0C0_K0               0
  
  /* Config1 register bits */
-#define CP0C1_M    31
-#define CP0C1_MMU  25
-#define CP0C1_IS   22
-#define CP0C1_IL   19
-#define CP0C1_IA   16
-#define CP0C1_DS   13
-#define CP0C1_DL   10
-#define CP0C1_DA   7
-#define CP0C1_C2   6
-#define CP0C1_MD   5
-#define CP0C1_PC   4
-#define CP0C1_WR   3
-#define CP0C1_CA   2
-#define CP0C1_EP   1
-#define CP0C1_FP   0
+#define CP0C1_M                        31
+#define CP0C1_MMU              25
+#define CP0C1_IS               22
+#define CP0C1_IL               19
+#define CP0C1_IA               16
+#define CP0C1_DS               13
+#define CP0C1_DL               10
+#define CP0C1_DA               7
+#define CP0C1_C2               6
+#define CP0C1_MD               5
+#define CP0C1_PC               4
+#define CP0C1_WR               3
+#define CP0C1_CA               2
+#define CP0C1_EP               1
+#define CP0C1_FP               0
  
  /* Config2 Register bits */
-#define CP0C2_M    31
-#define CP0C2_TU   28
-#define CP0C2_TS   24
-#define CP0C2_TL   20
-#define CP0C2_TA   16
-#define CP0C2_SU   12
-#define CP0C2_SS   8
-#define CP0C2_SL   4
-#define CP0C2_SA   0
+#define CP0C2_M                        31
+#define CP0C2_TU               28
+#define CP0C2_TS               24
+#define CP0C2_TL               20
+#define CP0C2_TA               16
+#define CP0C2_SU               12
+#define CP0C2_SS               8
+#define CP0C2_SL               4
+#define CP0C2_SA               0
  
  /* Config3 Register bits */
-#define CP0C3_M    31
-#define CP0C3_ISA_ON_EXC 16
-#define CP0C3_ULRI  13
-#define CP0C3_DSPP 10
-#define CP0C3_LPA  7
-#define CP0C3_VEIC 6
-#define CP0C3_VInt 5
-#define CP0C3_SP   4
-#define CP0C3_MT   2
-#define CP0C3_SM   1
-#define CP0C3_TL   0
+#define CP0C3_M                        31
+#define CP0C3_ISA_ON_EXC       16
+#define CP0C3_ULRI             13
+#define CP0C3_DSPP             10
+#define CP0C3_LPA              7
+#define CP0C3_VEIC             6
+#define CP0C3_VInt             5
+#define CP0C3_SP               4
+#define CP0C3_MT               2
+#define CP0C3_SM               1
+#define CP0C3_TL               0
  
  /* Have config1, Cacheable, noncoherent, write-back, write allocate*/
-#define MIPS_CONFIG0                                              \
+#define MIPS_CONFIG0                                           \
    ((1 << CP0C0_M) | (0x3 << CP0C0_K0))
  
  /* Have config2, no coprocessor2 attached, no MDMX support attached,
     no performance counters, watch registers present,
     no code compression, EJTAG present, no FPU, no watch registers */
-#define MIPS_CONFIG1                                              \
-((1 << CP0C1_M) |                                                 \
- (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) |            \
- (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) |            \
+#define MIPS_CONFIG1                                           \
+((1 << CP0C1_M) |                                              \
+ (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) |         \
+ (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) |         \
   (0 << CP0C1_FP))
  
  /* Have config3, no tertiary/secondary caches implemented */
-#define MIPS_CONFIG2                                              \
+#define MIPS_CONFIG2                                           \
  ((1 << CP0C2_M))
  
  /* No config4, no DSP ASE, no large physaddr (PABITS),
     no external interrupt controller, no vectored interrupts,
     no 1kb pages, no SmartMIPS ASE, no trace logic */
-#define MIPS_CONFIG3                                              \
-((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) |          \
- (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) |        \
+#define MIPS_CONFIG3                                           \
+((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) |       \
+ (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) |     \
   (0 << CP0C3_SM) | (0 << CP0C3_TL))
  
  /* MMU types, the first four entries have the same layout as the
@@ -274,36 +274,36 @@ enum mips_mmu_types {
  /*
   * Trap codes
   */
-#define T_INT           0      /* Interrupt pending */
-#define T_TLB_MOD       1      /* TLB modified fault */
-#define T_TLB_LD_MISS       2  /* TLB miss on load or ifetch */
-#define T_TLB_ST_MISS       3  /* TLB miss on a store */
-#define T_ADDR_ERR_LD       4  /* Address error on a load or ifetch */
-#define T_ADDR_ERR_ST       5  /* Address error on a store */
-#define T_BUS_ERR_IFETCH    6  /* Bus error on an ifetch */
-#define T_BUS_ERR_LD_ST     7  /* Bus error on a load or store */
-#define T_SYSCALL       8      /* System call */
-#define T_BREAK         9      /* Breakpoint */
-#define T_RES_INST      10     /* Reserved instruction exception */
-#define T_COP_UNUSABLE      11 /* Coprocessor unusable */
-#define T_OVFLOW        12     /* Arithmetic overflow */
+#define T_INT                  0       /* Interrupt pending */
+#define T_TLB_MOD              1       /* TLB modified fault */
+#define T_TLB_LD_MISS          2       /* TLB miss on load or ifetch */
+#define T_TLB_ST_MISS          3       /* TLB miss on a store */
+#define T_ADDR_ERR_LD          4       /* Address error on a load or ifetch */
+#define T_ADDR_ERR_ST          5       /* Address error on a store */
+#define T_BUS_ERR_IFETCH       6       /* Bus error on an ifetch */
+#define T_BUS_ERR_LD_ST                7       /* Bus error on a load or store */
+#define T_SYSCALL              8       /* System call */
+#define T_BREAK                        9       /* Breakpoint */
+#define T_RES_INST             10      /* Reserved instruction exception */
+#define T_COP_UNUSABLE         11      /* Coprocessor unusable */
+#define T_OVFLOW               12      /* Arithmetic overflow */
  
  /*
   * Trap definitions added for r4000 port.
   */
-#define T_TRAP          13     /* Trap instruction */
-#define T_VCEI          14     /* Virtual coherency exception */
-#define T_FPE           15     /* Floating point exception */
-#define T_WATCH         23     /* Watch address reference */
-#define T_VCED          31     /* Virtual coherency data */
+#define T_TRAP                 13      /* Trap instruction */
+#define T_VCEI                 14      /* Virtual coherency exception */
+#define T_FPE                  15      /* Floating point exception */
+#define T_WATCH                        23      /* Watch address reference */
+#define T_VCED                 31      /* Virtual coherency data */
  
  /* Resume Flags */
-#define RESUME_FLAG_DR          (1<<0) /* Reload guest nonvolatile state? */
-#define RESUME_FLAG_HOST        (1<<1) /* Resume host? */
+#define RESUME_FLAG_DR         (1<<0)  /* Reload guest nonvolatile state? */
+#define RESUME_FLAG_HOST       (1<<1)  /* Resume host? */
  
-#define RESUME_GUEST            0
-#define RESUME_GUEST_DR         RESUME_FLAG_DR
-#define RESUME_HOST             RESUME_FLAG_HOST
+#define RESUME_GUEST           0
+#define RESUME_GUEST_DR                RESUME_FLAG_DR
+#define RESUME_HOST            RESUME_FLAG_HOST
  
  enum emulation_result {
         EMULATE_DONE,           /* no further processing */
@@ -313,24 +313,27 @@ enum emulation_result {
         EMULATE_PRIV_FAIL,
  };
  
-#define MIPS3_PG_G  0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V  0x00000002 /* Valid */
-#define MIPS3_PG_NV 0x00000000
-#define MIPS3_PG_D  0x00000004 /* Dirty */
+#define MIPS3_PG_G     0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
+#define MIPS3_PG_V     0x00000002 /* Valid */
+#define MIPS3_PG_NV    0x00000000
+#define MIPS3_PG_D     0x00000004 /* Dirty */
  
  #define mips3_paddr_to_tlbpfn(x) \
-    (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
+       (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
  #define mips3_tlbpfn_to_paddr(x) \
-    ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
+       ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
  
-#define MIPS3_PG_SHIFT      6
-#define MIPS3_PG_FRAME      0x3fffffc0
+#define MIPS3_PG_SHIFT         6
+#define MIPS3_PG_FRAME         0x3fffffc0
  
-#define VPN2_MASK           0xffffe000
-#define TLB_IS_GLOBAL(x)    (((x).tlb_lo0 & MIPS3_PG_G) && ((x).tlb_lo1 & MIPS3_PG_G))
-#define TLB_VPN2(x)         ((x).tlb_hi & VPN2_MASK)
-#define TLB_ASID(x)         ((x).tlb_hi & ASID_MASK)
-#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) ? ((x).tlb_lo1 & MIPS3_PG_V) : ((x).tlb_lo0 & MIPS3_PG_V))
+#define VPN2_MASK              0xffffe000
+#define TLB_IS_GLOBAL(x)       (((x).tlb_lo0 & MIPS3_PG_G) &&  \
+                                ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_VPN2(x)            ((x).tlb_hi & VPN2_MASK)
+#define TLB_ASID(x)            ((x).tlb_hi & ASID_MASK)
+#define TLB_IS_VALID(x, va)    (((va) & (1 << PAGE_SHIFT))     \
+                                ? ((x).tlb_lo1 & MIPS3_PG_V)   \
+                                : ((x).tlb_lo0 & MIPS3_PG_V))
  
  struct kvm_mips_tlb {
         long tlb_mask;
@@ -339,7 +342,7 @@ struct kvm_mips_tlb {
         long tlb_lo1;
  };
  
-#define KVM_MIPS_GUEST_TLB_SIZE     64
+#define KVM_MIPS_GUEST_TLB_SIZE        64
  struct kvm_vcpu_arch {
         void *host_ebase, *guest_ebase;
         unsigned long host_stack;
@@ -400,65 +403,67 @@ struct kvm_vcpu_arch {
  };
  
  
-#define kvm_read_c0_guest_index(cop0)               (cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val)         (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0)            (cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_read_c0_guest_entrylo1(cop0)            (cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_read_c0_guest_context(cop0)             (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val)       (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0)           (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_read_c0_guest_pagemask(cop0)            (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val)      (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0)               (cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val)         (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0)            (cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val)      (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0)               (cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val)         (cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0)             (cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val)       (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0)             (cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val)       (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0)              (cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0)              (cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0)               (cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val)         (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0)                 (cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val)           (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0)                (cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val)          (cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0)               (cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val)         (cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0)              (cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0)             (cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0)             (cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0)             (cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config7(cop0)             (cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val)        (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config7(cop0, val)       (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0)            (cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val)      (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-
-#define kvm_set_c0_guest_status(cop0, val)          (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val)        (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
-#define kvm_set_c0_guest_cause(cop0, val)           (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
-#define kvm_clear_c0_guest_cause(cop0, val)         (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
-#define kvm_change_c0_guest_cause(cop0, change, val)  \
-{                                                     \
-    kvm_clear_c0_guest_cause(cop0, change);           \
-    kvm_set_c0_guest_cause(cop0, ((val) & (change))); \
+#define kvm_read_c0_guest_index(cop0)          (cop0->reg[MIPS_CP0_TLB_INDEX][0])
+#define kvm_write_c0_guest_index(cop0, val)    (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
+#define kvm_read_c0_guest_entrylo0(cop0)       (cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_read_c0_guest_entrylo1(cop0)       (cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_read_c0_guest_context(cop0)                (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
+#define kvm_write_c0_guest_context(cop0, val)  (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
+#define kvm_read_c0_guest_userlocal(cop0)      (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
+#define kvm_read_c0_guest_pagemask(cop0)       (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
+#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
+#define kvm_read_c0_guest_wired(cop0)          (cop0->reg[MIPS_CP0_TLB_WIRED][0])
+#define kvm_write_c0_guest_wired(cop0, val)    (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
+#define kvm_read_c0_guest_hwrena(cop0)         (cop0->reg[MIPS_CP0_HWRENA][0])
+#define kvm_write_c0_guest_hwrena(cop0, val)   (cop0->reg[MIPS_CP0_HWRENA][0] = (val))
+#define kvm_read_c0_guest_badvaddr(cop0)       (cop0->reg[MIPS_CP0_BAD_VADDR][0])
+#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
+#define kvm_read_c0_guest_count(cop0)          (cop0->reg[MIPS_CP0_COUNT][0])
+#define kvm_write_c0_guest_count(cop0, val)    (cop0->reg[MIPS_CP0_COUNT][0] = (val))
+#define kvm_read_c0_guest_entryhi(cop0)                (cop0->reg[MIPS_CP0_TLB_HI][0])
+#define kvm_write_c0_guest_entryhi(cop0, val)  (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
+#define kvm_read_c0_guest_compare(cop0)                (cop0->reg[MIPS_CP0_COMPARE][0])
+#define kvm_write_c0_guest_compare(cop0, val)  (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
+#define kvm_read_c0_guest_status(cop0)         (cop0->reg[MIPS_CP0_STATUS][0])
+#define kvm_write_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] = (val))
+#define kvm_read_c0_guest_intctl(cop0)         (cop0->reg[MIPS_CP0_STATUS][1])
+#define kvm_write_c0_guest_intctl(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][1] = (val))
+#define kvm_read_c0_guest_cause(cop0)          (cop0->reg[MIPS_CP0_CAUSE][0])
+#define kvm_write_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
+#define kvm_read_c0_guest_epc(cop0)            (cop0->reg[MIPS_CP0_EXC_PC][0])
+#define kvm_write_c0_guest_epc(cop0, val)      (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
+#define kvm_read_c0_guest_prid(cop0)           (cop0->reg[MIPS_CP0_PRID][0])
+#define kvm_write_c0_guest_prid(cop0, val)     (cop0->reg[MIPS_CP0_PRID][0] = (val))
+#define kvm_read_c0_guest_ebase(cop0)          (cop0->reg[MIPS_CP0_PRID][1])
+#define kvm_write_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] = (val))
+#define kvm_read_c0_guest_config(cop0)         (cop0->reg[MIPS_CP0_CONFIG][0])
+#define kvm_read_c0_guest_config1(cop0)                (cop0->reg[MIPS_CP0_CONFIG][1])
+#define kvm_read_c0_guest_config2(cop0)                (cop0->reg[MIPS_CP0_CONFIG][2])
+#define kvm_read_c0_guest_config3(cop0)                (cop0->reg[MIPS_CP0_CONFIG][3])
+#define kvm_read_c0_guest_config7(cop0)                (cop0->reg[MIPS_CP0_CONFIG][7])
+#define kvm_write_c0_guest_config(cop0, val)   (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
+#define kvm_write_c0_guest_config1(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
+#define kvm_write_c0_guest_config2(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
+#define kvm_write_c0_guest_config3(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
+#define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
+#define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
+#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+
+#define kvm_set_c0_guest_status(cop0, val)     (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
+#define kvm_clear_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+#define kvm_set_c0_guest_cause(cop0, val)      (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
+#define kvm_clear_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
+#define kvm_change_c0_guest_cause(cop0, change, val)                   \
+{                                                                      \
+       kvm_clear_c0_guest_cause(cop0, change);                         \
+       kvm_set_c0_guest_cause(cop0, ((val) & (change)));               \
  }
-#define kvm_set_c0_guest_ebase(cop0, val)           (cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val)         (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val)  \
-{                                                     \
-    kvm_clear_c0_guest_ebase(cop0, change);           \
-    kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \
+#define kvm_set_c0_guest_ebase(cop0, val)      (cop0->reg[MIPS_CP0_PRID][1] |= (val))
+#define kvm_clear_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
+#define kvm_change_c0_guest_ebase(cop0, change, val)                   \
+{                                                                      \
+       kvm_clear_c0_guest_ebase(cop0, change);                         \
+       kvm_set_c0_guest_ebase(cop0, ((val) & (change)));               \
  }
  
  
diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c

index 4b6274b..e3fec99 100644 (file)
--- a/arch/mips/kvm/kvm_mips_emul.c
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -436,13 +436,6 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
         sel = inst & 0x7;
         co_bit = (inst >> 25) & 1;
  
-       /* Verify that the register is valid */
-       if (rd > MIPS_CP0_DESAVE) {
-               printk("Invalid rd: %d\n", rd);
-               er = EMULATE_FAIL;
-               goto done;
-       }
-
         if (co_bit) {
                 op = (inst) & 0xff;
  
@@ -1542,8 +1535,15 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
         }
  
         if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+               int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
                 int rd = (inst & RD) >> 11;
                 int rt = (inst & RT) >> 16;
+               /* If usermode, check RDHWR rd is allowed by guest HWREna */
+               if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
+                       kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
+                                 rd, opc);
+                       goto emulate_ri;
+               }
                 switch (rd) {
                 case 0: /* CPU number */
                         arch->gprs[rt] = 0;
@@ -1567,31 +1567,27 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                         }
                         break;
                 case 29:
-#if 1
                         arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
-#else
-                       /* UserLocal not implemented */
-                       er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
-#endif
                         break;
  
                 default:
-                       printk("RDHWR not supported\n");
-                       er = EMULATE_FAIL;
-                       break;
+                       kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
+                       goto emulate_ri;
                 }
         } else {
-               printk("Emulate RI not supported @ %p: %#x\n", opc, inst);
-               er = EMULATE_FAIL;
+               kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+               goto emulate_ri;
         }
  
+       return EMULATE_DONE;
+
+emulate_ri:
         /*
-        * Rollback PC only if emulation was unsuccessful
+        * Rollback PC (if in branch delay slot then the PC already points to
+        * branch target), and pass the RI exception to the guest OS.
          */
-       if (er == EMULATE_FAIL) {
-               vcpu->arch.pc = curr_pc;
-       }
-       return er;
+       vcpu->arch.pc = curr_pc;
+       return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
  }
  
  enum emulation_result
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h

index 83851aa..bb1e38a 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -304,6 +304,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
         return vcpu->arch.fault_dar;
  }
  
+static inline bool is_kvmppc_resume_guest(int r)
+{
+       return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
+}
+
  /* Magic register values loaded into r3 and r4 before the 'sc' assembly
   * instruction for the OSI hypercalls */
  #define OSI_SC_MAGIC_R3                        0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index bf0fa8b..51388be 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -289,6 +289,18 @@ static inline void note_hpte_modification(struct kvm *kvm,
         if (atomic_read(&kvm->arch.hpte_mod_interest))
                 rev->guest_rpte |= HPTE_GR_MODIFIED;
  }
+
+/*
+ * Like kvm_memslots(), but for use in real mode when we can't do
+ * any RCU stuff (since the secondary threads are offline from the
+ * kernel's point of view), and we can't print anything.
+ * Thus we use rcu_dereference_raw() rather than rcu_dereference_check().
+ */
+static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
+{
+       return rcu_dereference_raw_notrace(kvm->memslots);
+}
+
  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
  
  #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h

index f3a91dc..821725c 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -94,7 +94,7 @@ struct kvmppc_host_state {
         unsigned long xics_phys;
         u32 saved_xirr;
         u64 dabr;
-       u64 host_mmcr[3];
+       u64 host_mmcr[7];       /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
         u32 host_pmc[8];
         u64 host_purr;
         u64 host_spurr;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h

index fcd53f0..4096f16 100644 (file)
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -129,6 +129,8 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                 struct kvm_create_spapr_tce *args);
  extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                              unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                            unsigned long ioba);
  extern struct kvm_rma_info *kvm_alloc_rma(void);
  extern void kvm_release_rma(struct kvm_rma_info *ri);
  extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h

index 1a36b8e..0dcc48a 100644 (file)
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -213,6 +213,7 @@
  #define SPRN_ACOP      0x1F    /* Available Coprocessor Register */
  #define SPRN_TFIAR     0x81    /* Transaction Failure Inst Addr   */
  #define SPRN_TEXASR    0x82    /* Transaction EXception & Summary */
+#define   TEXASR_FS    __MASK(63-36)   /* Transaction Failure Summary */
  #define SPRN_TEXASRU   0x83    /* ''      ''      ''    Upper 32  */
  #define SPRN_TFHAR     0x80    /* Transaction Failure Handler Addr */
  #define SPRN_CTRLF     0x088
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h

index 0c9f8b7..c22d704 100644 (file)
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -7,6 +7,8 @@
  
  #include <uapi/asm/tm.h>
  
+#ifndef __ASSEMBLY__
+
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
  extern void do_load_up_transact_fpu(struct thread_struct *thread);
  extern void do_load_up_transact_altivec(struct thread_struct *thread);
@@ -21,3 +23,5 @@ extern void tm_recheckpoint(struct thread_struct *thread,
  extern void tm_abort(uint8_t cause);
  extern void tm_save_sprs(struct thread_struct *thread);
  extern void tm_restore_sprs(struct thread_struct *thread);
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c

index 303ece7..fb25ebc 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -262,7 +262,14 @@ int kvmppc_mmu_hv_init(void)
  
  static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
  {
-       kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+       unsigned long msr = vcpu->arch.intr_msr;
+
+       /* If transactional, change to suspend mode on IRQ delivery */
+       if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+               msr |= MSR_TS_S;
+       else
+               msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
+       kvmppc_set_msr(vcpu, msr);
  }
  
  /*
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c

index 2c25f54..89e96b3 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -75,3 +75,31 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
         return H_TOO_HARD;
  }
  EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvmppc_spapr_tce_table *stt;
+
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == liobn) {
+                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+                       struct page *page;
+                       u64 *tbl;
+
+                       if (ioba >= stt->window_size)
+                               return H_PARAMETER;
+
+                       page = stt->pages[idx / TCES_PER_PAGE];
+                       tbl = (u64 *)page_address(page);
+
+                       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+                       return H_SUCCESS;
+               }
+       }
+
+       /* Didn't find the liobn, punt it to userspace */
+       return H_TOO_HARD;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 17fc949..8227dba 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -86,7 +86,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  
         /* CPU points to the first thread of the core */
         if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_PPC_ICP_NATIVE
                 int real_cpu = cpu + vcpu->arch.ptid;
                 if (paca[real_cpu].kvm_hstate.xics_phys)
                         xics_wake_cpu(real_cpu);
@@ -879,17 +879,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_IAMR:
                 *val = get_reg_val(id, vcpu->arch.iamr);
                 break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       case KVM_REG_PPC_TFHAR:
-               *val = get_reg_val(id, vcpu->arch.tfhar);
-               break;
-       case KVM_REG_PPC_TFIAR:
-               *val = get_reg_val(id, vcpu->arch.tfiar);
-               break;
-       case KVM_REG_PPC_TEXASR:
-               *val = get_reg_val(id, vcpu->arch.texasr);
-               break;
-#endif
         case KVM_REG_PPC_FSCR:
                 *val = get_reg_val(id, vcpu->arch.fscr);
                 break;
@@ -970,6 +959,69 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_PPR:
                 *val = get_reg_val(id, vcpu->arch.ppr);
                 break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case KVM_REG_PPC_TFHAR:
+               *val = get_reg_val(id, vcpu->arch.tfhar);
+               break;
+       case KVM_REG_PPC_TFIAR:
+               *val = get_reg_val(id, vcpu->arch.tfiar);
+               break;
+       case KVM_REG_PPC_TEXASR:
+               *val = get_reg_val(id, vcpu->arch.texasr);
+               break;
+       case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+               i = id - KVM_REG_PPC_TM_GPR0;
+               *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+               break;
+       case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+       {
+               int j;
+               i = id - KVM_REG_PPC_TM_VSR0;
+               if (i < 32)
+                       for (j = 0; j < TS_FPRWIDTH; j++)
+                               val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+               else {
+                       if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               val->vval = vcpu->arch.vr_tm.vr[i-32];
+                       else
+                               r = -ENXIO;
+               }
+               break;
+       }
+       case KVM_REG_PPC_TM_CR:
+               *val = get_reg_val(id, vcpu->arch.cr_tm);
+               break;
+       case KVM_REG_PPC_TM_LR:
+               *val = get_reg_val(id, vcpu->arch.lr_tm);
+               break;
+       case KVM_REG_PPC_TM_CTR:
+               *val = get_reg_val(id, vcpu->arch.ctr_tm);
+               break;
+       case KVM_REG_PPC_TM_FPSCR:
+               *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+               break;
+       case KVM_REG_PPC_TM_AMR:
+               *val = get_reg_val(id, vcpu->arch.amr_tm);
+               break;
+       case KVM_REG_PPC_TM_PPR:
+               *val = get_reg_val(id, vcpu->arch.ppr_tm);
+               break;
+       case KVM_REG_PPC_TM_VRSAVE:
+               *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+               break;
+       case KVM_REG_PPC_TM_VSCR:
+               if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                       *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+               else
+                       r = -ENXIO;
+               break;
+       case KVM_REG_PPC_TM_DSCR:
+               *val = get_reg_val(id, vcpu->arch.dscr_tm);
+               break;
+       case KVM_REG_PPC_TM_TAR:
+               *val = get_reg_val(id, vcpu->arch.tar_tm);
+               break;
+#endif
         case KVM_REG_PPC_ARCH_COMPAT:
                 *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
                 break;
@@ -1039,17 +1091,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_IAMR:
                 vcpu->arch.iamr = set_reg_val(id, *val);
                 break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       case KVM_REG_PPC_TFHAR:
-               vcpu->arch.tfhar = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_TFIAR:
-               vcpu->arch.tfiar = set_reg_val(id, *val);
-               break;
-       case KVM_REG_PPC_TEXASR:
-               vcpu->arch.texasr = set_reg_val(id, *val);
-               break;
-#endif
         case KVM_REG_PPC_FSCR:
                 vcpu->arch.fscr = set_reg_val(id, *val);
                 break;
@@ -1144,6 +1185,68 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_PPR:
                 vcpu->arch.ppr = set_reg_val(id, *val);
                 break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case KVM_REG_PPC_TFHAR:
+               vcpu->arch.tfhar = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TFIAR:
+               vcpu->arch.tfiar = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TEXASR:
+               vcpu->arch.texasr = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+               i = id - KVM_REG_PPC_TM_GPR0;
+               vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+       {
+               int j;
+               i = id - KVM_REG_PPC_TM_VSR0;
+               if (i < 32)
+                       for (j = 0; j < TS_FPRWIDTH; j++)
+                               vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+               else
+                       if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                               vcpu->arch.vr_tm.vr[i-32] = val->vval;
+                       else
+                               r = -ENXIO;
+               break;
+       }
+       case KVM_REG_PPC_TM_CR:
+               vcpu->arch.cr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_LR:
+               vcpu->arch.lr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_CTR:
+               vcpu->arch.ctr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_FPSCR:
+               vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_AMR:
+               vcpu->arch.amr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_PPR:
+               vcpu->arch.ppr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VRSAVE:
+               vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_VSCR:
+               if (cpu_has_feature(CPU_FTR_ALTIVEC))
+                       vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+               else
+                       r = - ENXIO;
+               break;
+       case KVM_REG_PPC_TM_DSCR:
+               vcpu->arch.dscr_tm = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_TM_TAR:
+               vcpu->arch.tar_tm = set_reg_val(id, *val);
+               break;
+#endif
         case KVM_REG_PPC_ARCH_COMPAT:
                 r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
                 break;
@@ -1360,9 +1463,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
         smp_wmb();
  #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
         if (cpu != smp_processor_id()) {
-#ifdef CONFIG_KVM_XICS
                 xics_wake_cpu(cpu);
-#endif
                 if (vcpu->arch.ptid)
                         ++vc->n_woken;
         }
@@ -1530,7 +1631,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
                 vcpu->arch.trap = 0;
  
                 if (vcpu->arch.ceded) {
-                       if (ret != RESUME_GUEST)
+                       if (!is_kvmppc_resume_guest(ret))
                                 kvmppc_end_cede(vcpu);
                         else
                                 kvmppc_set_timer(vcpu);
@@ -1541,7 +1642,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
         vc->vcore_state = VCORE_INACTIVE;
         list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                  arch.run_list) {
-               if (vcpu->arch.ret != RESUME_GUEST) {
+               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
                         kvmppc_remove_runnable(vc, vcpu);
                         wake_up(&vcpu->arch.cpu_run);
                 }
@@ -1731,7 +1832,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
                         srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
                 }
-       } while (r == RESUME_GUEST);
+       } while (is_kvmppc_resume_guest(r));
  
   out:
         vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -2366,7 +2467,7 @@ static int kvmppc_book3s_init_hv(void)
          */
         r = kvmppc_core_check_processor_compat_hv();
         if (r < 0)
-               return r;
+               return -ENODEV;
  
         kvm_ops_hv.owner = THIS_MODULE;
         kvmppc_hv_ops = &kvm_ops_hv;
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S

index e873796..e18e3cf 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -71,6 +71,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         mtmsrd  r10,1
  
         /* Save host PMU registers */
+BEGIN_FTR_SECTION
+       /* Work around P8 PMAE bug */
+       li      r3, -1
+       clrrdi  r3, r3, 10
+       mfspr   r8, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         li      r3, 1
         sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
         mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
@@ -87,9 +95,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
         cmpwi   r5, 0
         beq     31f                     /* skip if not */
         mfspr   r5, SPRN_MMCR1
+       mfspr   r9, SPRN_SIAR
+       mfspr   r10, SPRN_SDAR
         std     r7, HSTATE_MMCR(r13)
         std     r5, HSTATE_MMCR + 8(r13)
         std     r6, HSTATE_MMCR + 16(r13)
+       std     r9, HSTATE_MMCR + 24(r13)
+       std     r10, HSTATE_MMCR + 32(r13)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR + 40(r13)
+       std     r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         mfspr   r3, SPRN_PMC1
         mfspr   r5, SPRN_PMC2
         mfspr   r6, SPRN_PMC3
@@ -110,6 +127,11 @@ BEGIN_FTR_SECTION
         stw     r10, HSTATE_PMC + 24(r13)
         stw     r11, HSTATE_PMC + 28(r13)
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR + 40(r13)
+       std     r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  31:
  
         /*
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

index 37fb3ca..1d6c56a 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -111,7 +111,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
         rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
         ptel = rev->guest_rpte |= rcbits;
         gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
         if (!memslot)
                 return;
  
@@ -192,7 +192,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
         /* Find the memslot (if any) for this address */
         gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
         gfn = gpa >> PAGE_SHIFT;
-       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
         pa = 0;
         is_io = ~0ul;
         rmap = NULL;
@@ -670,7 +670,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
  
                         psize = hpte_page_size(v, r);
                         gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
-                       memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+                       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
                         if (memslot) {
                                 hva = __gfn_to_hva_memslot(memslot, gfn);
                                 pte = lookup_linux_pte_and_update(pgdir, hva,
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 53d647f..ffbb871 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -28,6 +28,9 @@
  #include <asm/exception-64s.h>
  #include <asm/kvm_book3s_asm.h>
  #include <asm/mmu-hash64.h>
+#include <asm/tm.h>
+
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
  
  #ifdef __LITTLE_ENDIAN__
  #error Need to fix lppaca and SLB shadow accesses in little endian mode
@@ -106,8 +109,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
         ld      r3, HSTATE_MMCR(r13)
         ld      r4, HSTATE_MMCR + 8(r13)
         ld      r5, HSTATE_MMCR + 16(r13)
+       ld      r6, HSTATE_MMCR + 24(r13)
+       ld      r7, HSTATE_MMCR + 32(r13)
         mtspr   SPRN_MMCR1, r4
         mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_SIAR, r6
+       mtspr   SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+       ld      r8, HSTATE_MMCR + 40(r13)
+       ld      r9, HSTATE_MMCR + 48(r13)
+       mtspr   SPRN_MMCR2, r8
+       mtspr   SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         mtspr   SPRN_MMCR0, r3
         isync
  23:
@@ -597,6 +610,116 @@ BEGIN_FTR_SECTION
   END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89)
  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       b       skip_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     skip_tm /* TM not active in guest */
+
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      .load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      .load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+skip_tm:
+#endif
+
         /* Load guest PMU registers */
         /* R4 is live here (vcpu pointer) */
         li      r3, 1
@@ -704,14 +827,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         ld      r6, VCPU_VTB(r4)
         mtspr   SPRN_IC, r5
         mtspr   SPRN_VTB, r6
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
-#endif
         ld      r8, VCPU_EBBHR(r4)
         mtspr   SPRN_EBBHR, r8
         ld      r5, VCPU_EBBRR(r4)
@@ -736,6 +851,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
          * Set the decrementer to the guest decrementer.
          */
         ld      r8,VCPU_DEC_EXPIRES(r4)
+       /* r8 is a host timebase value here, convert to guest TB */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r6,VCORE_TB_OFFSET(r5)
+       add     r8,r8,r6
         mftb    r7
         subf    r3,r7,r8
         mtspr   SPRN_DEC,r3
@@ -817,7 +936,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
  12:    mtspr   SPRN_SRR0, r10
         mr      r10,r0
         mtspr   SPRN_SRR1, r11
-       ld      r11, VCPU_INTR_MSR(r4)
+       mr      r9, r4
+       bl      kvmppc_msr_interrupt
  5:
  
  /*
@@ -1098,17 +1218,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
         mftb    r6
         extsw   r5,r5
         add     r5,r5,r6
+       /* r5 is a guest timebase value here, convert to host TB */
+       ld      r3,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_TB_OFFSET(r3)
+       subf    r5,r4,r5
         std     r5,VCPU_DEC_EXPIRES(r9)
  
  BEGIN_FTR_SECTION
         b       8f
  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
         /* Save POWER8-specific registers */
         mfspr   r5, SPRN_IAMR
         mfspr   r6, SPRN_PSPB
@@ -1122,14 +1240,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         std     r5, VCPU_IC(r9)
         std     r6, VCPU_VTB(r9)
         std     r7, VCPU_TAR(r9)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
-#endif
         mfspr   r8, SPRN_EBBHR
         std     r8, VCPU_EBBHR(r9)
         mfspr   r5, SPRN_EBBRR
@@ -1387,7 +1497,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         ld      r8,VCORE_TB_OFFSET(r5)
         cmpdi   r8,0
         beq     17f
-       mftb    r6                      /* current host timebase */
+       mftb    r6                      /* current guest timebase */
         subf    r8,r8,r6
         mtspr   SPRN_TBU40,r8           /* update upper 40 bits */
         mftb    r7                      /* check if lower 24 bits overflowed */
@@ -1557,7 +1667,7 @@ kvmppc_hdsi:
         mtspr   SPRN_SRR0, r10
         mtspr   SPRN_SRR1, r11
         li      r10, BOOK3S_INTERRUPT_DATA_STORAGE
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
  fast_interrupt_c_return:
  6:     ld      r7, VCPU_CTR(r9)
         lwz     r8, VCPU_XER(r9)
@@ -1626,7 +1736,7 @@ kvmppc_hisi:
  1:     mtspr   SPRN_SRR0, r10
         mtspr   SPRN_SRR1, r11
         li      r10, BOOK3S_INTERRUPT_INST_STORAGE
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
         b       fast_interrupt_c_return
  
  3:     ld      r6, VCPU_KVM(r9)        /* not relocated, use VRMA */
@@ -1669,7 +1779,7 @@ sc_1_fast_return:
         mtspr   SPRN_SRR0,r10
         mtspr   SPRN_SRR1,r11
         li      r10, BOOK3S_INTERRUPT_SYSCALL
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
         mr      r4,r9
         b       fast_guest_return
  
@@ -1691,7 +1801,7 @@ hcall_real_table:
         .long   0               /* 0x10 - H_CLEAR_MOD */
         .long   0               /* 0x14 - H_CLEAR_REF */
         .long   .kvmppc_h_protect - hcall_real_table
-       .long   0               /* 0x1c - H_GET_TCE */
+       .long   .kvmppc_h_get_tce - hcall_real_table
         .long   .kvmppc_h_put_tce - hcall_real_table
         .long   0               /* 0x24 - H_SET_SPRG0 */
         .long   .kvmppc_h_set_dabr - hcall_real_table
@@ -1997,7 +2107,7 @@ machine_check_realmode:
         beq     mc_cont
         /* If not, deliver a machine check.  SRR0/1 are already set */
         li      r10, BOOK3S_INTERRUPT_MACHINE_CHECK
-       ld      r11, VCPU_INTR_MSR(r9)
+       bl      kvmppc_msr_interrupt
         b       fast_interrupt_c_return
  
  /*
@@ -2138,8 +2248,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
         mfspr   r6,SPRN_VRSAVE
         stw     r6,VCPU_VRSAVE(r31)
         mtlr    r30
-       mtmsrd  r5
-       isync
         blr
  
  /*
@@ -2186,3 +2294,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
   */
  kvmppc_bad_host_intr:
         b       .
+
+/*
+ * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken
+ * from VCPU_INTR_MSR and is modified based on the required TM state changes.
+ *   r11 has the guest MSR value (in/out)
+ *   r9 has a vcpu pointer (in)
+ *   r0 is used as a scratch register
+ */
+kvmppc_msr_interrupt:
+       rldicl  r0, r11, 64 - MSR_TS_S_LG, 62
+       cmpwi   r0, 2 /* Check if we are in transactional state..  */
+       ld      r11, VCPU_INTR_MSR(r9)
+       bne     1f
+       /* ... if transactional, change to suspended */
+       li      r0, 1
+1:     rldimi  r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       blr
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c

index cf95cde..7a05315 100644 (file)
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -213,8 +213,11 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
         gpa_t args_phys;
         int rc;
  
-       /* r4 contains the guest physical address of the RTAS args */
-       args_phys = kvmppc_get_gpr(vcpu, 4);
+       /*
+        * r4 contains the guest physical address of the RTAS args
+        * Mask off the top 4 bits since this is a guest real address
+        */
+       args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
  
         rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
         if (rc)
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h

index 5f8bcc5..35f0faa 100644 (file)
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -53,6 +53,7 @@ enum interruption_class {
         IRQIO_PCI,
         IRQIO_MSI,
         IRQIO_VIR,
+       IRQIO_VAI,
         NMI_NMI,
         CPU_RST,
         NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 9bf95bb..154b600 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -16,12 +16,22 @@
  #include <linux/hrtimer.h>
  #include <linux/interrupt.h>
  #include <linux/kvm_host.h>
+#include <linux/kvm.h>
  #include <asm/debug.h>
  #include <asm/cpu.h>
+#include <asm/isc.h>
  
  #define KVM_MAX_VCPUS 64
  #define KVM_USER_MEM_SLOTS 32
  
+/*
+ * These seem to be used for allocating ->chip in the routing table,
+ * which we don't use. 4096 is an out-of-thin-air value. If we need
+ * to look at ->chip later on, we'll need to revisit this.
+ */
+#define KVM_NR_IRQCHIPS 1
+#define KVM_IRQCHIP_NUM_PINS 4096
+
  struct sca_entry {
         atomic_t scn;
         __u32   reserved;
@@ -108,7 +118,9 @@ struct kvm_s390_sie_block {
         __u32   fac;                    /* 0x01a0 */
         __u8    reserved1a4[20];        /* 0x01a4 */
         __u64   cbrlo;                  /* 0x01b8 */
-       __u8    reserved1c0[40];        /* 0x01c0 */
+       __u8    reserved1c0[30];        /* 0x01c0 */
+       __u64   pp;                     /* 0x01de */
+       __u8    reserved1e6[2];         /* 0x01e6 */
         __u64   itdba;                  /* 0x01e8 */
         __u8    reserved1f0[16];        /* 0x01f0 */
  } __attribute__((packed));
@@ -171,18 +183,6 @@ struct kvm_vcpu_stat {
         u32 diagnose_9c;
  };
  
-struct kvm_s390_io_info {
-       __u16        subchannel_id;            /* 0x0b8 */
-       __u16        subchannel_nr;            /* 0x0ba */
-       __u32        io_int_parm;              /* 0x0bc */
-       __u32        io_int_word;              /* 0x0c0 */
-};
-
-struct kvm_s390_ext_info {
-       __u32 ext_params;
-       __u64 ext_params2;
-};
-
  #define PGM_OPERATION            0x01
  #define PGM_PRIVILEGED_OP       0x02
  #define PGM_EXECUTE              0x03
@@ -191,27 +191,6 @@ struct kvm_s390_ext_info {
  #define PGM_SPECIFICATION        0x06
  #define PGM_DATA                 0x07
  
-struct kvm_s390_pgm_info {
-       __u16 code;
-};
-
-struct kvm_s390_prefix_info {
-       __u32 address;
-};
-
-struct kvm_s390_extcall_info {
-       __u16 code;
-};
-
-struct kvm_s390_emerg_info {
-       __u16 code;
-};
-
-struct kvm_s390_mchk_info {
-       __u64 cr14;
-       __u64 mcic;
-};
-
  struct kvm_s390_interrupt_info {
         struct list_head list;
         u64     type;
@@ -246,9 +225,8 @@ struct kvm_s390_float_interrupt {
         struct list_head list;
         atomic_t active;
         int next_rr_cpu;
-       unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1)
-                               / sizeof(long)];
-       struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS];
+       unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+       unsigned int irq_count;
  };
  
  
@@ -265,6 +243,10 @@ struct kvm_vcpu_arch {
                 u64             stidp_data;
         };
         struct gmap *gmap;
+#define KVM_S390_PFAULT_TOKEN_INVALID  (-1UL)
+       unsigned long pfault_token;
+       unsigned long pfault_select;
+       unsigned long pfault_compare;
  };
  
  struct kvm_vm_stat {
@@ -274,12 +256,36 @@ struct kvm_vm_stat {
  struct kvm_arch_memory_slot {
  };
  
+struct s390_map_info {
+       struct list_head list;
+       __u64 guest_addr;
+       __u64 addr;
+       struct page *page;
+};
+
+struct s390_io_adapter {
+       unsigned int id;
+       int isc;
+       bool maskable;
+       bool masked;
+       bool swap;
+       struct rw_semaphore maps_lock;
+       struct list_head maps;
+       atomic_t nr_maps;
+};
+
+#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
+#define MAX_S390_ADAPTER_MAPS 256
+
  struct kvm_arch{
         struct sca_block *sca;
         debug_info_t *dbf;
         struct kvm_s390_float_interrupt float_int;
+       struct kvm_device *flic;
         struct gmap *gmap;
         int css_support;
+       int use_irqchip;
+       struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
  };
  
  #define KVM_HVA_ERR_BAD                (-1UL)
@@ -290,6 +296,24 @@ static inline bool kvm_is_error_hva(unsigned long addr)
         return IS_ERR_VALUE(addr);
  }
  
+#define ASYNC_PF_PER_VCPU      64
+struct kvm_vcpu;
+struct kvm_async_pf;
+struct kvm_arch_async_pf {
+       unsigned long pfault_token;
+};
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+                              struct kvm_async_pf *work);
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work);
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work);
+
  extern int sie64a(struct kvm_s390_sie_block *, u64 *);
  extern char sie_exit;
  #endif
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 1ab75ea..50a75d9 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -782,6 +782,7 @@ static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
   * @table: pointer to the page directory
   * @asce: address space control element for gmap page table
   * @crst_list: list of all crst tables used in the guest address space
+ * @pfault_enabled: defines if pfaults are applicable for the guest
   */
  struct gmap {
         struct list_head list;
@@ -790,6 +791,7 @@ struct gmap {
         unsigned long asce;
         void *private;
         struct list_head crst_list;
+       bool pfault_enabled;
  };
  
  /**
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h

index 0a876bc..dc5fc4f 100644 (file)
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -79,6 +79,7 @@ struct thread_struct {
          unsigned long ksp;              /* kernel stack pointer             */
         mm_segment_t mm_segment;
         unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_pfault;       /* signal of a pending guest pfault */
         struct per_regs per_user;       /* User specified PER registers */
         struct per_event per_event;     /* Cause of the last PER trap */
         unsigned long per_flags;        /* Flags to control debug behavior */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h

index d25da59..c003c6a 100644 (file)
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -16,6 +16,44 @@
  
  #define __KVM_S390
  
+/* Device control API: s390-specific devices */
+#define KVM_DEV_FLIC_GET_ALL_IRQS      1
+#define KVM_DEV_FLIC_ENQUEUE           2
+#define KVM_DEV_FLIC_CLEAR_IRQS                3
+#define KVM_DEV_FLIC_APF_ENABLE                4
+#define KVM_DEV_FLIC_APF_DISABLE_WAIT  5
+#define KVM_DEV_FLIC_ADAPTER_REGISTER  6
+#define KVM_DEV_FLIC_ADAPTER_MODIFY    7
+/*
+ * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
+ * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
+ * There are also sclp and machine checks. This gives us
+ * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000
+ * Lets round up to 8192 pages.
+ */
+#define KVM_S390_MAX_FLOAT_IRQS        266250
+#define KVM_S390_FLIC_MAX_BUFFER       0x2000000
+
+struct kvm_s390_io_adapter {
+       __u32 id;
+       __u8 isc;
+       __u8 maskable;
+       __u8 swap;
+       __u8 pad;
+};
+
+#define KVM_S390_IO_ADAPTER_MASK 1
+#define KVM_S390_IO_ADAPTER_MAP 2
+#define KVM_S390_IO_ADAPTER_UNMAP 3
+
+struct kvm_s390_io_adapter_req {
+       __u32 id;
+       __u8 type;
+       __u8 mask;
+       __u16 pad0;
+       __u64 addr;
+};
+
  /* for KVM_GET_REGS and KVM_SET_REGS */
  struct kvm_regs {
         /* general purpose regs for s390 */
@@ -57,4 +95,9 @@ struct kvm_sync_regs {
  #define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2)
  #define KVM_REG_S390_CPU_TIMER  (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3)
  #define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_S390_PFTOKEN   (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_S390_PFSELECT  (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_S390_PP                (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_S390_GBEA      (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9)
  #endif
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c

index a770be9..d42b14c 100644 (file)
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -85,6 +85,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
         [IRQIO_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
         [IRQIO_MSI]  = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
         [IRQIO_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
+       [IRQIO_VAI]  = {.name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"},
         [NMI_NMI]    = {.name = "NMI", .desc = "[NMI] Machine Check"},
         [CPU_RST]    = {.name = "RST", .desc = "[CPU] CPU Restart"},
  };
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig

index 70b46ea..10d529a 100644 (file)
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -23,6 +23,10 @@ config KVM
         select ANON_INODES
         select HAVE_KVM_CPU_RELAX_INTERCEPT
         select HAVE_KVM_EVENTFD
+       select KVM_ASYNC_PF
+       select KVM_ASYNC_PF_SYNC
+       select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_IRQ_ROUTING
         ---help---
           Support hosting paravirtualized guest machines using the SIE
           virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile

index 40b4c64..d3adb37 100644 (file)
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -7,7 +7,7 @@
  # as published by the Free Software Foundation.
  
  KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o
  
  ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
  
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c

index 6f9cfa5..03a05ff 100644 (file)
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -18,6 +18,7 @@
  #include "kvm-s390.h"
  #include "trace.h"
  #include "trace-s390.h"
+#include "gaccess.h"
  
  static int diag_release_pages(struct kvm_vcpu *vcpu)
  {
@@ -47,6 +48,87 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
         return 0;
  }
  
+static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
+{
+       struct prs_parm {
+               u16 code;
+               u16 subcode;
+               u16 parm_len;
+               u16 parm_version;
+               u64 token_addr;
+               u64 select_mask;
+               u64 compare_mask;
+               u64 zarch;
+       };
+       struct prs_parm parm;
+       int rc;
+       u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4;
+       u16 ry = (vcpu->arch.sie_block->ipa & 0x0f);
+       unsigned long hva_token = KVM_HVA_ERR_BAD;
+
+       if (vcpu->run->s.regs.gprs[rx] & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm)))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       switch (parm.subcode) {
+       case 0: /* TOKEN */
+               if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) {
+                       /*
+                        * If the pagefault handshake is already activated,
+                        * the token must not be changed.  We have to return
+                        * decimal 8 instead, as mandated in SC24-6084.
+                        */
+                       vcpu->run->s.regs.gprs[ry] = 8;
+                       return 0;
+               }
+
+               if ((parm.compare_mask & parm.select_mask) != parm.compare_mask ||
+                   parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
+                       return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+               hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr));
+               if (kvm_is_error_hva(hva_token))
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+               vcpu->arch.pfault_token = parm.token_addr;
+               vcpu->arch.pfault_select = parm.select_mask;
+               vcpu->arch.pfault_compare = parm.compare_mask;
+               vcpu->run->s.regs.gprs[ry] = 0;
+               rc = 0;
+               break;
+       case 1: /*
+                * CANCEL
+                * Specification allows to let already pending tokens survive
+                * the cancel, therefore to reduce code complexity, we assume
+                * all outstanding tokens are already pending.
+                */
+               if (parm.token_addr || parm.select_mask ||
+                   parm.compare_mask || parm.zarch)
+                       return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+               vcpu->run->s.regs.gprs[ry] = 0;
+               /*
+                * If the pfault handling was not established or is already
+                * canceled SC24-6084 requests to return decimal 4.
+                */
+               if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+                       vcpu->run->s.regs.gprs[ry] = 4;
+               else
+                       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+
+               rc = 0;
+               break;
+       default:
+               rc = -EOPNOTSUPP;
+               break;
+       }
+
+       return rc;
+}
+
  static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
  {
         VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
@@ -153,6 +235,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
                 return __diag_time_slice_end(vcpu);
         case 0x9c:
                 return __diag_time_slice_end_directed(vcpu);
+       case 0x258:
+               return __diag_page_ref_service(vcpu);
         case 0x308:
                 return __diag_ipl_functions(vcpu);
         case 0x500:
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index 5f79d2d..200a8f9 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1,7 +1,7 @@
  /*
   * handling kvm guest interrupts
   *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2014
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License (version 2 only)
@@ -13,6 +13,7 @@
  #include <linux/interrupt.h>
  #include <linux/kvm_host.h>
  #include <linux/hrtimer.h>
+#include <linux/mmu_context.h>
  #include <linux/signal.h>
  #include <linux/slab.h>
  #include <asm/asm-offsets.h>
@@ -31,7 +32,7 @@ static int is_ioint(u64 type)
         return ((type & 0xfffe0000u) != 0xfffe0000u);
  }
  
-static int psw_extint_disabled(struct kvm_vcpu *vcpu)
+int psw_extint_disabled(struct kvm_vcpu *vcpu)
  {
         return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
  }
@@ -78,11 +79,8 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                         return 1;
                 return 0;
         case KVM_S390_INT_SERVICE:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
-                       return 1;
-               return 0;
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
         case KVM_S390_INT_VIRTIO:
                 if (psw_extint_disabled(vcpu))
                         return 0;
@@ -117,14 +115,12 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
  
  static void __set_cpu_idle(struct kvm_vcpu *vcpu)
  {
-       BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
         atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
         set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
  }
  
  static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
  {
-       BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
         atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
         clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
  }
@@ -150,6 +146,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
         case KVM_S390_INT_EXTERNAL_CALL:
         case KVM_S390_INT_EMERGENCY:
         case KVM_S390_INT_SERVICE:
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
         case KVM_S390_INT_VIRTIO:
                 if (psw_extint_disabled(vcpu))
                         __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -223,6 +221,30 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                 rc |= put_guest(vcpu, inti->ext.ext_params,
                                 (u32 __user *)__LC_EXT_PARAMS);
                 break;
+       case KVM_S390_INT_PFAULT_INIT:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+                                                inti->ext.ext_params2);
+               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params2,
+                               (u64 __user *) __LC_EXT_PARAMS2);
+               break;
+       case KVM_S390_INT_PFAULT_DONE:
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+                                                inti->ext.ext_params2);
+               rc  = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params2,
+                               (u64 __user *) __LC_EXT_PARAMS2);
+               break;
         case KVM_S390_INT_VIRTIO:
                 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
                            inti->ext.ext_params, inti->ext.ext_params2);
@@ -357,7 +379,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
  {
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
         struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -482,11 +504,26 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
         struct kvm_vcpu *vcpu;
  
         vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+       vcpu->preempted = true;
         tasklet_schedule(&vcpu->arch.tasklet);
  
         return HRTIMER_NORESTART;
  }
  
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+
+       spin_lock_bh(&li->lock);
+       list_for_each_entry_safe(inti, n, &li->list, list) {
+               list_del(&inti->list);
+               kfree(inti);
+       }
+       atomic_set(&li->active, 0);
+       spin_unlock_bh(&li->lock);
+}
+
  void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
  {
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -528,6 +565,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
                         list_for_each_entry_safe(inti, n, &fi->list, list) {
                                 if (__interrupt_is_deliverable(vcpu, inti)) {
                                         list_del(&inti->list);
+                                       fi->irq_count--;
                                         deliver = 1;
                                         break;
                                 }
@@ -583,6 +621,7 @@ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
                                 if ((inti->type == KVM_S390_MCHK) &&
                                     __interrupt_is_deliverable(vcpu, inti)) {
                                         list_del(&inti->list);
+                                       fi->irq_count--;
                                         deliver = 1;
                                         break;
                                 }
@@ -650,8 +689,10 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                 inti = iter;
                 break;
         }
-       if (inti)
+       if (inti) {
                 list_del_init(&inti->list);
+               fi->irq_count--;
+       }
         if (list_empty(&fi->list))
                 atomic_set(&fi->active, 0);
         spin_unlock(&fi->lock);
@@ -659,53 +700,101 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
         return inti;
  }
  
-int kvm_s390_inject_vm(struct kvm *kvm,
-                      struct kvm_s390_interrupt *s390int)
+static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
  {
         struct kvm_s390_local_interrupt *li;
         struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *inti, *iter;
+       struct kvm_s390_interrupt_info *iter;
+       struct kvm_vcpu *dst_vcpu = NULL;
         int sigcpu;
+       int rc = 0;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
+               rc = -EINVAL;
+               goto unlock_fi;
+       }
+       fi->irq_count++;
+       if (!is_ioint(inti->type)) {
+               list_add_tail(&inti->list, &fi->list);
+       } else {
+               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+               /* Keep I/O interrupts sorted in isc order. */
+               list_for_each_entry(iter, &fi->list, list) {
+                       if (!is_ioint(iter->type))
+                               continue;
+                       if (int_word_to_isc_bits(iter->io.io_int_word)
+                           <= isc_bits)
+                               continue;
+                       break;
+               }
+               list_add_tail(&inti->list, &iter->list);
+       }
+       atomic_set(&fi->active, 1);
+       sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+       if (sigcpu == KVM_MAX_VCPUS) {
+               do {
+                       sigcpu = fi->next_rr_cpu++;
+                       if (sigcpu == KVM_MAX_VCPUS)
+                               sigcpu = fi->next_rr_cpu = 0;
+               } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
+       }
+       dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+       li = &dst_vcpu->arch.local_int;
+       spin_lock_bh(&li->lock);
+       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+       if (waitqueue_active(li->wq))
+               wake_up_interruptible(li->wq);
+       kvm_get_vcpu(kvm, sigcpu)->preempted = true;
+       spin_unlock_bh(&li->lock);
+unlock_fi:
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+       return rc;
+}
+
+int kvm_s390_inject_vm(struct kvm *kvm,
+                      struct kvm_s390_interrupt *s390int)
+{
+       struct kvm_s390_interrupt_info *inti;
  
         inti = kzalloc(sizeof(*inti), GFP_KERNEL);
         if (!inti)
                 return -ENOMEM;
  
-       switch (s390int->type) {
+       inti->type = s390int->type;
+       switch (inti->type) {
         case KVM_S390_INT_VIRTIO:
                 VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
                          s390int->parm, s390int->parm64);
-               inti->type = s390int->type;
                 inti->ext.ext_params = s390int->parm;
                 inti->ext.ext_params2 = s390int->parm64;
                 break;
         case KVM_S390_INT_SERVICE:
                 VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
-               inti->type = s390int->type;
                 inti->ext.ext_params = s390int->parm;
                 break;
-       case KVM_S390_PROGRAM_INT:
-       case KVM_S390_SIGP_STOP:
-       case KVM_S390_INT_EXTERNAL_CALL:
-       case KVM_S390_INT_EMERGENCY:
-               kfree(inti);
-               return -EINVAL;
+       case KVM_S390_INT_PFAULT_DONE:
+               inti->type = s390int->type;
+               inti->ext.ext_params2 = s390int->parm64;
+               break;
         case KVM_S390_MCHK:
                 VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
                          s390int->parm64);
-               inti->type = s390int->type;
                 inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
                 inti->mchk.mcic = s390int->parm64;
                 break;
         case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (s390int->type & IOINT_AI_MASK)
+               if (inti->type & IOINT_AI_MASK)
                         VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
                 else
                         VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
                                  s390int->type & IOINT_CSSID_MASK,
                                  s390int->type & IOINT_SSID_MASK,
                                  s390int->type & IOINT_SCHID_MASK);
-               inti->type = s390int->type;
                 inti->io.subchannel_id = s390int->parm >> 16;
                 inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
                 inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -718,43 +807,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
         trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
                                  2);
  
-       mutex_lock(&kvm->lock);
-       fi = &kvm->arch.float_int;
-       spin_lock(&fi->lock);
-       if (!is_ioint(inti->type))
-               list_add_tail(&inti->list, &fi->list);
-       else {
-               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
-
-               /* Keep I/O interrupts sorted in isc order. */
-               list_for_each_entry(iter, &fi->list, list) {
-                       if (!is_ioint(iter->type))
-                               continue;
-                       if (int_word_to_isc_bits(iter->io.io_int_word)
-                           <= isc_bits)
-                               continue;
-                       break;
-               }
-               list_add_tail(&inti->list, &iter->list);
-       }
-       atomic_set(&fi->active, 1);
-       sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
-       if (sigcpu == KVM_MAX_VCPUS) {
-               do {
-                       sigcpu = fi->next_rr_cpu++;
-                       if (sigcpu == KVM_MAX_VCPUS)
-                               sigcpu = fi->next_rr_cpu = 0;
-               } while (fi->local_int[sigcpu] == NULL);
-       }
-       li = fi->local_int[sigcpu];
-       spin_lock_bh(&li->lock);
-       atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-       if (waitqueue_active(li->wq))
-               wake_up_interruptible(li->wq);
-       spin_unlock_bh(&li->lock);
-       spin_unlock(&fi->lock);
-       mutex_unlock(&kvm->lock);
-       return 0;
+       return __inject_vm(kvm, inti);
  }
  
  int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -814,6 +867,10 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                 inti->type = s390int->type;
                 inti->mchk.mcic = s390int->parm64;
                 break;
+       case KVM_S390_INT_PFAULT_INIT:
+               inti->type = s390int->type;
+               inti->ext.ext_params2 = s390int->parm64;
+               break;
         case KVM_S390_INT_VIRTIO:
         case KVM_S390_INT_SERVICE:
         case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -837,7 +894,528 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
         atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
         if (waitqueue_active(&vcpu->wq))
                 wake_up_interruptible(&vcpu->wq);
+       vcpu->preempted = true;
         spin_unlock_bh(&li->lock);
         mutex_unlock(&vcpu->kvm->lock);
         return 0;
  }
+
+static void clear_floating_interrupts(struct kvm *kvm)
+{
+       struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       list_for_each_entry_safe(inti, n, &fi->list, list) {
+               list_del(&inti->list);
+               kfree(inti);
+       }
+       fi->irq_count = 0;
+       atomic_set(&fi->active, 0);
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+}
+
+static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
+                                  u8 *addr)
+{
+       struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+       struct kvm_s390_irq irq = {0};
+
+       irq.type = inti->type;
+       switch (inti->type) {
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
+       case KVM_S390_INT_VIRTIO:
+       case KVM_S390_INT_SERVICE:
+               irq.u.ext = inti->ext;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               irq.u.io = inti->io;
+               break;
+       case KVM_S390_MCHK:
+               irq.u.mchk = inti->mchk;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (copy_to_user(uptr, &irq, sizeof(irq)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
+{
+       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_float_interrupt *fi;
+       int ret = 0;
+       int n = 0;
+
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+
+       list_for_each_entry(inti, &fi->list, list) {
+               if (len < sizeof(struct kvm_s390_irq)) {
+                       /* signal userspace to try again */
+                       ret = -ENOMEM;
+                       break;
+               }
+               ret = copy_irq_to_user(inti, buf);
+               if (ret)
+                       break;
+               buf += sizeof(struct kvm_s390_irq);
+               len -= sizeof(struct kvm_s390_irq);
+               n++;
+       }
+
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+
+       return ret < 0 ? ret : n;
+}
+
+static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int r;
+
+       switch (attr->group) {
+       case KVM_DEV_FLIC_GET_ALL_IRQS:
+               r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr,
+                                         attr->attr);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
+                                    u64 addr)
+{
+       struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+       void *target = NULL;
+       void __user *source;
+       u64 size;
+
+       if (get_user(inti->type, (u64 __user *)addr))
+               return -EFAULT;
+
+       switch (inti->type) {
+       case KVM_S390_INT_PFAULT_INIT:
+       case KVM_S390_INT_PFAULT_DONE:
+       case KVM_S390_INT_VIRTIO:
+       case KVM_S390_INT_SERVICE:
+               target = (void *) &inti->ext;
+               source = &uptr->u.ext;
+               size = sizeof(inti->ext);
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               target = (void *) &inti->io;
+               source = &uptr->u.io;
+               size = sizeof(inti->io);
+               break;
+       case KVM_S390_MCHK:
+               target = (void *) &inti->mchk;
+               source = &uptr->u.mchk;
+               size = sizeof(inti->mchk);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (copy_from_user(target, source, size))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int enqueue_floating_irq(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       struct kvm_s390_interrupt_info *inti = NULL;
+       int r = 0;
+       int len = attr->attr;
+
+       if (len % sizeof(struct kvm_s390_irq) != 0)
+               return -EINVAL;
+       else if (len > KVM_S390_FLIC_MAX_BUFFER)
+               return -EINVAL;
+
+       while (len >= sizeof(struct kvm_s390_irq)) {
+               inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+               if (!inti)
+                       return -ENOMEM;
+
+               r = copy_irq_from_user(inti, attr->addr);
+               if (r) {
+                       kfree(inti);
+                       return r;
+               }
+               r = __inject_vm(dev->kvm, inti);
+               if (r) {
+                       kfree(inti);
+                       return r;
+               }
+               len -= sizeof(struct kvm_s390_irq);
+               attr->addr += sizeof(struct kvm_s390_irq);
+       }
+
+       return r;
+}
+
+static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id)
+{
+       if (id >= MAX_S390_IO_ADAPTERS)
+               return NULL;
+       return kvm->arch.adapters[id];
+}
+
+static int register_io_adapter(struct kvm_device *dev,
+                              struct kvm_device_attr *attr)
+{
+       struct s390_io_adapter *adapter;
+       struct kvm_s390_io_adapter adapter_info;
+
+       if (copy_from_user(&adapter_info,
+                          (void __user *)attr->addr, sizeof(adapter_info)))
+               return -EFAULT;
+
+       if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) ||
+           (dev->kvm->arch.adapters[adapter_info.id] != NULL))
+               return -EINVAL;
+
+       adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+       if (!adapter)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&adapter->maps);
+       init_rwsem(&adapter->maps_lock);
+       atomic_set(&adapter->nr_maps, 0);
+       adapter->id = adapter_info.id;
+       adapter->isc = adapter_info.isc;
+       adapter->maskable = adapter_info.maskable;
+       adapter->masked = false;
+       adapter->swap = adapter_info.swap;
+       dev->kvm->arch.adapters[adapter->id] = adapter;
+
+       return 0;
+}
+
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
+{
+       int ret;
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+       if (!adapter || !adapter->maskable)
+               return -EINVAL;
+       ret = adapter->masked;
+       adapter->masked = masked;
+       return ret;
+}
+
+static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+       struct s390_map_info *map;
+       int ret;
+
+       if (!adapter || !addr)
+               return -EINVAL;
+
+       map = kzalloc(sizeof(*map), GFP_KERNEL);
+       if (!map) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       INIT_LIST_HEAD(&map->list);
+       map->guest_addr = addr;
+       map->addr = gmap_translate(addr, kvm->arch.gmap);
+       if (map->addr == -EFAULT) {
+               ret = -EFAULT;
+               goto out;
+       }
+       ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret != 1);
+       down_write(&adapter->maps_lock);
+       if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
+               list_add_tail(&map->list, &adapter->maps);
+               ret = 0;
+       } else {
+               put_page(map->page);
+               ret = -EINVAL;
+       }
+       up_write(&adapter->maps_lock);
+out:
+       if (ret)
+               kfree(map);
+       return ret;
+}
+
+static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+       struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+       struct s390_map_info *map, *tmp;
+       int found = 0;
+
+       if (!adapter || !addr)
+               return -EINVAL;
+
+       down_write(&adapter->maps_lock);
+       list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
+               if (map->guest_addr == addr) {
+                       found = 1;
+                       atomic_dec(&adapter->nr_maps);
+                       list_del(&map->list);
+                       put_page(map->page);
+                       kfree(map);
+                       break;
+               }
+       }
+       up_write(&adapter->maps_lock);
+
+       return found ? 0 : -EINVAL;
+}
+
+void kvm_s390_destroy_adapters(struct kvm *kvm)
+{
+       int i;
+       struct s390_map_info *map, *tmp;
+
+       for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+               if (!kvm->arch.adapters[i])
+                       continue;
+               list_for_each_entry_safe(map, tmp,
+                                        &kvm->arch.adapters[i]->maps, list) {
+                       list_del(&map->list);
+                       put_page(map->page);
+                       kfree(map);
+               }
+               kfree(kvm->arch.adapters[i]);
+       }
+}
+
+static int modify_io_adapter(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       struct kvm_s390_io_adapter_req req;
+       struct s390_io_adapter *adapter;
+       int ret;
+
+       if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+               return -EFAULT;
+
+       adapter = get_io_adapter(dev->kvm, req.id);
+       if (!adapter)
+               return -EINVAL;
+       switch (req.type) {
+       case KVM_S390_IO_ADAPTER_MASK:
+               ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask);
+               if (ret > 0)
+                       ret = 0;
+               break;
+       case KVM_S390_IO_ADAPTER_MAP:
+               ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
+               break;
+       case KVM_S390_IO_ADAPTER_UNMAP:
+               ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int r = 0;
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+
+       switch (attr->group) {
+       case KVM_DEV_FLIC_ENQUEUE:
+               r = enqueue_floating_irq(dev, attr);
+               break;
+       case KVM_DEV_FLIC_CLEAR_IRQS:
+               r = 0;
+               clear_floating_interrupts(dev->kvm);
+               break;
+       case KVM_DEV_FLIC_APF_ENABLE:
+               dev->kvm->arch.gmap->pfault_enabled = 1;
+               break;
+       case KVM_DEV_FLIC_APF_DISABLE_WAIT:
+               dev->kvm->arch.gmap->pfault_enabled = 0;
+               /*
+                * Make sure no async faults are in transition when
+                * clearing the queues. So we don't need to worry
+                * about late coming workers.
+                */
+               synchronize_srcu(&dev->kvm->srcu);
+               kvm_for_each_vcpu(i, vcpu, dev->kvm)
+                       kvm_clear_async_pf_completion_queue(vcpu);
+               break;
+       case KVM_DEV_FLIC_ADAPTER_REGISTER:
+               r = register_io_adapter(dev, attr);
+               break;
+       case KVM_DEV_FLIC_ADAPTER_MODIFY:
+               r = modify_io_adapter(dev, attr);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+static int flic_create(struct kvm_device *dev, u32 type)
+{
+       if (!dev)
+               return -EINVAL;
+       if (dev->kvm->arch.flic)
+               return -EINVAL;
+       dev->kvm->arch.flic = dev;
+       return 0;
+}
+
+static void flic_destroy(struct kvm_device *dev)
+{
+       dev->kvm->arch.flic = NULL;
+       kfree(dev);
+}
+
+/* s390 floating irq controller (flic) */
+struct kvm_device_ops kvm_flic_ops = {
+       .name = "kvm-flic",
+       .get_attr = flic_get_attr,
+       .set_attr = flic_set_attr,
+       .create = flic_create,
+       .destroy = flic_destroy,
+};
+
+static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
+{
+       unsigned long bit;
+
+       bit = bit_nr + (addr % PAGE_SIZE) * 8;
+
+       return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
+}
+
+static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
+                                         u64 addr)
+{
+       struct s390_map_info *map;
+
+       if (!adapter)
+               return NULL;
+
+       list_for_each_entry(map, &adapter->maps, list) {
+               if (map->guest_addr == addr)
+                       return map;
+       }
+       return NULL;
+}
+
+static int adapter_indicators_set(struct kvm *kvm,
+                                 struct s390_io_adapter *adapter,
+                                 struct kvm_s390_adapter_int *adapter_int)
+{
+       unsigned long bit;
+       int summary_set, idx;
+       struct s390_map_info *info;
+       void *map;
+
+       info = get_map_info(adapter, adapter_int->ind_addr);
+       if (!info)
+               return -1;
+       map = page_address(info->page);
+       bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
+       set_bit(bit, map);
+       idx = srcu_read_lock(&kvm->srcu);
+       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(info->page);
+       info = get_map_info(adapter, adapter_int->summary_addr);
+       if (!info) {
+               srcu_read_unlock(&kvm->srcu, idx);
+               return -1;
+       }
+       map = page_address(info->page);
+       bit = get_ind_bit(info->addr, adapter_int->summary_offset,
+                         adapter->swap);
+       summary_set = test_and_set_bit(bit, map);
+       mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+       set_page_dirty_lock(info->page);
+       srcu_read_unlock(&kvm->srcu, idx);
+       return summary_set ? 0 : 1;
+}
+
+/*
+ * < 0 - not injected due to error
+ * = 0 - coalesced, summary indicator already active
+ * > 0 - injected interrupt
+ */
+static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
+                          struct kvm *kvm, int irq_source_id, int level,
+                          bool line_status)
+{
+       int ret;
+       struct s390_io_adapter *adapter;
+
+       /* We're only interested in the 0->1 transition. */
+       if (!level)
+               return 0;
+       adapter = get_io_adapter(kvm, e->adapter.adapter_id);
+       if (!adapter)
+               return -1;
+       down_read(&adapter->maps_lock);
+       ret = adapter_indicators_set(kvm, adapter, &e->adapter);
+       up_read(&adapter->maps_lock);
+       if ((ret > 0) && !adapter->masked) {
+               struct kvm_s390_interrupt s390int = {
+                       .type = KVM_S390_INT_IO(1, 0, 0, 0),
+                       .parm = 0,
+                       .parm64 = (adapter->isc << 27) | 0x80000000,
+               };
+               ret = kvm_s390_inject_vm(kvm, &s390int);
+               if (ret == 0)
+                       ret = 1;
+       }
+       return ret;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue)
+{
+       int ret;
+
+       switch (ue->type) {
+       case KVM_IRQ_ROUTING_S390_ADAPTER:
+               e->set = set_adapter_int;
+               e->adapter.summary_addr = ue->u.adapter.summary_addr;
+               e->adapter.ind_addr = ue->u.adapter.ind_addr;
+               e->adapter.summary_offset = ue->u.adapter.summary_offset;
+               e->adapter.ind_offset = ue->u.adapter.ind_offset;
+               e->adapter.adapter_id = ue->u.adapter.adapter_id;
+               ret = 0;
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+               int irq_source_id, int level, bool line_status)
+{
+       return -EINVAL;
+}
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h

new file mode 100644 (file)

index 0000000..d98e415
--- /dev/null
+++ b/arch/s390/kvm/irq.h
@@ -0,0 +1,22 @@
+/*
+ * s390 irqchip routines
+ *
+ * Copyright IBM Corp. 2014
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+ */
+#ifndef __KVM_IRQ_H
+#define __KVM_IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return 1;
+}
+
+#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 10b5db3..b3ecb8f 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -153,11 +153,14 @@ int kvm_dev_ioctl_check_extension(long ext)
  #ifdef CONFIG_KVM_S390_UCONTROL
         case KVM_CAP_S390_UCONTROL:
  #endif
+       case KVM_CAP_ASYNC_PF:
         case KVM_CAP_SYNC_REGS:
         case KVM_CAP_ONE_REG:
         case KVM_CAP_ENABLE_CAP:
         case KVM_CAP_S390_CSS_SUPPORT:
         case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_DEVICE_CTRL:
+       case KVM_CAP_ENABLE_CAP_VM:
                 r = 1;
                 break;
         case KVM_CAP_NR_VCPUS:
@@ -186,6 +189,25 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
         return 0;
  }
  
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_S390_IRQCHIP:
+               kvm->arch.use_irqchip = 1;
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
  long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
  {
@@ -203,6 +225,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = kvm_s390_inject_vm(kvm, &s390int);
                 break;
         }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       break;
+               r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+               break;
+       }
+       case KVM_CREATE_IRQCHIP: {
+               struct kvm_irq_routing_entry routing;
+
+               r = -EINVAL;
+               if (kvm->arch.use_irqchip) {
+                       /* Set up dummy routing. */
+                       memset(&routing, 0, sizeof(routing));
+                       kvm_set_irq_routing(kvm, &routing, 0, 0);
+                       r = 0;
+               }
+               break;
+       }
         default:
                 r = -ENOTTY;
         }
@@ -214,6 +256,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
         int rc;
         char debug_name[16];
+       static unsigned long sca_offset;
  
         rc = -EINVAL;
  #ifdef CONFIG_KVM_S390_UCONTROL
@@ -235,6 +278,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
         if (!kvm->arch.sca)
                 goto out_err;
+       spin_lock(&kvm_lock);
+       sca_offset = (sca_offset + 16) & 0x7f0;
+       kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
+       spin_unlock(&kvm_lock);
  
         sprintf(debug_name, "kvm-%u", current->pid);
  
@@ -255,9 +302,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                 if (!kvm->arch.gmap)
                         goto out_nogmap;
                 kvm->arch.gmap->private = kvm;
+               kvm->arch.gmap->pfault_enabled = 0;
         }
  
         kvm->arch.css_support = 0;
+       kvm->arch.use_irqchip = 0;
  
         return 0;
  out_nogmap:
@@ -272,6 +321,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  {
         VCPU_EVENT(vcpu, 3, "%s", "free cpu");
         trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
+       kvm_clear_async_pf_completion_queue(vcpu);
         if (!kvm_is_ucontrol(vcpu->kvm)) {
                 clear_bit(63 - vcpu->vcpu_id,
                           (unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -320,11 +370,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
         debug_unregister(kvm->arch.dbf);
         if (!kvm_is_ucontrol(kvm))
                 gmap_free(kvm->arch.gmap);
+       kvm_s390_destroy_adapters(kvm);
  }
  
  /* Section: vcpu related */
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
+       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+       kvm_clear_async_pf_completion_queue(vcpu);
         if (kvm_is_ucontrol(vcpu->kvm)) {
                 vcpu->arch.gmap = gmap_alloc(current->mm);
                 if (!vcpu->arch.gmap)
@@ -385,7 +438,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
         vcpu->arch.guest_fpregs.fpc = 0;
         asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
         vcpu->arch.sie_block->gbea = 1;
+       vcpu->arch.sie_block->pp = 0;
+       vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+       kvm_clear_async_pf_completion_queue(vcpu);
         atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+       kvm_s390_clear_local_irqs(vcpu);
  }
  
  int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -466,11 +523,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
         spin_lock_init(&vcpu->arch.local_int.lock);
         INIT_LIST_HEAD(&vcpu->arch.local_int.list);
         vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-       spin_lock(&kvm->arch.float_int.lock);
-       kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
         vcpu->arch.local_int.wq = &vcpu->wq;
         vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
-       spin_unlock(&kvm->arch.float_int.lock);
  
         rc = kvm_vcpu_init(vcpu, kvm, id);
         if (rc)
@@ -490,9 +544,7 @@ out:
  
  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  {
-       /* kvm common code refers to this, but never calls it */
-       BUG();
-       return 0;
+       return kvm_cpu_has_interrupt(vcpu);
  }
  
  void s390_vcpu_block(struct kvm_vcpu *vcpu)
@@ -568,6 +620,26 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
                 r = put_user(vcpu->arch.sie_block->ckc,
                              (u64 __user *)reg->addr);
                 break;
+       case KVM_REG_S390_PFTOKEN:
+               r = put_user(vcpu->arch.pfault_token,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFCOMPARE:
+               r = put_user(vcpu->arch.pfault_compare,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFSELECT:
+               r = put_user(vcpu->arch.pfault_select,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PP:
+               r = put_user(vcpu->arch.sie_block->pp,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_GBEA:
+               r = put_user(vcpu->arch.sie_block->gbea,
+                            (u64 __user *)reg->addr);
+               break;
         default:
                 break;
         }
@@ -597,6 +669,26 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
                 r = get_user(vcpu->arch.sie_block->ckc,
                              (u64 __user *)reg->addr);
                 break;
+       case KVM_REG_S390_PFTOKEN:
+               r = get_user(vcpu->arch.pfault_token,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFCOMPARE:
+               r = get_user(vcpu->arch.pfault_compare,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PFSELECT:
+               r = get_user(vcpu->arch.pfault_select,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_PP:
+               r = get_user(vcpu->arch.sie_block->pp,
+                            (u64 __user *)reg->addr);
+               break;
+       case KVM_REG_S390_GBEA:
+               r = get_user(vcpu->arch.sie_block->gbea,
+                            (u64 __user *)reg->addr);
+               break;
         default:
                 break;
         }
@@ -715,10 +807,100 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
         return 0;
  }
  
+static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu)
+{
+       long rc;
+       hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+       struct mm_struct *mm = current->mm;
+       down_read(&mm->mmap_sem);
+       rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL);
+       up_read(&mm->mmap_sem);
+       return rc;
+}
+
+static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
+                                     unsigned long token)
+{
+       struct kvm_s390_interrupt inti;
+       inti.parm64 = token;
+
+       if (start_token) {
+               inti.type = KVM_S390_INT_PFAULT_INIT;
+               WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &inti));
+       } else {
+               inti.type = KVM_S390_INT_PFAULT_DONE;
+               WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti));
+       }
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work)
+{
+       trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
+       __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work)
+{
+       trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token);
+       __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+                              struct kvm_async_pf *work)
+{
+       /* s390 will always inject the page directly */
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+       /*
+        * s390 will always inject the page directly,
+        * but we still want check_async_completion to cleanup
+        */
+       return true;
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+{
+       hva_t hva;
+       struct kvm_arch_async_pf arch;
+       int rc;
+
+       if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+               return 0;
+       if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
+           vcpu->arch.pfault_compare)
+               return 0;
+       if (psw_extint_disabled(vcpu))
+               return 0;
+       if (kvm_cpu_has_interrupt(vcpu))
+               return 0;
+       if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
+               return 0;
+       if (!vcpu->arch.gmap->pfault_enabled)
+               return 0;
+
+       hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+       if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8))
+               return 0;
+
+       rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
+       return rc;
+}
+
  static int vcpu_pre_run(struct kvm_vcpu *vcpu)
  {
         int rc, cpuflags;
  
+       /*
+        * On s390 notifications for arriving pages will be delivered directly
+        * to the guest but the house keeping for completed pfaults is
+        * handled outside the worker.
+        */
+       kvm_check_async_pf_completion(vcpu);
+
         memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
  
         if (need_resched())
@@ -744,7 +926,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
  
  static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
  {
-       int rc;
+       int rc = -1;
  
         VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                    vcpu->arch.sie_block->icptcode);
@@ -758,7 +940,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
                                                 current->thread.gmap_addr;
                 vcpu->run->s390_ucontrol.pgm_code = 0x10;
                 rc = -EREMOTE;
-       } else {
+
+       } else if (current->thread.gmap_pfault) {
+               trace_kvm_s390_major_guest_pfault(vcpu);
+               current->thread.gmap_pfault = 0;
+               if (kvm_arch_setup_async_pf(vcpu) ||
+                   (kvm_arch_fault_in_sync(vcpu) >= 0))
+                       rc = 0;
+       }
+
+       if (rc == -1) {
                 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
                 trace_kvm_s390_sie_fault(vcpu);
                 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -768,7 +959,8 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
  
         if (rc == 0) {
                 if (kvm_is_ucontrol(vcpu->kvm))
-                       rc = -EOPNOTSUPP;
+                       /* Don't exit for host interrupts. */
+                       rc = vcpu->arch.sie_block->icptcode ? -EOPNOTSUPP : 0;
                 else
                         rc = kvm_handle_sie_intercept(vcpu);
         }
@@ -831,8 +1023,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  
         atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
  
-       BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL);
-
         switch (kvm_run->exit_reason) {
         case KVM_EXIT_S390_SIEIC:
         case KVM_EXIT_UNKNOWN:
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h

index 564514f..3c1e227 100644 (file)
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -129,6 +129,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
  void kvm_s390_tasklet(unsigned long parm);
  void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
  void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu);
  int __must_check kvm_s390_inject_vm(struct kvm *kvm,
                                     struct kvm_s390_interrupt *s390int);
  int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -136,6 +137,7 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
  int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
  struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                     u64 cr6, u64 schid);
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
  
  /* implemented in priv.c */
  int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
@@ -161,4 +163,9 @@ bool kvm_enabled_cmma(void);
  /* implemented in diag.c */
  int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
  
+/* implemented in interrupt.c */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int psw_extint_disabled(struct kvm_vcpu *vcpu);
+void kvm_s390_destroy_adapters(struct kvm *kvm);
+
  #endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c

index aacb6b1..476e9e2 100644 (file)
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -396,15 +396,10 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
  
  static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         int cpus = 0;
         int n;
  
-       spin_lock(&fi->lock);
-       for (n = 0; n < KVM_MAX_VCPUS; n++)
-               if (fi->local_int[n])
-                       cpus++;
-       spin_unlock(&fi->lock);
+       cpus = atomic_read(&vcpu->kvm->online_vcpus);
  
         /* deal with other level 3 hypervisors */
         if (stsi(mem, 3, 2, 2))
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c

index 87c2b3a..26caeb5 100644 (file)
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -23,29 +23,30 @@
  static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
                         u64 *reg)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
+       int cpuflags;
         int rc;
  
         if (cpu_addr >= KVM_MAX_VCPUS)
                 return SIGP_CC_NOT_OPERATIONAL;
  
-       spin_lock(&fi->lock);
-       if (fi->local_int[cpu_addr] == NULL)
-               rc = SIGP_CC_NOT_OPERATIONAL;
-       else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                  & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+
+       cpuflags = atomic_read(li->cpuflags);
+       if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
                 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
         else {
                 *reg &= 0xffffffff00000000UL;
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_ECALL_PEND)
+               if (cpuflags & CPUSTAT_ECALL_PEND)
                         *reg |= SIGP_STATUS_EXT_CALL_PENDING;
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_STOPPED)
+               if (cpuflags & CPUSTAT_STOPPED)
                         *reg |= SIGP_STATUS_STOPPED;
                 rc = SIGP_CC_STATUS_STORED;
         }
-       spin_unlock(&fi->lock);
  
         VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
         return rc;
@@ -53,12 +54,13 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
  
  static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         struct kvm_s390_local_interrupt *li;
         struct kvm_s390_interrupt_info *inti;
-       int rc;
+       struct kvm_vcpu *dst_vcpu = NULL;
  
-       if (cpu_addr >= KVM_MAX_VCPUS)
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
                 return SIGP_CC_NOT_OPERATIONAL;
  
         inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -68,13 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
         inti->type = KVM_S390_INT_EMERGENCY;
         inti->emerg.code = vcpu->vcpu_id;
  
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               kfree(inti);
-               goto unlock;
-       }
+       li = &dst_vcpu->arch.local_int;
         spin_lock_bh(&li->lock);
         list_add_tail(&inti->list, &li->list);
         atomic_set(&li->active, 1);
@@ -82,11 +78,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
         if (waitqueue_active(li->wq))
                 wake_up_interruptible(li->wq);
         spin_unlock_bh(&li->lock);
-       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
         VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
-unlock:
-       spin_unlock(&fi->lock);
-       return rc;
+
+       return SIGP_CC_ORDER_CODE_ACCEPTED;
  }
  
  static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
@@ -122,12 +116,13 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
  
  static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         struct kvm_s390_local_interrupt *li;
         struct kvm_s390_interrupt_info *inti;
-       int rc;
+       struct kvm_vcpu *dst_vcpu = NULL;
  
-       if (cpu_addr >= KVM_MAX_VCPUS)
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
                 return SIGP_CC_NOT_OPERATIONAL;
  
         inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -137,13 +132,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
         inti->type = KVM_S390_INT_EXTERNAL_CALL;
         inti->extcall.code = vcpu->vcpu_id;
  
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               kfree(inti);
-               goto unlock;
-       }
+       li = &dst_vcpu->arch.local_int;
         spin_lock_bh(&li->lock);
         list_add_tail(&inti->list, &li->list);
         atomic_set(&li->active, 1);
@@ -151,11 +140,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
         if (waitqueue_active(li->wq))
                 wake_up_interruptible(li->wq);
         spin_unlock_bh(&li->lock);
-       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
         VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
-unlock:
-       spin_unlock(&fi->lock);
-       return rc;
+
+       return SIGP_CC_ORDER_CODE_ACCEPTED;
  }
  
  static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
@@ -189,31 +176,26 @@ out:
  
  static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
         int rc;
  
         if (cpu_addr >= KVM_MAX_VCPUS)
                 return SIGP_CC_NOT_OPERATIONAL;
  
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               goto unlock;
-       }
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
  
         rc = __inject_sigp_stop(li, action);
  
-unlock:
-       spin_unlock(&fi->lock);
         VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
  
         if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) {
                 /* If the CPU has already been stopped, we still have
                  * to save the status when doing stop-and-store. This
                  * has to be done after unlocking all spinlocks. */
-               struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
                 rc = kvm_s390_store_status_unloaded(dst_vcpu,
                                                 KVM_S390_STORE_STATUS_NOADDR);
         }
@@ -224,6 +206,8 @@ unlock:
  static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
  {
         int rc;
+       unsigned int i;
+       struct kvm_vcpu *v;
  
         switch (parameter & 0xff) {
         case 0:
@@ -231,6 +215,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
                 break;
         case 1:
         case 2:
+               kvm_for_each_vcpu(i, v, vcpu->kvm) {
+                       v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+                       kvm_clear_async_pf_completion_queue(v);
+               }
+
                 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
                 break;
         default:
@@ -242,12 +231,18 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
  static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
                              u64 *reg)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
-       struct kvm_s390_local_interrupt *li = NULL;
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
         struct kvm_s390_interrupt_info *inti;
         int rc;
         u8 tmp;
  
+       if (cpu_addr < KVM_MAX_VCPUS)
+               dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+
         /* make sure that the new value is valid memory */
         address = address & 0x7fffe000u;
         if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
@@ -261,18 +256,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
         if (!inti)
                 return SIGP_CC_BUSY;
  
-       spin_lock(&fi->lock);
-       if (cpu_addr < KVM_MAX_VCPUS)
-               li = fi->local_int[cpu_addr];
-
-       if (li == NULL) {
-               *reg &= 0xffffffff00000000UL;
-               *reg |= SIGP_STATUS_INCORRECT_STATE;
-               rc = SIGP_CC_STATUS_STORED;
-               kfree(inti);
-               goto out_fi;
-       }
-
         spin_lock_bh(&li->lock);
         /* cpu must be in stopped state */
         if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
@@ -295,8 +278,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
         VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
  out_li:
         spin_unlock_bh(&li->lock);
-out_fi:
-       spin_unlock(&fi->lock);
         return rc;
  }
  
@@ -334,28 +315,26 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id,
  static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
                                 u64 *reg)
  {
+       struct kvm_s390_local_interrupt *li;
+       struct kvm_vcpu *dst_vcpu = NULL;
         int rc;
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
  
         if (cpu_addr >= KVM_MAX_VCPUS)
                 return SIGP_CC_NOT_OPERATIONAL;
  
-       spin_lock(&fi->lock);
-       if (fi->local_int[cpu_addr] == NULL)
-               rc = SIGP_CC_NOT_OPERATIONAL;
-       else {
-               if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
-                   & CPUSTAT_RUNNING) {
-                       /* running */
-                       rc = SIGP_CC_ORDER_CODE_ACCEPTED;
-               } else {
-                       /* not running */
-                       *reg &= 0xffffffff00000000UL;
-                       *reg |= SIGP_STATUS_NOT_RUNNING;
-                       rc = SIGP_CC_STATUS_STORED;
-               }
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
+       if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+               /* running */
+               rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+       } else {
+               /* not running */
+               *reg &= 0xffffffff00000000UL;
+               *reg |= SIGP_STATUS_NOT_RUNNING;
+               rc = SIGP_CC_STATUS_STORED;
         }
-       spin_unlock(&fi->lock);
  
         VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", cpu_addr,
                    rc);
@@ -366,26 +345,22 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
  /* Test whether the destination CPU is available and not busy */
  static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr)
  {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         struct kvm_s390_local_interrupt *li;
         int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+       struct kvm_vcpu *dst_vcpu = NULL;
  
         if (cpu_addr >= KVM_MAX_VCPUS)
                 return SIGP_CC_NOT_OPERATIONAL;
  
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = SIGP_CC_NOT_OPERATIONAL;
-               goto out;
-       }
-
+       dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+       if (!dst_vcpu)
+               return SIGP_CC_NOT_OPERATIONAL;
+       li = &dst_vcpu->arch.local_int;
         spin_lock_bh(&li->lock);
         if (li->action_bits & ACTION_STOP_ON_STOP)
                 rc = SIGP_CC_BUSY;
         spin_unlock_bh(&li->lock);
-out:
-       spin_unlock(&fi->lock);
+
         return rc;
  }
  
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h

index 3db76b2..e8e7213 100644 (file)
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -30,6 +30,52 @@
         TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id,           \
                   __entry->pswmask, __entry->pswaddr, p_args)
  
+TRACE_EVENT(kvm_s390_major_guest_pfault,
+           TP_PROTO(VCPU_PROTO_COMMON),
+           TP_ARGS(VCPU_ARGS_COMMON),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   ),
+           VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault")
+       );
+
+TRACE_EVENT(kvm_s390_pfault_init,
+           TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+           TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(long, pfault_token)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->pfault_token = pfault_token;
+                   ),
+           VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token)
+       );
+
+TRACE_EVENT(kvm_s390_pfault_done,
+           TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+           TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(long, pfault_token)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->pfault_token = pfault_token;
+                   ),
+           VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token)
+       );
+
  /*
   * Tracepoints for SIE entry and exit.
   */
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index d95265b..88cef50 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -50,6 +50,7 @@
  #define VM_FAULT_BADMAP                0x020000
  #define VM_FAULT_BADACCESS     0x040000
  #define VM_FAULT_SIGNAL                0x080000
+#define VM_FAULT_PFAULT                0x100000
  
  static unsigned long store_indication __read_mostly;
  
@@ -227,6 +228,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
                         return;
                 }
         case VM_FAULT_BADCONTEXT:
+       case VM_FAULT_PFAULT:
                 do_no_context(regs);
                 break;
         case VM_FAULT_SIGNAL:
@@ -264,6 +266,9 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
   */
  static inline int do_exception(struct pt_regs *regs, int access)
  {
+#ifdef CONFIG_PGSTE
+       struct gmap *gmap;
+#endif
         struct task_struct *tsk;
         struct mm_struct *mm;
         struct vm_area_struct *vma;
@@ -304,9 +309,10 @@ static inline int do_exception(struct pt_regs *regs, int access)
         down_read(&mm->mmap_sem);
  
  #ifdef CONFIG_PGSTE
-       if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
-               address = __gmap_fault(address,
-                                    (struct gmap *) S390_lowcore.gmap);
+       gmap = (struct gmap *)
+               ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0);
+       if (gmap) {
+               address = __gmap_fault(address, gmap);
                 if (address == -EFAULT) {
                         fault = VM_FAULT_BADMAP;
                         goto out_up;
@@ -315,6 +321,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
                         fault = VM_FAULT_OOM;
                         goto out_up;
                 }
+               if (gmap->pfault_enabled)
+                       flags |= FAULT_FLAG_RETRY_NOWAIT;
         }
  #endif
  
@@ -371,9 +379,19 @@ retry:
                                       regs, address);
                 }
                 if (fault & VM_FAULT_RETRY) {
+#ifdef CONFIG_PGSTE
+                       if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                               /* FAULT_FLAG_RETRY_NOWAIT has been set,
+                                * mmap_sem has not been released */
+                               current->thread.gmap_pfault = 1;
+                               fault = VM_FAULT_PFAULT;
+                               goto out_up;
+                       }
+#endif
                         /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                          * of starvation. */
-                       flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags &= ~(FAULT_FLAG_ALLOW_RETRY |
+                                  FAULT_FLAG_RETRY_NOWAIT);
                         flags |= FAULT_FLAG_TRIED;
                         down_read(&mm->mmap_sem);
                         goto retry;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index fdf83af..fcaf9c9 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -337,6 +337,11 @@ struct kvm_pmu {
         u64 reprogram_pmi;
  };
  
+enum {
+       KVM_DEBUGREG_BP_ENABLED = 1,
+       KVM_DEBUGREG_WONT_EXIT = 2,
+};
+
  struct kvm_vcpu_arch {
         /*
          * rip and regs accesses must go through
@@ -444,7 +449,6 @@ struct kvm_vcpu_arch {
         } st;
  
         u64 last_guest_tsc;
-       u64 last_kernel_ns;
         u64 last_host_tsc;
         u64 tsc_offset_adjustment;
         u64 this_tsc_nsec;
@@ -464,7 +468,7 @@ struct kvm_vcpu_arch {
         struct mtrr_state_type mtrr_state;
         u32 pat;
  
-       int switch_db_regs;
+       unsigned switch_db_regs;
         unsigned long db[KVM_NR_DB_REGS];
         unsigned long dr6;
         unsigned long dr7;
@@ -599,6 +603,8 @@ struct kvm_arch {
         bool use_master_clock;
         u64 master_kernel_ns;
         cycle_t master_cycle_now;
+       struct delayed_work kvmclock_update_work;
+       struct delayed_work kvmclock_sync_work;
  
         struct kvm_xen_hvm_config xen_hvm_config;
  
@@ -702,6 +708,7 @@ struct kvm_x86_ops {
         void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
         u64 (*get_dr6)(struct kvm_vcpu *vcpu);
         void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+       void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
         void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
         void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
         unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -728,8 +735,8 @@ struct kvm_x86_ops {
         int (*nmi_allowed)(struct kvm_vcpu *vcpu);
         bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
         void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
-       int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-       int (*enable_irq_window)(struct kvm_vcpu *vcpu);
+       void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+       void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
         int (*vm_has_apicv)(struct kvm *kvm);
         void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
@@ -765,6 +772,9 @@ struct kvm_x86_ops {
                                struct x86_instruction_info *info,
                                enum x86_intercept_stage stage);
         void (*handle_external_intr)(struct kvm_vcpu *vcpu);
+       bool (*mpx_supported)(void);
+
+       int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
  };
  
  struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h

index 2067264..7004d21 100644 (file)
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -85,6 +85,7 @@
  #define VM_EXIT_SAVE_IA32_EFER                  0x00100000
  #define VM_EXIT_LOAD_IA32_EFER                  0x00200000
  #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
+#define VM_EXIT_CLEAR_BNDCFGS                   0x00800000
  
  #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR      0x00036dff
  
@@ -95,6 +96,7 @@
  #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL     0x00002000
  #define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
  #define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
+#define VM_ENTRY_LOAD_BNDCFGS                   0x00010000
  
  #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR     0x000011ff
  
@@ -174,6 +176,8 @@ enum vmcs_field {
         GUEST_PDPTR2_HIGH               = 0x0000280f,
         GUEST_PDPTR3                    = 0x00002810,
         GUEST_PDPTR3_HIGH               = 0x00002811,
+       GUEST_BNDCFGS                   = 0x00002812,
+       GUEST_BNDCFGS_HIGH              = 0x00002813,
         HOST_IA32_PAT                   = 0x00002c00,
         HOST_IA32_PAT_HIGH              = 0x00002c01,
         HOST_IA32_EFER                  = 0x00002c02,
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h

index 6c1d741..d949ef2 100644 (file)
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,8 @@
  #define XSTATE_Hi16_ZMM                0x80
  
  #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK     (~(XSTATE_FPSSE | (1ULL << 63)))
  
  #define FXSAVE_SIZE    512
  
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h

index 4924f4b..c827ace 100644 (file)
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -295,6 +295,7 @@
  #define MSR_SMI_COUNT                  0x00000034
  #define MSR_IA32_FEATURE_CONTROL        0x0000003a
  #define MSR_IA32_TSC_ADJUST             0x0000003b
+#define MSR_IA32_BNDCFGS               0x00000d90
  
  #define FEATURE_CONTROL_LOCKED                         (1<<0)
  #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX       (1<<1)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index 713f1b3..0331cb3 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -417,7 +417,6 @@ void kvm_disable_steal_time(void)
  #ifdef CONFIG_SMP
  static void __init kvm_smp_prepare_boot_cpu(void)
  {
-       WARN_ON(kvm_register_clock("primary cpu clock"));
         kvm_guest_cpu_init();
         native_smp_prepare_boot_cpu();
         kvm_spinlock_init();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c

index e604109..d9156ce 100644 (file)
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -242,7 +242,7 @@ void __init kvmclock_init(void)
         hv_clock = __va(mem);
         memset(hv_clock, 0, size);
  
-       if (kvm_register_clock("boot clock")) {
+       if (kvm_register_clock("primary cpu clock")) {
                 hv_clock = NULL;
                 memblock_free(mem, size);
                 return;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index e5503d8..bea6067 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv)
         int feature_bit = 0;
         u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
  
-       xstate_bv &= ~XSTATE_FPSSE;
+       xstate_bv &= XSTATE_EXTEND_MASK;
         while (xstate_bv) {
                 if (xstate_bv & 0x1) {
                         u32 eax, ebx, ecx, edx;
@@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv)
         return ret;
  }
  
+u64 kvm_supported_xcr0(void)
+{
+       u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+       if (!kvm_x86_ops->mpx_supported())
+               xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+
+       return xcr0;
+}
+
  void kvm_update_cpuid(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpuid_entry2 *best;
@@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
         } else {
                 vcpu->arch.guest_supported_xcr0 =
                         (best->eax | ((u64)best->edx << 32)) &
-                       host_xcr0 & KVM_SUPPORTED_XCR0;
-               vcpu->arch.guest_xstate_size =
-                       xstate_required_size(vcpu->arch.guest_supported_xcr0);
+                       kvm_supported_xcr0();
+               vcpu->arch.guest_xstate_size = best->ebx =
+                       xstate_required_size(vcpu->arch.xcr0);
         }
  
         kvm_pmu_cpuid_update(vcpu);
@@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
         entry->flags = 0;
  }
  
-static bool supported_xcr0_bit(unsigned bit)
-{
-       u64 mask = ((u64)1 << bit);
-
-       return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
-}
-
  #define F(x) bit(X86_FEATURE_##x)
  
  static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
@@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
  #endif
         unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
         unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+       unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
  
         /* cpuid 1.edx */
         const u32 kvm_supported_word0_x86_features =
@@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
         /* cpuid 7.0.ebx */
         const u32 kvm_supported_word9_x86_features =
                 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-               F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
+               F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+               F(ADX);
  
         /* all calls to cpuid_count() should be made on the same cpu */
         get_cpu();
@@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
         }
         case 0xd: {
                 int idx, i;
+               u64 supported = kvm_supported_xcr0();
  
-               entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
-               entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
+               entry->eax &= supported;
+               entry->edx &= supported >> 32;
                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                 for (idx = 1, i = 1; idx < 64; ++idx) {
+                       u64 mask = ((u64)1 << idx);
                         if (*nent >= maxnent)
                                 goto out;
  
                         do_cpuid_1_ent(&entry[i], function, idx);
-                       if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+                       if (entry[i].eax == 0 || !(supported & mask))
                                 continue;
                         entry[i].flags |=
                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index 07ffca0..205b17e 100644 (file)
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = {
         I(0, em_mov), N, N, N,
  };
  
+static const struct gprefix pfx_0f_28_0f_29 = {
+       I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
  static const struct escape escape_d9 = { {
         N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
  }, {
@@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = {
         IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
         IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
         N, N, N, N,
-       N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+       GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+       GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+       N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
         N, N, N, N,
         /* 0x30 - 0x3F */
         II(ImplicitOps | Priv, em_wrmsr, wrmsr),
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 9b53135..f5704d9 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
         arch.direct_map = vcpu->arch.mmu.direct_map;
         arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
  
-       return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+       return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
  }
  
  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

index cba218a..b1e6c1b 100644 (file)
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
   *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
   *   used by guest then tlbs are not flushed, so guest is allowed to access the
   *   freed pages.
- *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ *   We set tlbs_dirty to let the notifier know this change and delay the flush
+ *   until such a case actually happens.
   */
  static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  {
@@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                         return -EINVAL;
  
                 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-                       vcpu->kvm->tlbs_dirty++;
+                       vcpu->kvm->tlbs_dirty = true;
                         continue;
                 }
  
@@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  
                 if (gfn != sp->gfns[i]) {
                         drop_spte(vcpu->kvm, &sp->spt[i]);
-                       vcpu->kvm->tlbs_dirty++;
+                       vcpu->kvm->tlbs_dirty = true;
                         continue;
                 }
  
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 2de1bc0..7f4f9c2 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,7 @@
  #include <asm/perf_event.h>
  #include <asm/tlbflush.h>
  #include <asm/desc.h>
+#include <asm/debugreg.h>
  #include <asm/kvm_para.h>
  
  #include <asm/virtext.h>
@@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
         return vmcb->control.intercept_cr & (1U << bit);
  }
  
-static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_dr |= (1U << bit);
+       vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+               | (1 << INTERCEPT_DR1_READ)
+               | (1 << INTERCEPT_DR2_READ)
+               | (1 << INTERCEPT_DR3_READ)
+               | (1 << INTERCEPT_DR4_READ)
+               | (1 << INTERCEPT_DR5_READ)
+               | (1 << INTERCEPT_DR6_READ)
+               | (1 << INTERCEPT_DR7_READ)
+               | (1 << INTERCEPT_DR0_WRITE)
+               | (1 << INTERCEPT_DR1_WRITE)
+               | (1 << INTERCEPT_DR2_WRITE)
+               | (1 << INTERCEPT_DR3_WRITE)
+               | (1 << INTERCEPT_DR4_WRITE)
+               | (1 << INTERCEPT_DR5_WRITE)
+               | (1 << INTERCEPT_DR6_WRITE)
+               | (1 << INTERCEPT_DR7_WRITE);
  
         recalc_intercepts(svm);
  }
  
-static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_dr &= ~(1U << bit);
+       vmcb->control.intercept_dr = 0;
  
         recalc_intercepts(svm);
  }
@@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm)
         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
         set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  
-       set_dr_intercept(svm, INTERCEPT_DR0_READ);
-       set_dr_intercept(svm, INTERCEPT_DR1_READ);
-       set_dr_intercept(svm, INTERCEPT_DR2_READ);
-       set_dr_intercept(svm, INTERCEPT_DR3_READ);
-       set_dr_intercept(svm, INTERCEPT_DR4_READ);
-       set_dr_intercept(svm, INTERCEPT_DR5_READ);
-       set_dr_intercept(svm, INTERCEPT_DR6_READ);
-       set_dr_intercept(svm, INTERCEPT_DR7_READ);
-
-       set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
-       set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+       set_dr_intercepts(svm);
  
         set_exception_intercept(svm, PF_VECTOR);
         set_exception_intercept(svm, UD_VECTOR);
@@ -1684,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
         mark_dirty(svm->vmcb, VMCB_DR);
  }
  
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       get_debugreg(vcpu->arch.db[0], 0);
+       get_debugreg(vcpu->arch.db[1], 1);
+       get_debugreg(vcpu->arch.db[2], 2);
+       get_debugreg(vcpu->arch.db[3], 3);
+       vcpu->arch.dr6 = svm_get_dr6(vcpu);
+       vcpu->arch.dr7 = svm->vmcb->save.dr7;
+
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+       set_dr_intercepts(svm);
+}
+
  static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -2842,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm)
         clr_intercept(svm, INTERCEPT_IRET);
         svm->vcpu.arch.hflags |= HF_IRET_MASK;
         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
         return 1;
  }
  
@@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm)
         unsigned long val;
         int err;
  
+       if (svm->vcpu.guest_debug == 0) {
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               clr_dr_intercepts(svm);
+               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
+
         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
                 return emulate_on_interception(svm);
  
@@ -3649,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
         return ret;
  }
  
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
@@ -3663,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
                 svm_set_vintr(svm);
                 svm_inject_irq(svm, 0x0);
         }
-       return 0;
  }
  
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
             == HF_NMI_MASK)
-               return 0; /* IRET will cause a vm exit */
+               return; /* IRET will cause a vm exit */
  
         /*
          * Something prevents NMI from been injected. Single step over possible
@@ -3681,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu)
         svm->nmi_singlestep = true;
         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
         update_db_bp_intercept(vcpu);
-       return 0;
  }
  
  static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4064,6 +4089,11 @@ static bool svm_invpcid_supported(void)
         return false;
  }
  
+static bool svm_mpx_supported(void)
+{
+       return false;
+}
+
  static bool svm_has_wbinvd_exit(void)
  {
         return true;
@@ -4302,6 +4332,7 @@ static struct kvm_x86_ops svm_x86_ops = {
         .get_dr6 = svm_get_dr6,
         .set_dr6 = svm_set_dr6,
         .set_dr7 = svm_set_dr7,
+       .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
         .cache_reg = svm_cache_reg,
         .get_rflags = svm_get_rflags,
         .set_rflags = svm_set_rflags,
@@ -4345,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = {
  
         .rdtscp_supported = svm_rdtscp_supported,
         .invpcid_supported = svm_invpcid_supported,
+       .mpx_supported = svm_mpx_supported,
  
         .set_supported_cpuid = svm_set_supported_cpuid,
  
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 3927528..1320e0f 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
  #include <linux/ftrace_event.h>
  #include <linux/slab.h>
  #include <linux/tboot.h>
+#include <linux/hrtimer.h>
  #include "kvm_cache_regs.h"
  #include "x86.h"
  
@@ -42,6 +43,7 @@
  #include <asm/i387.h>
  #include <asm/xcr.h>
  #include <asm/perf_event.h>
+#include <asm/debugreg.h>
  #include <asm/kexec.h>
  
  #include "trace.h"
@@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);
  
  #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
  
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
  /*
   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
   * ple_gap:    upper bound on the amount of time between two successive
@@ -202,6 +206,7 @@ struct __packed vmcs12 {
         u64 guest_pdptr1;
         u64 guest_pdptr2;
         u64 guest_pdptr3;
+       u64 guest_bndcfgs;
         u64 host_ia32_pat;
         u64 host_ia32_efer;
         u64 host_ia32_perf_global_ctrl;
@@ -374,6 +379,9 @@ struct nested_vmx {
          */
         struct page *apic_access_page;
         u64 msr_ia32_feature_control;
+
+       struct hrtimer preemption_timer;
+       bool preemption_timer_expired;
  };
  
  #define POSTED_INTR_ON  0
@@ -441,6 +449,7 @@ struct vcpu_vmx {
  #endif
                 int           gs_ldt_reload_needed;
                 int           fs_reload_needed;
+               u64           msr_host_bndcfgs;
         } host_state;
         struct {
                 int vm86_active;
@@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {
         GUEST_CS_LIMIT,
         GUEST_CS_BASE,
         GUEST_ES_BASE,
+       GUEST_BNDCFGS,
         CR0_GUEST_HOST_MASK,
         CR0_READ_SHADOW,
         CR4_READ_SHADOW,
@@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD64(GUEST_PDPTR1, guest_pdptr1),
         FIELD64(GUEST_PDPTR2, guest_pdptr2),
         FIELD64(GUEST_PDPTR3, guest_pdptr3),
+       FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
         FIELD64(HOST_IA32_PAT, host_ia32_pat),
         FIELD64(HOST_IA32_EFER, host_ia32_efer),
         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(unsigned long root_hpa);
  static void kvm_cpu_vmxon(u64 addr);
  static void kvm_cpu_vmxoff(void);
+static bool vmx_mpx_supported(void);
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
@@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_mpx_supported(void);
  
  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
  }
  
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+       return vmcs12->pin_based_vm_exec_control &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
  static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
  {
         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
         if (is_long_mode(&vmx->vcpu))
                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #endif
+       if (boot_cpu_has(X86_FEATURE_MPX))
+               rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
         for (i = 0; i < vmx->save_nmsrs; ++i)
                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
                                    vmx->guest_msrs[i].data,
@@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
  #ifdef CONFIG_X86_64
         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
  #endif
+       if (vmx->host_state.msr_host_bndcfgs)
+               wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
         /*
          * If the FPU is not active (through the host task or
          * the guest vcpu), then restore the cr0.TS bit.
@@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
          */
         nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
         nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                 PIN_BASED_VMX_PREEMPTION_TIMER;
-       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  
         /*
          * Exit controls
@@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #ifdef CONFIG_X86_64
                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
-           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
-               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-       }
-       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
  
         /* entry controls */
         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                 VM_ENTRY_LOAD_IA32_PAT;
         nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
                                        VM_ENTRY_LOAD_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
  
         /* cpu-based controls */
         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  
         /* miscellaneous data */
         rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-       nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
-               VMX_MISC_SAVE_EFER_LMA;
-       nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
+       nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+               VMX_MISC_ACTIVITY_HLT;
         nested_vmx_misc_high = 0;
  }
  
@@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
         case MSR_IA32_SYSENTER_ESP:
                 data = vmcs_readl(GUEST_SYSENTER_ESP);
                 break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               data = vmcs_read64(GUEST_BNDCFGS);
+               break;
         case MSR_IA32_FEATURE_CONTROL:
                 if (!nested_vmx_allowed(vcpu))
                         return 1;
@@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_IA32_SYSENTER_ESP:
                 vmcs_writel(GUEST_SYSENTER_ESP, data);
                 break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               vmcs_write64(GUEST_BNDCFGS, data);
+               break;
         case MSR_IA32_TSC:
                 kvm_write_tsc(vcpu, msr_info);
                 break;
@@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                       vmx_capability.ept, vmx_capability.vpid);
         }
  
-       min = 0;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
  #ifdef CONFIG_X86_64
         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #endif
         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT;
+               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                 &_vmexit_control) < 0)
                 return -EIO;
@@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
  
-       min = 0;
-       opt = VM_ENTRY_LOAD_IA32_PAT;
+       min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+       opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                 &_vmentry_control) < 0)
                 return -EIO;
@@ -4223,6 +4255,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
  {
         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+       if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+               exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
                 exec_control &= ~CPU_BASED_TPR_SHADOW;
  #ifdef CONFIG_X86_64
@@ -4496,39 +4532,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
                 PIN_BASED_NMI_EXITING;
  }
  
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
         u32 cpu_based_vm_exec_control;
  
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-               /*
-                * We get here if vmx_interrupt_allowed() said we can't
-                * inject to L1 now because L2 must run. The caller will have
-                * to make L2 exit right after entry, so we can inject to L1
-                * more promptly.
-                */
-               return -EBUSY;
-
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
  }
  
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
         u32 cpu_based_vm_exec_control;
  
-       if (!cpu_has_virtual_nmis())
-               return enable_irq_window(vcpu);
-
-       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
-               return enable_irq_window(vcpu);
+       if (!cpu_has_virtual_nmis() ||
+           vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+               enable_irq_window(vcpu);
+               return;
+       }
  
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
  }
  
  static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4620,22 +4645,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
  
  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_nmi(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                         NMI_VECTOR | INTR_TYPE_NMI_INTR |
-                                         INTR_INFO_VALID_MASK, 0);
-                       /*
-                        * The NMI-triggered VM exit counts as injection:
-                        * clear this one and block further NMIs.
-                        */
-                       vcpu->arch.nmi_pending = 0;
-                       vmx_set_nmi_mask(vcpu, true);
-                       return 0;
-               }
-       }
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return 0;
  
         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
                 return 0;
@@ -4647,19 +4658,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_intr(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
-                                         0, 0);
-                       /*
-                        * fall through to normal code, but now in L1, not L2
-                        */
-               }
-       }
-
-       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       return (!to_vmx(vcpu)->nested.nested_run_pending &&
+               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@ -5102,6 +5102,22 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 }
         }
  
+       if (vcpu->guest_debug == 0) {
+               u32 cpu_based_vm_exec_control;
+
+               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
+
         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
@@ -5128,6 +5144,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
  {
  }
  
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       get_debugreg(vcpu->arch.db[0], 0);
+       get_debugreg(vcpu->arch.db[1], 1);
+       get_debugreg(vcpu->arch.db[2], 2);
+       get_debugreg(vcpu->arch.db[3], 3);
+       get_debugreg(vcpu->arch.dr6, 6);
+       vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
  {
         vmcs_writel(GUEST_DR7, val);
@@ -5727,6 +5761,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
          */
  }
  
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+       struct vcpu_vmx *vmx =
+               container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+       vmx->nested.preemption_timer_expired = true;
+       kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+       kvm_vcpu_kick(&vmx->vcpu);
+
+       return HRTIMER_NORESTART;
+}
+
  /*
   * Emulate the VMXON instruction.
   * Currently, we just remember that VMX is active, and do not save or even
@@ -5791,6 +5837,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
         vmx->nested.vmcs02_num = 0;
  
+       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL);
+       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
         vmx->nested.vmxon = true;
  
         skip_emulated_instruction(vcpu);
@@ -6767,9 +6817,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                  * table is L0's fault.
                  */
                 return 0;
-       case EXIT_REASON_PREEMPTION_TIMER:
-               return vmcs12->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER;
         case EXIT_REASON_WBINVD:
                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
         case EXIT_REASON_XSETBV:
@@ -6785,27 +6832,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
-static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
-{
-       u64 delta_tsc_l1;
-       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
-
-       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER))
-               return;
-       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
-                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
-       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
-       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
-               - vcpu->arch.last_guest_tsc;
-       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
-       if (preempt_val_l2 <= preempt_val_l1)
-               preempt_val_l2 = 0;
-       else
-               preempt_val_l2 -= preempt_val_l1;
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
-}
-
  /*
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
@@ -7052,6 +7078,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                 local_irq_enable();
  }
  
+static bool vmx_mpx_supported(void)
+{
+       return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+               (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
         u32 exit_intr_info;
@@ -7218,8 +7250,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         atomic_switch_perf_msrs(vmx);
         debugctlmsr = get_debugctlmsr();
  
-       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
-               nested_adjust_preemption_timer(vcpu);
         vmx->__launched = vmx->loaded_vmcs->launched;
         asm(
                 /* Store host registers */
@@ -7616,6 +7646,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                 kvm_inject_page_fault(vcpu, fault);
  }
  
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.virtual_tsc_khz == 0)
+               return;
+
+       /* Make sure short timeouts reliably trigger an immediate vmexit.
+        * hrtimer_start does not guarantee this. */
+       if (preemption_timeout <= 1) {
+               vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+               return;
+       }
+
+       preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+       preemption_timeout *= 1000000;
+       do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+       hrtimer_start(&vmx->nested.preemption_timer,
+                     ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
  /*
   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7629,7 +7681,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 exec_control;
-       u32 exit_control;
  
         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7687,13 +7738,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  
         vmcs_write64(VMCS_LINK_POINTER, -1ull);
  
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               (vmcs_config.pin_based_exec_ctrl |
-                vmcs12->pin_based_vm_exec_control));
+       exec_control = vmcs12->pin_based_vm_exec_control;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
  
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
-                            vmcs12->vmx_preemption_timer_value);
+       vmx->nested.preemption_timer_expired = false;
+       if (nested_cpu_has_preemption_timer(vmcs12))
+               vmx_start_preemption_timer(vcpu);
  
         /*
          * Whether page-faults are trapped is determined by a combination of
@@ -7721,7 +7773,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
  
         if (cpu_has_secondary_exec_ctrls()) {
-               u32 exec_control = vmx_secondary_exec_control(vmx);
+               exec_control = vmx_secondary_exec_control(vmx);
                 if (!vmx->rdtscp_enabled)
                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
                 /* Take the following fields only from vmcs12 */
@@ -7808,10 +7860,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
          * bits are further modified by vmx_set_efer() below.
          */
-       exit_control = vmcs_config.vmexit_ctrl;
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       vm_exit_controls_init(vmx, exit_control);
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
  
         /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
          * emulated by vmx_set_efer(), below.
@@ -7830,6 +7879,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  
         set_cr4_guest_host_mask(vmx);
  
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                 vmcs_write64(TSC_OFFSET,
                         vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -8155,6 +8207,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
         }
  }
  
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+           vmx->nested.preemption_timer_expired) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+               return 0;
+       }
+
+       if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+               if (vmx->nested.nested_run_pending ||
+                   vcpu->arch.interrupt.pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+                                 NMI_VECTOR | INTR_TYPE_NMI_INTR |
+                                 INTR_INFO_VALID_MASK, 0);
+               /*
+                * The NMI-triggered VM exit counts as injection:
+                * clear this one and block further NMIs.
+                */
+               vcpu->arch.nmi_pending = 0;
+               vmx_set_nmi_mask(vcpu, true);
+               return 0;
+       }
+
+       if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
+           nested_exit_on_intr(vcpu)) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+       }
+
+       return 0;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+       ktime_t remaining =
+               hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+       u64 value;
+
+       if (ktime_to_ns(remaining) <= 0)
+               return 0;
+
+       value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+       do_div(value, 1000000);
+       return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
  /*
   * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
   * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8225,10 +8329,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         else
                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
  
-       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
-           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
-               vmcs12->vmx_preemption_timer_value =
-                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       if (nested_cpu_has_preemption_timer(vmcs12)) {
+               if (vmcs12->vm_exit_controls &
+                   VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+                       vmcs12->vmx_preemption_timer_value =
+                               vmx_get_preemption_timer_value(vcpu);
+               hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+       }
  
         /*
          * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -8260,6 +8367,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+       if (vmx_mpx_supported())
+               vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
  
         /* update exit information fields: */
  
@@ -8369,6 +8478,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
  
+       /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
+       if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, 0);
+
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
                 vcpu->arch.pat = vmcs12->host_ia32_pat;
@@ -8495,6 +8608,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                 nested_vmx_succeed(vcpu);
         if (enable_shadow_vmcs)
                 vmx->nested.sync_shadow_vmcs = true;
+
+       /* in case we halted in L2 */
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  }
  
  /*
@@ -8573,6 +8689,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .get_dr6 = vmx_get_dr6,
         .set_dr6 = vmx_set_dr6,
         .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
         .cache_reg = vmx_cache_reg,
         .get_rflags = vmx_get_rflags,
         .set_rflags = vmx_set_rflags,
@@ -8634,6 +8751,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
  
         .check_intercept = vmx_check_intercept,
         .handle_external_intr = vmx_handle_external_intr,
+       .mpx_supported = vmx_mpx_supported,
+
+       .check_nested_events = vmx_check_nested_events,
  };
  
  static int __init vmx_init(void)
@@ -8721,6 +8841,8 @@ static int __init vmx_init(void)
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
         memcpy(vmx_msr_bitmap_legacy_x2apic,
                         vmx_msr_bitmap_legacy, PAGE_SIZE);
         memcpy(vmx_msr_bitmap_longmode_x2apic,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 2b85784..d1c55f8 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
  
  int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
  {
-       u64 xcr0;
+       u64 xcr0 = xcr;
+       u64 old_xcr0 = vcpu->arch.xcr0;
         u64 valid_bits;
  
         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
         if (index != XCR_XFEATURE_ENABLED_MASK)
                 return 1;
-       xcr0 = xcr;
         if (!(xcr0 & XSTATE_FP))
                 return 1;
         if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
         if (xcr0 & ~valid_bits)
                 return 1;
  
+       if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+               return 1;
+
         kvm_put_guest_xcr0(vcpu);
         vcpu->arch.xcr0 = xcr0;
+
+       if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+               kvm_update_cpuid(vcpu);
         return 0;
  }
  
@@ -753,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
         else
                 dr7 = vcpu->arch.dr7;
         kvm_x86_ops->set_dr7(vcpu, dr7);
-       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+       if (dr7 & DR7_BP_EN_MASK)
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
  }
  
  static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -879,7 +887,7 @@ static u32 msrs_to_save[] = {
         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
-       MSR_IA32_FEATURE_CONTROL
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
  };
  
  static unsigned num_msrs_to_save;
@@ -1581,7 +1589,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         /* With all the info we got, fill in the values */
         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
-       vcpu->last_kernel_ns = kernel_ns;
         vcpu->last_guest_tsc = tsc_timestamp;
  
         /*
@@ -1623,14 +1630,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
   * the others.
   *
   * So in those cases, request a kvmclock update for all vcpus.
- * The worst case for a remote vcpu to update its kvmclock
- * is then bounded by maximum nohz sleep latency.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
   */
  
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
  {
         int i;
-       struct kvm *kvm = v->kvm;
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_update_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
         struct kvm_vcpu *vcpu;
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1639,6 +1653,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
         }
  }
  
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+       struct kvm *kvm = v->kvm;
+
+       set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+                                       KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+       struct delayed_work *dwork = to_delayed_work(work);
+       struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+                                          kvmclock_sync_work);
+       struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+       schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+}
+
  static bool msr_mtrr_valid(unsigned msr)
  {
         switch (msr) {
@@ -2323,9 +2360,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case HV_X64_MSR_VP_INDEX: {
                 int r;
                 struct kvm_vcpu *v;
-               kvm_for_each_vcpu(r, v, vcpu->kvm)
-                       if (v == vcpu)
+               kvm_for_each_vcpu(r, v, vcpu->kvm) {
+                       if (v == vcpu) {
                                 data = r;
+                               break;
+                       }
+               }
                 break;
         }
         case HV_X64_MSR_EOI:
@@ -2617,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_KVMCLOCK_CTRL:
         case KVM_CAP_READONLY_MEM:
         case KVM_CAP_HYPERV_TIME:
+       case KVM_CAP_IOAPIC_POLARITY_IGNORED:
  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
         case KVM_CAP_ASSIGN_DEV_IRQ:
         case KVM_CAP_PCI_2_3:
@@ -3043,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                  * with old userspace.
                  */
-               if (xstate_bv & ~KVM_SUPPORTED_XCR0)
-                       return -EINVAL;
-               if (xstate_bv & ~host_xcr0)
+               if (xstate_bv & ~kvm_supported_xcr0())
                         return -EINVAL;
                 memcpy(&vcpu->arch.guest_fpu.state->xsave,
                         guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3898,6 +3937,23 @@ static void kvm_init_msr_list(void)
         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
                         continue;
+
+               /*
+                * Even MSRs that are valid in the host may not be exposed
+                * to the guests in some cases.  We could work around this
+                * in VMX with the generic MSR save/load machinery, but it
+                * is not really worthwhile since it will really only
+                * happen with nested virtualization.
+                */
+               switch (msrs_to_save[i]) {
+               case MSR_IA32_BNDCFGS:
+                       if (!kvm_x86_ops->mpx_supported())
+                               continue;
+                       break;
+               default:
+                       break;
+               }
+
                 if (j < i)
                         msrs_to_save[j] = msrs_to_save[i];
                 j++;
@@ -4394,6 +4450,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
         if (!exchanged)
                 return X86EMUL_CMPXCHG_FAILED;
  
+       mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
         kvm_mmu_pte_write(vcpu, gpa, new, bytes);
  
         return X86EMUL_CONTINUE;
@@ -5537,9 +5594,10 @@ int kvm_arch_init(void *opaque)
                 goto out_free_percpu;
  
         kvm_set_mmio_spte_mask();
-       kvm_init_msr_list();
  
         kvm_x86_ops = ops;
+       kvm_init_msr_list();
+
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
  
@@ -5782,8 +5840,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
  }
  
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
  {
+       int r;
+
         /* try to reinject previous events if any */
         if (vcpu->arch.exception.pending) {
                 trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5793,17 +5853,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                                           vcpu->arch.exception.has_error_code,
                                           vcpu->arch.exception.error_code,
                                           vcpu->arch.exception.reinject);
-               return;
+               return 0;
         }
  
         if (vcpu->arch.nmi_injected) {
                 kvm_x86_ops->set_nmi(vcpu);
-               return;
+               return 0;
         }
  
         if (vcpu->arch.interrupt.pending) {
                 kvm_x86_ops->set_irq(vcpu);
-               return;
+               return 0;
+       }
+
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+               r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+               if (r != 0)
+                       return r;
         }
  
         /* try to inject new event if pending */
@@ -5820,6 +5886,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                         kvm_x86_ops->set_irq(vcpu);
                 }
         }
+       return 0;
  }
  
  static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5924,15 +5991,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         goto out;
                 }
  
-               inject_pending_event(vcpu);
-
+               if (inject_pending_event(vcpu, req_int_win) != 0)
+                       req_immediate_exit = true;
                 /* enable NMI/IRQ window open exits if needed */
-               if (vcpu->arch.nmi_pending)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_nmi_window(vcpu) != 0;
+               else if (vcpu->arch.nmi_pending)
+                       kvm_x86_ops->enable_nmi_window(vcpu);
                 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-                       req_immediate_exit =
-                               kvm_x86_ops->enable_irq_window(vcpu) != 0;
+                       kvm_x86_ops->enable_irq_window(vcpu);
  
                 if (kvm_lapic_enabled(vcpu)) {
                         /*
@@ -5992,11 +6057,27 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 set_debugreg(vcpu->arch.eff_db[1], 1);
                 set_debugreg(vcpu->arch.eff_db[2], 2);
                 set_debugreg(vcpu->arch.eff_db[3], 3);
+               set_debugreg(vcpu->arch.dr6, 6);
         }
  
         trace_kvm_entry(vcpu->vcpu_id);
         kvm_x86_ops->run(vcpu);
  
+       /*
+        * Do this here before restoring debug registers on the host.  And
+        * since we do this before handling the vmexit, a DR access vmexit
+        * can (a) read the correct value of the debug registers, (b) set
+        * KVM_DEBUGREG_WONT_EXIT again.
+        */
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+               int i;
+
+               WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+               kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+               for (i = 0; i < KVM_NR_DB_REGS; i++)
+                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+       }
+
         /*
          * If the guest has used debug registers, at least dr7
          * will be disabled while returning to the host.
@@ -6711,6 +6792,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  {
         int r;
         struct msr_data msr;
+       struct kvm *kvm = vcpu->kvm;
  
         r = vcpu_load(vcpu);
         if (r)
@@ -6721,6 +6803,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
         kvm_write_tsc(vcpu, &msr);
         vcpu_put(vcpu);
  
+       schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+                                       KVMCLOCK_SYNC_PERIOD);
+
         return r;
  }
  
@@ -7013,6 +7098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  
         pvclock_update_vm_gtod_copy(kvm);
  
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+       INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
         return 0;
  }
  
@@ -7050,6 +7138,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
  
  void kvm_arch_sync_events(struct kvm *kvm)
  {
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+       cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
         kvm_free_all_assigned_devices(kvm);
         kvm_free_pit(kvm);
  }
@@ -7248,6 +7338,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
  
  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  {
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+               kvm_x86_ops->check_nested_events(vcpu, false);
+
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                 !vcpu->arch.apf.halted)
                 || !list_empty_careful(&vcpu->async_pf.done)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 8da5823..8c97bac 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -122,9 +122,12 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
         gva_t addr, void *val, unsigned int bytes,
         struct x86_exception *exception);
  
-#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
+                               | XSTATE_BNDREGS | XSTATE_BNDCSR)
  extern u64 host_xcr0;
  
+extern u64 kvm_supported_xcr0(void);
+
  extern unsigned int min_timer_period_us;
  
  extern struct static_key kvm_no_apic_vcpu;
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c

index 0fc5848..1e1fc67 100644 (file)
--- a/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -1,7 +1,7 @@
  /*
   * ccw based virtio transport
   *
- * Copyright IBM Corp. 2012
+ * Copyright IBM Corp. 2012, 2014
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License (version 2 only)
@@ -32,6 +32,8 @@
  #include <asm/cio.h>
  #include <asm/ccwdev.h>
  #include <asm/virtio-ccw.h>
+#include <asm/isc.h>
+#include <asm/airq.h>
  
  /*
   * virtio related functions
@@ -58,6 +60,9 @@ struct virtio_ccw_device {
         unsigned long indicators;
         unsigned long indicators2;
         struct vq_config_block *config_block;
+       bool is_thinint;
+       bool going_away;
+       void *airq_info;
  };
  
  struct vq_info_block {
@@ -72,15 +77,38 @@ struct virtio_feature_desc {
         __u8 index;
  } __packed;
  
+struct virtio_thinint_area {
+       unsigned long summary_indicator;
+       unsigned long indicator;
+       u64 bit_nr;
+       u8 isc;
+} __packed;
+
  struct virtio_ccw_vq_info {
         struct virtqueue *vq;
         int num;
         void *queue;
         struct vq_info_block *info_block;
+       int bit_nr;
         struct list_head node;
         long cookie;
  };
  
+#define VIRTIO_AIRQ_ISC IO_SCH_ISC /* inherit from subchannel */
+
+#define VIRTIO_IV_BITS (L1_CACHE_BYTES * 8)
+#define MAX_AIRQ_AREAS 20
+
+static int virtio_ccw_use_airq = 1;
+
+struct airq_info {
+       rwlock_t lock;
+       u8 summary_indicator;
+       struct airq_struct airq;
+       struct airq_iv *aiv;
+};
+static struct airq_info *airq_areas[MAX_AIRQ_AREAS];
+
  #define CCW_CMD_SET_VQ 0x13
  #define CCW_CMD_VDEV_RESET 0x33
  #define CCW_CMD_SET_IND 0x43
@@ -91,6 +119,7 @@ struct virtio_ccw_vq_info {
  #define CCW_CMD_WRITE_CONF 0x21
  #define CCW_CMD_WRITE_STATUS 0x31
  #define CCW_CMD_READ_VQ_CONF 0x32
+#define CCW_CMD_SET_IND_ADAPTER 0x73
  
  #define VIRTIO_CCW_DOING_SET_VQ 0x00010000
  #define VIRTIO_CCW_DOING_RESET 0x00040000
@@ -102,6 +131,7 @@ struct virtio_ccw_vq_info {
  #define VIRTIO_CCW_DOING_SET_IND 0x01000000
  #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
  #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
+#define VIRTIO_CCW_DOING_SET_IND_ADAPTER 0x08000000
  #define VIRTIO_CCW_INTPARM_MASK 0xffff0000
  
  static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
@@ -109,6 +139,125 @@ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
         return container_of(vdev, struct virtio_ccw_device, vdev);
  }
  
+static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info)
+{
+       unsigned long i, flags;
+
+       write_lock_irqsave(&info->lock, flags);
+       for (i = 0; i < airq_iv_end(info->aiv); i++) {
+               if (vq == (void *)airq_iv_get_ptr(info->aiv, i)) {
+                       airq_iv_free_bit(info->aiv, i);
+                       airq_iv_set_ptr(info->aiv, i, 0);
+                       break;
+               }
+       }
+       write_unlock_irqrestore(&info->lock, flags);
+}
+
+static void virtio_airq_handler(struct airq_struct *airq)
+{
+       struct airq_info *info = container_of(airq, struct airq_info, airq);
+       unsigned long ai;
+
+       inc_irq_stat(IRQIO_VAI);
+       read_lock(&info->lock);
+       /* Walk through indicators field, summary indicator active. */
+       for (ai = 0;;) {
+               ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+               if (ai == -1UL)
+                       break;
+               vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+       }
+       info->summary_indicator = 0;
+       smp_wmb();
+       /* Walk through indicators field, summary indicator not active. */
+       for (ai = 0;;) {
+               ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+               if (ai == -1UL)
+                       break;
+               vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+       }
+       read_unlock(&info->lock);
+}
+
+static struct airq_info *new_airq_info(void)
+{
+       struct airq_info *info;
+       int rc;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return NULL;
+       rwlock_init(&info->lock);
+       info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR);
+       if (!info->aiv) {
+               kfree(info);
+               return NULL;
+       }
+       info->airq.handler = virtio_airq_handler;
+       info->airq.lsi_ptr = &info->summary_indicator;
+       info->airq.lsi_mask = 0xff;
+       info->airq.isc = VIRTIO_AIRQ_ISC;
+       rc = register_adapter_interrupt(&info->airq);
+       if (rc) {
+               airq_iv_release(info->aiv);
+               kfree(info);
+               return NULL;
+       }
+       return info;
+}
+
+static void destroy_airq_info(struct airq_info *info)
+{
+       if (!info)
+               return;
+
+       unregister_adapter_interrupt(&info->airq);
+       airq_iv_release(info->aiv);
+       kfree(info);
+}
+
+static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs,
+                                       u64 *first, void **airq_info)
+{
+       int i, j;
+       struct airq_info *info;
+       unsigned long indicator_addr = 0;
+       unsigned long bit, flags;
+
+       for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) {
+               if (!airq_areas[i])
+                       airq_areas[i] = new_airq_info();
+               info = airq_areas[i];
+               if (!info)
+                       return 0;
+               write_lock_irqsave(&info->lock, flags);
+               bit = airq_iv_alloc(info->aiv, nvqs);
+               if (bit == -1UL) {
+                       /* Not enough vacancies. */
+                       write_unlock_irqrestore(&info->lock, flags);
+                       continue;
+               }
+               *first = bit;
+               *airq_info = info;
+               indicator_addr = (unsigned long)info->aiv->vector;
+               for (j = 0; j < nvqs; j++) {
+                       airq_iv_set_ptr(info->aiv, bit + j,
+                                       (unsigned long)vqs[j]);
+               }
+               write_unlock_irqrestore(&info->lock, flags);
+       }
+       return indicator_addr;
+}
+
+static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev)
+{
+       struct virtio_ccw_vq_info *info;
+
+       list_for_each_entry(info, &vcdev->virtqueues, node)
+               drop_airq_indicator(info->vq, vcdev->airq_info);
+}
+
  static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
  {
         unsigned long flags;
@@ -145,6 +294,51 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
         return ret ? ret : vcdev->err;
  }
  
+static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
+                                     struct ccw1 *ccw)
+{
+       int ret;
+       unsigned long *indicatorp = NULL;
+       struct virtio_thinint_area *thinint_area = NULL;
+       struct airq_info *airq_info = vcdev->airq_info;
+
+       if (vcdev->is_thinint) {
+               thinint_area = kzalloc(sizeof(*thinint_area),
+                                      GFP_DMA | GFP_KERNEL);
+               if (!thinint_area)
+                       return;
+               thinint_area->summary_indicator =
+                       (unsigned long) &airq_info->summary_indicator;
+               thinint_area->isc = VIRTIO_AIRQ_ISC;
+               ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+               ccw->count = sizeof(*thinint_area);
+               ccw->cda = (__u32)(unsigned long) thinint_area;
+       } else {
+               indicatorp = kmalloc(sizeof(&vcdev->indicators),
+                                    GFP_DMA | GFP_KERNEL);
+               if (!indicatorp)
+                       return;
+               *indicatorp = 0;
+               ccw->cmd_code = CCW_CMD_SET_IND;
+               ccw->count = sizeof(vcdev->indicators);
+               ccw->cda = (__u32)(unsigned long) indicatorp;
+       }
+       /* Deregister indicators from host. */
+       vcdev->indicators = 0;
+       ccw->flags = 0;
+       ret = ccw_io_helper(vcdev, ccw,
+                           vcdev->is_thinint ?
+                           VIRTIO_CCW_DOING_SET_IND_ADAPTER :
+                           VIRTIO_CCW_DOING_SET_IND);
+       if (ret && (ret != -ENODEV))
+               dev_info(&vcdev->cdev->dev,
+                        "Failed to deregister indicators (%d)\n", ret);
+       else if (vcdev->is_thinint)
+               virtio_ccw_drop_indicators(vcdev);
+       kfree(indicatorp);
+       kfree(thinint_area);
+}
+
  static inline long do_kvm_notify(struct subchannel_id schid,
                                  unsigned long queue_index,
                                  long cookie)
@@ -232,11 +426,13 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
  {
         struct virtqueue *vq, *n;
         struct ccw1 *ccw;
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
  
         ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
         if (!ccw)
                 return;
  
+       virtio_ccw_drop_indicator(vcdev, ccw);
  
         list_for_each_entry_safe(vq, n, &vdev->vqs, list)
                 virtio_ccw_del_vq(vq, ccw);
@@ -326,6 +522,54 @@ out_err:
         return ERR_PTR(err);
  }
  
+static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev,
+                                          struct virtqueue *vqs[], int nvqs,
+                                          struct ccw1 *ccw)
+{
+       int ret;
+       struct virtio_thinint_area *thinint_area = NULL;
+       struct airq_info *info;
+
+       thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL);
+       if (!thinint_area) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       /* Try to get an indicator. */
+       thinint_area->indicator = get_airq_indicator(vqs, nvqs,
+                                                    &thinint_area->bit_nr,
+                                                    &vcdev->airq_info);
+       if (!thinint_area->indicator) {
+               ret = -ENOSPC;
+               goto out;
+       }
+       info = vcdev->airq_info;
+       thinint_area->summary_indicator =
+               (unsigned long) &info->summary_indicator;
+       thinint_area->isc = VIRTIO_AIRQ_ISC;
+       ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+       ccw->flags = CCW_FLAG_SLI;
+       ccw->count = sizeof(*thinint_area);
+       ccw->cda = (__u32)(unsigned long)thinint_area;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND_ADAPTER);
+       if (ret) {
+               if (ret == -EOPNOTSUPP) {
+                       /*
+                        * The host does not support adapter interrupts
+                        * for virtio-ccw, stop trying.
+                        */
+                       virtio_ccw_use_airq = 0;
+                       pr_info("Adapter interrupts unsupported on host\n");
+               } else
+                       dev_warn(&vcdev->cdev->dev,
+                                "enabling adapter interrupts = %d\n", ret);
+               virtio_ccw_drop_indicators(vcdev);
+       }
+out:
+       kfree(thinint_area);
+       return ret;
+}
+
  static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
                                struct virtqueue *vqs[],
                                vq_callback_t *callbacks[],
@@ -355,15 +599,23 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
         if (!indicatorp)
                 goto out;
         *indicatorp = (unsigned long) &vcdev->indicators;
-       /* Register queue indicators with host. */
-       vcdev->indicators = 0;
-       ccw->cmd_code = CCW_CMD_SET_IND;
-       ccw->flags = 0;
-       ccw->count = sizeof(vcdev->indicators);
-       ccw->cda = (__u32)(unsigned long) indicatorp;
-       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
-       if (ret)
-               goto out;
+       if (vcdev->is_thinint) {
+               ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw);
+               if (ret)
+                       /* no error, just fall back to legacy interrupts */
+                       vcdev->is_thinint = 0;
+       }
+       if (!vcdev->is_thinint) {
+               /* Register queue indicators with host. */
+               vcdev->indicators = 0;
+               ccw->cmd_code = CCW_CMD_SET_IND;
+               ccw->flags = 0;
+               ccw->count = sizeof(vcdev->indicators);
+               ccw->cda = (__u32)(unsigned long) indicatorp;
+               ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
+               if (ret)
+                       goto out;
+       }
         /* Register indicators2 with host for config changes */
         *indicatorp = (unsigned long) &vcdev->indicators2;
         vcdev->indicators2 = 0;
@@ -636,6 +888,8 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
         struct virtqueue *vq;
         struct virtio_driver *drv;
  
+       if (!vcdev)
+               return;
         /* Check if it's a notification from the host. */
         if ((intparm == 0) &&
             (scsw_stctl(&irb->scsw) ==
@@ -663,6 +917,7 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
                 case VIRTIO_CCW_DOING_SET_CONF_IND:
                 case VIRTIO_CCW_DOING_RESET:
                 case VIRTIO_CCW_DOING_READ_VQ_CONF:
+               case VIRTIO_CCW_DOING_SET_IND_ADAPTER:
                         vcdev->curr_io &= ~activity;
                         wake_up(&vcdev->wait_q);
                         break;
@@ -734,23 +989,46 @@ static int virtio_ccw_probe(struct ccw_device *cdev)
         return 0;
  }
  
+static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev)
+{
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev;
+
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+       vcdev = dev_get_drvdata(&cdev->dev);
+       if (!vcdev || vcdev->going_away) {
+               spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+               return NULL;
+       }
+       vcdev->going_away = true;
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+       return vcdev;
+}
+
  static void virtio_ccw_remove(struct ccw_device *cdev)
  {
-       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
  
-       if (cdev->online) {
+       if (vcdev && cdev->online)
                 unregister_virtio_device(&vcdev->vdev);
-               dev_set_drvdata(&cdev->dev, NULL);
-       }
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+       dev_set_drvdata(&cdev->dev, NULL);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
         cdev->handler = NULL;
  }
  
  static int virtio_ccw_offline(struct ccw_device *cdev)
  {
-       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       unsigned long flags;
+       struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
  
-       unregister_virtio_device(&vcdev->vdev);
-       dev_set_drvdata(&cdev->dev, NULL);
+       if (vcdev) {
+               unregister_virtio_device(&vcdev->vdev);
+               spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+               dev_set_drvdata(&cdev->dev, NULL);
+               spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+       }
         return 0;
  }
  
@@ -759,6 +1037,7 @@ static int virtio_ccw_online(struct ccw_device *cdev)
  {
         int ret;
         struct virtio_ccw_device *vcdev;
+       unsigned long flags;
  
         vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
         if (!vcdev) {
@@ -778,6 +1057,8 @@ static int virtio_ccw_online(struct ccw_device *cdev)
                 goto out_free;
         }
  
+       vcdev->is_thinint = virtio_ccw_use_airq; /* at least try */
+
         vcdev->vdev.dev.parent = &cdev->dev;
         vcdev->vdev.dev.release = virtio_ccw_release_dev;
         vcdev->vdev.config = &virtio_ccw_config_ops;
@@ -786,7 +1067,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
         INIT_LIST_HEAD(&vcdev->virtqueues);
         spin_lock_init(&vcdev->lock);
  
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
         dev_set_drvdata(&cdev->dev, vcdev);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
         vcdev->vdev.id.vendor = cdev->id.cu_type;
         vcdev->vdev.id.device = cdev->id.cu_model;
         ret = register_virtio_device(&vcdev->vdev);
@@ -797,7 +1080,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
         }
         return 0;
  out_put:
+       spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
         dev_set_drvdata(&cdev->dev, NULL);
+       spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
         put_device(&vcdev->vdev.dev);
         return ret;
  out_free:
@@ -935,6 +1220,10 @@ module_init(virtio_ccw_init);
  
  static void __exit virtio_ccw_exit(void)
  {
+       int i;
+
         ccw_driver_unregister(&virtio_ccw_driver);
+       for (i = 0; i < MAX_AIRQ_AREAS; i++)
+               destroy_airq_info(airq_areas[i]);
  }
  module_exit(virtio_ccw_exit);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index b8e9a43..7d21cf9 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -192,7 +192,7 @@ struct kvm_async_pf {
  
  void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
  void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
                        struct kvm_arch_async_pf *arch);
  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
  #endif
@@ -297,6 +297,14 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
         return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
  }
  
+struct kvm_s390_adapter_int {
+       u64 ind_addr;
+       u64 summary_addr;
+       u64 ind_offset;
+       u32 summary_offset;
+       u32 adapter_id;
+};
+
  struct kvm_kernel_irq_routing_entry {
         u32 gsi;
         u32 type;
@@ -309,6 +317,7 @@ struct kvm_kernel_irq_routing_entry {
                         unsigned pin;
                 } irqchip;
                 struct msi_msg msi;
+               struct kvm_s390_adapter_int adapter;
         };
         struct hlist_node link;
  };
@@ -401,7 +410,9 @@ struct kvm {
         unsigned long mmu_notifier_seq;
         long mmu_notifier_count;
  #endif
-       long tlbs_dirty;
+       /* Protected by mmu_lock */
+       bool tlbs_dirty;
+
         struct list_head devices;
  };
  
@@ -911,7 +922,11 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
  
  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  
+#ifdef CONFIG_S390
+#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that...
+#else
  #define KVM_MAX_IRQ_ROUTES 1024
+#endif
  
  int kvm_setup_default_irq_routing(struct kvm *kvm);
  int kvm_set_irq_routing(struct kvm *kvm,
@@ -1064,6 +1079,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
  extern struct kvm_device_ops kvm_xics_ops;
  extern struct kvm_device_ops kvm_vfio_ops;
  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_flic_ops;
  
  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 932d7f2..a8f4ee5 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -413,6 +413,8 @@ struct kvm_s390_psw {
  #define KVM_S390_PROGRAM_INT           0xfffe0001u
  #define KVM_S390_SIGP_SET_PREFIX       0xfffe0002u
  #define KVM_S390_RESTART               0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT       0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE       0xfffe0005u
  #define KVM_S390_MCHK                  0xfffe1000u
  #define KVM_S390_INT_VIRTIO            0xffff2603u
  #define KVM_S390_INT_SERVICE           0xffff2401u
@@ -434,6 +436,69 @@ struct kvm_s390_interrupt {
         __u64 parm64;
  };
  
+struct kvm_s390_io_info {
+       __u16 subchannel_id;
+       __u16 subchannel_nr;
+       __u32 io_int_parm;
+       __u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+       __u32 ext_params;
+       __u32 pad;
+       __u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+       __u64 trans_exc_code;
+       __u64 mon_code;
+       __u64 per_address;
+       __u32 data_exc_code;
+       __u16 code;
+       __u16 mon_class_nr;
+       __u8 per_code;
+       __u8 per_atmid;
+       __u8 exc_access_id;
+       __u8 per_access_id;
+       __u8 op_access_id;
+       __u8 pad[3];
+};
+
+struct kvm_s390_prefix_info {
+       __u32 address;
+};
+
+struct kvm_s390_extcall_info {
+       __u16 code;
+};
+
+struct kvm_s390_emerg_info {
+       __u16 code;
+};
+
+struct kvm_s390_mchk_info {
+       __u64 cr14;
+       __u64 mcic;
+       __u64 failing_storage_address;
+       __u32 ext_damage_code;
+       __u32 pad;
+       __u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+       __u64 type;
+       union {
+               struct kvm_s390_io_info io;
+               struct kvm_s390_ext_info ext;
+               struct kvm_s390_pgm_info pgm;
+               struct kvm_s390_emerg_info emerg;
+               struct kvm_s390_extcall_info extcall;
+               struct kvm_s390_prefix_info prefix;
+               struct kvm_s390_mchk_info mchk;
+               char reserved[64];
+       } u;
+};
+
  /* for KVM_SET_GUEST_DEBUG */
  
  #define KVM_GUESTDBG_ENABLE            0x00000001
@@ -675,6 +740,9 @@ struct kvm_ppc_smmu_info {
  #define KVM_CAP_SPAPR_MULTITCE 94
  #define KVM_CAP_EXT_EMUL_CPUID 95
  #define KVM_CAP_HYPERV_TIME 96
+#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
+#define KVM_CAP_ENABLE_CAP_VM 98
+#define KVM_CAP_S390_IRQCHIP 99
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@ -690,9 +758,18 @@ struct kvm_irq_routing_msi {
         __u32 pad;
  };
  
+struct kvm_irq_routing_s390_adapter {
+       __u64 ind_addr;
+       __u64 summary_addr;
+       __u64 ind_offset;
+       __u32 summary_offset;
+       __u32 adapter_id;
+};
+
  /* gsi routing entry types */
  #define KVM_IRQ_ROUTING_IRQCHIP 1
  #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
  
  struct kvm_irq_routing_entry {
         __u32 gsi;
@@ -702,6 +779,7 @@ struct kvm_irq_routing_entry {
         union {
                 struct kvm_irq_routing_irqchip irqchip;
                 struct kvm_irq_routing_msi msi;
+               struct kvm_irq_routing_s390_adapter adapter;
                 __u32 pad[8];
         } u;
  };
@@ -855,6 +933,7 @@ struct kvm_device_attr {
  #define   KVM_DEV_VFIO_GROUP_ADD                       1
  #define   KVM_DEV_VFIO_GROUP_DEL                       2
  #define KVM_DEV_TYPE_ARM_VGIC_V2       5
+#define KVM_DEV_TYPE_FLIC              6
  
  /*
   * ioctls for VM fds
@@ -1009,6 +1088,10 @@ struct kvm_s390_ucas_mapping {
  /* Available with KVM_CAP_DEBUGREGS */
  #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
  #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
+/*
+ * vcpu version available with KVM_ENABLE_CAP
+ * vm version available with KVM_CAP_ENABLE_CAP_VM
+ */
  #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
  /* Available with KVM_CAP_XSAVE */
  #define KVM_GET_XSAVE            _IOR(KVMIO,  0xa4, struct kvm_xsave)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig

index fbe1a48..13f2d19 100644 (file)
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -22,6 +22,10 @@ config KVM_MMIO
  config KVM_ASYNC_PF
         bool
  
+# Toggle to switch between direct notification and batch job
+config KVM_ASYNC_PF_SYNC
+       bool
+
  config HAVE_KVM_MSI
         bool
  
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c

index 8631d9c..10df100 100644 (file)
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -28,6 +28,21 @@
  #include "async_pf.h"
  #include <trace/events/kvm.h>
  
+static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
+                                              struct kvm_async_pf *work)
+{
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+       kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
+                                               struct kvm_async_pf *work)
+{
+#ifndef CONFIG_KVM_ASYNC_PF_SYNC
+       kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+
  static struct kmem_cache *async_pf_cache;
  
  int kvm_async_pf_init(void)
@@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work)
         down_read(&mm->mmap_sem);
         get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
         up_read(&mm->mmap_sem);
+       kvm_async_page_present_sync(vcpu, apf);
         unuse_mm(mm);
  
         spin_lock(&vcpu->async_pf.lock);
@@ -97,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                         list_entry(vcpu->async_pf.queue.next,
                                    typeof(*work), queue);
                 list_del(&work->queue);
+
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+               flush_work(&work->work);
+#else
                 if (cancel_work_sync(&work->work)) {
                         mmdrop(work->mm);
                         kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
                         kmem_cache_free(async_pf_cache, work);
                 }
+#endif
         }
  
         spin_lock(&vcpu->async_pf.lock);
@@ -130,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
                 spin_unlock(&vcpu->async_pf.lock);
  
                 kvm_arch_async_page_ready(vcpu, work);
-               kvm_arch_async_page_present(vcpu, work);
+               kvm_async_page_present_async(vcpu, work);
  
                 list_del(&work->queue);
                 vcpu->async_pf.queued--;
@@ -138,7 +159,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
         }
  }
  
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
                        struct kvm_arch_async_pf *arch)
  {
         struct kvm_async_pf *work;
@@ -159,7 +180,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
         work->wakeup_all = false;
         work->vcpu = vcpu;
         work->gva = gva;
-       work->addr = gfn_to_hva(vcpu->kvm, gfn);
+       work->addr = hva;
         work->arch = *arch;
         work->mm = current->mm;
         atomic_inc(&work->mm->mm_count);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c

index abe4d60..29c2a04 100644 (file)
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -391,19 +391,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
                                            lockdep_is_held(&kvm->irqfds.lock));
         irqfd_update(kvm, irqfd, irq_rt);
  
-       events = f.file->f_op->poll(f.file, &irqfd->pt);
-
         list_add_tail(&irqfd->list, &kvm->irqfds.items);
  
+       spin_unlock_irq(&kvm->irqfds.lock);
+
         /*
          * Check if there was an event already pending on the eventfd
          * before we registered, and trigger it as if we didn't miss it.
          */
+       events = f.file->f_op->poll(f.file, &irqfd->pt);
+
         if (events & POLLIN)
                 schedule_work(&irqfd->inject);
  
-       spin_unlock_irq(&kvm->irqfds.lock);
-
         /*
          * do not drop the file until the irqfd is fully initialized, otherwise
          * we might race against the POLLHUP
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c

index ce9ed99..d4b6015 100644 (file)
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -50,7 +50,7 @@
  #else
  #define ioapic_debug(fmt, arg...)
  #endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
                 bool line_status);
  
  static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -163,23 +163,67 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
         return false;
  }
  
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
-               bool line_status)
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+               int irq_level, bool line_status)
  {
-       union kvm_ioapic_redirect_entry *pent;
-       int injected = -1;
+       union kvm_ioapic_redirect_entry entry;
+       u32 mask = 1 << irq;
+       u32 old_irr;
+       int edge, ret;
  
-       pent = &ioapic->redirtbl[idx];
+       entry = ioapic->redirtbl[irq];
+       edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
  
-       if (!pent->fields.mask) {
-               injected = ioapic_deliver(ioapic, idx, line_status);
-               if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-                       pent->fields.remote_irr = 1;
+       if (!irq_level) {
+               ioapic->irr &= ~mask;
+               ret = 1;
+               goto out;
+       }
+
+       /*
+        * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+        * this only happens if a previous edge has not been delivered due
+        * do masking.  For level interrupts, the remote_irr field tells
+        * us if the interrupt is waiting for an EOI.
+        *
+        * RTC is special: it is edge-triggered, but userspace likes to know
+        * if it has been already ack-ed via EOI because coalesced RTC
+        * interrupts lead to time drift in Windows guests.  So we track
+        * EOI manually for the RTC interrupt.
+        */
+       if (irq == RTC_GSI && line_status &&
+               rtc_irq_check_coalesced(ioapic)) {
+               ret = 0;
+               goto out;
         }
  
-       return injected;
+       old_irr = ioapic->irr;
+       ioapic->irr |= mask;
+       if ((edge && old_irr == ioapic->irr) ||
+           (!edge && entry.fields.remote_irr)) {
+               ret = 0;
+               goto out;
+       }
+
+       ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+       trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+       return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+       u32 idx;
+
+       rtc_irq_eoi_tracking_reset(ioapic);
+       for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+               ioapic_set_irq(ioapic, idx, 1, true);
+
+       kvm_rtc_eoi_tracking_restore_all(ioapic);
  }
  
+
  static void update_handled_vectors(struct kvm_ioapic *ioapic)
  {
         DECLARE_BITMAP(handled_vectors, 256);
@@ -282,12 +326,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
         }
  }
  
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
  {
         union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
         struct kvm_lapic_irq irqe;
         int ret;
  
+       if (entry->fields.mask)
+               return -1;
+
         ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
                      "vector=%x trig_mode=%x\n",
                      entry->fields.dest_id, entry->fields.dest_mode,
@@ -302,6 +349,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
         irqe.level = 1;
         irqe.shorthand = 0;
  
+       if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+               ioapic->irr &= ~(1 << irq);
+
         if (irq == RTC_GSI && line_status) {
                 BUG_ON(ioapic->rtc_status.pending_eoi != 0);
                 ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
@@ -310,45 +360,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
         } else
                 ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
  
+       if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+               entry->fields.remote_irr = 1;
+
         return ret;
  }
  
  int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
                        int level, bool line_status)
  {
-       u32 old_irr;
-       u32 mask = 1 << irq;
-       union kvm_ioapic_redirect_entry entry;
         int ret, irq_level;
  
         BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
  
         spin_lock(&ioapic->lock);
-       old_irr = ioapic->irr;
         irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
                                          irq_source_id, level);
-       entry = ioapic->redirtbl[irq];
-       irq_level ^= entry.fields.polarity;
-       if (!irq_level) {
-               ioapic->irr &= ~mask;
-               ret = 1;
-       } else {
-               int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+       ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
  
-               if (irq == RTC_GSI && line_status &&
-                       rtc_irq_check_coalesced(ioapic)) {
-                       ret = 0; /* coalesced */
-                       goto out;
-               }
-               ioapic->irr |= mask;
-               if ((edge && old_irr != ioapic->irr) ||
-                   (!edge && !entry.fields.remote_irr))
-                       ret = ioapic_service(ioapic, irq, line_status);
-               else
-                       ret = 0; /* report coalesced interrupt */
-       }
-out:
-       trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
         spin_unlock(&ioapic->lock);
  
         return ret;
@@ -394,7 +423,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
  
                 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
                 ent->fields.remote_irr = 0;
-               if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+               if (ioapic->irr & (1 << i))
                         ioapic_service(ioapic, i, false);
         }
  }
@@ -595,9 +624,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
  
         spin_lock(&ioapic->lock);
         memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+       ioapic->irr = 0;
         update_handled_vectors(ioapic);
         kvm_vcpu_request_scan_ioapic(kvm);
-       kvm_rtc_eoi_tracking_restore_all(ioapic);
+       kvm_ioapic_inject_all(ioapic, state->irr);
         spin_unlock(&ioapic->lock);
         return 0;
  }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index b5ec7fb..56baae8 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
  
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
-       long dirty_count = kvm->tlbs_dirty;
-
-       smp_mb();
         if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                 ++kvm->stat.remote_tlb_flush;
-       cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+       kvm->tlbs_dirty = false;
  }
  EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
  
@@ -1804,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                 continue;
                         if (vcpu == me)
                                 continue;
-                       if (waitqueue_active(&vcpu->wq))
+                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
@@ -2283,6 +2280,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
         case KVM_DEV_TYPE_ARM_VGIC_V2:
                 ops = &kvm_arm_vgic_v2_ops;
                 break;
+#endif
+#ifdef CONFIG_S390
+       case KVM_DEV_TYPE_FLIC:
+               ops = &kvm_flic_ops;
+               break;
  #endif
         default:
                 return -ENODEV;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 2 Apr 2014 21:50:10 +0000 (14:50 -0700)
Documentation/virtual/kvm/api.txt		patch \| blob \| history
Documentation/virtual/kvm/devices/s390_flic.txt	[new file with mode: 0644]	patch \| blob
arch/arm/include/asm/kvm_arm.h		patch \| blob \| history
arch/arm/include/asm/kvm_asm.h		patch \| blob \| history
arch/arm/include/asm/kvm_host.h		patch \| blob \| history
arch/arm/include/asm/kvm_mmu.h		patch \| blob \| history
arch/arm/kernel/asm-offsets.c		patch \| blob \| history
arch/arm/kvm/coproc.c		patch \| blob \| history
arch/arm/kvm/coproc.h		patch \| blob \| history
arch/arm/kvm/coproc_a15.c		patch \| blob \| history
arch/arm/kvm/coproc_a7.c		patch \| blob \| history
arch/arm/kvm/guest.c		patch \| blob \| history
arch/arm/kvm/interrupts_head.S		patch \| blob \| history
arch/arm/kvm/mmu.c		patch \| blob \| history
arch/arm64/include/asm/kvm_arm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_asm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_mmu.h		patch \| blob \| history
arch/arm64/kvm/sys_regs.c		patch \| blob \| history
arch/arm64/kvm/sys_regs.h		patch \| blob \| history
arch/ia64/kvm/kvm-ia64.c		patch \| blob \| history
arch/mips/include/asm/kvm_host.h		patch \| blob \| history
arch/mips/kvm/kvm_mips_emul.c		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s_64.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s_asm.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_ppc.h		patch \| blob \| history
arch/powerpc/include/asm/reg.h		patch \| blob \| history
arch/powerpc/include/asm/tm.h		patch \| blob \| history
arch/powerpc/kvm/book3s_64_mmu_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_interrupts.S		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rm_mmu.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S		patch \| blob \| history
arch/powerpc/kvm/book3s_rtas.c		patch \| blob \| history
arch/s390/include/asm/irq.h		patch \| blob \| history
arch/s390/include/asm/kvm_host.h		patch \| blob \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| history
arch/s390/include/asm/processor.h		patch \| blob \| history
arch/s390/include/uapi/asm/kvm.h		patch \| blob \| history
arch/s390/kernel/irq.c		patch \| blob \| history
arch/s390/kvm/Kconfig		patch \| blob \| history
arch/s390/kvm/Makefile		patch \| blob \| history
arch/s390/kvm/diag.c		patch \| blob \| history
arch/s390/kvm/interrupt.c		patch \| blob \| history
arch/s390/kvm/irq.h	[new file with mode: 0644]	patch \| blob
arch/s390/kvm/kvm-s390.c		patch \| blob \| history
arch/s390/kvm/kvm-s390.h		patch \| blob \| history
arch/s390/kvm/priv.c		patch \| blob \| history
arch/s390/kvm/sigp.c		patch \| blob \| history
arch/s390/kvm/trace.h		patch \| blob \| history
arch/s390/mm/fault.c		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/vmx.h		patch \| blob \| history
arch/x86/include/asm/xsave.h		patch \| blob \| history
arch/x86/include/uapi/asm/msr-index.h		patch \| blob \| history
arch/x86/kernel/kvm.c		patch \| blob \| history
arch/x86/kernel/kvmclock.c		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/emulate.c		patch \| blob \| history
arch/x86/kvm/mmu.c		patch \| blob \| history
arch/x86/kvm/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/svm.c		patch \| blob \| history
arch/x86/kvm/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
drivers/s390/kvm/virtio_ccw.c		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
virt/kvm/Kconfig		patch \| blob \| history
virt/kvm/async_pf.c		patch \| blob \| history
virt/kvm/eventfd.c		patch \| blob \| history
virt/kvm/ioapic.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history