Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Sep 2015 23:42:49 +0000 (16:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Sep 2015 23:42:49 +0000 (16:42 -0700)
Pull more kvm updates from Paolo Bonzini:
 "ARM:
   - Full debug support for arm64
   - Active state switching for timer interrupts
   - Lazy FP/SIMD save/restore for arm64
   - Generic ARMv8 target

  PPC:
   - Book3S: A few bug fixes
   - Book3S: Allow micro-threading on POWER8

  x86:
   - Compiler warnings

  Generic:
   - Adaptive polling for guest halt"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (49 commits)
  kvm: irqchip: fix memory leak
  kvm: move new trace event outside #ifdef CONFIG_KVM_ASYNC_PF
  KVM: trace kvm_halt_poll_ns grow/shrink
  KVM: dynamic halt-polling
  KVM: make halt_poll_ns per-vCPU
  Silence compiler warning in arch/x86/kvm/emulate.c
  kvm: compile process_smi_save_seg_64() only for x86_64
  KVM: x86: avoid uninitialized variable warning
  KVM: PPC: Book3S: Fix typo in top comment about locking
  KVM: PPC: Book3S: Fix size of the PSPB register
  KVM: PPC: Book3S HV: Exit on H_DOORBELL if HOST_IPI is set
  KVM: PPC: Book3S HV: Fix race in starting secondary threads
  KVM: PPC: Book3S: correct width in XER handling
  KVM: PPC: Book3S HV: Fix preempted vcore stolen time calculation
  KVM: PPC: Book3S HV: Fix preempted vcore list locking
  KVM: PPC: Book3S HV: Implement H_CLEAR_REF and H_CLEAR_MOD
  KVM: PPC: Book3S HV: Fix bug in dirty page tracking
  KVM: PPC: Book3S HV: Fix race in reading change bit when removing HPTE
  KVM: PPC: Book3S HV: Implement dynamic micro-threading on POWER8
  KVM: PPC: Book3S HV: Make use of unused threads when running guests
  ...

62 files changed:
Documentation/virtual/kvm/api.txt
arch/arm/include/asm/kvm_host.h
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm/kvm/interrupts.S
arch/arm/kvm/reset.c
arch/arm64/include/asm/hw_breakpoint.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kvm/Makefile
arch/arm64/kvm/debug.c [new file with mode: 0644]
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp.S
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs.h
arch/arm64/kvm/sys_regs_generic_v8.c
arch/arm64/kvm/trace.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_paired_singles.c
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/powerpc.c
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/x86.c
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/linux/irqchip/arm-gic-v3.h
include/linux/irqchip/arm-gic.h
include/linux/kvm_host.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
virt/kvm/arm/arch_timer.c
virt/kvm/arm/vgic-v2.c
virt/kvm/arm/vgic-v3.c
virt/kvm/arm/vgic.c
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index a4ebcb7..d9eccee 100644 (file)
@@ -2671,7 +2671,7 @@ handled.
 4.87 KVM_SET_GUEST_DEBUG
 
 Capability: KVM_CAP_SET_GUEST_DEBUG
-Architectures: x86, s390, ppc
+Architectures: x86, s390, ppc, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_guest_debug (in)
 Returns: 0 on success; -1 on error
@@ -2693,8 +2693,8 @@ when running. Common control bits are:
 The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
-  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -2709,6 +2709,11 @@ updated to the correct (supplied) values.
 The second part of the structure is architecture specific and
 typically contains a set of debug registers.
 
+For arm64 the number of debug registers is implementation defined and
+can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
+KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
+indicating the number of supported registers.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3111,11 +3116,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
                /* KVM_EXIT_MMIO */
                struct {
index e896d2c..dcba0fa 100644 (file)
@@ -231,4 +231,9 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+static inline void kvm_arm_init_debug(void) {}
+static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
index bc738d2..ce404a5 100644 (file)
@@ -125,6 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_free_stage2_pgd;
 
+       kvm_vgic_early_init(kvm);
        kvm_timer_init(kvm);
 
        /* Mark the initial VMID generation invalid */
@@ -249,6 +250,7 @@ out:
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
+       kvm_vgic_vcpu_early_init(vcpu);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -278,6 +280,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        /* Set up the timer */
        kvm_timer_vcpu_init(vcpu);
 
+       kvm_arm_reset_debug_ptr(vcpu);
+
        return 0;
 }
 
@@ -301,13 +305,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        kvm_arm_set_running_vcpu(NULL);
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                       struct kvm_guest_debug *dbg)
-{
-       return -EINVAL;
-}
-
-
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
@@ -528,10 +525,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                if (vcpu->arch.pause)
                        vcpu_pause(vcpu);
 
-               kvm_vgic_flush_hwstate(vcpu);
+               /*
+                * Disarming the background timer must be done in a
+                * preemptible context, as this call may sleep.
+                */
                kvm_timer_flush_hwstate(vcpu);
 
+               /*
+                * Preparing the interrupts to be injected also
+                * involves poking the GIC, which must be done in a
+                * non-preemptible context.
+                */
                preempt_disable();
+               kvm_vgic_flush_hwstate(vcpu);
+
                local_irq_disable();
 
                /*
@@ -544,12 +551,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
                if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
                        local_irq_enable();
+                       kvm_vgic_sync_hwstate(vcpu);
                        preempt_enable();
                        kvm_timer_sync_hwstate(vcpu);
-                       kvm_vgic_sync_hwstate(vcpu);
                        continue;
                }
 
+               kvm_arm_setup_debug(vcpu);
+
                /**************************************************************
                 * Enter the guest
                 */
@@ -564,6 +573,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Back from guest
                 *************************************************************/
 
+               kvm_arm_clear_debug(vcpu);
+
                /*
                 * We may have taken a host interrupt in HYP mode (ie
                 * while executing the guest). This interrupt is still
@@ -586,11 +597,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 */
                kvm_guest_exit();
                trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-               preempt_enable();
 
+               kvm_vgic_sync_hwstate(vcpu);
+
+               preempt_enable();
 
                kvm_timer_sync_hwstate(vcpu);
-               kvm_vgic_sync_hwstate(vcpu);
 
                ret = handle_exit(vcpu, run, ret);
        }
@@ -921,6 +933,8 @@ static void cpu_init_hyp_mode(void *dummy)
        vector_ptr = (unsigned long)__kvm_hyp_vector;
 
        __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+       kvm_arm_init_debug();
 }
 
 static int hyp_init_cpu_notify(struct notifier_block *self,
index d503fbb..96e935b 100644 (file)
@@ -290,3 +290,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       return -EINVAL;
+}
index 568494d..900ef6d 100644 (file)
@@ -361,10 +361,6 @@ hyp_hvc:
        @ Check syndrome register
        mrc     p15, 4, r1, c5, c2, 0   @ HSR
        lsr     r0, r1, #HSR_EC_SHIFT
-#ifdef CONFIG_VFPv3
-       cmp     r0, #HSR_EC_CP_0_13
-       beq     switch_to_guest_vfp
-#endif
        cmp     r0, #HSR_EC_HVC
        bne     guest_trap              @ Not HVC instr.
 
@@ -378,7 +374,10 @@ hyp_hvc:
        cmp     r2, #0
        bne     guest_trap              @ Guest called HVC
 
-host_switch_to_hyp:
+       /*
+        * Getting here means host called HVC, we shift parameters and branch
+        * to Hyp function.
+        */
        pop     {r0, r1, r2}
 
        /* Check for __hyp_get_vectors */
@@ -409,6 +408,10 @@ guest_trap:
 
        @ Check if we need the fault information
        lsr     r1, r1, #HSR_EC_SHIFT
+#ifdef CONFIG_VFPv3
+       cmp     r1, #HSR_EC_CP_0_13
+       beq     switch_to_guest_vfp
+#endif
        cmp     r1, #HSR_EC_IABT
        mrceq   p15, 4, r2, c6, c0, 2   @ HIFAR
        beq     2f
@@ -477,7 +480,6 @@ guest_trap:
  */
 #ifdef CONFIG_VFPv3
 switch_to_guest_vfp:
-       load_vcpu                       @ Load VCPU pointer to r0
        push    {r3-r7}
 
        @ NEON/VFP used.  Turn on VFP access.
index f558c07..eeb8585 100644 (file)
@@ -77,7 +77,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_coprocs(vcpu);
 
        /* Reset arch_timer context */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index 52b484b..4c47cb2 100644 (file)
@@ -16,6 +16,8 @@
 #ifndef __ASM_HW_BREAKPOINT_H
 #define __ASM_HW_BREAKPOINT_H
 
+#include <asm/cputype.h>
+
 #ifdef __KERNEL__
 
 struct arch_hw_breakpoint_ctrl {
@@ -132,5 +134,17 @@ static inline void ptrace_hw_copy_thread(struct task_struct *task)
 
 extern struct pmu perf_ops_bp;
 
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_BREAKPOINT_H */
index ac6fafb..7605e09 100644 (file)
 #define HSTR_EL2_TTEE  (1 << 16)
 #define HSTR_EL2_T(x)  (1 << x)
 
+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC (1 << 31)
 #define CPTR_EL2_TTA   (1 << 20)
-#define CPTR_EL2_TFP   (1 << 10)
+#define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA          (1 << 11)
index 3c5fe68..67fa0de 100644 (file)
 #define        CNTKCTL_EL1     20      /* Timer Control Register (EL1) */
 #define        PAR_EL1         21      /* Physical Address Register */
 #define MDSCR_EL1      22      /* Monitor Debug System Control Register */
-#define DBGBCR0_EL1    23      /* Debug Breakpoint Control Registers (0-15) */
-#define DBGBCR15_EL1   38
-#define DBGBVR0_EL1    39      /* Debug Breakpoint Value Registers (0-15) */
-#define DBGBVR15_EL1   54
-#define DBGWCR0_EL1    55      /* Debug Watchpoint Control Registers (0-15) */
-#define DBGWCR15_EL1   70
-#define DBGWVR0_EL1    71      /* Debug Watchpoint Value Registers (0-15) */
-#define DBGWVR15_EL1   86
-#define MDCCINT_EL1    87      /* Monitor Debug Comms Channel Interrupt Enable Reg */
+#define MDCCINT_EL1    23      /* Monitor Debug Comms Channel Interrupt Enable Reg */
 
 /* 32bit specific registers. Keep them at the end of the range */
-#define        DACR32_EL2      88      /* Domain Access Control Register */
-#define        IFSR32_EL2      89      /* Instruction Fault Status Register */
-#define        FPEXC32_EL2     90      /* Floating-Point Exception Control Register */
-#define        DBGVCR32_EL2    91      /* Debug Vector Catch Register */
-#define        TEECR32_EL1     92      /* ThumbEE Configuration Register */
-#define        TEEHBR32_EL1    93      /* ThumbEE Handler Base Register */
-#define        NR_SYS_REGS     94
+#define        DACR32_EL2      24      /* Domain Access Control Register */
+#define        IFSR32_EL2      25      /* Instruction Fault Status Register */
+#define        FPEXC32_EL2     26      /* Floating-Point Exception Control Register */
+#define        DBGVCR32_EL2    27      /* Debug Vector Catch Register */
+#define        TEECR32_EL1     28      /* ThumbEE Configuration Register */
+#define        TEEHBR32_EL1    29      /* ThumbEE Handler Base Register */
+#define        NR_SYS_REGS     30
 
 /* 32bit mapping */
 #define c0_MPIDR       (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
@@ -132,6 +124,8 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 
+extern u32 __kvm_get_mdcr_el2(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index 2709db2..415938d 100644 (file)
@@ -103,15 +103,34 @@ struct kvm_vcpu_arch {
 
        /* HYP configuration */
        u64 hcr_el2;
+       u32 mdcr_el2;
 
        /* Exception Information */
        struct kvm_vcpu_fault_info fault;
 
-       /* Debug state */
+       /* Guest debug state */
        u64 debug_flags;
 
+       /*
+        * We maintain more than a single set of debug registers to support
+        * debugging the guest from the host and to maintain separate host and
+        * guest state during world switches. vcpu_debug_state are the debug
+        * registers of the vcpu as the guest sees them.  host_debug_state are
+        * the host registers which are saved and restored during
+        * world switches. external_debug_state contains the debug
+        * values we want to debug the guest. This is set via the
+        * KVM_SET_GUEST_DEBUG ioctl.
+        *
+        * debug_ptr points to the set of debug registers that should be loaded
+        * onto the hardware when running the guest.
+        */
+       struct kvm_guest_debug_arch *debug_ptr;
+       struct kvm_guest_debug_arch vcpu_debug_state;
+       struct kvm_guest_debug_arch external_debug_state;
+
        /* Pointer to host CPU context */
        kvm_cpu_context_t *host_cpu_context;
+       struct kvm_guest_debug_arch host_debug_state;
 
        /* VGIC state */
        struct vgic_cpu vgic_cpu;
@@ -122,6 +141,17 @@ struct kvm_vcpu_arch {
         * here.
         */
 
+       /*
+        * Guest registers we preserve during guest debugging.
+        *
+        * These shadow registers are updated by the kvm_handle_sys_reg
+        * trap handler if the guest accesses or updates them while we
+        * are using guest debug.
+        */
+       struct {
+               u32     mdscr_el1;
+       } guest_debug_preserved;
+
        /* Don't run the guest */
        bool pause;
 
@@ -216,15 +246,15 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
                     hyp_stack_ptr, vector_ptr);
 }
 
-struct vgic_sr_vectors {
-       void    *save_vgic;
-       void    *restore_vgic;
-};
-
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+void kvm_arm_init_debug(void);
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
index d268320..0cd7b59 100644 (file)
@@ -53,14 +53,20 @@ struct kvm_regs {
        struct user_fpsimd_state fp_regs;
 };
 
-/* Supported Processor Types */
+/*
+ * Supported CPU Targets - Adding a new target type is not recommended,
+ * unless there are some special registers not supported by the
+ * genericv8 syreg table.
+ */
 #define KVM_ARM_TARGET_AEM_V8          0
 #define KVM_ARM_TARGET_FOUNDATION_V8   1
 #define KVM_ARM_TARGET_CORTEX_A57      2
 #define KVM_ARM_TARGET_XGENE_POTENZA   3
 #define KVM_ARM_TARGET_CORTEX_A53      4
+/* Generic ARM v8 target */
+#define KVM_ARM_TARGET_GENERIC_V8      5
 
-#define KVM_ARM_NUM_TARGETS            5
+#define KVM_ARM_NUM_TARGETS            6
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT      0
@@ -100,12 +106,39 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ *
+ * Although the control registers are architecturally defined as 32
+ * bits wide we use a 64 bit structure here to keep parity with
+ * KVM_GET/SET_ONE_REG behaviour which treats all system registers as
+ * 64 bit values. It also allows for the possibility of the
+ * architecture expanding the control registers without having to
+ * change the userspace ABI.
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
 struct kvm_guest_debug_arch {
+       __u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
 };
 
 struct kvm_debug_exit_arch {
+       __u32 hsr;
+       __u64 far;      /* used for watchpoints */
 };
 
+/*
+ * Architecture specific defines for kvm_guest_debug->control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP         (1 << 16)
+#define KVM_GUESTDBG_USE_HW            (1 << 17)
+
 struct kvm_sync_regs {
 };
 
index c99701a..8d89cf8 100644 (file)
@@ -116,17 +116,22 @@ int main(void)
   DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,     offsetof(struct kvm_vcpu, arch.debug_flags));
+  DEFINE(VCPU_DEBUG_PTR,       offsetof(struct kvm_vcpu, arch.debug_ptr));
+  DEFINE(DEBUG_BCR,            offsetof(struct kvm_guest_debug_arch, dbg_bcr));
+  DEFINE(DEBUG_BVR,            offsetof(struct kvm_guest_debug_arch, dbg_bvr));
+  DEFINE(DEBUG_WCR,            offsetof(struct kvm_guest_debug_arch, dbg_wcr));
+  DEFINE(DEBUG_WVR,            offsetof(struct kvm_guest_debug_arch, dbg_wvr));
   DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
+  DEFINE(VCPU_MDCR_EL2,        offsetof(struct kvm_vcpu, arch.mdcr_el2));
   DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HOST_CONTEXT,    offsetof(struct kvm_vcpu, arch.host_cpu_context));
+  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
   DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(KVM_TIMER_CNTVOFF,    offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_ENABLED,    offsetof(struct kvm, arch.timer.enabled));
   DEFINE(VCPU_KVM,             offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_VGIC_CPU,                offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_SAVE_FN,         offsetof(struct vgic_sr_vectors, save_vgic));
-  DEFINE(VGIC_RESTORE_FN,      offsetof(struct vgic_sr_vectors, restore_vgic));
   DEFINE(VGIC_V2_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_MISR,     offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
index 003bc3d..c97040e 100644 (file)
@@ -48,18 +48,6 @@ static DEFINE_PER_CPU(int, stepping_kernel_bp);
 static int core_num_brps;
 static int core_num_wrps;
 
-/* Determine number of BRP registers available. */
-static int get_num_brps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
-}
-
-/* Determine number of WRP registers available. */
-static int get_num_wrps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
-}
-
 int hw_breakpoint_slots(int type)
 {
        /*
index f90f4aa..1949fe5 100644 (file)
@@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
new file mode 100644 (file)
index 0000000..47e5f0f
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * Debug and Guest Debug support
+ *
+ * Copyright (C) 2015 - Linaro Ltd
+ * Author: Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hw_breakpoint.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+
+#include "trace.h"
+
+/* These are the bits of MDSCR_EL1 we may manipulate */
+#define MDSCR_EL1_DEBUG_MASK   (DBG_MDSCR_SS | \
+                               DBG_MDSCR_KDE | \
+                               DBG_MDSCR_MDE)
+
+static DEFINE_PER_CPU(u32, mdcr_el2);
+
+/**
+ * save/restore_guest_debug_regs
+ *
+ * For some debug operations we need to tweak some guest registers. As
+ * a result we need to save the state of those registers before we
+ * make those modifications.
+ *
+ * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
+ * after we have restored the preserved value to the main context.
+ */
+static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1);
+
+       trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
+                               vcpu->arch.guest_debug_preserved.mdscr_el1);
+}
+
+static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1;
+
+       trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
+                               vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+/**
+ * kvm_arm_init_debug - grab what we need for debug
+ *
+ * Currently the sole task of this function is to retrieve the initial
+ * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
+ * presumably been set-up by some knowledgeable bootcode.
+ *
+ * It is called once per-cpu during CPU hyp initialisation.
+ */
+
+void kvm_arm_init_debug(void)
+{
+       __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2));
+}
+
+/**
+ * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
+ */
+
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+}
+
+/**
+ * kvm_arm_setup_debug - set up debug related stuff
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This is called before each entry into the hypervisor to setup any
+ * debug related registers. Currently this just ensures we will trap
+ * access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *
+ * Additionally, KVM only traps guest accesses to the debug registers if
+ * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
+ * flag on vcpu->arch.debug_flags).  Since the guest must not interfere
+ * with the hardware state when debugging the guest, we must ensure that
+ * trapping is enabled whenever we are debugging the guest using the
+ * debug registers.
+ */
+
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+{
+       bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
+
+       trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is Guest debugging in effect? */
+       if (vcpu->guest_debug) {
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+               /* Save guest debug state */
+               save_guest_debug_regs(vcpu);
+
+               /*
+                * Single Step (ARM ARM D2.12.3 The software step state
+                * machine)
+                *
+                * If we are doing Single Step we need to manipulate
+                * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
+                * step has occurred the hypervisor will trap the
+                * debug exception and we return to userspace.
+                *
+                * If the guest attempts to single step its userspace
+                * we would have to deal with a trapped exception
+                * while in the guest kernel. Because this would be
+                * hard to unwind we suppress the guest's ability to
+                * do so by masking MDSCR_EL.SS.
+                *
+                * This confuses guest debuggers which use
+                * single-step behind the scenes but everything
+                * returns to normal once the host is no longer
+                * debugging the system.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                       *vcpu_cpsr(vcpu) |=  DBG_SPSR_SS;
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS;
+               } else {
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS;
+               }
+
+               trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
+
+               /*
+                * HW Breakpoints and watchpoints
+                *
+                * We simply switch the debug_ptr to point to our new
+                * external_debug_state which has been populated by the
+                * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY
+                * mechanism ensures the registers are updated on the
+                * world switch.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       /* Enable breakpoints/watchpoints */
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE;
+
+                       vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
+                       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+                       trap_debug = true;
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+
+       BUG_ON(!vcpu->guest_debug &&
+               vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
+
+       /* Trap debug register access */
+       if (trap_debug)
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+       trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+{
+       trace_kvm_arm_clear_debug(vcpu->guest_debug);
+
+       if (vcpu->guest_debug) {
+               restore_guest_debug_regs(vcpu);
+
+               /*
+                * If we were using HW debug we need to restore the
+                * debug_ptr to the guest debug state.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       kvm_arm_reset_debug_ptr(vcpu);
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+}
index 9535bd5..d250160 100644 (file)
@@ -32,6 +32,8 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 
+#include "trace.h"
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
@@ -293,7 +295,8 @@ int __attribute_const__ kvm_target_cpu(void)
                break;
        };
 
-       return -EINVAL;
+       /* Return a default generic target */
+       return KVM_ARM_TARGET_GENERIC_V8;
 }
 
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
@@ -331,3 +334,41 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
+                           KVM_GUESTDBG_USE_SW_BP | \
+                           KVM_GUESTDBG_USE_HW | \
+                           KVM_GUESTDBG_SINGLESTEP)
+
+/**
+ * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
+ * @kvm:       pointer to the KVM struct
+ * @kvm_guest_debug: the ioctl data buffer
+ *
+ * This sets up and enables the VM for guest debugging. Userspace
+ * passes in a control flag to enable different debug types and
+ * potentially other architecture specific information in the rest of
+ * the structure.
+ */
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       trace_kvm_set_guest_debug(vcpu, dbg->control);
+
+       if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+               return -EINVAL;
+
+       if (dbg->control & KVM_GUESTDBG_ENABLE) {
+               vcpu->guest_debug = dbg->control;
+
+               /* Hardware assisted Break and Watch points */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       vcpu->arch.external_debug_state = dbg->arch;
+               }
+
+       } else {
+               /* If not enabled clear all flags */
+               vcpu->guest_debug = 0;
+       }
+       return 0;
+}
index 524fa25..68a0759 100644 (file)
@@ -82,6 +82,45 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 1;
 }
 
+/**
+ * kvm_handle_guest_debug - handle a debug exception instruction
+ *
+ * @vcpu:      the vcpu pointer
+ * @run:       access to the kvm_run structure for results
+ *
+ * We route all debug exceptions through the same handler. If both the
+ * guest and host are using the same debug facilities it will be up to
+ * userspace to re-inject the correct exception for guest delivery.
+ *
+ * @return: 0 (while setting run->exit_reason), -1 for error
+ */
+static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       u32 hsr = kvm_vcpu_get_hsr(vcpu);
+       int ret = 0;
+
+       run->exit_reason = KVM_EXIT_DEBUG;
+       run->debug.arch.hsr = hsr;
+
+       switch (hsr >> ESR_ELx_EC_SHIFT) {
+       case ESR_ELx_EC_WATCHPT_LOW:
+               run->debug.arch.far = vcpu->arch.fault.far_el2;
+               /* fall through */
+       case ESR_ELx_EC_SOFTSTP_LOW:
+       case ESR_ELx_EC_BREAKPT_LOW:
+       case ESR_ELx_EC_BKPT32:
+       case ESR_ELx_EC_BRK64:
+               break;
+       default:
+               kvm_err("%s: un-handled case hsr: %#08x\n",
+                       __func__, (unsigned int) hsr);
+               ret = -1;
+               break;
+       }
+
+       return ret;
+}
+
 static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_WFx]        = kvm_handle_wfx,
        [ESR_ELx_EC_CP15_32]    = kvm_handle_cp15_32,
@@ -96,6 +135,11 @@ static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_SYS64]      = kvm_handle_sys_reg,
        [ESR_ELx_EC_IABT_LOW]   = kvm_handle_guest_abort,
        [ESR_ELx_EC_DABT_LOW]   = kvm_handle_guest_abort,
+       [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BKPT32]     = kvm_handle_guest_debug,
+       [ESR_ELx_EC_BRK64]      = kvm_handle_guest_debug,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
index 10915aa..37c89ea 100644 (file)
        stp     x24, x25, [x3, #160]
 .endm
 
-.macro save_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgbcr15_el1
-       mrs     x19, dbgbcr14_el1
-       mrs     x18, dbgbcr13_el1
-       mrs     x17, dbgbcr12_el1
-       mrs     x16, dbgbcr11_el1
-       mrs     x15, dbgbcr10_el1
-       mrs     x14, dbgbcr9_el1
-       mrs     x13, dbgbcr8_el1
-       mrs     x12, dbgbcr7_el1
-       mrs     x11, dbgbcr6_el1
-       mrs     x10, dbgbcr5_el1
-       mrs     x9, dbgbcr4_el1
-       mrs     x8, dbgbcr3_el1
-       mrs     x7, dbgbcr2_el1
-       mrs     x6, dbgbcr1_el1
-       mrs     x5, dbgbcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+.macro save_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       mrs     x20, dbgbvr15_el1
-       mrs     x19, dbgbvr14_el1
-       mrs     x18, dbgbvr13_el1
-       mrs     x17, dbgbvr12_el1
-       mrs     x16, dbgbvr11_el1
-       mrs     x15, dbgbvr10_el1
-       mrs     x14, dbgbvr9_el1
-       mrs     x13, dbgbvr8_el1
-       mrs     x12, dbgbvr7_el1
-       mrs     x11, dbgbvr6_el1
-       mrs     x10, dbgbvr5_el1
-       mrs     x9, dbgbvr4_el1
-       mrs     x8, dbgbvr3_el1
-       mrs     x7, dbgbvr2_el1
-       mrs     x6, dbgbvr1_el1
-       mrs     x5, dbgbvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwcr15_el1
-       mrs     x19, dbgwcr14_el1
-       mrs     x18, dbgwcr13_el1
-       mrs     x17, dbgwcr12_el1
-       mrs     x16, dbgwcr11_el1
-       mrs     x15, dbgwcr10_el1
-       mrs     x14, dbgwcr9_el1
-       mrs     x13, dbgwcr8_el1
-       mrs     x12, dbgwcr7_el1
-       mrs     x11, dbgwcr6_el1
-       mrs     x10, dbgwcr5_el1
-       mrs     x9, dbgwcr4_el1
-       mrs     x8, dbgwcr3_el1
-       mrs     x7, dbgwcr2_el1
-       mrs     x6, dbgwcr1_el1
-       mrs     x5, dbgwcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwvr15_el1
-       mrs     x19, dbgwvr14_el1
-       mrs     x18, dbgwvr13_el1
-       mrs     x17, dbgwvr12_el1
-       mrs     x16, dbgwvr11_el1
-       mrs     x15, dbgwvr10_el1
-       mrs     x14, dbgwvr9_el1
-       mrs     x13, dbgwvr8_el1
-       mrs     x12, dbgwvr7_el1
-       mrs     x11, dbgwvr6_el1
-       mrs     x10, dbgwvr5_el1
-       mrs     x9, dbgwvr4_el1
-       mrs     x8, dbgwvr3_el1
-       mrs     x7, dbgwvr2_el1
-       mrs     x6, dbgwvr1_el1
-       mrs     x5, dbgwvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
+       mrs     x21, \type\()15_el1
+       mrs     x20, \type\()14_el1
+       mrs     x19, \type\()13_el1
+       mrs     x18, \type\()12_el1
+       mrs     x17, \type\()11_el1
+       mrs     x16, \type\()10_el1
+       mrs     x15, \type\()9_el1
+       mrs     x14, \type\()8_el1
+       mrs     x13, \type\()7_el1
+       mrs     x12, \type\()6_el1
+       mrs     x11, \type\()5_el1
+       mrs     x10, \type\()4_el1
+       mrs     x9, \type\()3_el1
+       mrs     x8, \type\()2_el1
+       mrs     x7, \type\()1_el1
+       mrs     x6, \type\()0_el1
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       mrs     x21, mdccint_el1
-       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       str     x21, [x4, #(15 * 8)]
+       str     x20, [x4, #(14 * 8)]
+       str     x19, [x4, #(13 * 8)]
+       str     x18, [x4, #(12 * 8)]
+       str     x17, [x4, #(11 * 8)]
+       str     x16, [x4, #(10 * 8)]
+       str     x15, [x4, #(9 * 8)]
+       str     x14, [x4, #(8 * 8)]
+       str     x13, [x4, #(7 * 8)]
+       str     x12, [x4, #(6 * 8)]
+       str     x11, [x4, #(5 * 8)]
+       str     x10, [x4, #(4 * 8)]
+       str     x9, [x4, #(3 * 8)]
+       str     x8, [x4, #(2 * 8)]
+       str     x7, [x4, #(1 * 8)]
+       str     x6, [x4, #(0 * 8)]
 .endm
 
 .macro restore_sysregs
        msr     mdscr_el1,      x25
 .endm
 
-.macro restore_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
+.macro restore_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
 
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       msr     dbgbcr15_el1, x20
-       msr     dbgbcr14_el1, x19
-       msr     dbgbcr13_el1, x18
-       msr     dbgbcr12_el1, x17
-       msr     dbgbcr11_el1, x16
-       msr     dbgbcr10_el1, x15
-       msr     dbgbcr9_el1, x14
-       msr     dbgbcr8_el1, x13
-       msr     dbgbcr7_el1, x12
-       msr     dbgbcr6_el1, x11
-       msr     dbgbcr5_el1, x10
-       msr     dbgbcr4_el1, x9
-       msr     dbgbcr3_el1, x8
-       msr     dbgbcr2_el1, x7
-       msr     dbgbcr1_el1, x6
-       msr     dbgbcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       ldr     x21, [x4, #(15 * 8)]
+       ldr     x20, [x4, #(14 * 8)]
+       ldr     x19, [x4, #(13 * 8)]
+       ldr     x18, [x4, #(12 * 8)]
+       ldr     x17, [x4, #(11 * 8)]
+       ldr     x16, [x4, #(10 * 8)]
+       ldr     x15, [x4, #(9 * 8)]
+       ldr     x14, [x4, #(8 * 8)]
+       ldr     x13, [x4, #(7 * 8)]
+       ldr     x12, [x4, #(6 * 8)]
+       ldr     x11, [x4, #(5 * 8)]
+       ldr     x10, [x4, #(4 * 8)]
+       ldr     x9, [x4, #(3 * 8)]
+       ldr     x8, [x4, #(2 * 8)]
+       ldr     x7, [x4, #(1 * 8)]
+       ldr     x6, [x4, #(0 * 8)]
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       msr     dbgbvr15_el1, x20
-       msr     dbgbvr14_el1, x19
-       msr     dbgbvr13_el1, x18
-       msr     dbgbvr12_el1, x17
-       msr     dbgbvr11_el1, x16
-       msr     dbgbvr10_el1, x15
-       msr     dbgbvr9_el1, x14
-       msr     dbgbvr8_el1, x13
-       msr     dbgbvr7_el1, x12
-       msr     dbgbvr6_el1, x11
-       msr     dbgbvr5_el1, x10
-       msr     dbgbvr4_el1, x9
-       msr     dbgbvr3_el1, x8
-       msr     dbgbvr2_el1, x7
-       msr     dbgbvr1_el1, x6
-       msr     dbgbvr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwcr15_el1, x20
-       msr     dbgwcr14_el1, x19
-       msr     dbgwcr13_el1, x18
-       msr     dbgwcr12_el1, x17
-       msr     dbgwcr11_el1, x16
-       msr     dbgwcr10_el1, x15
-       msr     dbgwcr9_el1, x14
-       msr     dbgwcr8_el1, x13
-       msr     dbgwcr7_el1, x12
-       msr     dbgwcr6_el1, x11
-       msr     dbgwcr5_el1, x10
-       msr     dbgwcr4_el1, x9
-       msr     dbgwcr3_el1, x8
-       msr     dbgwcr2_el1, x7
-       msr     dbgwcr1_el1, x6
-       msr     dbgwcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwvr15_el1, x20
-       msr     dbgwvr14_el1, x19
-       msr     dbgwvr13_el1, x18
-       msr     dbgwvr12_el1, x17
-       msr     dbgwvr11_el1, x16
-       msr     dbgwvr10_el1, x15
-       msr     dbgwvr9_el1, x14
-       msr     dbgwvr8_el1, x13
-       msr     dbgwvr7_el1, x12
-       msr     dbgwvr6_el1, x11
-       msr     dbgwvr5_el1, x10
-       msr     dbgwvr4_el1, x9
-       msr     dbgwvr3_el1, x8
-       msr     dbgwvr2_el1, x7
-       msr     dbgwvr1_el1, x6
-       msr     dbgwvr0_el1, x5
-
-       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-       msr     mdccint_el1, x21
+       msr     \type\()15_el1, x21
+       msr     \type\()14_el1, x20
+       msr     \type\()13_el1, x19
+       msr     \type\()12_el1, x18
+       msr     \type\()11_el1, x17
+       msr     \type\()10_el1, x16
+       msr     \type\()9_el1, x15
+       msr     \type\()8_el1, x14
+       msr     \type\()7_el1, x13
+       msr     \type\()6_el1, x12
+       msr     \type\()5_el1, x11
+       msr     \type\()4_el1, x10
+       msr     \type\()3_el1, x9
+       msr     \type\()2_el1, x8
+       msr     \type\()1_el1, x7
+       msr     \type\()0_el1, x6
 .endm
 
 .macro skip_32bit_state tmp, target
        tbz     \tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm
 
+/*
+ * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
+ */
+.macro skip_fpsimd_state tmp, target
+       mrs     \tmp, cptr_el2
+       tbnz    \tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
        // Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
        // is set, we do a full save/restore cycle and disable trapping.
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        mrs     x4, dacr32_el2
        mrs     x5, ifsr32_el2
-       mrs     x6, fpexc32_el2
        stp     x4, x5, [x3]
-       str     x6, [x3, #16]
 
+       skip_fpsimd_state x8, 3f
+       mrs     x6, fpexc32_el2
+       str     x6, [x3, #16]
+3:
        skip_debug_state x8, 2f
        mrs     x7, dbgvcr32_el2
        str     x7, [x3, #24]
 
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        ldp     x4, x5, [x3]
-       ldr     x6, [x3, #16]
        msr     dacr32_el2, x4
        msr     ifsr32_el2, x5
-       msr     fpexc32_el2, x6
 
        skip_debug_state x8, 2f
        ldr     x7, [x3, #24]
 
 .macro activate_traps
        ldr     x2, [x0, #VCPU_HCR_EL2]
+
+       /*
+        * We are about to set CPTR_EL2.TFP to trap all floating point
+        * register accesses to EL2, however, the ARM ARM clearly states that
+        * traps are only taken to EL2 if the operation would not otherwise
+        * trap to EL1.  Therefore, always make sure that for 32-bit guests,
+        * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+        */
+       tbnz    x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
+       mov     x3, #(1 << 30)
+       msr     fpexc32_el2, x3
+       isb
+99:
        msr     hcr_el2, x2
        mov     x2, #CPTR_EL2_TTA
+       orr     x2, x2, #CPTR_EL2_TFP
        msr     cptr_el2, x2
 
        mov     x2, #(1 << 15)  // Trap CP15 Cr=15
        msr     hstr_el2, x2
 
-       mrs     x2, mdcr_el2
-       and     x2, x2, #MDCR_EL2_HPMN_MASK
-       orr     x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
-       orr     x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
-
-       // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
-       // if not dirty.
-       ldr     x3, [x0, #VCPU_DEBUG_FLAGS]
-       tbnz    x3, #KVM_ARM64_DEBUG_DIRTY_SHIFT, 1f
-       orr     x2, x2,  #MDCR_EL2_TDA
-1:
+       // Monitor Debug Config - see kvm_arm_setup_debug()
+       ldr     x2, [x0, #VCPU_MDCR_EL2]
        msr     mdcr_el2, x2
 .endm
 
 .macro deactivate_traps
        mov     x2, #HCR_RW
        msr     hcr_el2, x2
-       msr     cptr_el2, xzr
        msr     hstr_el2, xzr
 
        mrs     x2, mdcr_el2
@@ -900,21 +622,101 @@ __restore_sysregs:
        restore_sysregs
        ret
 
+/* Save debug state */
 __save_debug:
-       save_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       save_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       save_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       save_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       save_debug dbgwvr
+
+       mrs     x21, mdccint_el1
+       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
        ret
 
+/* Restore debug state */
 __restore_debug:
-       restore_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       restore_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       restore_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       restore_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       restore_debug dbgwvr
+
+       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       msr     mdccint_el1, x21
+
        ret
 
 __save_fpsimd:
+       skip_fpsimd_state x3, 1f
        save_fpsimd
-       ret
+1:     ret
 
 __restore_fpsimd:
+       skip_fpsimd_state x3, 1f
        restore_fpsimd
-       ret
+1:     ret
+
+switch_to_guest_fpsimd:
+       push    x4, lr
+
+       mrs     x2, cptr_el2
+       bic     x2, x2, #CPTR_EL2_TFP
+       msr     cptr_el2, x2
+       isb
+
+       mrs     x0, tpidr_el2
+
+       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
+       kern_hyp_va x2
+       bl __save_fpsimd
+
+       add     x2, x0, #VCPU_CONTEXT
+       bl __restore_fpsimd
+
+       skip_32bit_state x3, 1f
+       ldr     x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
+       msr     fpexc32_el2, x4
+1:
+       pop     x4, lr
+       pop     x2, x3
+       pop     x0, x1
+
+       eret
 
 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -936,10 +738,10 @@ ENTRY(__kvm_vcpu_run)
        kern_hyp_va x2
 
        save_host_regs
-       bl __save_fpsimd
        bl __save_sysregs
 
        compute_debug_state 1f
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __save_debug
 1:
        activate_traps
@@ -952,9 +754,10 @@ ENTRY(__kvm_vcpu_run)
        add     x2, x0, #VCPU_CONTEXT
 
        bl __restore_sysregs
-       bl __restore_fpsimd
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __restore_debug
 1:
        restore_guest_32bit_state
@@ -975,6 +778,8 @@ __kvm_vcpu_return:
        bl __save_sysregs
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __save_debug
 1:
        save_guest_32bit_state
@@ -991,12 +796,15 @@ __kvm_vcpu_return:
 
        bl __restore_sysregs
        bl __restore_fpsimd
+       /* Clear FPSIMD and Trace trapping */
+       msr     cptr_el2, xzr
 
        skip_debug_state x3, 1f
        // Clear the dirty flag for the next run, as all the state has
        // already been saved. Note that we nuke the whole 64bit word.
        // If we ever add more flags, we'll have to be more careful...
        str     xzr, [x0, #VCPU_DEBUG_FLAGS]
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __restore_debug
 1:
        restore_host_regs
@@ -1199,6 +1007,11 @@ el1_trap:
         * x1: ESR
         * x2: ESR_EC
         */
+
+       /* Guest accessed VFP/SIMD registers, save host, restore Guest */
+       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       b.eq    switch_to_guest_fpsimd
+
        cmp     x2, #ESR_ELx_EC_DABT_LOW
        mov     x0, #ESR_ELx_EC_IABT_LOW
        ccmp    x2, x0, #4, ne
@@ -1293,4 +1106,10 @@ ENTRY(__kvm_hyp_vector)
        ventry  el1_error_invalid               // Error 32-bit EL1
 ENDPROC(__kvm_hyp_vector)
 
+
+ENTRY(__kvm_get_mdcr_el2)
+       mrs     x0, mdcr_el2
+       ret
+ENDPROC(__kvm_get_mdcr_el2)
+
        .popsection
index 0b43265..91cf535 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
+#include <linux/hw_breakpoint.h>
 
 #include <kvm/arm_arch_timer.h>
 
@@ -56,6 +57,12 @@ static bool cpu_has_32bit_el1(void)
        return !!(pfr0 & 0x20);
 }
 
+/**
+ * kvm_arch_dev_ioctl_check_extension
+ *
+ * We currently assume that the number of HW registers is uniform
+ * across all CPUs (see cpuinfo_sanity_check).
+ */
 int kvm_arch_dev_ioctl_check_extension(long ext)
 {
        int r;
@@ -64,6 +71,15 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
        case KVM_CAP_ARM_EL1_32BIT:
                r = cpu_has_32bit_el1();
                break;
+       case KVM_CAP_GUEST_DEBUG_HW_BPS:
+               r = get_num_brps();
+               break;
+       case KVM_CAP_GUEST_DEBUG_HW_WPS:
+               r = get_num_wrps();
+               break;
+       case KVM_CAP_SET_GUEST_DEBUG:
+               r = 1;
+               break;
        default:
                r = 0;
        }
@@ -105,7 +121,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_sys_regs(vcpu);
 
        /* Reset timer */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index c370b40..b41607d 100644 (file)
@@ -38,6 +38,8 @@
 
 #include "sys_regs.h"
 
+#include "trace.h"
+
 /*
  * All of this file is extremly similar to the ARM coproc.c, but the
  * types are different. My gut feeling is that it should be pretty
@@ -208,9 +210,217 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
                *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
        }
 
+       trace_trap_reg(__func__, r->reg, p->is_write, *vcpu_reg(vcpu, p->Rt));
+
+       return true;
+}
+
+/*
+ * reg_to_dbg/dbg_to_reg
+ *
+ * A 32 bit write to a debug register leave top bits alone
+ * A 32 bit read from a debug register only returns the bottom bits
+ *
+ * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the
+ * hyp.S code switches between host and guest values in future.
+ */
+static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *vcpu_reg(vcpu, p->Rt);
+
+       if (p->is_32bit) {
+               val &= 0xffffffffUL;
+               val |= ((*dbg_reg >> 32) << 32);
+       }
+
+       *dbg_reg = val;
+       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+}
+
+static inline void dbg_to_reg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *dbg_reg;
+
+       if (p->is_32bit)
+               val &= 0xffffffffUL;
+
+       *vcpu_reg(vcpu, p->Rt) = val;
+}
+
+static inline bool trap_bvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_bcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+
+       return 0;
+}
+
+static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write,
+               vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
+
        return true;
 }
 
+static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
+}
+
 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
        u64 amair;
@@ -240,16 +450,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)                                     \
        /* DBGBVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),     \
-         trap_debug_regs, reset_val, (DBGBVR0_EL1 + (n)), 0 },         \
+         trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr },                \
        /* DBGBCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),     \
-         trap_debug_regs, reset_val, (DBGBCR0_EL1 + (n)), 0 },         \
+         trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr },                \
        /* DBGWVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),     \
-         trap_debug_regs, reset_val, (DBGWVR0_EL1 + (n)), 0 },         \
+         trap_wvr, reset_wvr, n, 0,  get_wvr, set_wvr },               \
        /* DBGWCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),     \
-         trap_debug_regs, reset_val, (DBGWCR0_EL1 + (n)), 0 }
+         trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
 /*
  * Architected system registers.
@@ -516,28 +726,57 @@ static bool trap_debug32(struct kvm_vcpu *vcpu,
        return true;
 }
 
-#define DBG_BCR_BVR_WCR_WVR(n)                                 \
-       /* DBGBVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_debug32,    \
-         NULL, (cp14_DBGBVR0 + (n) * 2) },                     \
-       /* DBGBCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_debug32,    \
-         NULL, (cp14_DBGBCR0 + (n) * 2) },                     \
-       /* DBGWVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_debug32,    \
-         NULL, (cp14_DBGWVR0 + (n) * 2) },                     \
-       /* DBGWCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_debug32,    \
-         NULL, (cp14_DBGWCR0 + (n) * 2) }
-
-#define DBGBXVR(n)                                             \
-       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_debug32,    \
-         NULL, cp14_DBGBXVR0 + n * 2 }
+/* AArch32 debug register mappings
+ *
+ * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
+ * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
+ *
+ * All control registers and watchpoint value registers are mapped to
+ * the lower 32 bits of their AArch64 equivalents. We share the trap
+ * handlers with the above AArch64 code which checks what mode the
+ * system is in.
+ */
+
+static inline bool trap_xvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write) {
+               u64 val = *dbg_reg;
+
+               val &= 0xffffffffUL;
+               val |= *vcpu_reg(vcpu, p->Rt) << 32;
+               *dbg_reg = val;
+
+               vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+       } else {
+               *vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32;
+       }
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+#define DBG_BCR_BVR_WCR_WVR(n)                                         \
+       /* DBGBVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n },     \
+       /* DBGBCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n },     \
+       /* DBGWVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n },     \
+       /* DBGWCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
+
+#define DBGBXVR(n)                                                     \
+       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n }
 
 /*
  * Trapped cp14 registers. We generally ignore most of the external
  * debug, on the principle that they don't really make sense to a
- * guest. Revisit this one day, whould this principle change.
+ * guest. Revisit this one day, would this principle change.
  */
 static const struct sys_reg_desc cp14_regs[] = {
        /* DBGIDR */
@@ -999,6 +1238,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 
+       trace_kvm_handle_sys_reg(esr);
+
        params.is_aarch32 = false;
        params.is_32bit = false;
        params.Op0 = (esr >> 20) & 3;
@@ -1303,6 +1544,9 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return get_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->get_user)
+               return (r->get_user)(vcpu, r, reg, uaddr);
+
        return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
 }
 
@@ -1321,6 +1565,9 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return set_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->set_user)
+               return (r->set_user)(vcpu, r, reg, uaddr);
+
        return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
 }
 
index d411e25..eaa324e 100644 (file)
@@ -55,6 +55,12 @@ struct sys_reg_desc {
 
        /* Value (usually reset value) */
        u64 val;
+
+       /* Custom get/set_user functions, fallback to generic if NULL */
+       int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
+       int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
 };
 
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
index 475fd29..1e45768 100644 (file)
@@ -94,6 +94,8 @@ static int __init sys_reg_genericv8_init(void)
                                          &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
                                          &genericv8_target_table);
+       kvm_register_target_sys_reg_table(KVM_ARM_TARGET_GENERIC_V8,
+                                         &genericv8_target_table);
 
        return 0;
 }
index 157416e..7fb0008 100644 (file)
@@ -44,6 +44,129 @@ TRACE_EVENT(kvm_hvc_arm64,
                  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
+TRACE_EVENT(kvm_arm_setup_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+       TP_PROTO(__u32 guest_debug),
+       TP_ARGS(guest_debug),
+
+       TP_STRUCT__entry(
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+       TP_PROTO(const char *name, __u32 value),
+       TP_ARGS(name, value),
+
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(__u32, value)
+       ),
+
+       TP_fast_assign(
+               __entry->name = name;
+               __entry->value = value;
+       ),
+
+       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_EVENT(kvm_arm_set_regset,
+       TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+       TP_ARGS(type, len, control, value),
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(int, len)
+               __array(u64, ctrls, 16)
+               __array(u64, values, 16)
+       ),
+       TP_fast_assign(
+               __entry->name = type;
+               __entry->len = len;
+               memcpy(__entry->ctrls, control, len << 3);
+               memcpy(__entry->values, value, len << 3);
+       ),
+       TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+               __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+               __print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+       TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+       TP_ARGS(fn, reg, is_write, write_value),
+
+       TP_STRUCT__entry(
+               __field(const char *, fn)
+               __field(int, reg)
+               __field(bool, is_write)
+               __field(u64, write_value)
+       ),
+
+       TP_fast_assign(
+               __entry->fn = fn;
+               __entry->reg = reg;
+               __entry->is_write = is_write;
+               __entry->write_value = write_value;
+       ),
+
+       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+       TP_PROTO(unsigned long hsr),
+       TP_ARGS(hsr),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  hsr)
+       ),
+
+       TP_fast_assign(
+               __entry->hsr = hsr;
+       ),
+
+       TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+
 #endif /* _TRACE_ARM64_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index b91e74a..9fac01c 100644 (file)
@@ -158,6 +158,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
                        bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
@@ -225,12 +226,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index 5bdfb5d..72b6225 100644 (file)
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* Maximum number of threads per physical core */
+#define MAX_SMT_THREADS                8
+
+/* Maximum number of subcores per physical core */
+#define MAX_SUBCORES           4
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -65,6 +71,19 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+struct kvmppc_vcore;
+
+/* Struct used for coordinating micro-threading (split-core) mode changes */
+struct kvm_split_mode {
+       unsigned long   rpr;
+       unsigned long   pmmar;
+       unsigned long   ldbar;
+       u8              subcore_size;
+       u8              do_nap;
+       u8              napped[MAX_SMT_THREADS];
+       struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+};
+
 /*
  * This struct goes in the PACA on 64-bit processors.  It is used
  * to store host state that needs to be saved when we enter a guest
@@ -100,6 +119,7 @@ struct kvmppc_host_state {
        u64 host_spurr;
        u64 host_dscr;
        u64 dec_expires;
+       struct kvm_split_mode *kvm_split_mode;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
        u64 cfar;
@@ -112,7 +132,7 @@ struct kvmppc_book3s_shadow_vcpu {
        bool in_use;
        ulong gpr[14];
        u32 cr;
-       u32 xer;
+       ulong xer;
        ulong ctr;
        ulong lr;
        ulong pc;
index 3286f0d..bc6e29e 100644 (file)
@@ -54,12 +54,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index d91f65b..98eebbf 100644 (file)
@@ -205,8 +205,10 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
 #define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_CHG_SHIFT  48
 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED    (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER  (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT    0x100000000ul
 #define KVMPPC_RMAP_INDEX      0xfffffffful
 
@@ -278,7 +280,9 @@ struct kvmppc_vcore {
        u16 last_cpu;
        u8 vcore_state;
        u8 in_guest;
+       struct kvmppc_vcore *master_vcore;
        struct list_head runnable_threads;
+       struct list_head preempt_list;
        spinlock_t lock;
        wait_queue_head_t wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
@@ -300,12 +304,21 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
-/* Values for vcore_state */
+/* This bit is used when a vcore exit is triggered from outside the vcore */
+#define VCORE_EXIT_REQ         0x10000
+
+/*
+ * Values for vcore_state.
+ * Note that these are arranged such that lower values
+ * (< VCORE_SLEEPING) don't require stolen time accounting
+ * on load/unload, and higher values do.
+ */
 #define VCORE_INACTIVE 0
-#define VCORE_SLEEPING 1
-#define VCORE_PREEMPT  2
-#define VCORE_RUNNING  3
-#define VCORE_EXITING  4
+#define VCORE_PREEMPT  1
+#define VCORE_PIGGYBACK        2
+#define VCORE_SLEEPING 3
+#define VCORE_RUNNING  4
+#define VCORE_EXITING  5
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -473,7 +486,7 @@ struct kvm_vcpu_arch {
        ulong ciabr;
        ulong cfar;
        ulong ppr;
-       ulong pspb;
+       u32 pspb;
        ulong fscr;
        ulong shadow_fscr;
        ulong ebbhr;
@@ -619,6 +632,7 @@ struct kvm_vcpu_arch {
        int trap;
        int state;
        int ptid;
+       int thread_cpu;
        bool timer_running;
        wait_queue_head_t cpu_run;
 
index 8452335..790f5d1 100644 (file)
 
 /* POWER8 Micro Partition Prefetch (MPP) parameters */
 /* Address mask is common for LOGMPP instruction and MPPR SPR */
-#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000
+#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000ULL
 
 /* Bits 60 and 61 of MPP SPR should be set to one of the following */
 /* Aborting the fetch is indeed setting 00 in the table size bits */
index 810f433..221d584 100644 (file)
@@ -511,6 +511,8 @@ int main(void)
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
        DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
        DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
+       DEFINE(VCPU_CPU, offsetof(struct kvm_vcpu, cpu));
+       DEFINE(VCPU_THREAD_CPU, offsetof(struct kvm_vcpu, arch.thread_cpu));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -673,7 +675,14 @@ int main(void)
        HSTATE_FIELD(HSTATE_DSCR, host_dscr);
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+       HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
        DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+       DEFINE(KVM_SPLIT_RPR, offsetof(struct kvm_split_mode, rpr));
+       DEFINE(KVM_SPLIT_PMMAR, offsetof(struct kvm_split_mode, pmmar));
+       DEFINE(KVM_SPLIT_LDBAR, offsetof(struct kvm_split_mode, ldbar));
+       DEFINE(KVM_SPLIT_SIZE, offsetof(struct kvm_split_mode, subcore_size));
+       DEFINE(KVM_SPLIT_DO_NAP, offsetof(struct kvm_split_mode, do_nap));
+       DEFINE(KVM_SPLIT_NAPPED, offsetof(struct kvm_split_mode, napped));
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index 3caec2c..c2024ac 100644 (file)
@@ -74,14 +74,14 @@ config KVM_BOOK3S_64
          If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-       tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       tristate "KVM for POWER7 and later using hypervisor mode in host"
        depends on KVM_BOOK3S_64 && PPC_POWERNV
        select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
        ---help---
          Support running unmodified book3s_64 guest kernels in
-         virtual machines on POWER7 and PPC970 processors that have
+         virtual machines on POWER7 and newer processors that have
          hypervisor mode available to the host.
 
          If you say Y here, KVM will use the hardware virtualization
@@ -89,8 +89,8 @@ config KVM_BOOK3S_64_HV
          guest operating systems will run at full hardware speed
          using supervisor and user modes.  However, this also means
          that KVM is not usable under PowerVM (pHyp), is only usable
-         on POWER7 (or later) processors and PPC970-family processors,
-         and cannot emulate a different processor from the host processor.
+         on POWER7 or later processors, and cannot emulate a
+         different processor from the host processor.
 
          If unsure, say N.
 
index 6d6398f..d75bf32 100644 (file)
@@ -240,7 +240,8 @@ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
        kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 }
 
-int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
 {
        int deliver = 1;
        int vec = 0;
index 2035d16..d5c9bfe 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
+#include "book3s.h"
 
 /* #define DEBUG_MMU */
 /* #define DEBUG_SR */
index b982d92..79ad35a 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 #include "trace_pr.h"
+#include "book3s.h"
 
 #define PTE_SIZE 12
 
index dab68b7..1f9c0a1 100644 (file)
@@ -761,6 +761,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        /* Harvest R and C */
                        rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
                        *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+                       if (rcbits & HPTE_R_C)
+                               kvmppc_update_rmap_change(rmapp, psize);
                        if (rcbits & ~rev[i].guest_rpte) {
                                rev[i].guest_rpte = ptel | rcbits;
                                note_hpte_modification(kvm, &rev[i]);
@@ -927,8 +929,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  retry:
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_CHANGED) {
-               *rmapp &= ~KVMPPC_RMAP_CHANGED;
+               long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
+                       >> KVMPPC_RMAP_CHG_SHIFT;
+               *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
                npages_dirty = 1;
+               if (change_order > PAGE_SHIFT)
+                       npages_dirty = 1ul << (change_order - PAGE_SHIFT);
        }
        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
                unlock_rmap(rmapp);
index 5a2bc4b..2afdb9c 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/reg.h>
 #include <asm/switch_to.h>
 #include <asm/time.h>
+#include "book3s.h"
 
 #define OP_19_XOP_RFID         18
 #define OP_19_XOP_RFI          50
index a9f753f..9754e68 100644 (file)
@@ -81,6 +81,12 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #define MPP_BUFFER_ORDER       3
 #endif
 
+static int dynamic_mt_modes = 6;
+module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
+static int target_smt_mode;
+module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -114,7 +120,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-       int cpu = vcpu->cpu;
+       int cpu;
        wait_queue_head_t *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
@@ -123,10 +129,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
                ++vcpu->stat.halt_wakeup;
        }
 
-       if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+       if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
                return;
 
        /* CPU points to the first thread of the core */
+       cpu = vcpu->cpu;
        if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
                smp_send_reschedule(cpu);
 }
@@ -164,6 +171,27 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  * they should never fail.)
  */
 
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       vc->preempt_tb = mftb();
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       if (vc->preempt_tb != TB_NIL) {
+               vc->stolen_tb += mftb() - vc->preempt_tb;
+               vc->preempt_tb = TB_NIL;
+       }
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -175,14 +203,9 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
         * vcpu, and once it is set to this vcpu, only this task
         * ever sets it to NULL.
         */
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               if (vc->preempt_tb != TB_NIL) {
-                       vc->stolen_tb += mftb() - vc->preempt_tb;
-                       vc->preempt_tb = TB_NIL;
-               }
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_end_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
            vcpu->arch.busy_preempt != TB_NIL) {
@@ -197,11 +220,9 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
        unsigned long flags;
 
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               vc->preempt_tb = mftb();
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_start_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
                vcpu->arch.busy_preempt = mftb();
@@ -214,12 +235,12 @@ static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
        kvmppc_end_cede(vcpu);
 }
 
-void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 {
        vcpu->arch.pvr = pvr;
 }
 
-int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
        unsigned long pcr = 0;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -259,7 +280,7 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
        return 0;
 }
 
-void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -292,7 +313,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
               vcpu->arch.last_inst);
 }
 
-struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 {
        int r;
        struct kvm_vcpu *v, *ret = NULL;
@@ -641,7 +662,8 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 
        spin_lock(&vcore->lock);
        if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
-           vcore->vcore_state != VCORE_INACTIVE)
+           vcore->vcore_state != VCORE_INACTIVE &&
+           vcore->runner)
                target = vcore->runner;
        spin_unlock(&vcore->lock);
 
@@ -1431,6 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
        vcore->kvm = kvm;
+       INIT_LIST_HEAD(&vcore->preempt_list);
 
        vcore->mpp_buffer_is_valid = false;
 
@@ -1655,6 +1678,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
        vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+       vcpu->arch.thread_cpu = -1;
 
        vcpu->arch.cpu_type = KVM_CPU_3S_64;
        kvmppc_sanity_check(vcpu);
@@ -1749,6 +1773,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
        /* Ensure the thread won't go into the kernel if it wakes */
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
        tpaca->kvm_hstate.napping = 0;
        smp_wmb();
        tpaca->kvm_hstate.hwthread_req = 1;
@@ -1780,26 +1805,32 @@ static void kvmppc_release_hwthread(int cpu)
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.hwthread_req = 0;
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
+       tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
-static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
        int cpu;
        struct paca_struct *tpaca;
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *mvc = vc->master_vcore;
 
-       if (vcpu->arch.timer_running) {
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-               vcpu->arch.timer_running = 0;
+       cpu = vc->pcpu;
+       if (vcpu) {
+               if (vcpu->arch.timer_running) {
+                       hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+                       vcpu->arch.timer_running = 0;
+               }
+               cpu += vcpu->arch.ptid;
+               vcpu->cpu = mvc->pcpu;
+               vcpu->arch.thread_cpu = cpu;
        }
-       cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
-       tpaca->kvm_hstate.kvm_vcore = vc;
-       tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
-       vcpu->cpu = vc->pcpu;
-       /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
-       smp_wmb();
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+       /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
+       smp_wmb();
+       tpaca->kvm_hstate.kvm_vcore = mvc;
        if (cpu != smp_processor_id())
                kvmppc_ipi_thread(cpu);
 }
@@ -1812,12 +1843,12 @@ static void kvmppc_wait_for_nap(void)
        for (loops = 0; loops < 1000000; ++loops) {
                /*
                 * Check if all threads are finished.
-                * We set the vcpu pointer when starting a thread
+                * We set the vcore pointer when starting a thread
                 * and the thread clears it when finished, so we look
-                * for any threads that still have a non-NULL vcpu ptr.
+                * for any threads that still have a non-NULL vcore ptr.
                 */
                for (i = 1; i < threads_per_subcore; ++i)
-                       if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                       if (paca[cpu + i].kvm_hstate.kvm_vcore)
                                break;
                if (i == threads_per_subcore) {
                        HMT_medium();
@@ -1827,7 +1858,7 @@ static void kvmppc_wait_for_nap(void)
        }
        HMT_medium();
        for (i = 1; i < threads_per_subcore; ++i)
-               if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+               if (paca[cpu + i].kvm_hstate.kvm_vcore)
                        pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
@@ -1890,6 +1921,278 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
        mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+/*
+ * A list of virtual cores for each physical CPU.
+ * These are vcores that could run but their runner VCPU tasks are
+ * (or may be) preempted.
+ */
+struct preempted_vcore_list {
+       struct list_head        list;
+       spinlock_t              lock;
+};
+
+static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
+
+static void init_vcore_lists(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
+               spin_lock_init(&lp->lock);
+               INIT_LIST_HEAD(&lp->list);
+       }
+}
+
+static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+
+       vc->vcore_state = VCORE_PREEMPT;
+       vc->pcpu = smp_processor_id();
+       if (vc->num_threads < threads_per_subcore) {
+               spin_lock(&lp->lock);
+               list_add_tail(&vc->preempt_list, &lp->list);
+               spin_unlock(&lp->lock);
+       }
+
+       /* Start accumulating stolen time */
+       kvmppc_core_start_stolen(vc);
+}
+
+static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp;
+
+       kvmppc_core_end_stolen(vc);
+       if (!list_empty(&vc->preempt_list)) {
+               lp = &per_cpu(preempted_vcores, vc->pcpu);
+               spin_lock(&lp->lock);
+               list_del_init(&vc->preempt_list);
+               spin_unlock(&lp->lock);
+       }
+       vc->vcore_state = VCORE_INACTIVE;
+}
+
+/*
+ * This stores information about the virtual cores currently
+ * assigned to a physical core.
+ */
+struct core_info {
+       int             n_subcores;
+       int             max_subcore_threads;
+       int             total_threads;
+       int             subcore_threads[MAX_SUBCORES];
+       struct kvm      *subcore_vm[MAX_SUBCORES];
+       struct list_head vcs[MAX_SUBCORES];
+};
+
+/*
+ * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
+ * respectively in 2-way micro-threading (split-core) mode.
+ */
+static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
+
+static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
+{
+       int sub;
+
+       memset(cip, 0, sizeof(*cip));
+       cip->n_subcores = 1;
+       cip->max_subcore_threads = vc->num_threads;
+       cip->total_threads = vc->num_threads;
+       cip->subcore_threads[0] = vc->num_threads;
+       cip->subcore_vm[0] = vc->kvm;
+       for (sub = 0; sub < MAX_SUBCORES; ++sub)
+               INIT_LIST_HEAD(&cip->vcs[sub]);
+       list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+}
+
+static bool subcore_config_ok(int n_subcores, int n_threads)
+{
+       /* Can only dynamically split if unsplit to begin with */
+       if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
+               return false;
+       if (n_subcores > MAX_SUBCORES)
+               return false;
+       if (n_subcores > 1) {
+               if (!(dynamic_mt_modes & 2))
+                       n_subcores = 4;
+               if (n_subcores > 2 && !(dynamic_mt_modes & 4))
+                       return false;
+       }
+
+       return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
+}
+
+static void init_master_vcore(struct kvmppc_vcore *vc)
+{
+       vc->master_vcore = vc;
+       vc->entry_exit_map = 0;
+       vc->in_guest = 0;
+       vc->napping_threads = 0;
+       vc->conferring_threads = 0;
+}
+
+/*
+ * See if the existing subcores can be split into 3 (or fewer) subcores
+ * of at most two threads each, so we can fit in another vcore.  This
+ * assumes there are at most two subcores and at most 6 threads in total.
+ */
+static bool can_split_piggybacked_subcores(struct core_info *cip)
+{
+       int sub, new_sub;
+       int large_sub = -1;
+       int thr;
+       int n_subcores = cip->n_subcores;
+       struct kvmppc_vcore *vc, *vcnext;
+       struct kvmppc_vcore *master_vc = NULL;
+
+       for (sub = 0; sub < cip->n_subcores; ++sub) {
+               if (cip->subcore_threads[sub] <= 2)
+                       continue;
+               if (large_sub >= 0)
+                       return false;
+               large_sub = sub;
+               vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                                     preempt_list);
+               if (vc->num_threads > 2)
+                       return false;
+               n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
+       }
+       if (n_subcores > 3 || large_sub < 0)
+               return false;
+
+       /*
+        * Seems feasible, so go through and move vcores to new subcores.
+        * Note that when we have two or more vcores in one subcore,
+        * all those vcores must have only one thread each.
+        */
+       new_sub = cip->n_subcores;
+       thr = 0;
+       sub = large_sub;
+       list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
+               if (thr >= 2) {
+                       list_del(&vc->preempt_list);
+                       list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
+                       /* vc->num_threads must be 1 */
+                       if (++cip->subcore_threads[new_sub] == 1) {
+                               cip->subcore_vm[new_sub] = vc->kvm;
+                               init_master_vcore(vc);
+                               master_vc = vc;
+                               ++cip->n_subcores;
+                       } else {
+                               vc->master_vcore = master_vc;
+                               ++new_sub;
+                       }
+               }
+               thr += vc->num_threads;
+       }
+       cip->subcore_threads[large_sub] = 2;
+       cip->max_subcore_threads = 2;
+
+       return true;
+}
+
+static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
+{
+       int n_threads = vc->num_threads;
+       int sub;
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return false;
+
+       if (n_threads < cip->max_subcore_threads)
+               n_threads = cip->max_subcore_threads;
+       if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
+               cip->max_subcore_threads = n_threads;
+       } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
+                  vc->num_threads <= 2) {
+               /*
+                * We may be able to fit another subcore in by
+                * splitting an existing subcore with 3 or 4
+                * threads into two 2-thread subcores, or one
+                * with 5 or 6 threads into three subcores.
+                * We can only do this if those subcores have
+                * piggybacked virtual cores.
+                */
+               if (!can_split_piggybacked_subcores(cip))
+                       return false;
+       } else {
+               return false;
+       }
+
+       sub = cip->n_subcores;
+       ++cip->n_subcores;
+       cip->total_threads += vc->num_threads;
+       cip->subcore_threads[sub] = vc->num_threads;
+       cip->subcore_vm[sub] = vc->kvm;
+       init_master_vcore(vc);
+       list_del(&vc->preempt_list);
+       list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
+                                 struct core_info *cip, int sub)
+{
+       struct kvmppc_vcore *vc;
+       int n_thr;
+
+       vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                             preempt_list);
+
+       /* require same VM and same per-core reg values */
+       if (pvc->kvm != vc->kvm ||
+           pvc->tb_offset != vc->tb_offset ||
+           pvc->pcr != vc->pcr ||
+           pvc->lpcr != vc->lpcr)
+               return false;
+
+       /* P8 guest with > 1 thread per core would see wrong TIR value */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           (vc->num_threads > 1 || pvc->num_threads > 1))
+               return false;
+
+       n_thr = cip->subcore_threads[sub] + pvc->num_threads;
+       if (n_thr > cip->max_subcore_threads) {
+               if (!subcore_config_ok(cip->n_subcores, n_thr))
+                       return false;
+               cip->max_subcore_threads = n_thr;
+       }
+
+       cip->total_threads += pvc->num_threads;
+       cip->subcore_threads[sub] = n_thr;
+       pvc->master_vcore = vc;
+       list_del(&pvc->preempt_list);
+       list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+/*
+ * Work out whether it is possible to piggyback the execution of
+ * vcore *pvc onto the execution of the other vcores described in *cip.
+ */
+static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
+                         int target_threads)
+{
+       int sub;
+
+       if (cip->total_threads + pvc->num_threads > target_threads)
+               return false;
+       for (sub = 0; sub < cip->n_subcores; ++sub)
+               if (cip->subcore_threads[sub] &&
+                   can_piggyback_subcore(pvc, cip, sub))
+                       return true;
+
+       if (can_dynamic_split(pvc, cip))
+               return true;
+
+       return false;
+}
+
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
        struct kvm_vcpu *vcpu, *vnext;
@@ -1909,12 +2212,45 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        }
 }
 
-static void post_guest_process(struct kvmppc_vcore *vc)
+static void collect_piggybacks(struct core_info *cip, int target_threads)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+       struct kvmppc_vcore *pvc, *vcnext;
+
+       spin_lock(&lp->lock);
+       list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
+               if (!spin_trylock(&pvc->lock))
+                       continue;
+               prepare_threads(pvc);
+               if (!pvc->n_runnable) {
+                       list_del_init(&pvc->preempt_list);
+                       if (pvc->runner == NULL) {
+                               pvc->vcore_state = VCORE_INACTIVE;
+                               kvmppc_core_end_stolen(pvc);
+                       }
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               if (!can_piggyback(pvc, cip, target_threads)) {
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               kvmppc_core_end_stolen(pvc);
+               pvc->vcore_state = VCORE_PIGGYBACK;
+               if (cip->total_threads >= target_threads)
+                       break;
+       }
+       spin_unlock(&lp->lock);
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
+       int still_running = 0;
        u64 now;
        long ret;
        struct kvm_vcpu *vcpu, *vnext;
 
+       spin_lock(&vc->lock);
        now = get_tb();
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
@@ -1933,17 +2269,36 @@ static void post_guest_process(struct kvmppc_vcore *vc)
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
 
-               if (vcpu->arch.ceded) {
-                       if (!is_kvmppc_resume_guest(ret))
-                               kvmppc_end_cede(vcpu);
-                       else
+               if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       if (vcpu->arch.pending_exceptions)
+                               kvmppc_core_prepare_to_enter(vcpu);
+                       if (vcpu->arch.ceded)
                                kvmppc_set_timer(vcpu);
-               }
-               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       else
+                               ++still_running;
+               } else {
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
                }
        }
+       list_del_init(&vc->preempt_list);
+       if (!is_master) {
+               if (still_running > 0) {
+                       kvmppc_vcore_preempt(vc);
+               } else if (vc->runner) {
+                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_core_start_stolen(vc);
+               } else {
+                       vc->vcore_state = VCORE_INACTIVE;
+               }
+               if (vc->n_runnable > 0 && vc->runner == NULL) {
+                       /* make sure there's a candidate runner awake */
+                       vcpu = list_first_entry(&vc->runnable_threads,
+                                               struct kvm_vcpu, arch.run_list);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+       spin_unlock(&vc->lock);
 }
 
 /*
@@ -1955,6 +2310,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        struct kvm_vcpu *vcpu, *vnext;
        int i;
        int srcu_idx;
+       struct core_info core_info;
+       struct kvmppc_vcore *pvc, *vcnext;
+       struct kvm_split_mode split_info, *sip;
+       int split, subcore_size, active;
+       int sub;
+       bool thr0_done;
+       unsigned long cmd_bit, stat_bit;
+       int pcpu, thr;
+       int target_threads;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -1969,11 +2333,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        /*
         * Initialize *vc.
         */
-       vc->entry_exit_map = 0;
+       init_master_vcore(vc);
        vc->preempt_tb = TB_NIL;
-       vc->in_guest = 0;
-       vc->napping_threads = 0;
-       vc->conferring_threads = 0;
 
        /*
         * Make sure we are running on primary threads, and that secondary
@@ -1991,24 +2352,120 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                goto out;
        }
 
+       /*
+        * See if we could run any other vcores on the physical core
+        * along with this one.
+        */
+       init_core_info(&core_info, vc);
+       pcpu = smp_processor_id();
+       target_threads = threads_per_subcore;
+       if (target_smt_mode && target_smt_mode < target_threads)
+               target_threads = target_smt_mode;
+       if (vc->num_threads < target_threads)
+               collect_piggybacks(&core_info, target_threads);
+
+       /* Decide on micro-threading (split-core) mode */
+       subcore_size = threads_per_subcore;
+       cmd_bit = stat_bit = 0;
+       split = core_info.n_subcores;
+       sip = NULL;
+       if (split > 1) {
+               /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
+               if (split == 2 && (dynamic_mt_modes & 2)) {
+                       cmd_bit = HID0_POWER8_1TO2LPAR;
+                       stat_bit = HID0_POWER8_2LPARMODE;
+               } else {
+                       split = 4;
+                       cmd_bit = HID0_POWER8_1TO4LPAR;
+                       stat_bit = HID0_POWER8_4LPARMODE;
+               }
+               subcore_size = MAX_SMT_THREADS / split;
+               sip = &split_info;
+               memset(&split_info, 0, sizeof(split_info));
+               split_info.rpr = mfspr(SPRN_RPR);
+               split_info.pmmar = mfspr(SPRN_PMMAR);
+               split_info.ldbar = mfspr(SPRN_LDBAR);
+               split_info.subcore_size = subcore_size;
+               for (sub = 0; sub < core_info.n_subcores; ++sub)
+                       split_info.master_vcs[sub] =
+                               list_first_entry(&core_info.vcs[sub],
+                                       struct kvmppc_vcore, preempt_list);
+               /* order writes to split_info before kvm_split_mode pointer */
+               smp_wmb();
+       }
+       pcpu = smp_processor_id();
+       for (thr = 0; thr < threads_per_subcore; ++thr)
+               paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+
+       /* Initiate micro-threading (split-core) if required */
+       if (cmd_bit) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+
+               hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (hid0 & stat_bit)
+                               break;
+                       cpu_relax();
+               }
+       }
 
-       vc->pcpu = smp_processor_id();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               kvmppc_start_thread(vcpu);
-               kvmppc_create_dtl_entry(vcpu, vc);
-               trace_kvm_guest_enter(vcpu);
+       /* Start all the threads */
+       active = 0;
+       for (sub = 0; sub < core_info.n_subcores; ++sub) {
+               thr = subcore_thread_map[sub];
+               thr0_done = false;
+               active |= 1 << thr;
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
+                       pvc->pcpu = pcpu + thr;
+                       list_for_each_entry(vcpu, &pvc->runnable_threads,
+                                           arch.run_list) {
+                               kvmppc_start_thread(vcpu, pvc);
+                               kvmppc_create_dtl_entry(vcpu, pvc);
+                               trace_kvm_guest_enter(vcpu);
+                               if (!vcpu->arch.ptid)
+                                       thr0_done = true;
+                               active |= 1 << (thr + vcpu->arch.ptid);
+                       }
+                       /*
+                        * We need to start the first thread of each subcore
+                        * even if it doesn't have a vcpu.
+                        */
+                       if (pvc->master_vcore == pvc && !thr0_done)
+                               kvmppc_start_thread(NULL, pvc);
+                       thr += pvc->num_threads;
+               }
        }
 
-       /* Set this explicitly in case thread 0 doesn't have a vcpu */
-       get_paca()->kvm_hstate.kvm_vcore = vc;
-       get_paca()->kvm_hstate.ptid = 0;
+       /*
+        * Ensure that split_info.do_nap is set after setting
+        * the vcore pointer in the PACA of the secondaries.
+        */
+       smp_mb();
+       if (cmd_bit)
+               split_info.do_nap = 1;  /* ask secondaries to nap when done */
+
+       /*
+        * When doing micro-threading, poke the inactive threads as well.
+        * This gets them to the nap instruction after kvm_do_nap,
+        * which reduces the time taken to unsplit later.
+        */
+       if (split > 1)
+               for (thr = 1; thr < threads_per_subcore; ++thr)
+                       if (!(active & (1 << thr)))
+                               kvmppc_ipi_thread(pcpu + thr);
 
        vc->vcore_state = VCORE_RUNNING;
        preempt_disable();
 
        trace_kvmppc_run_core(vc, 0);
 
-       spin_unlock(&vc->lock);
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
+                       spin_unlock(&pvc->lock);
 
        kvm_guest_enter();
 
@@ -2019,32 +2476,58 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        __kvmppc_vcore_entry();
 
-       spin_lock(&vc->lock);
-
        if (vc->mpp_buffer)
                kvmppc_start_saving_l2_cache(vc);
 
-       /* disable sending of IPIs on virtual external irqs */
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-               vcpu->cpu = -1;
-       /* wait for secondary threads to finish writing their state to memory */
-       kvmppc_wait_for_nap();
-       for (i = 0; i < threads_per_subcore; ++i)
-               kvmppc_release_hwthread(vc->pcpu + i);
+       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+       spin_lock(&vc->lock);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
        vc->vcore_state = VCORE_EXITING;
-       spin_unlock(&vc->lock);
 
-       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+       /* wait for secondary threads to finish writing their state to memory */
+       kvmppc_wait_for_nap();
+
+       /* Return to whole-core mode if we split the core earlier */
+       if (split > 1) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+               unsigned long loops = 0;
+
+               hid0 &= ~HID0_POWER8_DYNLPARDIS;
+               stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (!(hid0 & stat_bit))
+                               break;
+                       cpu_relax();
+                       ++loops;
+               }
+               split_info.do_nap = 0;
+       }
+
+       /* Let secondaries go back to the offline loop */
+       for (i = 0; i < threads_per_subcore; ++i) {
+               kvmppc_release_hwthread(pcpu + i);
+               if (sip && sip->napped[i])
+                       kvmppc_ipi_thread(pcpu + i);
+       }
+
+       spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
        kvm_guest_exit();
 
-       preempt_enable();
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
+                                        preempt_list)
+                       post_guest_process(pvc, pvc == vc);
 
        spin_lock(&vc->lock);
-       post_guest_process(vc);
+       preempt_enable();
 
  out:
        vc->vcore_state = VCORE_INACTIVE;
@@ -2055,13 +2538,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  */
-static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
+static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
+                                struct kvm_vcpu *vcpu, int wait_state)
 {
        DEFINE_WAIT(wait);
 
        prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+               spin_unlock(&vc->lock);
                schedule();
+               spin_lock(&vc->lock);
+       }
        finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
@@ -2137,9 +2624,21 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         * this thread straight away and have it join in.
         */
        if (!signal_pending(current)) {
-               if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
+               if (vc->vcore_state == VCORE_PIGGYBACK) {
+                       struct kvmppc_vcore *mvc = vc->master_vcore;
+                       if (spin_trylock(&mvc->lock)) {
+                               if (mvc->vcore_state == VCORE_RUNNING &&
+                                   !VCORE_IS_EXITING(mvc)) {
+                                       kvmppc_create_dtl_entry(vcpu, vc);
+                                       kvmppc_start_thread(vcpu, vc);
+                                       trace_kvm_guest_enter(vcpu);
+                               }
+                               spin_unlock(&mvc->lock);
+                       }
+               } else if (vc->vcore_state == VCORE_RUNNING &&
+                          !VCORE_IS_EXITING(vc)) {
                        kvmppc_create_dtl_entry(vcpu, vc);
-                       kvmppc_start_thread(vcpu);
+                       kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
                        wake_up(&vc->wq);
@@ -2149,10 +2648,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               !signal_pending(current)) {
+               if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+                       kvmppc_vcore_end_preempt(vc);
+
                if (vc->vcore_state != VCORE_INACTIVE) {
-                       spin_unlock(&vc->lock);
-                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
-                       spin_lock(&vc->lock);
+                       kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
                        continue;
                }
                list_for_each_entry_safe(v, vn, &vc->runnable_threads,
@@ -2179,10 +2679,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                if (n_ceded == vc->n_runnable) {
                        kvmppc_vcore_blocked(vc);
                } else if (need_resched()) {
-                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_vcore_preempt(vc);
                        /* Let something else run */
                        cond_resched_lock(&vc->lock);
-                       vc->vcore_state = VCORE_INACTIVE;
+                       if (vc->vcore_state == VCORE_PREEMPT)
+                               kvmppc_vcore_end_preempt(vc);
                } else {
                        kvmppc_run_core(vc);
                }
@@ -2191,11 +2692,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               (vc->vcore_state == VCORE_RUNNING ||
-               vc->vcore_state == VCORE_EXITING)) {
-               spin_unlock(&vc->lock);
-               kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-               spin_lock(&vc->lock);
-       }
+               vc->vcore_state == VCORE_EXITING))
+               kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
 
        if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
                kvmppc_remove_runnable(vc, vcpu);
@@ -2755,6 +3253,8 @@ static int kvmppc_book3s_init_hv(void)
 
        init_default_hcalls();
 
+       init_vcore_lists();
+
        r = kvmppc_mmu_hv_init();
        return r;
 }
index ed2589d..fd7006b 100644 (file)
@@ -110,14 +110,15 @@ void __init kvm_cma_reserve(void)
 long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                            unsigned int yield_count)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+       int ptid = local_paca->kvm_hstate.ptid;
        int threads_running;
        int threads_ceded;
        int threads_conferring;
        u64 stop = get_tb() + 10 * tb_ticks_per_usec;
        int rv = H_SUCCESS; /* => don't yield */
 
-       set_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       set_bit(ptid, &vc->conferring_threads);
        while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
                threads_running = VCORE_ENTRY_MAP(vc);
                threads_ceded = vc->napping_threads;
@@ -127,7 +128,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                        break;
                }
        }
-       clear_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       clear_bit(ptid, &vc->conferring_threads);
        return rv;
 }
 
@@ -238,7 +239,8 @@ void kvmhv_commence_exit(int trap)
 {
        struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
        int ptid = local_paca->kvm_hstate.ptid;
-       int me, ee;
+       struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
+       int me, ee, i;
 
        /* Set our bit in the threads-exiting-guest map in the 0xff00
           bits of vcore->entry_exit_map */
@@ -258,4 +260,26 @@ void kvmhv_commence_exit(int trap)
         */
        if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
                kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+
+       /*
+        * If we are doing dynamic micro-threading, interrupt the other
+        * subcores to pull them out of their guests too.
+        */
+       if (!sip)
+               return;
+
+       for (i = 0; i < MAX_SUBCORES; ++i) {
+               vc = sip->master_vcs[i];
+               if (!vc)
+                       break;
+               do {
+                       ee = vc->entry_exit_map;
+                       /* Already asked to exit? */
+                       if ((ee >> 8) != 0)
+                               break;
+               } while (cmpxchg(&vc->entry_exit_map, ee,
+                                ee | VCORE_EXIT_REQ) != ee);
+               if ((ee >> 8) == 0)
+                       kvmhv_interrupt_vcore(vc, ee);
+       }
 }
index b027a89..c1df9bb 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
 #include <linux/module.h>
+#include <linux/log2.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -97,25 +98,52 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/* Update the changed page order field of an rmap entry */
+void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+{
+       unsigned long order;
+
+       if (!psize)
+               return;
+       order = ilog2(psize);
+       order <<= KVMPPC_RMAP_CHG_SHIFT;
+       if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
+               *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
+
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+                                     unsigned long hpte_gr)
+{
+       struct kvm_memory_slot *memslot;
+       unsigned long *rmap;
+       unsigned long gfn;
+
+       gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+       if (!memslot)
+               return NULL;
+
+       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+       return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                                struct revmap_entry *rev,
                                unsigned long hpte_v, unsigned long hpte_r)
 {
        struct revmap_entry *next, *prev;
-       unsigned long gfn, ptel, head;
-       struct kvm_memory_slot *memslot;
+       unsigned long ptel, head;
        unsigned long *rmap;
        unsigned long rcbits;
 
        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
        ptel = rev->guest_rpte |= rcbits;
-       gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-       if (!memslot)
+       rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+       if (!rmap)
                return;
-
-       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
        lock_rmap(rmap);
 
        head = *rmap & KVMPPC_RMAP_INDEX;
@@ -131,6 +159,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
        }
        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+       if (rcbits & HPTE_R_C)
+               kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
        unlock_rmap(rmap);
 }
 
@@ -421,14 +451,20 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
        v = pte & ~HPTE_V_HVLOCK;
        if (v & HPTE_V_VALID) {
-               u64 pte1;
-
-               pte1 = be64_to_cpu(hpte[1]);
                hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
-               rb = compute_tlbie_rb(v, pte1, pte_index);
+               rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-               /* Read PTE low word after tlbie to get final R/C values */
-               remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+               /*
+                * The reference (R) and change (C) bits in a HPT
+                * entry can be set by hardware at any time up until
+                * the HPTE is invalidated and the TLB invalidation
+                * sequence has completed.  This means that when
+                * removing a HPTE, we need to re-read the HPTE after
+                * the invalidation sequence has completed in order to
+                * obtain reliable values of R and C.
+                */
+               remove_revmap_chain(kvm, pte_index, rev, v,
+                                   be64_to_cpu(hpte[1]));
        }
        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
        note_hpte_modification(kvm, rev);
@@ -655,6 +691,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
        return H_SUCCESS;
 }
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (rev->guest_rpte & HPTE_R_R) {
+               rev->guest_rpte &= ~HPTE_R_R;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_R) {
+                       kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_REFERENCED;
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (gr & HPTE_R_C) {
+               rev->guest_rpte &= ~HPTE_R_C;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               /* need to make it temporarily absent so C is stable */
+               hpte[0] |= cpu_to_be64(HPTE_V_ABSENT);
+               kvmppc_invalidate_hpte(kvm, hpte, pte_index);
+               r = be64_to_cpu(hpte[1]);
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_C) {
+                       unsigned long psize = hpte_page_size(v, r);
+                       hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
+                       eieio();
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_CHANGED;
+                               kvmppc_update_rmap_change(rmap, psize);
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
 void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index)
 {
index 00e45b6..24f5807 100644 (file)
@@ -67,14 +67,12 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
        }
 
        /* Check if the core is loaded, if not, too hard */
-       cpu = vcpu->cpu;
+       cpu = vcpu->arch.thread_cpu;
        if (cpu < 0 || cpu >= nr_cpu_ids) {
                this_icp->rm_action |= XICS_RM_KICK_VCPU;
                this_icp->rm_kick_target = vcpu;
                return;
        }
-       /* In SMT cpu will always point to thread 0, we adjust it */
-       cpu += vcpu->arch.ptid;
 
        smp_mb();
        kvmhv_rm_send_ipi(cpu);
index faa86e9..2273dca 100644 (file)
@@ -128,6 +128,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        subf    r4, r4, r3
        mtspr   SPRN_DEC, r4
 
+       /* hwthread_req may have got set by cede or no vcpu, so clear it */
+       li      r0, 0
+       stb     r0, HSTATE_HWTHREAD_REQ(r13)
+
        /*
         * For external and machine check interrupts, we need
         * to call the Linux handler to process the interrupt.
@@ -215,7 +219,6 @@ kvm_novcpu_wakeup:
        ld      r5, HSTATE_KVM_VCORE(r13)
        li      r0, 0
        stb     r0, HSTATE_NAPPING(r13)
-       stb     r0, HSTATE_HWTHREAD_REQ(r13)
 
        /* check the wake reason */
        bl      kvmppc_check_wake_reason
@@ -315,10 +318,10 @@ kvm_start_guest:
        cmpdi   r3, 0
        bge     kvm_no_guest
 
-       /* get vcpu pointer, NULL if we have no vcpu to run */
-       ld      r4,HSTATE_KVM_VCPU(r13)
-       cmpdi   r4,0
-       /* if we have no vcpu to run, go back to sleep */
+       /* get vcore pointer, NULL if we have nothing to run */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       cmpdi   r5,0
+       /* if we have no vcore to run, go back to sleep */
        beq     kvm_no_guest
 
 kvm_secondary_got_guest:
@@ -327,21 +330,42 @@ kvm_secondary_got_guest:
        ld      r6, PACA_DSCR_DEFAULT(r13)
        std     r6, HSTATE_DSCR(r13)
 
-       /* Order load of vcore, ptid etc. after load of vcpu */
+       /* On thread 0 of a subcore, set HDEC to max */
+       lbz     r4, HSTATE_PTID(r13)
+       cmpwi   r4, 0
+       bne     63f
+       lis     r6, 0x7fff
+       ori     r6, r6, 0xffff
+       mtspr   SPRN_HDEC, r6
+       /* and set per-LPAR registers, if doing dynamic micro-threading */
+       ld      r6, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r6, 0
+       beq     63f
+       ld      r0, KVM_SPLIT_RPR(r6)
+       mtspr   SPRN_RPR, r0
+       ld      r0, KVM_SPLIT_PMMAR(r6)
+       mtspr   SPRN_PMMAR, r0
+       ld      r0, KVM_SPLIT_LDBAR(r6)
+       mtspr   SPRN_LDBAR, r0
+       isync
+63:
+       /* Order load of vcpu after load of vcore */
        lwsync
+       ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_hv_entry
 
        /* Back from the guest, go back to nap */
-       /* Clear our vcpu pointer so we don't come back in early */
+       /* Clear our vcpu and vcore pointers so we don't come back in early */
        li      r0, 0
+       std     r0, HSTATE_KVM_VCPU(r13)
        /*
-        * Once we clear HSTATE_KVM_VCPU(r13), the code in
+        * Once we clear HSTATE_KVM_VCORE(r13), the code in
         * kvmppc_run_core() is going to assume that all our vcpu
         * state is visible in memory.  This lwsync makes sure
         * that that is true.
         */
        lwsync
-       std     r0, HSTATE_KVM_VCPU(r13)
+       std     r0, HSTATE_KVM_VCORE(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -374,16 +398,71 @@ kvm_no_guest:
        b       power7_wakeup_loss
 
 53:    HMT_LOW
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       cmpdi   r4, 0
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       cmpdi   r5, 0
+       bne     60f
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r3, 0
+       beq     kvm_no_guest
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
        beq     kvm_no_guest
        HMT_MEDIUM
+       b       kvm_unsplit_nap
+60:    HMT_MEDIUM
        b       kvm_secondary_got_guest
 
 54:    li      r0, KVM_HWTHREAD_IN_KVM
        stb     r0, HSTATE_HWTHREAD_STATE(r13)
        b       kvm_no_guest
 
+/*
+ * Here the primary thread is trying to return the core to
+ * whole-core mode, so we need to nap.
+ */
+kvm_unsplit_nap:
+       /*
+        * Ensure that secondary doesn't nap when it has
+        * its vcore pointer set.
+        */
+       sync            /* matches smp_mb() before setting split_info.do_nap */
+       ld      r0, HSTATE_KVM_VCORE(r13)
+       cmpdi   r0, 0
+       bne     kvm_no_guest
+       /* clear any pending message */
+BEGIN_FTR_SECTION
+       lis     r6, (PPC_DBELL_SERVER << (63-36))@h
+       PPC_MSGCLR(6)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       /* Set kvm_split_mode.napped[tid] = 1 */
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       li      r0, 1
+       lhz     r4, PACAPACAINDEX(r13)
+       clrldi  r4, r4, 61      /* micro-threading => P8 => 8 threads/core */
+       addi    r4, r4, KVM_SPLIT_NAPPED
+       stbx    r0, r3, r4
+       /* Check the do_nap flag again after setting napped[] */
+       sync
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
+       beq     57f
+       li      r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
+       mfspr   r4, SPRN_LPCR
+       rlwimi  r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+       mtspr   SPRN_LPCR, r4
+       isync
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+57:    li      r0, 0
+       stbx    r0, r3, r4
+       b       kvm_no_guest
+
 /******************************************************************************
  *                                                                            *
  *                               Entry code                                   *
@@ -854,7 +933,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        cmpwi   r0, 0
        bne     21f
        HMT_LOW
-20:    lbz     r0, VCORE_IN_GUEST(r5)
+20:    lwz     r3, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r3, 0x100
+       bge     no_switch_exit
+       lbz     r0, VCORE_IN_GUEST(r5)
        cmpwi   r0, 0
        beq     20b
        HMT_MEDIUM
@@ -870,7 +952,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        blt     hdec_soon
 
        ld      r6, VCPU_CTR(r4)
-       lwz     r7, VCPU_XER(r4)
+       l     r7, VCPU_XER(r4)
 
        mtctr   r6
        mtxer   r7
@@ -985,9 +1067,13 @@ secondary_too_late:
 #endif
 11:    b       kvmhv_switch_to_host
 
+no_switch_exit:
+       HMT_MEDIUM
+       li      r12, 0
+       b       12f
 hdec_soon:
        li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
-       stw     r12, VCPU_TRAP(r4)
+12:    stw     r12, VCPU_TRAP(r4)
        mr      r9, r4
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        addi    r3, r4, VCPU_TB_RMEXIT
@@ -1103,7 +1189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        mfctr   r3
        mfxer   r4
        std     r3, VCPU_CTR(r9)
-       stw     r4, VCPU_XER(r9)
+       std     r4, VCPU_XER(r9)
 
        /* If this is a page table miss then see if it's theirs or ours */
        cmpwi   r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
@@ -1127,6 +1213,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
        bne     3f
        lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
        beq     4f
        b       guest_exit_cont
 3:
@@ -1176,6 +1263,11 @@ mc_cont:
        ld      r9, HSTATE_KVM_VCPU(r13)
        lwz     r12, VCPU_TRAP(r9)
 
+       /* Stop others sending VCPU interrupts to this physical CPU */
+       li      r0, -1
+       stw     r0, VCPU_CPU(r9)
+       stw     r0, VCPU_THREAD_CPU(r9)
+
        /* Save guest CTRL register, set runlatch to 1 */
        mfspr   r6,SPRN_CTRLF
        stw     r6,VCPU_CTRL(r9)
@@ -1540,12 +1632,17 @@ kvmhv_switch_to_host:
 
        /* Primary thread waits for all the secondaries to exit guest */
 15:    lwz     r3,VCORE_ENTRY_EXIT(r5)
-       srwi    r0,r3,8
+       rlwinm  r0,r3,32-8,0xff
        clrldi  r3,r3,56
        cmpw    r3,r0
        bne     15b
        isync
 
+       /* Did we actually switch to the guest at all? */
+       lbz     r6, VCORE_IN_GUEST(r5)
+       cmpwi   r6, 0
+       beq     19f
+
        /* Primary thread switches back to host partition */
        ld      r6,KVM_HOST_SDR1(r4)
        lwz     r7,KVM_HOST_LPID(r4)
@@ -1589,7 +1686,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 18:
        /* Signal secondary CPUs to continue */
        stb     r0,VCORE_IN_GUEST(r5)
-       lis     r8,0x7fff               /* MAX_INT@h */
+19:    lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
 
 16:    ld      r8,KVM_HOST_LPCR(r4)
@@ -1675,7 +1772,7 @@ kvmppc_hdsi:
        bl      kvmppc_msr_interrupt
 fast_interrupt_c_return:
 6:     ld      r7, VCPU_CTR(r9)
-       lwz     r8, VCPU_XER(r9)
+       l     r8, VCPU_XER(r9)
        mtctr   r7
        mtxer   r8
        mr      r4, r9
@@ -1816,8 +1913,8 @@ hcall_real_table:
        .long   DOTSYM(kvmppc_h_remove) - hcall_real_table
        .long   DOTSYM(kvmppc_h_enter) - hcall_real_table
        .long   DOTSYM(kvmppc_h_read) - hcall_real_table
-       .long   0               /* 0x10 - H_CLEAR_MOD */
-       .long   0               /* 0x14 - H_CLEAR_REF */
+       .long   DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
+       .long   DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
        .long   DOTSYM(kvmppc_h_protect) - hcall_real_table
        .long   DOTSYM(kvmppc_h_get_tce) - hcall_real_table
        .long   DOTSYM(kvmppc_h_put_tce) - hcall_real_table
index bd6ab16..a759d9a 100644 (file)
@@ -352,7 +352,7 @@ static inline u32 inst_get_field(u32 inst, int msb, int lsb)
        return kvmppc_get_field(inst, msb + 32, lsb + 32);
 }
 
-bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
 {
        if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
                return false;
index acee37c..ca8f174 100644 (file)
@@ -123,7 +123,7 @@ no_dcbz32_on:
        PPC_LL  r8, SVCPU_CTR(r3)
        PPC_LL  r9, SVCPU_LR(r3)
        lwz     r10, SVCPU_CR(r3)
-       lwz     r11, SVCPU_XER(r3)
+       PPC_LL  r11, SVCPU_XER(r3)
 
        mtctr   r8
        mtlr    r9
@@ -237,7 +237,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
        mfctr   r8
        mflr    r9
 
-       stw     r5, SVCPU_XER(r13)
+       PPC_STL r5, SVCPU_XER(r13)
        PPC_STL r6, SVCPU_FAULT_DAR(r13)
        stw     r7, SVCPU_FAULT_DSISR(r13)
        PPC_STL r8, SVCPU_CTR(r13)
index c6ca7db..905e94a 100644 (file)
@@ -41,7 +41,7 @@
  * =======
  *
  * Each ICS has a spin lock protecting the information about the IRQ
- * sources and avoiding simultaneous deliveries if the same interrupt.
+ * sources and avoiding simultaneous deliveries of the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
  * (most ICP state fits in the union kvmppc_icp_state)
index cc58426..ae458f0 100644 (file)
@@ -933,6 +933,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 #endif
                break;
        case BOOKE_INTERRUPT_CRITICAL:
+               kvmppc_fill_pt_regs(&regs);
                unknown_exception(&regs);
                break;
        case BOOKE_INTERRUPT_DEBUG:
index 50860e9..29911a0 100644 (file)
@@ -377,7 +377,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
                        | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
                vcpu->arch.shared->mas1 =
                          (vcpu->arch.shared->mas6 & MAS6_SPID0)
-                       | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+                       | ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0)
                        | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
                vcpu->arch.shared->mas2 &= MAS2_EPN;
                vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
index e5dde32..2e51289 100644 (file)
@@ -660,7 +660,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
        return kvmppc_core_pending_dec(vcpu);
 }
 
-enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
 {
        struct kvm_vcpu *vcpu;
 
index e7a4fde..b372a75 100644 (file)
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        u16 sel;
 
        la = seg_base(ctxt, addr.seg) + addr.ea;
+       *linear = la;
        *max_size = 0;
        switch (mode) {
        case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        }
        if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
                return emulate_gp(ctxt, 0);
-       *linear = la;
        return X86EMUL_CONTINUE;
 bad:
        if (addr.seg == VCPU_SREG_SS)
index fb16a8e..69088a1 100644 (file)
@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
        walk_shadow_page_lockless_begin(vcpu);
 
-       for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+       for (shadow_walk_init(&iterator, vcpu, addr),
+                leaf = root = iterator.level;
             shadow_walk_okay(&iterator);
             __shadow_walk_next(&iterator, spte)) {
-               leaf = iterator.level;
                spte = mmu_spte_get_lockless(iterator.sptep);
 
                sptes[leaf - 1] = spte;
+               leaf--;
 
                if (!is_shadow_present_pte(spte))
                        break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
        if (reserved) {
                pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
                       __func__, addr);
-               while (root >= leaf) {
+               while (root > leaf) {
                        pr_err("------ spte 0x%llx level %d.\n",
                               sptes[root - 1], root);
                        root--;
index 1e7e76e..a60bdbc 100644 (file)
@@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 }
 
+#ifdef CONFIG_X86_64
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
@@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset + 4, seg.limit);
        put_smstate(u64, buf, offset + 8, seg.base);
 }
+#endif
 
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
index e596675..e1e4d7c 100644 (file)
@@ -52,13 +52,16 @@ struct arch_timer_cpu {
 
        /* Timer IRQ */
        const struct kvm_irq_level      *irq;
+
+       /* VGIC mapping */
+       struct irq_phys_map             *map;
 };
 
 int kvm_timer_hyp_init(void);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq);
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
index 133ea00..d901f1a 100644 (file)
@@ -95,11 +95,15 @@ enum vgic_type {
 #define LR_STATE_ACTIVE                (1 << 1)
 #define LR_STATE_MASK          (3 << 0)
 #define LR_EOI_INT             (1 << 2)
+#define LR_HW                  (1 << 3)
 
 struct vgic_lr {
-       u16     irq;
-       u8      source;
-       u8      state;
+       unsigned irq:10;
+       union {
+               unsigned hwirq:10;
+               unsigned source:3;
+       };
+       unsigned state:4;
 };
 
 struct vgic_vmcr {
@@ -155,6 +159,19 @@ struct vgic_io_device {
        struct kvm_io_device dev;
 };
 
+struct irq_phys_map {
+       u32                     virt_irq;
+       u32                     phys_irq;
+       u32                     irq;
+       bool                    active;
+};
+
+struct irq_phys_map_entry {
+       struct list_head        entry;
+       struct rcu_head         rcu;
+       struct irq_phys_map     map;
+};
+
 struct vgic_dist {
        spinlock_t              lock;
        bool                    in_kernel;
@@ -252,6 +269,10 @@ struct vgic_dist {
        struct vgic_vm_ops      vm_ops;
        struct vgic_io_device   dist_iodev;
        struct vgic_io_device   *redist_iodevs;
+
+       /* Virtual irq to hwirq mapping */
+       spinlock_t              irq_phys_map_lock;
+       struct list_head        irq_phys_map_list;
 };
 
 struct vgic_v2_cpu_if {
@@ -303,6 +324,9 @@ struct vgic_cpu {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
+
+       /* Protected by the distributor's irq_phys_map_lock */
+       struct list_head        irq_phys_map_list;
 };
 
 #define LR_EMPTY       0xff
@@ -317,16 +341,25 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_get_max_vcpus(void);
+void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
index 71e4faf..9eeeb95 100644 (file)
 
 #define ICH_LR_EOI                     (1UL << 41)
 #define ICH_LR_GROUP                   (1UL << 60)
+#define ICH_LR_HW                      (1UL << 61)
 #define ICH_LR_STATE                   (3UL << 62)
 #define ICH_LR_PENDING_BIT             (1UL << 62)
 #define ICH_LR_ACTIVE_BIT              (1UL << 63)
+#define ICH_LR_PHYS_ID_SHIFT           32
+#define ICH_LR_PHYS_ID_MASK            (0x3ffUL << ICH_LR_PHYS_ID_SHIFT)
 
 #define ICH_MISR_EOI                   (1 << 0)
 #define ICH_MISR_U                     (1 << 1)
index af3d29f..b8901df 100644 (file)
 
 #define GICH_LR_VIRTUALID              (0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT     (10)
-#define GICH_LR_PHYSID_CPUID           (7 << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PHYSID_CPUID           (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
 #define GICH_LR_STATE                  (3 << 28)
 #define GICH_LR_PENDING_BIT            (1 << 28)
 #define GICH_LR_ACTIVE_BIT             (1 << 29)
 #define GICH_LR_EOI                    (1 << 19)
+#define GICH_LR_HW                     (1 << 31)
 
 #define GICH_VMCR_CTRL_SHIFT           0
 #define GICH_VMCR_CTRL_MASK            (0x21f << GICH_VMCR_CTRL_SHIFT)
index 81089cf..1bef9e2 100644 (file)
@@ -242,6 +242,7 @@ struct kvm_vcpu {
        int sigset_active;
        sigset_t sigset;
        struct kvm_vcpu_stat stat;
+       unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
        int mmio_needed;
index a44062d..d6f8322 100644 (file)
@@ -358,6 +358,36 @@ TRACE_EVENT(
 
 #endif
 
+TRACE_EVENT(kvm_halt_poll_ns,
+       TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+       TP_ARGS(grow, vcpu_id, new, old),
+
+       TP_STRUCT__entry(
+               __field(bool, grow)
+               __field(unsigned int, vcpu_id)
+               __field(int, new)
+               __field(int, old)
+       ),
+
+       TP_fast_assign(
+               __entry->grow           = grow;
+               __entry->vcpu_id        = vcpu_id;
+               __entry->new            = new;
+               __entry->old            = old;
+       ),
+
+       TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+                       __entry->vcpu_id,
+                       __entry->new,
+                       __entry->grow ? "grow" : "shrink",
+                       __entry->old)
+);
+
+#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
+#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index 0d831f9..a9256f0 100644 (file)
@@ -237,6 +237,7 @@ struct kvm_run {
                        __u32 count;
                        __u64 data_offset; /* relative to kvm_run start */
                } io;
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
@@ -285,6 +286,7 @@ struct kvm_run {
                        __u32 data;
                        __u8  is_write;
                } dcr;
+               /* KVM_EXIT_INTERNAL_ERROR */
                struct {
                        __u32 suberror;
                        /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
                struct {
                        __u64 gprs[32];
                } osi;
+               /* KVM_EXIT_PAPR_HCALL */
                struct {
                        __u64 nr;
                        __u64 ret;
@@ -819,6 +822,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 98c95f2..76e38d2 100644 (file)
@@ -64,10 +64,10 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
        int ret;
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-       timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-       ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                                 timer->irq->irq,
-                                 timer->irq->level);
+       kvm_vgic_set_phys_irq_active(timer->map, true);
+       ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+                                        timer->map,
+                                        timer->irq->level);
        WARN_ON(ret);
 }
 
@@ -117,7 +117,8 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
 
        if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+           !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
+           kvm_vgic_get_phys_irq_active(timer->map))
                return false;
 
        cval = timer->cntv_cval;
@@ -184,10 +185,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        timer_arm(timer, ns);
 }
 
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq)
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       struct irq_phys_map *map;
 
        /*
         * The vcpu timer irq number cannot be determined in
@@ -196,6 +198,17 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
         * vcpu timer irq number when the vcpu is reset.
         */
        timer->irq = irq;
+
+       /*
+        * Tell the VGIC that the virtual interrupt is tied to a
+        * physical interrupt. We do that once per VCPU.
+        */
+       map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
+       if (WARN_ON(IS_ERR(map)))
+               return PTR_ERR(map);
+
+       timer->map = map;
+       return 0;
 }
 
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
@@ -335,6 +348,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
        timer_disarm(timer);
+       if (timer->map)
+               kvm_vgic_unmap_phys_irq(vcpu, timer->map);
 }
 
 void kvm_timer_enable(struct kvm *kvm)
index f9b9c7c..8d7b04d 100644 (file)
@@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & GICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & GICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
+       }
 
        return lr_desc;
 }
@@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
                           struct vgic_lr lr_desc)
 {
-       u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+       u32 lr_val;
+
+       lr_val = lr_desc.irq;
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= GICH_LR_PENDING_BIT;
@@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= GICH_LR_EOI;
 
+       if (lr_desc.state & LR_HW) {
+               lr_val |= GICH_LR_HW;
+               lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
+       }
+
+       if (lr_desc.irq < VGIC_NR_SGIS)
+               lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
+
        vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 
index dff0602..afbf925 100644 (file)
@@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & ICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & ICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
+       }
 
        return lr_desc;
 }
@@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
         * Eventually we want to make this configurable, so we may revisit
         * this in the future.
         */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+       switch (vcpu->kvm->arch.vgic.vgic_model) {
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
                lr_val |= ICH_LR_GROUP;
-       else
-               lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       case  KVM_DEV_TYPE_ARM_VGIC_V2:
+               if (lr_desc.irq < VGIC_NR_SGIS)
+                       lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       default:
+               BUG();
+       }
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= ICH_LR_PENDING_BIT;
@@ -95,6 +106,10 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
                lr_val |= ICH_LR_ACTIVE_BIT;
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= ICH_LR_EOI;
+       if (lr_desc.state & LR_HW) {
+               lr_val |= ICH_LR_HW;
+               lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
+       }
 
        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
index bc40137..9eb489a 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 
 #include <asm/kvm_emulate.h>
  *   cause the interrupt to become inactive in such a situation.
  *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
  *   inactive as long as the external input line is held high.
+ *
+ *
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *   [to be renamed to kvm_vgic_init??]
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
  */
 
 #include "vgic.h"
@@ -82,6 +105,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq);
 
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
@@ -375,7 +400,7 @@ void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
 
 static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
 {
-       return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
+       return !vgic_irq_is_queued(vcpu, irq);
 }
 
 /**
@@ -1115,6 +1140,39 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
        if (!vgic_irq_is_edge(vcpu, irq))
                vlr.state |= LR_EOI_INT;
 
+       if (vlr.irq >= VGIC_NR_SGIS) {
+               struct irq_phys_map *map;
+               map = vgic_irq_map_search(vcpu, irq);
+
+               /*
+                * If we have a mapping, and the virtual interrupt is
+                * being injected, then we must set the state to
+                * active in the physical world. Otherwise the
+                * physical interrupt will fire and the guest will
+                * exit before processing the virtual interrupt.
+                */
+               if (map) {
+                       int ret;
+
+                       BUG_ON(!map->active);
+                       vlr.hwirq = map->phys_irq;
+                       vlr.state |= LR_HW;
+                       vlr.state &= ~LR_EOI_INT;
+
+                       ret = irq_set_irqchip_state(map->irq,
+                                                   IRQCHIP_STATE_ACTIVE,
+                                                   true);
+                       WARN_ON(ret);
+
+                       /*
+                        * Make sure we're not going to sample this
+                        * again, as a HW-backed interrupt cannot be
+                        * in the PENDING_ACTIVE stage.
+                        */
+                       vgic_irq_set_queued(vcpu, irq);
+               }
+       }
+
        vgic_set_lr(vcpu, lr_nr, vlr);
        vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
@@ -1339,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
        return level_pending;
 }
 
+/*
+ * Save the physical active state, and reset it to inactive.
+ *
+ * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+       struct irq_phys_map *map;
+       int ret;
+
+       if (!(vlr.state & LR_HW))
+               return 0;
+
+       map = vgic_irq_map_search(vcpu, vlr.irq);
+       BUG_ON(!map || !map->active);
+
+       ret = irq_get_irqchip_state(map->irq,
+                                   IRQCHIP_STATE_ACTIVE,
+                                   &map->active);
+
+       WARN_ON(ret);
+
+       if (map->active) {
+               ret = irq_set_irqchip_state(map->irq,
+                                           IRQCHIP_STATE_ACTIVE,
+                                           false);
+               WARN_ON(ret);
+               return 0;
+       }
+
+       return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1353,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
        elrsr = vgic_get_elrsr(vcpu);
        elrsr_ptr = u64_to_bitmask(&elrsr);
 
-       /* Clear mappings for empty LRs */
-       for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+       /* Deal with HW interrupts, and clear mappings for empty LRs */
+       for (lr = 0; lr < vgic->nr_lr; lr++) {
                struct vgic_lr vlr;
 
-               if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+               if (!test_bit(lr, vgic_cpu->lr_used))
                        continue;
 
                vlr = vgic_get_lr(vcpu, lr);
+               if (vgic_sync_hwirq(vcpu, vlr)) {
+                       /*
+                        * So this is a HW interrupt that the guest
+                        * EOI-ed. Clean the LR state and allow the
+                        * interrupt to be sampled again.
+                        */
+                       vlr.state = 0;
+                       vlr.hwirq = 0;
+                       vgic_set_lr(vcpu, lr, vlr);
+                       vgic_irq_clear_queued(vcpu, vlr.irq);
+                       set_bit(lr, elrsr_ptr);
+               }
+
+               if (!test_bit(lr, elrsr_ptr))
+                       continue;
+
+               clear_bit(lr, vgic_cpu->lr_used);
 
                BUG_ON(vlr.irq >= dist->nr_irqs);
                vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
@@ -1447,7 +1555,8 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                 unsigned int irq_num, bool level)
+                                  struct irq_phys_map *map,
+                                  unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
@@ -1455,6 +1564,9 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        int enabled;
        bool ret = true, can_inject = true;
 
+       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+               return -EINVAL;
+
        spin_lock(&dist->lock);
 
        vcpu = kvm_get_vcpu(kvm, cpuid);
@@ -1517,18 +1629,46 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 out:
        spin_unlock(&dist->lock);
 
-       return ret ? cpuid : -EINVAL;
+       if (ret) {
+               /* kick the specified vcpu */
+               kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
+       }
+
+       return 0;
+}
+
+static int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
 }
 
 /**
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device
+ * @irq_num: The IRQ number that is assigned to the device. This IRQ
+ *           must not be mapped to a HW interrupt.
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *                           false: to ignore the call
- *          Level-sensitive  true:  activates an interrupt
- *                           false: deactivates an interrupt
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
  *
  * The GIC is not concerned with devices being active-LOW or active-HIGH for
  * level-sensitive interrupts.  You can think of the level parameter as 1
@@ -1537,39 +1677,44 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level)
 {
-       int ret = 0;
-       int vcpu_id;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
-                       ret = -EBUSY;
-                       goto out;
-               }
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
+       struct irq_phys_map *map;
+       int ret;
 
-               if (ret)
-                       goto out;
-       }
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
 
-       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+       map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
+       if (map)
                return -EINVAL;
 
-       vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-       if (vcpu_id >= 0) {
-               /* kick the specified vcpu */
-               kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
-       }
+       return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+}
 
-out:
-       return ret;
+/**
+ * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ *
+ * The GIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level)
+{
+       int ret;
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
 }
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1583,6 +1728,188 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
        return IRQ_HANDLED;
 }
 
+static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
+                                                   int virt_irq)
+{
+       if (virt_irq < VGIC_NR_PRIVATE_IRQS)
+               return &vcpu->arch.vgic_cpu.irq_phys_map_list;
+       else
+               return &vcpu->kvm->arch.vgic.irq_phys_map_list;
+}
+
+/**
+ * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
+ * @vcpu: The VCPU pointer
+ * @virt_irq: The virtual irq number
+ * @irq: The Linux IRQ number
+ *
+ * Establish a mapping between a guest visible irq (@virt_irq) and a
+ * Linux irq (@irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @irq. This mapping can be
+ * established multiple times as long as the parameters are the same.
+ *
+ * Returns a valid pointer on success, and an error pointer otherwise
+ */
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map *map;
+       struct irq_phys_map_entry *entry;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       int phys_irq;
+
+       desc = irq_to_desc(irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return ERR_PTR(-EINVAL);
+       }
+
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       phys_irq = data->hwirq;
+
+       /* Create a new mapping */
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return ERR_PTR(-ENOMEM);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       /* Try to match an existing mapping */
+       map = vgic_irq_map_search(vcpu, virt_irq);
+       if (map) {
+               /* Make sure this mapping matches */
+               if (map->phys_irq != phys_irq   ||
+                   map->irq      != irq)
+                       map = ERR_PTR(-EINVAL);
+
+               /* Found an existing, valid mapping */
+               goto out;
+       }
+
+       map           = &entry->map;
+       map->virt_irq = virt_irq;
+       map->phys_irq = phys_irq;
+       map->irq      = irq;
+
+       list_add_tail_rcu(&entry->entry, root);
+
+out:
+       spin_unlock(&dist->irq_phys_map_lock);
+       /* If we've found a hit in the existing list, free the useless
+        * entry */
+       if (IS_ERR(map) || map != &entry->map)
+               kfree(entry);
+       return map;
+}
+
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq)
+{
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map_entry *entry;
+       struct irq_phys_map *map;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(entry, root, entry) {
+               map = &entry->map;
+               if (map->virt_irq == virt_irq) {
+                       rcu_read_unlock();
+                       return map;
+               }
+       }
+
+       rcu_read_unlock();
+
+       return NULL;
+}
+
+static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
+{
+       struct irq_phys_map_entry *entry;
+
+       entry = container_of(rcu, struct irq_phys_map_entry, rcu);
+       kfree(entry);
+}
+
+/**
+ * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
+ *
+ * Return the logical active state of a mapped interrupt. This doesn't
+ * necessarily reflects the current HW state.
+ */
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
+{
+       BUG_ON(!map);
+       return map->active;
+}
+
+/**
+ * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
+ *
+ * Set the logical active state of a mapped interrupt. This doesn't
+ * immediately affects the HW state.
+ */
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
+{
+       BUG_ON(!map);
+       map->active = active;
+}
+
+/**
+ * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
+ * @vcpu: The VCPU pointer
+ * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ *
+ * Remove an existing mapping between virtual and physical interrupts.
+ */
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+       struct list_head *root;
+
+       if (!map)
+               return -EINVAL;
+
+       root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               if (&entry->map == map) {
+                       list_del_rcu(&entry->entry);
+                       call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+                       break;
+               }
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+
+       return 0;
+}
+
+static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               list_del_rcu(&entry->entry);
+               call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+}
+
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1591,6 +1918,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        kfree(vgic_cpu->active_shared);
        kfree(vgic_cpu->pend_act_shared);
        kfree(vgic_cpu->vgic_irq_lr_map);
+       vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
        vgic_cpu->pending_shared = NULL;
        vgic_cpu->active_shared = NULL;
        vgic_cpu->pend_act_shared = NULL;
@@ -1627,6 +1955,17 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
        return 0;
 }
 
+/**
+ * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
+}
+
 /**
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
  *
@@ -1664,6 +2003,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
        kfree(dist->irq_spi_target);
        kfree(dist->irq_pending_on_cpu);
        kfree(dist->irq_active_on_cpu);
+       vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
        dist->irq_sgi_sources = NULL;
        dist->irq_spi_cpu = NULL;
        dist->irq_spi_target = NULL;
@@ -1787,6 +2127,18 @@ static int init_vgic_model(struct kvm *kvm, int type)
        return 0;
 }
 
+/**
+ * kvm_vgic_early_init - Earliest possible vgic initialization stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+       spin_lock_init(&kvm->arch.vgic.lock);
+       spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
+       INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
+}
+
 int kvm_vgic_create(struct kvm *kvm, u32 type)
 {
        int i, vcpu_lock_idx = -1, ret;
@@ -1832,7 +2184,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
        if (ret)
                goto out_unlock;
 
-       spin_lock_init(&kvm->arch.vgic.lock);
        kvm->arch.vgic.in_kernel = true;
        kvm->arch.vgic.vgic_model = type;
        kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
index 21c1424..d7ea8e2 100644 (file)
@@ -213,11 +213,15 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        goto out;
 
                r = -EINVAL;
-               if (ue->flags)
+               if (ue->flags) {
+                       kfree(e);
                        goto out;
+               }
                r = setup_routing_entry(new, e, ue);
-               if (r)
+               if (r) {
+                       kfree(e);
                        goto out;
+               }
                ++ue;
        }
 
index d8db2f8..4662a88 100644 (file)
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 500000;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
  * Ordering of locks:
  *
@@ -217,6 +226,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
+       vcpu->halt_poll_ns = 0;
        init_waitqueue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
 
@@ -1906,6 +1916,35 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       /* 10us base */
+       if (val == 0 && halt_poll_ns_grow)
+               val = 10000;
+       else
+               val *= halt_poll_ns_grow;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       if (halt_poll_ns_shrink == 0)
+               val = 0;
+       else
+               val /= halt_poll_ns_shrink;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
        if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1928,10 +1967,11 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        ktime_t start, cur;
        DEFINE_WAIT(wait);
        bool waited = false;
+       u64 block_ns;
 
        start = cur = ktime_get();
-       if (halt_poll_ns) {
-               ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+       if (vcpu->halt_poll_ns) {
+               ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
                do {
                        /*
@@ -1960,7 +2000,21 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        cur = ktime_get();
 
 out:
-       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+       block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+       if (halt_poll_ns) {
+               if (block_ns <= vcpu->halt_poll_ns)
+                       ;
+               /* we had a long block, shrink polling */
+               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                       shrink_halt_poll_ns(vcpu);
+               /* we had a short halt and our poll time is too small */
+               else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                       block_ns < halt_poll_ns)
+                       grow_halt_poll_ns(vcpu);
+       }
+
+       trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);