Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jul 2011 16:54:54 +0000 (09:54 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jul 2011 16:54:54 +0000 (09:54 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
  modpost: Fix modpost's license checking V3
  module: add /sys/module/<name>/uevent files
  module: change attr callbacks to take struct module_kobject
  modules: make arch's use default loader hooks
  modules: add default loader hook implementations
  param: fix return value handling in param_set_*

134 files changed:
Documentation/kernel-parameters.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/mmu.txt
Documentation/virtual/kvm/msr.txt
Documentation/virtual/kvm/nested-vmx.txt [new file with mode: 0644]
Documentation/virtual/kvm/ppc-pv.txt
arch/ia64/include/asm/paravirt.h
arch/ia64/kernel/paravirt.c
arch/powerpc/include/asm/cputable.h
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kvm.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_e500.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu-hash64.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/ppc_asm.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/reg_booke.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_power7.S
arch/powerpc/kernel/cpu_setup_ppc970.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_fsl_booke.S
arch/powerpc/kernel/idle_power7.S
arch/powerpc/kernel/paca.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_64_mmu_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_vio_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_exports.c
arch/powerpc/kvm/book3s_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_builtin.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_interrupts.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_rm_mmu.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_rmhandlers.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/book3s_mmu_hpte.c
arch/powerpc/kvm/book3s_pr.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_rmhandlers.S
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/booke.h
arch/powerpc/kvm/booke_interrupts.S
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500_tlb.c
arch/powerpc/kvm/e500_tlb.h
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.c
arch/powerpc/kvm/trace.h
arch/powerpc/mm/hash_native_64.c
arch/powerpc/platforms/iseries/exception.S
arch/powerpc/platforms/iseries/exception.h
arch/powerpc/sysdev/xics/icp-native.c
arch/s390/crypto/sha256_s390.c
arch/x86/Kconfig
arch/x86/crypto/ghash-clmulni-intel_glue.c
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/processor-flags.h
arch/x86/include/asm/vmx.h
arch/x86/include/asm/xen/hypercall.h
arch/x86/include/asm/xen/trace_types.h [new file with mode: 0644]
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/paravirt.c
arch/x86/kvm/Kconfig
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/mmutrace.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/multicalls.c
arch/x86/xen/multicalls.h
arch/x86/xen/trace.c [new file with mode: 0644]
crypto/Kconfig
crypto/algif_hash.c
crypto/arc4.c
crypto/crc32c.c
crypto/gf128mul.c
crypto/sha1_generic.c
crypto/testmgr.h
drivers/char/hw_random/Kconfig
drivers/char/hw_random/Makefile
drivers/char/hw_random/nomadik-rng.c
drivers/char/hw_random/omap-rng.c
drivers/char/hw_random/ppc4xx-rng.c [new file with mode: 0644]
drivers/char/hw_random/timeriomem-rng.c
drivers/crypto/amcc/crypto4xx_core.c
drivers/crypto/caam/caamalg.c
drivers/crypto/caam/compat.h
drivers/crypto/caam/ctrl.c
drivers/crypto/caam/desc_constr.h
drivers/crypto/omap-sham.c
drivers/crypto/talitos.c
drivers/of/of_pci.c
include/linux/kvm.h
include/linux/kvm_host.h
include/trace/events/xen.h [new file with mode: 0644]
kernel/compat.c
kernel/delayacct.c
kernel/sched.c
kernel/sched_features.h
virt/kvm/assigned-dev.c
virt/kvm/iommu.c
virt/kvm/kvm_main.c

index aa47be7..40cc653 100644 (file)
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        for all guests.
                        Default is 1 (enabled) if in 64bit or 32bit-PAE mode
 
-       kvm-intel.bypass_guest_pf=
-                       [KVM,Intel] Disables bypassing of guest page faults
-                       on Intel chips. Default is 1 (enabled)
-
        kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                        (virtualized MMU) support on capable Intel chips.
                        Default is 1 (enabled)
@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
                        fault handling.
 
+       no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+                       steal time is computed, but won't influence scheduler
+                       behaviour
+
        nolapic         [X86-32,APIC] Do not enable or use the local APIC.
 
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
index 42542eb..b0e4b9c 100644 (file)
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
 
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device.
 
 struct kvm_assigned_irq {
        __u32 assigned_dev_id;
-       __u32 host_irq;
+       __u32 host_irq; /* ignored (legacy field) */
        __u32 guest_irq;
        __u32 flags;
        union {
-               struct {
-                       __u32 addr_lo;
-                       __u32 addr_hi;
-                       __u32 data;
-               } guest_msi;
                __u32 reserved[12];
        };
 };
@@ -1239,8 +1247,10 @@ Type: vm ioctl
 Parameters: struct kvm_assigned_msix_nr (in)
 Returns: 0 on success, -1 on error
 
-Set the number of MSI-X interrupts for an assigned device. This service can
-only be called once in the lifetime of an assigned device.
+Set the number of MSI-X interrupts for an assigned device. The number is
+reset again by terminating the MSI-X assignment of the device via
+KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
+point will fail.
 
 struct kvm_assigned_msix_nr {
        __u32 assigned_dev_id;
@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is
 KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
 error.
 
+4.56 KVM_GET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (out)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+       char regs[KVM_APIC_REG_SIZE];
+};
+
+Reads the Local APIC registers and copies them into the input argument.  The
+data format and layout are the same as documented in the architecture manual.
+
+4.57 KVM_SET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (in)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+       char regs[KVM_APIC_REG_SIZE];
+};
+
+Copies the input argument into the the Local APIC registers.  The data format
+and layout are the same as documented in the architecture manual.
+
+4.58 KVM_IOEVENTFD
+
+Capability: KVM_CAP_IOEVENTFD
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_ioeventfd (in)
+Returns: 0 on success, !0 on error
+
+This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
+within the guest.  A guest write in the registered address will signal the
+provided event instead of triggering an exit.
+
+struct kvm_ioeventfd {
+       __u64 datamatch;
+       __u64 addr;        /* legal pio/mmio address */
+       __u32 len;         /* 1, 2, 4, or 8 bytes    */
+       __s32 fd;
+       __u32 flags;
+       __u8  pad[36];
+};
+
+The following flags are defined:
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+
+If datamatch flag is set, the event will be signaled only if the written value
+to the registered address is equal to datamatch in struct kvm_ioeventfd.
+
+4.62 KVM_CREATE_SPAPR_TCE
+
+Capability: KVM_CAP_SPAPR_TCE
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+       __u64 liobn;
+       __u32 window_size;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table.  The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every 4kiB of the DMA window.
+
+When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
+table has been created using this ioctl(), the kernel will handle it
+in real mode, updating the TCE table.  H_PUT_TCE calls for other
+liobns will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
+4.63 KVM_ALLOCATE_RMA
+
+Capability: KVM_CAP_PPC_RMA
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_allocate_rma (out)
+Returns: file descriptor for mapping the allocated RMA
+
+This allocates a Real Mode Area (RMA) from the pool allocated at boot
+time by the kernel.  An RMA is a physically-contiguous, aligned region
+of memory used on older POWER processors to provide the memory which
+will be accessed by real-mode (MMU off) accesses in a KVM guest.
+POWER processors support a set of sizes for the RMA that usually
+includes 64MB, 128MB, 256MB and some larger powers of two.
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+       __u64 rma_size;
+};
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the allocated RMA into userspace.  The mapped area can then be
+passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
+RMA for a virtual machine.  The size of the RMA in bytes (which is
+fixed at host kernel boot time) is returned in the rma_size field of
+the argument structure.
+
+The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
+is supported; 2 if the processor requires all virtual machines to have
+an RMA, or 1 if the processor can use an RMA but doesn't require it,
+because it supports the Virtual RMA (VRMA) facility.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+               /* KVM_EXIT_PAPR_HCALL */
+               struct {
+                       __u64 nr;
+                       __u64 ret;
+                       __u64 args[9];
+               } papr_hcall;
+
+This is used on 64-bit PowerPC when emulating a pSeries partition,
+e.g. with the 'pseries' machine type in qemu.  It occurs when the
+guest does a hypercall using the 'sc 1' instruction.  The 'nr' field
+contains the hypercall number (from the guest R3), and 'args' contains
+the arguments (from the guest R4 - R12).  Userspace should put the
+return code in 'ret' and any extra returned values in args[].
+The possible hypercalls are defined in the Power Architecture Platform
+Requirements (PAPR) document available from www.power.org (free
+developer registration required to access it).
+
                /* Fix the size of the union. */
                char padding[256];
        };
index f46aa58..5dc972c 100644 (file)
@@ -165,6 +165,10 @@ Shadow pages contain the following information:
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
+  role.smep_andnot_wp:
+    Contains the value of cr4.smep && !cr0.wp for which the page is valid
+    (pages for which this is true are different from other pages; see the
+    treatment of cr0.wp=0 below).
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
@@ -317,6 +321,20 @@ on fault type:
 
 (user write faults generate a #PF)
 
+In the first case there is an additional complication if CR4.SMEP is
+enabled: since we've turned the page into a kernel page, the kernel may now
+execute it.  We handle this by also setting spte.nx.  If we get a user
+fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.
+
+To prevent an spte that was converted into a kernel page with cr0.wp=0
+from being written by the kernel after cr0.wp has changed to 1, we make
+the value of cr0.wp part of the page role.  This means that an spte created
+with one value of cr0.wp cannot be used when cr0.wp has a different value -
+it will simply be missed by the shadow page lookup code.  A similar issue
+exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
+changing cr4.smep to 1.  To avoid this, the value of !cr0.wp && cr4.smep
+is also made a part of the page role.
+
 Large pages
 ===========
 
index d079aed..5031780 100644 (file)
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 
        Currently type 2 APF will be always delivered on the same vcpu as
        type 1 was, but guest should not rely on that.
+
+MSR_KVM_STEAL_TIME: 0x4b564d03
+
+       data: 64-byte alignment physical address of a memory area which must be
+       in guest RAM, plus an enable bit in bit 0. This memory is expected to
+       hold a copy of the following structure:
+
+       struct kvm_steal_time {
+               __u64 steal;
+               __u32 version;
+               __u32 flags;
+               __u32 pad[12];
+       }
+
+       whose data will be filled in by the hypervisor periodically. Only one
+       write, or registration, is needed for each VCPU. The interval between
+       updates of this structure is arbitrary and implementation-dependent.
+       The hypervisor may update this structure at any time it sees fit until
+       anything with bit0 == 0 is written to it. Guest is required to make sure
+       this structure is initialized to zero.
+
+       Fields have the following meanings:
+
+               version: a sequence counter. In other words, guest has to check
+               this field before and after grabbing time information and make
+               sure they are both equal and even. An odd version indicates an
+               in-progress update.
+
+               flags: At this point, always zero. May be used to indicate
+               changes in this structure in the future.
+
+               steal: the amount of time in which this vCPU did not run, in
+               nanoseconds. Time during which the vcpu is idle, will not be
+               reported as steal time.
diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt
new file mode 100644 (file)
index 0000000..8ed937d
--- /dev/null
@@ -0,0 +1,251 @@
+Nested VMX
+==========
+
+Overview
+---------
+
+On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions)
+to easily and efficiently run guest operating systems. Normally, these guests
+*cannot* themselves be hypervisors running their own guests, because in VMX,
+guests cannot use VMX instructions.
+
+The "Nested VMX" feature adds this missing capability - of running guest
+hypervisors (which use VMX) with their own nested guests. It does so by
+allowing a guest to use VMX instructions, and correctly and efficiently
+emulating them using the single level of VMX available in the hardware.
+
+We describe in much greater detail the theory behind the nested VMX feature,
+its implementation and its performance characteristics, in the OSDI 2010 paper
+"The Turtles Project: Design and Implementation of Nested Virtualization",
+available at:
+
+       http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf
+
+
+Terminology
+-----------
+
+Single-level virtualization has two levels - the host (KVM) and the guests.
+In nested virtualization, we have three levels: The host (KVM), which we call
+L0, the guest hypervisor, which we call L1, and its nested guest, which we
+call L2.
+
+
+Known limitations
+-----------------
+
+The current code supports running Linux guests under KVM guests.
+Only 64-bit guest hypervisors are supported.
+
+Additional patches for running Windows under guest KVM, and Linux under
+guest VMware server, and support for nested EPT, are currently running in
+the lab, and will be sent as follow-on patchsets.
+
+
+Running nested VMX
+------------------
+
+The nested VMX feature is disabled by default. It can be enabled by giving
+the "nested=1" option to the kvm-intel module.
+
+No modifications are required to user space (qemu). However, qemu's default
+emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
+explicitly enabled, by giving qemu one of the following options:
+
+     -cpu host              (emulated CPU has all features of the real CPU)
+
+     -cpu qemu64,+vmx       (add just the vmx feature to a named CPU type)
+
+
+ABIs
+----
+
+Nested VMX aims to present a standard and (eventually) fully-functional VMX
+implementation for the a guest hypervisor to use. As such, the official
+specification of the ABI that it provides is Intel's VMX specification,
+namely volume 3B of their "Intel 64 and IA-32 Architectures Software
+Developer's Manual". Not all of VMX's features are currently fully supported,
+but the goal is to eventually support them all, starting with the VMX features
+which are used in practice by popular hypervisors (KVM and others).
+
+As a VMX implementation, nested VMX presents a VMCS structure to L1.
+As mandated by the spec, other than the two fields revision_id and abort,
+this structure is *opaque* to its user, who is not supposed to know or care
+about its internal structure. Rather, the structure is accessed through the
+VMREAD and VMWRITE instructions.
+Still, for debugging purposes, KVM developers might be interested to know the
+internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c.
+
+The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we
+also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS
+which L0 builds to actually run L2 - how this is done is explained in the
+aforementioned paper.
+
+For convenience, we repeat the content of struct vmcs12 here. If the internals
+of this structure changes, this can break live migration across KVM versions.
+VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner
+struct shadow_vmcs is ever changed.
+
+       typedef u64 natural_width;
+       struct __packed vmcs12 {
+               /* According to the Intel spec, a VMCS region must start with
+                * these two user-visible fields */
+               u32 revision_id;
+               u32 abort;
+
+               u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+               u32 padding[7]; /* room for future expansion */
+
+               u64 io_bitmap_a;
+               u64 io_bitmap_b;
+               u64 msr_bitmap;
+               u64 vm_exit_msr_store_addr;
+               u64 vm_exit_msr_load_addr;
+               u64 vm_entry_msr_load_addr;
+               u64 tsc_offset;
+               u64 virtual_apic_page_addr;
+               u64 apic_access_addr;
+               u64 ept_pointer;
+               u64 guest_physical_address;
+               u64 vmcs_link_pointer;
+               u64 guest_ia32_debugctl;
+               u64 guest_ia32_pat;
+               u64 guest_ia32_efer;
+               u64 guest_pdptr0;
+               u64 guest_pdptr1;
+               u64 guest_pdptr2;
+               u64 guest_pdptr3;
+               u64 host_ia32_pat;
+               u64 host_ia32_efer;
+               u64 padding64[8]; /* room for future expansion */
+               natural_width cr0_guest_host_mask;
+               natural_width cr4_guest_host_mask;
+               natural_width cr0_read_shadow;
+               natural_width cr4_read_shadow;
+               natural_width cr3_target_value0;
+               natural_width cr3_target_value1;
+               natural_width cr3_target_value2;
+               natural_width cr3_target_value3;
+               natural_width exit_qualification;
+               natural_width guest_linear_address;
+               natural_width guest_cr0;
+               natural_width guest_cr3;
+               natural_width guest_cr4;
+               natural_width guest_es_base;
+               natural_width guest_cs_base;
+               natural_width guest_ss_base;
+               natural_width guest_ds_base;
+               natural_width guest_fs_base;
+               natural_width guest_gs_base;
+               natural_width guest_ldtr_base;
+               natural_width guest_tr_base;
+               natural_width guest_gdtr_base;
+               natural_width guest_idtr_base;
+               natural_width guest_dr7;
+               natural_width guest_rsp;
+               natural_width guest_rip;
+               natural_width guest_rflags;
+               natural_width guest_pending_dbg_exceptions;
+               natural_width guest_sysenter_esp;
+               natural_width guest_sysenter_eip;
+               natural_width host_cr0;
+               natural_width host_cr3;
+               natural_width host_cr4;
+               natural_width host_fs_base;
+               natural_width host_gs_base;
+               natural_width host_tr_base;
+               natural_width host_gdtr_base;
+               natural_width host_idtr_base;
+               natural_width host_ia32_sysenter_esp;
+               natural_width host_ia32_sysenter_eip;
+               natural_width host_rsp;
+               natural_width host_rip;
+               natural_width paddingl[8]; /* room for future expansion */
+               u32 pin_based_vm_exec_control;
+               u32 cpu_based_vm_exec_control;
+               u32 exception_bitmap;
+               u32 page_fault_error_code_mask;
+               u32 page_fault_error_code_match;
+               u32 cr3_target_count;
+               u32 vm_exit_controls;
+               u32 vm_exit_msr_store_count;
+               u32 vm_exit_msr_load_count;
+               u32 vm_entry_controls;
+               u32 vm_entry_msr_load_count;
+               u32 vm_entry_intr_info_field;
+               u32 vm_entry_exception_error_code;
+               u32 vm_entry_instruction_len;
+               u32 tpr_threshold;
+               u32 secondary_vm_exec_control;
+               u32 vm_instruction_error;
+               u32 vm_exit_reason;
+               u32 vm_exit_intr_info;
+               u32 vm_exit_intr_error_code;
+               u32 idt_vectoring_info_field;
+               u32 idt_vectoring_error_code;
+               u32 vm_exit_instruction_len;
+               u32 vmx_instruction_info;
+               u32 guest_es_limit;
+               u32 guest_cs_limit;
+               u32 guest_ss_limit;
+               u32 guest_ds_limit;
+               u32 guest_fs_limit;
+               u32 guest_gs_limit;
+               u32 guest_ldtr_limit;
+               u32 guest_tr_limit;
+               u32 guest_gdtr_limit;
+               u32 guest_idtr_limit;
+               u32 guest_es_ar_bytes;
+               u32 guest_cs_ar_bytes;
+               u32 guest_ss_ar_bytes;
+               u32 guest_ds_ar_bytes;
+               u32 guest_fs_ar_bytes;
+               u32 guest_gs_ar_bytes;
+               u32 guest_ldtr_ar_bytes;
+               u32 guest_tr_ar_bytes;
+               u32 guest_interruptibility_info;
+               u32 guest_activity_state;
+               u32 guest_sysenter_cs;
+               u32 host_ia32_sysenter_cs;
+               u32 padding32[8]; /* room for future expansion */
+               u16 virtual_processor_id;
+               u16 guest_es_selector;
+               u16 guest_cs_selector;
+               u16 guest_ss_selector;
+               u16 guest_ds_selector;
+               u16 guest_fs_selector;
+               u16 guest_gs_selector;
+               u16 guest_ldtr_selector;
+               u16 guest_tr_selector;
+               u16 host_es_selector;
+               u16 host_cs_selector;
+               u16 host_ss_selector;
+               u16 host_ds_selector;
+               u16 host_fs_selector;
+               u16 host_gs_selector;
+               u16 host_tr_selector;
+       };
+
+
+Authors
+-------
+
+These patches were written by:
+     Abel Gordon, abelg <at> il.ibm.com
+     Nadav Har'El, nyh <at> il.ibm.com
+     Orit Wasserman, oritw <at> il.ibm.com
+     Ben-Ami Yassor, benami <at> il.ibm.com
+     Muli Ben-Yehuda, muli <at> il.ibm.com
+
+With contributions by:
+     Anthony Liguori, aliguori <at> us.ibm.com
+     Mike Day, mdday <at> us.ibm.com
+     Michael Factor, factor <at> il.ibm.com
+     Zvi Dubitzky, dubi <at> il.ibm.com
+
+And valuable reviews by:
+     Avi Kivity, avi <at> redhat.com
+     Gleb Natapov, gleb <at> redhat.com
+     Marcelo Tosatti, mtosatti <at> redhat.com
+     Kevin Tian, kevin.tian <at> intel.com
+     and others.
index 3ab969c..2b7ce19 100644 (file)
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can
 map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
 
 With this hypercall issued the guest always gets the magic page mapped at the
-desired location in effective and physical address space. For now, we always
-map the page to -4096. This way we can access it using absolute load and store
-functions. The following instruction reads the first field of the magic page:
+desired location. The first parameter indicates the effective address when the
+MMU is enabled. The second parameter indicates the address in real mode, if
+applicable to the target. For now, we always map the page to -4096. This way we
+can access it using absolute load and store functions. The following
+instruction reads the first field of the magic page:
 
        ld      rX, -4096(0)
 
index 2eb0a98..32551d3 100644 (file)
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu)
                pv_time_ops.init_missing_ticks_accounting(cpu);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
 static inline int
 paravirt_do_steal_accounting(unsigned long *new_itm)
 {
index a21d7bb..1008682 100644 (file)
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
  * pv_time_ops
  * time operations
  */
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
 
 static int
 ia64_native_do_steal_accounting(unsigned long *new_itm)
index c0d842c..e30442c 100644 (file)
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform;
 #define LONG_ASM_CONST(x)              0
 #endif
 
-
-#define CPU_FTR_HVMODE_206             LONG_ASM_CONST(0x0000000800000000)
+#define CPU_FTR_HVMODE                 LONG_ASM_CONST(0x0000000200000000)
+#define CPU_FTR_ARCH_201               LONG_ASM_CONST(0x0000000400000000)
+#define CPU_FTR_ARCH_206               LONG_ASM_CONST(0x0000000800000000)
 #define CPU_FTR_CFAR                   LONG_ASM_CONST(0x0000001000000000)
 #define CPU_FTR_IABR                   LONG_ASM_CONST(0x0000002000000000)
 #define CPU_FTR_MMCRA                  LONG_ASM_CONST(0x0000004000000000)
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform;
            CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
            CPU_FTR_STCX_CHECKS_ADDRESS)
 #define CPU_FTRS_PPC970        (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
            CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
-           CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS)
+           CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
+           CPU_FTR_HVMODE)
 #define CPU_FTRS_POWER5        (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform;
            CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
            CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\
+           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
            CPU_FTR_COHERENT_ICACHE | \
            CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
            CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
            CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-           CPU_FTR_ICSWX | CPU_FTR_CFAR)
+           CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE)
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
index f5dfe34..8057f4f 100644 (file)
 #define EXC_HV H
 #define EXC_STD
 
-#define EXCEPTION_PROLOG_1(area)                                       \
+#define __EXCEPTION_PROLOG_1(area, extra, vec)                         \
        GET_PACA(r13);                                                  \
        std     r9,area+EX_R9(r13);     /* save r9 - r12 */             \
        std     r10,area+EX_R10(r13);                                   \
-       std     r11,area+EX_R11(r13);                                   \
-       std     r12,area+EX_R12(r13);                                   \
        BEGIN_FTR_SECTION_NESTED(66);                                   \
        mfspr   r10,SPRN_CFAR;                                          \
        std     r10,area+EX_CFAR(r13);                                  \
        END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);         \
-       GET_SCRATCH0(r9);                                               \
-       std     r9,area+EX_R13(r13);                                    \
-       mfcr    r9
+       mfcr    r9;                                                     \
+       extra(vec);                                                     \
+       std     r11,area+EX_R11(r13);                                   \
+       std     r12,area+EX_R12(r13);                                   \
+       GET_SCRATCH0(r10);                                              \
+       std     r10,area+EX_R13(r13)
+#define EXCEPTION_PROLOG_1(area, extra, vec)                           \
+       __EXCEPTION_PROLOG_1(area, extra, vec)
 
 #define __EXCEPTION_PROLOG_PSERIES_1(label, h)                         \
        ld      r12,PACAKBASE(r13);     /* get high part of &label */   \
        mtspr   SPRN_##h##SRR1,r10;                                     \
        h##rfid;                                                        \
        b       .       /* prevent speculative execution */
-#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
+#define EXCEPTION_PROLOG_PSERIES_1(label, h)                           \
        __EXCEPTION_PROLOG_PSERIES_1(label, h)
 
-#define EXCEPTION_PROLOG_PSERIES(area, label, h)                       \
-       EXCEPTION_PROLOG_1(area);                                       \
+#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)           \
+       EXCEPTION_PROLOG_1(area, extra, vec);                           \
        EXCEPTION_PROLOG_PSERIES_1(label, h);
 
+#define __KVMTEST(n)                                                   \
+       lbz     r10,HSTATE_IN_GUEST(r13);                       \
+       cmpwi   r10,0;                                                  \
+       bne     do_kvm_##n
+
+#define __KVM_HANDLER(area, h, n)                                      \
+do_kvm_##n:                                                            \
+       ld      r10,area+EX_R10(r13);                                   \
+       stw     r9,HSTATE_SCRATCH1(r13);                        \
+       ld      r9,area+EX_R9(r13);                                     \
+       std     r12,HSTATE_SCRATCH0(r13);                       \
+       li      r12,n;                                                  \
+       b       kvmppc_interrupt
+
+#define __KVM_HANDLER_SKIP(area, h, n)                                 \
+do_kvm_##n:                                                            \
+       cmpwi   r10,KVM_GUEST_MODE_SKIP;                                \
+       ld      r10,area+EX_R10(r13);                                   \
+       beq     89f;                                                    \
+       stw     r9,HSTATE_SCRATCH1(r13);                        \
+       ld      r9,area+EX_R9(r13);                                     \
+       std     r12,HSTATE_SCRATCH0(r13);                       \
+       li      r12,n;                                                  \
+       b       kvmppc_interrupt;                                       \
+89:    mtocrf  0x80,r9;                                                \
+       ld      r9,area+EX_R9(r13);                                     \
+       b       kvmppc_skip_##h##interrupt
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define KVMTEST(n)                     __KVMTEST(n)
+#define KVM_HANDLER(area, h, n)                __KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)   __KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST(n)
+#define KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+#define KVMTEST_PR(n)                  __KVMTEST(n)
+#define KVM_HANDLER_PR(area, h, n)     __KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)        __KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_PR(n)
+#define KVM_HANDLER_PR(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)
+#endif
+
+#define NOTEST(n)
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
        .globl label##_pSeries;                         \
 label##_pSeries:                                       \
        HMT_MEDIUM;                                     \
-       DO_KVM  vec;                                    \
        SET_SCRATCH0(r13);              /* save r13 */          \
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,    \
+                                EXC_STD, KVMTEST_PR, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)              \
        . = loc;                                        \
        .globl label##_hv;                              \
 label##_hv:                                            \
        HMT_MEDIUM;                                     \
-       DO_KVM  vec;                                    \
-       SET_SCRATCH0(r13);      /* save r13 */          \
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV)
+       SET_SCRATCH0(r13);      /* save r13 */                  \
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,    \
+                                EXC_HV, KVMTEST, vec)
 
-#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h)                    \
-       HMT_MEDIUM;                                                     \
-       DO_KVM  vec;                                                    \
-       SET_SCRATCH0(r13);    /* save r13 */                            \
-       GET_PACA(r13);                                                  \
-       std     r9,PACA_EXGEN+EX_R9(r13);       /* save r9, r10 */      \
-       std     r10,PACA_EXGEN+EX_R10(r13);                             \
+#define __SOFTEN_TEST(h)                                               \
        lbz     r10,PACASOFTIRQEN(r13);                                 \
-       mfcr    r9;                                                     \
        cmpwi   r10,0;                                                  \
-       beq     masked_##h##interrupt;                                  \
-       GET_SCRATCH0(r10);                                              \
-       std     r10,PACA_EXGEN+EX_R13(r13);                             \
-       std     r11,PACA_EXGEN+EX_R11(r13);                             \
-       std     r12,PACA_EXGEN+EX_R12(r13);                             \
-       ld      r12,PACAKBASE(r13);     /* get high part of &label */   \
-       ld      r10,PACAKMSR(r13);      /* get MSR value for kernel */  \
-       mfspr   r11,SPRN_##h##SRR0;     /* save SRR0 */                 \
-       LOAD_HANDLER(r12,label##_common)                                \
-       mtspr   SPRN_##h##SRR0,r12;                                     \
-       mfspr   r12,SPRN_##h##SRR1;     /* and SRR1 */                  \
-       mtspr   SPRN_##h##SRR1,r10;                                     \
-       h##rfid;                                                        \
-       b       .       /* prevent speculative execution */
-#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h)                     \
-       __MASKABLE_EXCEPTION_PSERIES(vec, label, h)
+       beq     masked_##h##interrupt
+#define _SOFTEN_TEST(h)        __SOFTEN_TEST(h)
+
+#define SOFTEN_TEST_PR(vec)                                            \
+       KVMTEST_PR(vec);                                                \
+       _SOFTEN_TEST(EXC_STD)
+
+#define SOFTEN_TEST_HV(vec)                                            \
+       KVMTEST(vec);                                                   \
+       _SOFTEN_TEST(EXC_HV)
+
+#define SOFTEN_TEST_HV_201(vec)                                                \
+       KVMTEST(vec);                                                   \
+       _SOFTEN_TEST(EXC_STD)
+
+#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)             \
+       HMT_MEDIUM;                                                     \
+       SET_SCRATCH0(r13);    /* save r13 */                            \
+       __EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);           \
+       EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
+#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)              \
+       __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)                    \
        . = loc;                                                        \
        .globl label##_pSeries;                                         \
 label##_pSeries:                                                       \
-       _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD)
+       _MASKABLE_EXCEPTION_PSERIES(vec, label,                         \
+                                   EXC_STD, SOFTEN_TEST_PR)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)                         \
        . = loc;                                                        \
        .globl label##_hv;                                              \
 label##_hv:                                                            \
-       _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV)
+       _MASKABLE_EXCEPTION_PSERIES(vec, label,                         \
+                                   EXC_HV, SOFTEN_TEST_HV)
 
 #ifdef CONFIG_PPC_ISERIES
 #define DISABLE_INTS                           \
index fd8201d..1c324ff 100644 (file)
 #define H_LONG_BUSY_ORDER_100_SEC      9905  /* Long busy, hint that 100sec \
                                                 is a good time to retry */
 #define H_LONG_BUSY_END_RANGE          9905  /* End of long busy range */
+
+/* Internal value used in book3s_hv kvm support; not returned to guests */
+#define H_TOO_HARD     9999
+
 #define H_HARDWARE     -1      /* Hardware error */
 #define H_FUNCTION     -2      /* Function not supported */
 #define H_PRIVILEGE    -3      /* Caller not privileged */
 #define H_PAGE_SET_ACTIVE      H_PAGE_STATE_CHANGE
 #define H_AVPN                 (1UL<<(63-32))  /* An avpn is provided as a sanity test */
 #define H_ANDCOND              (1UL<<(63-33))
+#define H_LOCAL                        (1UL<<(63-35))
 #define H_ICACHE_INVALIDATE    (1UL<<(63-40))  /* icbi, etc.  (ignored for IO pages) */
 #define H_ICACHE_SYNCHRONIZE   (1UL<<(63-41))  /* dcbst, icbi, etc (ignored for IO pages */
 #define H_COALESCE_CAND        (1UL<<(63-42))  /* page is a good candidate for coalescing */
index d2ca5ed..a4f6c85 100644 (file)
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
+
 struct kvm_regs {
        __u64 pc;
        __u64 cr;
@@ -272,4 +276,15 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET    -2U
 #define KVM_INTERRUPT_SET_LEVEL        -3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+       __u64 liobn;
+       __u32 window_size;
+};
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+       __u64 rma_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
index 0951b17..7b1f0e0 100644 (file)
 #define BOOK3S_INTERRUPT_PROGRAM       0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL    0x800
 #define BOOK3S_INTERRUPT_DECREMENTER   0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER        0x980
 #define BOOK3S_INTERRUPT_SYSCALL       0xc00
 #define BOOK3S_INTERRUPT_TRACE         0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE        0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE        0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40
 #define BOOK3S_INTERRUPT_PERFMON       0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC       0xf20
 #define BOOK3S_INTERRUPT_VSX           0xf40
index d62e703..98da010 100644 (file)
 #include <linux/kvm_host.h>
 #include <asm/kvm_book3s_asm.h>
 
-struct kvmppc_slb {
-       u64 esid;
-       u64 vsid;
-       u64 orige;
-       u64 origv;
-       bool valid      : 1;
-       bool Ks         : 1;
-       bool Kp         : 1;
-       bool nx         : 1;
-       bool large      : 1;    /* PTEs are 16MB */
-       bool tb         : 1;    /* 1TB segment */
-       bool class      : 1;
-};
-
 struct kvmppc_bat {
        u64 raw;
        u32 bepi;
@@ -67,11 +53,22 @@ struct kvmppc_sid_map {
 #define VSID_POOL_SIZE (SID_CONTEXTS * 16)
 #endif
 
+struct hpte_cache {
+       struct hlist_node list_pte;
+       struct hlist_node list_pte_long;
+       struct hlist_node list_vpte;
+       struct hlist_node list_vpte_long;
+       struct rcu_head rcu_head;
+       u64 host_va;
+       u64 pfn;
+       ulong slot;
+       struct kvmppc_pte pte;
+};
+
 struct kvmppc_vcpu_book3s {
        struct kvm_vcpu vcpu;
        struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
        struct kvmppc_sid_map sid_map[SID_MAP_NUM];
-       struct kvmppc_slb slb[64];
        struct {
                u64 esid;
                u64 vsid;
@@ -81,7 +78,6 @@ struct kvmppc_vcpu_book3s {
        struct kvmppc_bat dbat[8];
        u64 hid[6];
        u64 gqr[8];
-       int slb_nr;
        u64 sdr1;
        u64 hior;
        u64 msr_mask;
@@ -93,7 +89,13 @@ struct kvmppc_vcpu_book3s {
        u64 vsid_max;
 #endif
        int context_id[SID_CONTEXTS];
-       ulong prog_flags; /* flags to inject when giving a 700 trap */
+
+       struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+       struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
+       struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+       struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+       int hpte_cache_count;
+       spinlock_t mmu_lock;
 };
 
 #define CONTEXT_HOST           0
@@ -110,8 +112,10 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
 extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -123,19 +127,22 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern ulong kvmppc_trampoline_lowmem;
-extern ulong kvmppc_trampoline_enter;
+extern void kvmppc_handler_lowmem_trampoline(void);
+extern void kvmppc_handler_trampoline_enter(void);
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
@@ -147,15 +154,32 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
-static inline ulong dsisr(void)
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
-       ulong r;
-       asm ( "mfdsisr %0 " : "=r" (r) );
-       return r;
+       return to_book3s(vcpu)->hior;
 }
 
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+                       unsigned long pending_now, unsigned long old_pending)
+{
+       if (pending_now)
+               vcpu->arch.shared->int_pending = 1;
+       else if (old_pending)
+               vcpu->arch.shared->int_pending = 0;
+}
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
@@ -244,6 +268,120 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return to_svcpu(vcpu)->fault_dar;
 }
 
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+       ulong crit_raw = vcpu->arch.shared->critical;
+       ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+       bool crit;
+
+       /* Truncate crit indicators in 32 bit mode */
+       if (!(vcpu->arch.shared->msr & MSR_SF)) {
+               crit_raw &= 0xffffffff;
+               crit_r1 &= 0xffffffff;
+       }
+
+       /* Critical section when crit == r1 */
+       crit = (crit_raw == crit_r1);
+       /* ... and we're in supervisor mode */
+       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+       return crit;
+}
+#else /* CONFIG_KVM_BOOK3S_PR */
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+                       unsigned long pending_now, unsigned long old_pending)
+{
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+       vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+       return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+       vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+       vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+       ulong pc = kvmppc_get_pc(vcpu);
+
+       /* Load the instruction manually if it failed to do so in the
+        * exit path */
+       if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+               kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+       return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.fault_dar;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+#endif
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
@@ -251,12 +389,4 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ                       0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
index 4cadd61..e43fe42 100644 (file)
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#ifdef CONFIG_KVM_BOOK3S_PR
 static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 {
        return &get_paca()->shadow_vcpu;
 }
+#endif
+
+#define SPAPR_TCE_SHIFT                12
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index d5a8a38..ef7b368 100644 (file)
@@ -60,6 +60,36 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+/*
+ * This struct goes in the PACA on 64-bit processors.  It is used
+ * to store host state that needs to be saved when we enter a guest
+ * and restored when we exit, but isn't specific to any particular
+ * guest or vcpu.  It also has some scratch fields used by the guest
+ * exit code.
+ */
+struct kvmppc_host_state {
+       ulong host_r1;
+       ulong host_r2;
+       ulong host_msr;
+       ulong vmhandler;
+       ulong scratch0;
+       ulong scratch1;
+       u8 in_guest;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       struct kvm_vcpu *kvm_vcpu;
+       struct kvmppc_vcore *kvm_vcore;
+       unsigned long xics_phys;
+       u64 dabr;
+       u64 host_mmcr[3];
+       u32 host_pmc[8];
+       u64 host_purr;
+       u64 host_spurr;
+       u64 host_dscr;
+       u64 dec_expires;
+#endif
+};
+
 struct kvmppc_book3s_shadow_vcpu {
        ulong gpr[14];
        u32 cr;
@@ -73,17 +103,12 @@ struct kvmppc_book3s_shadow_vcpu {
        ulong shadow_srr1;
        ulong fault_dar;
 
-       ulong host_r1;
-       ulong host_r2;
-       ulong handler;
-       ulong scratch0;
-       ulong scratch1;
-       ulong vmhandler;
-       u8 in_guest;
-
 #ifdef CONFIG_PPC_BOOK3S_32
        u32     sr[16];                 /* Guest SRs */
+
+       struct kvmppc_host_state hstate;
 #endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
        u8 slb_max;                     /* highest used guest slb entry */
        struct  {
index 9c9ba3d..a90e091 100644 (file)
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
index 7a2a565..adbfca9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, <yu.liu@freescale.com>
  *
@@ -29,17 +29,25 @@ struct tlbe{
        u32 mas7;
 };
 
+#define E500_TLB_VALID 1
+#define E500_TLB_DIRTY 2
+
+struct tlbe_priv {
+       pfn_t pfn;
+       unsigned int flags; /* E500_TLB_* */
+};
+
+struct vcpu_id_table;
+
 struct kvmppc_vcpu_e500 {
        /* Unmodified copy of the guest's TLB. */
-       struct tlbe *guest_tlb[E500_TLB_NUM];
-       /* TLB that's actually used when the guest is running. */
-       struct tlbe *shadow_tlb[E500_TLB_NUM];
-       /* Pages which are referenced in the shadow TLB. */
-       struct page **shadow_pages[E500_TLB_NUM];
+       struct tlbe *gtlb_arch[E500_TLB_NUM];
 
-       unsigned int guest_tlb_size[E500_TLB_NUM];
-       unsigned int shadow_tlb_size[E500_TLB_NUM];
-       unsigned int guest_tlb_nv[E500_TLB_NUM];
+       /* KVM internal information associated with each guest TLB entry */
+       struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+       unsigned int gtlb_size[E500_TLB_NUM];
+       unsigned int gtlb_nv[E500_TLB_NUM];
 
        u32 host_pid[E500_PID_NUM];
        u32 pid[E500_PID_NUM];
@@ -53,6 +61,10 @@ struct kvmppc_vcpu_e500 {
        u32 mas5;
        u32 mas6;
        u32 mas7;
+
+       /* vcpu id table */
+       struct vcpu_id_table *idt;
+
        u32 l1csr0;
        u32 l1csr1;
        u32 hid0;
index 186f150..cc22b28 100644 (file)
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS          NR_CPUS
+#define KVM_MAX_VCORES         NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x) 0
@@ -57,6 +65,10 @@ struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
 
+struct lppaca;
+struct slb_shadow;
+struct dtl;
+
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
 };
@@ -133,9 +145,74 @@ struct kvmppc_exit_timing {
        };
 };
 
+struct kvmppc_pginfo {
+       unsigned long pfn;
+       atomic_t refcnt;
+};
+
+struct kvmppc_spapr_tce_table {
+       struct list_head list;
+       struct kvm *kvm;
+       u64 liobn;
+       u32 window_size;
+       struct page *pages[0];
+};
+
+struct kvmppc_rma_info {
+       void            *base_virt;
+       unsigned long    base_pfn;
+       unsigned long    npages;
+       struct list_head list;
+       atomic_t         use_count;
+};
+
 struct kvm_arch {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       unsigned long hpt_virt;
+       unsigned long ram_npages;
+       unsigned long ram_psize;
+       unsigned long ram_porder;
+       struct kvmppc_pginfo *ram_pginfo;
+       unsigned int lpid;
+       unsigned int host_lpid;
+       unsigned long host_lpcr;
+       unsigned long sdr1;
+       unsigned long host_sdr1;
+       int tlbie_lock;
+       int n_rma_pages;
+       unsigned long lpcr;
+       unsigned long rmor;
+       struct kvmppc_rma_info *rma;
+       struct list_head spapr_tce_tables;
+       unsigned short last_vcpu[NR_CPUS];
+       struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+       int n_runnable;
+       int n_blocked;
+       int num_threads;
+       int entry_exit_count;
+       int n_woken;
+       int nap_count;
+       u16 pcpu;
+       u8 vcore_running;
+       u8 in_guest;
+       struct list_head runnable_threads;
+       spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
        ulong eaddr;
        u64 vpage;
@@ -163,16 +240,18 @@ struct kvmppc_mmu {
        bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
 
-struct hpte_cache {
-       struct hlist_node list_pte;
-       struct hlist_node list_pte_long;
-       struct hlist_node list_vpte;
-       struct hlist_node list_vpte_long;
-       struct rcu_head rcu_head;
-       u64 host_va;
-       u64 pfn;
-       ulong slot;
-       struct kvmppc_pte pte;
+struct kvmppc_slb {
+       u64 esid;
+       u64 vsid;
+       u64 orige;
+       u64 origv;
+       bool valid      : 1;
+       bool Ks         : 1;
+       bool Kp         : 1;
+       bool nx         : 1;
+       bool large      : 1;    /* PTEs are 16MB */
+       bool tb         : 1;    /* 1TB segment */
+       bool class      : 1;
 };
 
 struct kvm_vcpu_arch {
@@ -187,6 +266,9 @@ struct kvm_vcpu_arch {
        ulong highmem_handler;
        ulong rmcall;
        ulong host_paca_phys;
+       struct kvmppc_slb slb[64];
+       int slb_max;            /* 1 + index of last valid entry in slb[] */
+       int slb_nr;             /* total number of entries in SLB */
        struct kvmppc_mmu mmu;
 #endif
 
@@ -195,13 +277,19 @@ struct kvm_vcpu_arch {
        u64 fpr[32];
        u64 fpscr;
 
+#ifdef CONFIG_SPE
+       ulong evr[32];
+       ulong spefscr;
+       ulong host_spefscr;
+       u64 acc;
+#endif
 #ifdef CONFIG_ALTIVEC
        vector128 vr[32];
        vector128 vscr;
 #endif
 
 #ifdef CONFIG_VSX
-       u64 vsr[32];
+       u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -209,22 +297,27 @@ struct kvm_vcpu_arch {
        u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
        ulong pc;
        ulong ctr;
        ulong lr;
 
        ulong xer;
        u32 cr;
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S
-       ulong shadow_msr;
        ulong hflags;
        ulong guest_owned_ext;
+       ulong purr;
+       ulong spurr;
+       ulong dscr;
+       ulong amr;
+       ulong uamor;
+       u32 ctrl;
+       ulong dabr;
 #endif
        u32 vrsave; /* also USPRG0 */
        u32 mmucr;
+       ulong shadow_msr;
        ulong sprg4;
        ulong sprg5;
        ulong sprg6;
@@ -249,6 +342,7 @@ struct kvm_vcpu_arch {
        u32 pvr;
 
        u32 shadow_pid;
+       u32 shadow_pid1;
        u32 pid;
        u32 swap_pid;
 
@@ -258,6 +352,9 @@ struct kvm_vcpu_arch {
        u32 dbcr1;
        u32 dbsr;
 
+       u64 mmcr[3];
+       u32 pmc[8];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
        struct mutex exit_timing_lock;
        struct kvmppc_exit_timing timing_exit;
@@ -272,8 +369,12 @@ struct kvm_vcpu_arch {
        struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+       ulong fault_dar;
+       u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-       u32 last_inst;
        ulong fault_dear;
        ulong fault_esr;
        ulong queued_dear;
@@ -288,25 +389,47 @@ struct kvm_vcpu_arch {
        u8 dcr_is_write;
        u8 osi_needed;
        u8 osi_enabled;
+       u8 hcall_needed;
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
        struct hrtimer dec_timer;
        struct tasklet_struct tasklet;
        u64 dec_jiffies;
+       u64 dec_expires;
        unsigned long pending_exceptions;
+       u16 last_cpu;
+       u8 ceded;
+       u8 prodded;
+       u32 last_inst;
+
+       struct lppaca *vpa;
+       struct slb_shadow *slb_shadow;
+       struct dtl *dtl;
+       struct dtl *dtl_end;
+
+       struct kvmppc_vcore *vcore;
+       int ret;
+       int trap;
+       int state;
+       int ptid;
+       wait_queue_head_t cpu_run;
+
        struct kvm_vcpu_arch_shared *shared;
        unsigned long magic_page_pa; /* phys addr to map the magic page to */
        unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
-#ifdef CONFIG_PPC_BOOK3S
-       struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
-       struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
-       struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
-       struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
-       int hpte_cache_count;
-       spinlock_t mmu_lock;
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       struct kvm_vcpu_arch_shared shregs;
+
+       struct list_head run_list;
+       struct task_struct *run_task;
+       struct kvm_run *kvm_run;
 #endif
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST       0
+#define KVMPPC_VCPU_BLOCKED            1
+#define KVMPPC_VCPU_RUNNABLE           2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
index 9345238..d121f49 100644 (file)
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
@@ -42,6 +45,7 @@ enum emulation_result {
        EMULATE_AGAIN,        /* something went wrong. go again */
 };
 
+extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
@@ -109,6 +113,27 @@ extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
+extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
+
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+                           struct kvm_userspace_memory_region *mem);
+extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+                               struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+                               struct kvm_allocate_rma *rma);
+extern struct kvmppc_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_rma_info *ri);
+extern int kvmppc_core_init_vm(struct kvm *kvm);
+extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
+extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -151,4 +176,20 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+       paca[cpu].kvm_hstate.xics_phys = addr;
+}
+
+extern void kvm_rma_init(void);
+
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+
+static inline void kvm_rma_init(void)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
index d865bd9..b445e0a 100644 (file)
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0             ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS              ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI          ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT       12
-#define HPTE_R_RPN             ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS           ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN             ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP              ASM_CONST(0x0000000000000003)
 #define HPTE_R_N               ASM_CONST(0x0000000000000004)
+#define HPTE_R_G               ASM_CONST(0x0000000000000008)
+#define HPTE_R_M               ASM_CONST(0x0000000000000010)
+#define HPTE_R_I               ASM_CONST(0x0000000000000020)
+#define HPTE_R_W               ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG            ASM_CONST(0x0000000000000078)
 #define HPTE_R_C               ASM_CONST(0x0000000000000080)
 #define HPTE_R_R               ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO          ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG         ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK       ASM_CONST(0x4001ffffff000000)
index 7412676..a6da128 100644 (file)
@@ -147,8 +147,11 @@ struct paca_struct {
        struct dtl_entry *dtl_curr;     /* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
        /* We use this to store guest state in */
        struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+#endif
+       struct kvmppc_host_state kvm_hstate;
 #endif
 };
 
index 1b42238..368f72f 100644 (file)
@@ -150,18 +150,22 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base)
 #define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base)
 
-#define SAVE_EVR(n,s,base)     evmergehi s,s,n; stw s,THREAD_EVR0+4*(n)(base)
-#define SAVE_2EVRS(n,s,base)   SAVE_EVR(n,s,base); SAVE_EVR(n+1,s,base)
-#define SAVE_4EVRS(n,s,base)   SAVE_2EVRS(n,s,base); SAVE_2EVRS(n+2,s,base)
-#define SAVE_8EVRS(n,s,base)   SAVE_4EVRS(n,s,base); SAVE_4EVRS(n+4,s,base)
-#define SAVE_16EVRS(n,s,base)  SAVE_8EVRS(n,s,base); SAVE_8EVRS(n+8,s,base)
-#define SAVE_32EVRS(n,s,base)  SAVE_16EVRS(n,s,base); SAVE_16EVRS(n+16,s,base)
-#define REST_EVR(n,s,base)     lwz s,THREAD_EVR0+4*(n)(base); evmergelo n,s,n
-#define REST_2EVRS(n,s,base)   REST_EVR(n,s,base); REST_EVR(n+1,s,base)
-#define REST_4EVRS(n,s,base)   REST_2EVRS(n,s,base); REST_2EVRS(n+2,s,base)
-#define REST_8EVRS(n,s,base)   REST_4EVRS(n,s,base); REST_4EVRS(n+4,s,base)
-#define REST_16EVRS(n,s,base)  REST_8EVRS(n,s,base); REST_8EVRS(n+8,s,base)
-#define REST_32EVRS(n,s,base)  REST_16EVRS(n,s,base); REST_16EVRS(n+16,s,base)
+/*
+ * b = base register for addressing, o = base offset from register of 1st EVR
+ * n = first EVR, s = scratch
+ */
+#define SAVE_EVR(n,s,b,o)      evmergehi s,s,n; stw s,o+4*(n)(b)
+#define SAVE_2EVRS(n,s,b,o)    SAVE_EVR(n,s,b,o); SAVE_EVR(n+1,s,b,o)
+#define SAVE_4EVRS(n,s,b,o)    SAVE_2EVRS(n,s,b,o); SAVE_2EVRS(n+2,s,b,o)
+#define SAVE_8EVRS(n,s,b,o)    SAVE_4EVRS(n,s,b,o); SAVE_4EVRS(n+4,s,b,o)
+#define SAVE_16EVRS(n,s,b,o)   SAVE_8EVRS(n,s,b,o); SAVE_8EVRS(n+8,s,b,o)
+#define SAVE_32EVRS(n,s,b,o)   SAVE_16EVRS(n,s,b,o); SAVE_16EVRS(n+16,s,b,o)
+#define REST_EVR(n,s,b,o)      lwz s,o+4*(n)(b); evmergelo n,s,n
+#define REST_2EVRS(n,s,b,o)    REST_EVR(n,s,b,o); REST_EVR(n+1,s,b,o)
+#define REST_4EVRS(n,s,b,o)    REST_2EVRS(n,s,b,o); REST_2EVRS(n+2,s,b,o)
+#define REST_8EVRS(n,s,b,o)    REST_4EVRS(n,s,b,o); REST_4EVRS(n+4,s,b,o)
+#define REST_16EVRS(n,s,b,o)   REST_8EVRS(n,s,b,o); REST_8EVRS(n+8,s,b,o)
+#define REST_32EVRS(n,s,b,o)   REST_16EVRS(n,s,b,o); REST_16EVRS(n+16,s,b,o)
 
 /* Macros to adjust thread priority for hardware multithreading */
 #define HMT_VERY_LOW   or      31,31,31        # very low priority
index c5cae0d..ddbe57a 100644 (file)
 #define SPRN_CTR       0x009   /* Count Register */
 #define SPRN_DSCR      0x11
 #define SPRN_CFAR      0x1c    /* Come From Address Register */
+#define SPRN_AMR       0x1d    /* Authority Mask Register */
+#define SPRN_UAMOR     0x9d    /* User Authority Mask Override Register */
+#define SPRN_AMOR      0x15d   /* Authority Mask Override Register */
 #define SPRN_ACOP      0x1F    /* Available Coprocessor Register */
 #define SPRN_CTRLF     0x088
 #define SPRN_CTRLT     0x098
 #define   LPCR_VPM0    (1ul << (63-0))
 #define   LPCR_VPM1    (1ul << (63-1))
 #define   LPCR_ISL     (1ul << (63-2))
+#define   LPCR_VC_SH   (63-2)
 #define   LPCR_DPFD_SH (63-11)
 #define   LPCR_VRMA_L  (1ul << (63-12))
 #define   LPCR_VRMA_LP0        (1ul << (63-15))
 #define   LPCR_VRMA_LP1        (1ul << (63-16))
+#define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
+#define          LPCR_RMLS_SH  (63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE    0x00007000      /* powersave exit cause enable */
 #define     LPCR_PECE0 0x00004000      /* ext. exceptions can cause exit */
 #define     LPCR_PECE1 0x00002000      /* decrementer can cause exit */
 #define     LPCR_PECE2 0x00001000      /* machine check etc can cause exit */
 #define   LPCR_MER     0x00000800      /* Mediated External Exception */
+#define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
+#define   LPCR_LPES_SH 2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID      0x13F   /* Logical Partition Identifier */
+#define   LPID_RSVD    0x3ff           /* Reserved LPID for partn switching */
 #define        SPRN_HMER       0x150   /* Hardware m? error recovery */
 #define        SPRN_HMEER      0x151   /* Hardware m? enable error recovery */
 #define        SPRN_HEIR       0x153   /* Hypervisor Emulated Instruction Register */
 #define SPRN_HASH1     0x3D2           /* Primary Hash Address Register */
 #define SPRN_HASH2     0x3D3           /* Secondary Hash Address Resgister */
 #define SPRN_HID0      0x3F0           /* Hardware Implementation Register 0 */
+#define HID0_HDICE_SH  (63 - 23)       /* 970 HDEC interrupt enable */
 #define HID0_EMCP      (1<<31)         /* Enable Machine Check pin */
 #define HID0_EBA       (1<<29)         /* Enable Bus Address Parity */
 #define HID0_EBD       (1<<28)         /* Enable Bus Data Parity */
 #define SPRN_IABR2     0x3FA           /* 83xx */
 #define SPRN_IBCR      0x135           /* 83xx Insn Breakpoint Control Reg */
 #define SPRN_HID4      0x3F4           /* 970 HID4 */
+#define  HID4_LPES0     (1ul << (63-0)) /* LPAR env. sel. bit 0 */
+#define         HID4_RMLS2_SH   (63 - 2)       /* Real mode limit bottom 2 bits */
+#define         HID4_LPID5_SH   (63 - 6)       /* partition ID bottom 4 bits */
+#define         HID4_RMOR_SH    (63 - 22)      /* real mode offset (16 bits) */
+#define  HID4_LPES1     (1 << (63-57)) /* LPAR env. sel. bit 1 */
+#define  HID4_RMLS0_SH  (63 - 58)      /* Real mode limit top bit */
+#define         HID4_LPID1_SH   0              /* partition ID top 2 bits */
 #define SPRN_HID4_GEKKO        0x3F3           /* Gekko HID4 */
 #define SPRN_HID5      0x3F6           /* 970 HID5 */
 #define SPRN_HID6      0x3F9   /* BE HID 6 */
        mfspr   rX,SPRN_SPRG_PACA;                      \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mfspr   rX,SPRN_SPRG_HPACA;                     \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_PACA(rX)                                   \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mtspr   SPRN_SPRG_PACA,rX;                      \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mtspr   SPRN_SPRG_HPACA,rX;                     \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define GET_SCRATCH0(rX)                               \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mfspr   rX,SPRN_SPRG_SCRATCH0;                  \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mfspr   rX,SPRN_SPRG_HSCRATCH0;                 \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_SCRATCH0(rX)                               \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mtspr   SPRN_SPRG_SCRATCH0,rX;                  \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mtspr   SPRN_SPRG_HSCRATCH0,rX;                 \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #else /* CONFIG_PPC_BOOK3S_64 */
 #define GET_SCRATCH0(rX)       mfspr   rX,SPRN_SPRG_SCRATCH0
index 0f0ad9f..9ec0b39 100644 (file)
 #define ESR_ILK                0x00100000      /* Instr. Cache Locking */
 #define ESR_PUO                0x00040000      /* Unimplemented Operation exception */
 #define ESR_BO         0x00020000      /* Byte Ordering */
+#define ESR_SPV                0x00000080      /* Signal Processing operation */
 
 /* Bit definitions related to the DBCR0. */
 #if defined(CONFIG_40x)
index 36e1c8a..54b935f 100644 (file)
@@ -128,6 +128,7 @@ int main(void)
        DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
        /* paca */
        DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+       DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
        DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
        DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
        DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
@@ -187,7 +188,9 @@ int main(void)
        DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
        DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
        DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+       DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
        DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+       DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
        DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
        DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -198,11 +201,6 @@ int main(void)
        DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
        DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
        DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-       DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
-       DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
-       DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
-#endif
 #endif /* CONFIG_PPC64 */
 
        /* RTAS */
@@ -397,67 +395,160 @@ int main(void)
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
        DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+       DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+       DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+       DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+       DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+       DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+       DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+       DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+       DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+       DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+       DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+       DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+       DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+       DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+       DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+       DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+       DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
        DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
        DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
        DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
        DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
        DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+       DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
        DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
        DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+       DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
        /* book3s */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+       DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+       DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+       DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+       DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+       DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+       DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+       DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+       DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+       DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+       DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+       DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+#endif
 #ifdef CONFIG_PPC_BOOK3S
+       DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+       DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
        DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
        DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
-       DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+       DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+       DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+       DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+       DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+       DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+       DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+       DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
        DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
        DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
        DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
        DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
        DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+       DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+       DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+       DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+       DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
+       DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+       DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+       DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+       DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+       DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+       DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+       DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+       DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+       DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+       DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+       DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+       DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+       DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
        DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
                           offsetof(struct kvmppc_vcpu_book3s, vcpu));
-       DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
-       DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
-       DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
-       DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
-       DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
-       DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
-       DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
-       DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
-       DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
-       DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
-       DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
-       DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
-       DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
-       DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
-       DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
-       DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
-       DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
-       DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
-       DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
-       DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
-       DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
-       DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        vmhandler));
-       DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       scratch0));
-       DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       scratch1));
-       DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       in_guest));
-       DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                          fault_dsisr));
-       DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        fault_dar));
-       DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        last_inst));
-       DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                          shadow_srr1));
+       DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+       DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+       DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR
+# define SVCPU_FIELD(x, f)     DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
+# define HSTATE_FIELD(x, f)    DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else  /* 32-bit */
+# define SVCPU_FIELD(x, f)     DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f)    DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
+#endif
+
+       SVCPU_FIELD(SVCPU_CR, cr);
+       SVCPU_FIELD(SVCPU_XER, xer);
+       SVCPU_FIELD(SVCPU_CTR, ctr);
+       SVCPU_FIELD(SVCPU_LR, lr);
+       SVCPU_FIELD(SVCPU_PC, pc);
+       SVCPU_FIELD(SVCPU_R0, gpr[0]);
+       SVCPU_FIELD(SVCPU_R1, gpr[1]);
+       SVCPU_FIELD(SVCPU_R2, gpr[2]);
+       SVCPU_FIELD(SVCPU_R3, gpr[3]);
+       SVCPU_FIELD(SVCPU_R4, gpr[4]);
+       SVCPU_FIELD(SVCPU_R5, gpr[5]);
+       SVCPU_FIELD(SVCPU_R6, gpr[6]);
+       SVCPU_FIELD(SVCPU_R7, gpr[7]);
+       SVCPU_FIELD(SVCPU_R8, gpr[8]);
+       SVCPU_FIELD(SVCPU_R9, gpr[9]);
+       SVCPU_FIELD(SVCPU_R10, gpr[10]);
+       SVCPU_FIELD(SVCPU_R11, gpr[11]);
+       SVCPU_FIELD(SVCPU_R12, gpr[12]);
+       SVCPU_FIELD(SVCPU_R13, gpr[13]);
+       SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+       SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+       SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+       SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
 #ifdef CONFIG_PPC_BOOK3S_32
-       DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+       SVCPU_FIELD(SVCPU_SR, sr);
 #endif
-#else
+#ifdef CONFIG_PPC64
+       SVCPU_FIELD(SVCPU_SLB, slb);
+       SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+#endif
+
+       HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+       HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+       HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
+       HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+       HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+       HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+       HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+       HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+       HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+       HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+       HSTATE_FIELD(HSTATE_PMC, host_pmc);
+       HSTATE_FIELD(HSTATE_PURR, host_purr);
+       HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+       HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+       HSTATE_FIELD(HSTATE_DABR, dabr);
+       HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
+#else /* CONFIG_PPC_BOOK3S */
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
        DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -467,7 +558,7 @@ int main(void)
        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
        DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
-#endif
+#endif /* CONFIG_KVM */
 
 #ifdef CONFIG_KVM_GUEST
        DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
@@ -497,6 +588,13 @@ int main(void)
        DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7));
 #endif
 
+#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
+       DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0]));
+       DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc));
+       DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr));
+       DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
        DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
                                                arch.timing_exit.tv32.tbu));
index 4f9a93f..76797c5 100644 (file)
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7)
        blr
 
 __init_hvmode_206:
-       /* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */
+       /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
        mfmsr   r3
        rldicl. r0,r3,4,63
        bnelr
        ld      r5,CPU_SPEC_FEATURES(r4)
-       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206)
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
        xor     r5,r5,r6
        std     r5,CPU_SPEC_FEATURES(r4)
        blr
@@ -61,19 +61,23 @@ __init_LPCR:
         *   LPES = 0b01 (HSRR0/1 used for 0x500)
         *   PECE = 0b111
         *   DPFD = 4
+        *   HDICE = 0
+        *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+        *   VRMASD = 0b10000 (L=1, LP=00)
         *
         * Other bits untouched for now
         */
        mfspr   r3,SPRN_LPCR
-       ori     r3,r3,(LPCR_LPES0|LPCR_LPES1)
-       xori    r3,r3, LPCR_LPES0
+       li      r5,1
+       rldimi  r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
        ori     r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
-       li      r5,7
-       sldi    r5,r5,LPCR_DPFD_SH
-       andc    r3,r3,r5
        li      r5,4
-       sldi    r5,r5,LPCR_DPFD_SH
-       or      r3,r3,r5
+       rldimi  r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+       clrrdi  r3,r3,1         /* clear HDICE */
+       li      r5,4
+       rldimi  r3,r5, LPCR_VC_SH, 0
+       li      r5,0x10
+       rldimi  r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
        mtspr   SPRN_LPCR,r3
        isync
        blr
index 27f2507..12fac8d 100644 (file)
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
        /* Do nothing if not running in HV mode */
        mfmsr   r0
        rldicl. r0,r0,4,63
-       beqlr
+       beq     no_hv_mode
 
        mfspr   r0,SPRN_HID0
        li      r11,5                   /* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
        /* Do nothing if not running in HV mode */
        mfmsr   r0
        rldicl. r0,r0,4,63
-       beqlr
+       beq     no_hv_mode
 
        mfspr   r0,SPRN_HID0
        li      r11,0x15                /* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
        sync
        isync
 
+       /* Try to set LPES = 01 in HID4 */
+       mfspr   r0,SPRN_HID4
+       clrldi  r0,r0,1                 /* clear LPES0 */
+       ori     r0,r0,HID4_LPES1        /* set LPES1 */
+       sync
+       mtspr   SPRN_HID4,r0
+       isync
+
        /* Save away cpu state */
        LOAD_REG_ADDR(r5,cpu_state_storage)
 
@@ -117,11 +125,21 @@ load_hids:
        std     r3,CS_HID0(r5)
        mfspr   r3,SPRN_HID1
        std     r3,CS_HID1(r5)
-       mfspr   r3,SPRN_HID4
-       std     r3,CS_HID4(r5)
+       mfspr   r4,SPRN_HID4
+       std     r4,CS_HID4(r5)
        mfspr   r3,SPRN_HID5
        std     r3,CS_HID5(r5)
 
+       /* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+       andi.   r4,r4,HID4_LPES1
+       bnelr
+
+no_hv_mode:
+       /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+       ld      r5,CPU_SPEC_FEATURES(r4)
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+       andc    r5,r5,r6
+       std     r5,CPU_SPEC_FEATURES(r4)
        blr
 
 /* Called with no MMU context (typically MSR:IR/DR off) to
index a85f487..41b02c7 100644 (file)
@@ -40,7 +40,6 @@ __start_interrupts:
        .globl system_reset_pSeries;
 system_reset_pSeries:
        HMT_MEDIUM;
-       DO_KVM  0x100;
        SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -50,82 +49,73 @@ BEGIN_FTR_SECTION
         * state loss at this time.
         */
        mfspr   r13,SPRN_SRR1
-       rlwinm  r13,r13,47-31,30,31
-       cmpwi   cr0,r13,1
-       bne     1f
-       b       .power7_wakeup_noloss
-1:     cmpwi   cr0,r13,2
-       bne     1f
-       b       .power7_wakeup_loss
+       rlwinm. r13,r13,47-31,30,31
+       beq     9f
+
+       /* waking up from powersave (nap) state */
+       cmpwi   cr1,r13,2
        /* Total loss of HV state is fatal, we could try to use the
         * PIR to locate a PACA, then use an emergency stack etc...
         * but for now, let's just stay stuck here
         */
-1:     cmpwi   cr0,r13,3
-       beq     .
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+       bgt     cr1,.
+       GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       lbz     r0,PACAPROCSTART(r13)
+       cmpwi   r0,0x80
+       bne     1f
+       li      r0,0
+       stb     r0,PACAPROCSTART(r13)
+       b       kvm_start_guest
+1:
+#endif
+
+       beq     cr1,2f
+       b       .power7_wakeup_noloss
+2:     b       .power7_wakeup_loss
+9:
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+                                NOTEST, 0x100)
 
        . = 0x200
-_machine_check_pSeries:
-       HMT_MEDIUM
-       DO_KVM  0x200
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+machine_check_pSeries_1:
+       /* This is moved out of line as it can be patched by FW, but
+        * some code path might still want to branch into the original
+        * vector
+        */
+       b       machine_check_pSeries
 
        . = 0x300
        .globl data_access_pSeries
 data_access_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x300
        SET_SCRATCH0(r13)
+#ifndef CONFIG_POWER4_ONLY
 BEGIN_FTR_SECTION
-       GET_PACA(r13)
-       std     r9,PACA_EXSLB+EX_R9(r13)
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       mfspr   r10,SPRN_DAR
-       mfspr   r9,SPRN_DSISR
-       srdi    r10,r10,60
-       rlwimi  r10,r9,16,0x20
-       mfcr    r9
-       cmpwi   r10,0x2c
-       beq     do_stab_bolted_pSeries
-       ld      r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXGEN+EX_R11(r13)
-       ld      r11,PACA_EXSLB+EX_R9(r13)
-       std     r12,PACA_EXGEN+EX_R12(r13)
-       GET_SCRATCH0(r12)
-       std     r10,PACA_EXGEN+EX_R10(r13)
-       std     r11,PACA_EXGEN+EX_R9(r13)
-       std     r12,PACA_EXGEN+EX_R13(r13)
-       EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD)
-FTR_SECTION_ELSE
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
+       b       data_access_check_stab
+data_access_not_stab:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
+#endif
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
+                                KVMTEST_PR, 0x300)
 
        . = 0x380
        .globl data_access_slb_pSeries
 data_access_slb_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x380
        SET_SCRATCH0(r13)
-       GET_PACA(r13)
+       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
        std     r3,PACA_EXSLB+EX_R3(r13)
        mfspr   r3,SPRN_DAR
-       std     r9,PACA_EXSLB+EX_R9(r13)        /* save r9 - r12 */
-       mfcr    r9
 #ifdef __DISABLED__
        /* Keep that around for when we re-implement dynamic VSIDs */
        cmpdi   r3,0
        bge     slb_miss_user_pseries
 #endif /* __DISABLED__ */
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       mfspr   r12,SPRN_SRR1           /* and SRR1 */
+       mfspr   r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
        b       .slb_miss_realmode
 #else
@@ -147,24 +137,16 @@ data_access_slb_pSeries:
        .globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x480
        SET_SCRATCH0(r13)
-       GET_PACA(r13)
+       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
        std     r3,PACA_EXSLB+EX_R3(r13)
        mfspr   r3,SPRN_SRR0            /* SRR0 is faulting address */
-       std     r9,PACA_EXSLB+EX_R9(r13)        /* save r9 - r12 */
-       mfcr    r9
 #ifdef __DISABLED__
        /* Keep that around for when we re-implement dynamic VSIDs */
        cmpdi   r3,0
        bge     slb_miss_user_pseries
 #endif /* __DISABLED__ */
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       mfspr   r12,SPRN_SRR1           /* and SRR1 */
+       mfspr   r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
        b       .slb_miss_realmode
 #else
@@ -184,26 +166,46 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
        BEGIN_FTR_SECTION
-               _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD)
+               _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
+                                           EXC_HV, SOFTEN_TEST_HV)
+               KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
        FTR_SECTION_ELSE
-               _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV)
-       ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+               _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+                                           EXC_STD, SOFTEN_TEST_HV_201)
+               KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+       ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
        STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
+
        STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
+
        STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
        MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
-       MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer)
+       MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
        STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
+
        STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
 
        . = 0xc00
        .globl  system_call_pSeries
 system_call_pSeries:
        HMT_MEDIUM
-       DO_KVM  0xc00
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9,PACA_EXGEN+EX_R9(r13)
+       std     r10,PACA_EXGEN+EX_R10(r13)
+       mfcr    r9
+       KVMTEST(0xc00)
+       GET_SCRATCH0(r13)
+#endif
 BEGIN_FTR_SECTION
        cmpdi   r0,0x1ebe
        beq-    1f
@@ -220,6 +222,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
        rfid
        b       .       /* prevent speculative execution */
 
+       KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+
 /* Fast LE/BE switch system call */
 1:     mfspr   r12,SPRN_SRR1
        xori    r12,r12,MSR_LE
@@ -228,6 +232,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
        b       .
 
        STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
        /* At 0xe??? we have a bunch of hypervisor exceptions, we branch
         * out of line to handle them
@@ -262,30 +267,93 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
+
        STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
+
        STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
+
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
        . = 0x3000
 
 /*** Out of line interrupts support ***/
 
+       /* moved from 0x200 */
+machine_check_pSeries:
+       .globl machine_check_fwnmi
+machine_check_fwnmi:
+       HMT_MEDIUM
+       SET_SCRATCH0(r13)               /* save r13 */
+       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
+                                EXC_STD, KVMTEST, 0x200)
+       KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
+
+#ifndef CONFIG_POWER4_ONLY
+       /* moved from 0x300 */
+data_access_check_stab:
+       GET_PACA(r13)
+       std     r9,PACA_EXSLB+EX_R9(r13)
+       std     r10,PACA_EXSLB+EX_R10(r13)
+       mfspr   r10,SPRN_DAR
+       mfspr   r9,SPRN_DSISR
+       srdi    r10,r10,60
+       rlwimi  r10,r9,16,0x20
+#ifdef CONFIG_KVM_BOOK3S_PR
+       lbz     r9,HSTATE_IN_GUEST(r13)
+       rlwimi  r10,r9,8,0x300
+#endif
+       mfcr    r9
+       cmpwi   r10,0x2c
+       beq     do_stab_bolted_pSeries
+       mtcrf   0x80,r9
+       ld      r9,PACA_EXSLB+EX_R9(r13)
+       ld      r10,PACA_EXSLB+EX_R10(r13)
+       b       data_access_not_stab
+do_stab_bolted_pSeries:
+       std     r11,PACA_EXSLB+EX_R11(r13)
+       std     r12,PACA_EXSLB+EX_R12(r13)
+       GET_SCRATCH0(r10)
+       std     r10,PACA_EXSLB+EX_R13(r13)
+       EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
+#endif /* CONFIG_POWER4_ONLY */
+
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+       KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+       KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
+
+       .align  7
        /* moved from 0xe00 */
-       STD_EXCEPTION_HV(., 0xe00, h_data_storage)
-       STD_EXCEPTION_HV(., 0xe20, h_instr_storage)
-       STD_EXCEPTION_HV(., 0xe40, emulation_assist)
-       STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */
+       STD_EXCEPTION_HV(., 0xe02, h_data_storage)
+       KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
+       STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
+       STD_EXCEPTION_HV(., 0xe42, emulation_assist)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
+       STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
        /* moved from 0xf00 */
        STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
        STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
        STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -317,14 +385,6 @@ masked_Hinterrupt:
        hrfid
        b       .
 
-       .align  7
-do_stab_bolted_pSeries:
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
-
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Vectors for the FWNMI option.  Share common code.
@@ -334,14 +394,8 @@ do_stab_bolted_pSeries:
 system_reset_fwnmi:
        HMT_MEDIUM
        SET_SCRATCH0(r13)               /* save r13 */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
-
-       .globl machine_check_fwnmi
-      .align 7
-machine_check_fwnmi:
-       HMT_MEDIUM
-       SET_SCRATCH0(r13)               /* save r13 */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+                                NOTEST, 0x100)
 
 #endif /* CONFIG_PPC_PSERIES */
 
@@ -376,7 +430,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
        .align  7
index 5ecf54c..fe37dd0 100644 (file)
@@ -656,7 +656,7 @@ load_up_spe:
        cmpi    0,r4,0
        beq     1f
        addi    r4,r4,THREAD    /* want THREAD of last_task_used_spe */
-       SAVE_32EVRS(0,r10,r4)
+       SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
        evxor   evr10, evr10, evr10     /* clear out evr10 */
        evmwumiaa evr10, evr10, evr10   /* evr10 <- ACC = 0 * 0 + ACC */
        li      r5,THREAD_ACC
@@ -676,7 +676,7 @@ load_up_spe:
        stw     r4,THREAD_USED_SPE(r5)
        evlddx  evr4,r10,r5
        evmra   evr4,evr4
-       REST_32EVRS(0,r10,r5)
+       REST_32EVRS(0,r10,r5,THREAD_EVR0)
 #ifndef CONFIG_SMP
        subi    r4,r5,THREAD
        stw     r4,last_task_used_spe@l(r3)
@@ -787,13 +787,11 @@ _GLOBAL(giveup_spe)
        addi    r3,r3,THREAD            /* want THREAD of task */
        lwz     r5,PT_REGS(r3)
        cmpi    0,r5,0
-       SAVE_32EVRS(0, r4, r3)
+       SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
        evxor   evr6, evr6, evr6        /* clear out evr6 */
        evmwumiaa evr6, evr6, evr6      /* evr6 <- ACC = 0 * 0 + ACC */
        li      r4,THREAD_ACC
        evstddx evr6, r4, r3            /* save off accumulator */
-       mfspr   r6,SPRN_SPEFSCR
-       stw     r6,THREAD_SPEFSCR(r3)   /* save spefscr register value */
        beq     1f
        lwz     r4,_MSR-STACK_FRAME_OVERHEAD(r5)
        lis     r3,MSR_SPE@h
index f8f0bc7..3a70845 100644 (file)
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
        b       .
 
 _GLOBAL(power7_wakeup_loss)
-       GET_PACA(r13)
        ld      r1,PACAR1(r13)
        REST_NVGPRS(r1)
        REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
        rfid
 
 _GLOBAL(power7_wakeup_noloss)
-       GET_PACA(r13)
        ld      r1,PACAR1(r13)
        ld      r4,_MSR(r1)
        ld      r5,_NIP(r1)
index efeb881..0a5a899 100644 (file)
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca)
         * if we do a GET_PACA() before the feature fixups have been
         * applied
         */
-       if (cpu_has_feature(CPU_FTR_HVMODE_206))
+       if (cpu_has_feature(CPU_FTR_HVMODE))
                mtspr(SPRN_SPRG_HPACA, local_paca);
 #endif
        mtspr(SPRN_SPRG_PACA, local_paca);
index 91e52df..ec2d0ed 100644 (file)
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
 
 void enable_kernel_fp(void)
 {
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
@@ -213,6 +216,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
 #ifdef CONFIG_SMP
                        BUG_ON(tsk != current);
 #endif
+                       tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
                        giveup_spe(tsk);
                }
                preempt_enable();
index 79fca26..22051ef 100644 (file)
@@ -375,6 +375,9 @@ void __init check_for_initrd(void)
 
 int threads_per_core, threads_shift;
 cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
 
 static void __init cpu_init_thread_core_maps(int tpc)
 {
index a88bf27..532054f 100644 (file)
@@ -63,6 +63,7 @@
 #include <asm/kexec.h>
 #include <asm/mmu_context.h>
 #include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
 
 #include "setup.h"
 
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
        /* Initialize the MMU context management stuff */
        mmu_context_init();
 
+       kvm_rma_init();
+
        ppc64_boot_msg(0x15, "Setup Done");
 }
 
index 8ebc670..09a85a9 100644 (file)
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu)
        if (likely(smp_ops))
                smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
index 1a01414..f19d977 100644 (file)
@@ -1387,10 +1387,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
        int code = 0;
        int err;
 
-       preempt_disable();
-       if (regs->msr & MSR_SPE)
-               giveup_spe(current);
-       preempt_enable();
+       flush_spe_to_thread(current);
 
        spefscr = current->thread.spefscr;
        fpexc_mode = current->thread.fpexc_mode;
index 5f3cff8..33aa715 100644 (file)
@@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
        }
 }
 
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
+       int usermode = vcpu->arch.shared->msr & MSR_PR;
+
        vcpu->arch.shadow_pid = !usermode;
 }
 
index 105b691..78133de 100644 (file)
@@ -20,7 +20,6 @@ config KVM
        bool
        select PREEMPT_NOTIFIERS
        select ANON_INODES
-       select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
        bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
        bool
        select KVM_BOOK3S_HANDLER
+       select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
        bool
        select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_PR
+       bool
+       select KVM_MMIO
+
 config KVM_BOOK3S_32
        tristate "KVM support for PowerPC book3s_32 processors"
        depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
        select KVM
        select KVM_BOOK3S_32_HANDLER
+       select KVM_BOOK3S_PR
        ---help---
          Support running unmodified book3s_32 guest kernels
          in virtual machines on book3s_32 host processors.
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32
 config KVM_BOOK3S_64
        tristate "KVM support for PowerPC book3s_64 processors"
        depends on EXPERIMENTAL && PPC_BOOK3S_64
-       select KVM
        select KVM_BOOK3S_64_HANDLER
+       select KVM
        ---help---
          Support running unmodified book3s_64 and book3s_32 guest kernels
          in virtual machines on book3s_64 host processors.
@@ -61,10 +66,34 @@ config KVM_BOOK3S_64
 
          If unsure, say N.
 
+config KVM_BOOK3S_64_HV
+       bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       depends on KVM_BOOK3S_64
+       ---help---
+         Support running unmodified book3s_64 guest kernels in
+         virtual machines on POWER7 and PPC970 processors that have
+         hypervisor mode available to the host.
+
+         If you say Y here, KVM will use the hardware virtualization
+         facilities of POWER7 (and later) processors, meaning that
+         guest operating systems will run at full hardware speed
+         using supervisor and user modes.  However, this also means
+         that KVM is not usable under PowerVM (pHyp), is only usable
+         on POWER7 (or later) processors and PPC970-family processors,
+         and cannot emulate a different processor from the host processor.
+
+         If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+       def_bool y
+       depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
+       select KVM_BOOK3S_PR
+
 config KVM_440
        bool "KVM support for PowerPC 440 processors"
        depends on EXPERIMENTAL && 44x
        select KVM
+       select KVM_MMIO
        ---help---
          Support running unmodified 440 guest kernels in virtual machines on
          440 host processors.
@@ -89,6 +118,7 @@ config KVM_E500
        bool "KVM support for PowerPC E500 processors"
        depends on EXPERIMENTAL && E500
        select KVM
+       select KVM_MMIO
        ---help---
          Support running unmodified E500 guest kernels in virtual machines on
          E500 host processors.
index 4d68638..08428e2 100644 (file)
@@ -38,24 +38,42 @@ kvm-e500-objs := \
        e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
-       $(common-objs-y) \
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+       ../../../virt/kvm/coalesced_mmio.o \
        fpu.o \
        book3s_paired_singles.o \
-       book3s.o \
+       book3s_pr.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
        book3s_64_mmu_host.o \
        book3s_64_mmu.o \
        book3s_32_mmu.o
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+       book3s_hv.o \
+       book3s_hv_interrupts.o \
+       book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+       book3s_hv_rm_mmu.o \
+       book3s_64_vio_hv.o \
+       book3s_hv_builtin.o
+
+kvm-book3s_64-module-objs := \
+       ../../../virt/kvm/kvm_main.o \
+       powerpc.o \
+       emulate.o \
+       book3s.o \
+       $(kvm-book3s_64-objs-y)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
        $(common-objs-y) \
        fpu.o \
        book3s_paired_singles.o \
        book3s.o \
+       book3s_pr.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
@@ -70,3 +88,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
+obj-y += $(kvm-book3s_64-builtin-objs-y)
index 0f95b5c..f68a34d 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "trace.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define DEBUG_EXT */
-
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-                            ulong msr);
-
-/* Some compatibility defines */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define MSR_USER32 MSR_USER
-#define MSR_USER64 MSR_USER
-#define HW_PAGE_SIZE PAGE_SIZE
-#endif
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "exits",       VCPU_STAT(sum_exits) },
@@ -77,100 +68,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
-       memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
-       to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_32
-       current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
-#endif
-}
-
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
-       memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
-       to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
-#endif
-
-       kvmppc_giveup_ext(vcpu, MSR_FP);
-       kvmppc_giveup_ext(vcpu, MSR_VEC);
-       kvmppc_giveup_ext(vcpu, MSR_VSX);
-}
-
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-       ulong smsr = vcpu->arch.shared->msr;
-
-       /* Guest MSR values */
-       smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
-       /* Process MSR values */
-       smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-       /* External providers the guest reserved */
-       smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
-       /* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-       smsr |= MSR_ISF | MSR_HV;
-#endif
-       vcpu->arch.shadow_msr = smsr;
-}
-
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
-{
-       ulong old_msr = vcpu->arch.shared->msr;
-
-#ifdef EXIT_DEBUG
-       printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
-#endif
-
-       msr &= to_book3s(vcpu)->msr_mask;
-       vcpu->arch.shared->msr = msr;
-       kvmppc_recalc_shadow_msr(vcpu);
-
-       if (msr & MSR_POW) {
-               if (!vcpu->arch.pending_exceptions) {
-                       kvm_vcpu_block(vcpu);
-                       vcpu->stat.halt_wakeup++;
-
-                       /* Unset POW bit after we woke up */
-                       msr &= ~MSR_POW;
-                       vcpu->arch.shared->msr = msr;
-               }
-       }
-
-       if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
-                  (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
-               kvmppc_mmu_flush_segments(vcpu);
-               kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-
-               /* Preload magic page segment when in kernel mode */
-               if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
-                       struct kvm_vcpu_arch *a = &vcpu->arch;
-
-                       if (msr & MSR_DR)
-                               kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
-                       else
-                               kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
-               }
-       }
-
-       /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
-               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-}
-
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
        vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
        vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
-       kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
+       kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
        vcpu->arch.mmu.reset_msr(vcpu);
 }
 
@@ -204,11 +106,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
                                          unsigned int vec)
 {
+       unsigned long old_pending = vcpu->arch.pending_exceptions;
+
        clear_bit(kvmppc_book3s_vec2irqprio(vec),
                  &vcpu->arch.pending_exceptions);
 
-       if (!vcpu->arch.pending_exceptions)
-               vcpu->arch.shared->int_pending = 0;
+       kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+                                 old_pending);
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -225,8 +129,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
 
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
-       to_book3s(vcpu)->prog_flags = flags;
-       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
+       /* might as well deliver this straight away */
+       kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
 }
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
@@ -266,21 +170,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        int deliver = 1;
        int vec = 0;
-       ulong flags = 0ULL;
-       ulong crit_raw = vcpu->arch.shared->critical;
-       ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-       bool crit;
-
-       /* Truncate crit indicators in 32 bit mode */
-       if (!(vcpu->arch.shared->msr & MSR_SF)) {
-               crit_raw &= 0xffffffff;
-               crit_r1 &= 0xffffffff;
-       }
-
-       /* Critical section when crit == r1 */
-       crit = (crit_raw == crit_r1);
-       /* ... and we're in supervisor mode */
-       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+       bool crit = kvmppc_critical_section(vcpu);
 
        switch (priority) {
        case BOOK3S_IRQPRIO_DECREMENTER:
@@ -315,7 +205,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
                break;
        case BOOK3S_IRQPRIO_PROGRAM:
                vec = BOOK3S_INTERRUPT_PROGRAM;
-               flags = to_book3s(vcpu)->prog_flags;
                break;
        case BOOK3S_IRQPRIO_VSX:
                vec = BOOK3S_INTERRUPT_VSX;
@@ -346,7 +235,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 #endif
 
        if (deliver)
-               kvmppc_inject_interrupt(vcpu, vec, flags);
+               kvmppc_inject_interrupt(vcpu, vec, 0);
 
        return deliver;
 }
@@ -392,64 +281,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
        }
 
        /* Tell the guest about our interrupt status */
-       if (*pending)
-               vcpu->arch.shared->int_pending = 1;
-       else if (old_pending)
-               vcpu->arch.shared->int_pending = 0;
-}
-
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
-{
-       u32 host_pvr;
-
-       vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
-       vcpu->arch.pvr = pvr;
-#ifdef CONFIG_PPC_BOOK3S_64
-       if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
-               kvmppc_mmu_book3s_64_init(vcpu);
-               to_book3s(vcpu)->hior = 0xfff00000;
-               to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
-       } else
-#endif
-       {
-               kvmppc_mmu_book3s_32_init(vcpu);
-               to_book3s(vcpu)->hior = 0;
-               to_book3s(vcpu)->msr_mask = 0xffffffffULL;
-       }
-
-       /* If we are in hypervisor level on 970, we can tell the CPU to
-        * treat DCBZ as 32 bytes store */
-       vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
-       if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
-           !strcmp(cur_cpu_spec->platform, "ppc970"))
-               vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-
-       /* Cell performs badly if MSR_FEx are set. So let's hope nobody
-          really needs them in a VM on Cell and force disable them. */
-       if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
-               to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
-
-#ifdef CONFIG_PPC_BOOK3S_32
-       /* 32 bit Book3S always has 32 byte dcbz */
-       vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-#endif
-
-       /* On some CPUs we can execute paired single operations natively */
-       asm ( "mfpvr %0" : "=r"(host_pvr));
-       switch (host_pvr) {
-       case 0x00080200:        /* lonestar 2.0 */
-       case 0x00088202:        /* lonestar 2.2 */
-       case 0x70000100:        /* gekko 1.0 */
-       case 0x00080100:        /* gekko 2.0 */
-       case 0x00083203:        /* gekko 2.3a */
-       case 0x00083213:        /* gekko 2.3b */
-       case 0x00083204:        /* gekko 2.4 */
-       case 0x00083214:        /* gekko 2.4e (8SE) - retail HW2 */
-       case 0x00087200:        /* broadway */
-               vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
-               /* Enable HID2.PSE - in case we need it later */
-               mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
-       }
+       kvmppc_update_int_pending(vcpu, *pending, old_pending);
 }
 
 pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -471,44 +303,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
        return gfn_to_pfn(vcpu->kvm, gfn);
 }
 
-/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
- * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
- * emulate 32 bytes dcbz length.
- *
- * The Book3s_64 inventors also realized this case and implemented a special bit
- * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
- *
- * My approach here is to patch the dcbz instruction on executing pages.
- */
-static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
-{
-       struct page *hpage;
-       u64 hpage_offset;
-       u32 *page;
-       int i;
-
-       hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-       if (is_error_page(hpage)) {
-               kvm_release_page_clean(hpage);
-               return;
-       }
-
-       hpage_offset = pte->raddr & ~PAGE_MASK;
-       hpage_offset &= ~0xFFFULL;
-       hpage_offset /= 4;
-
-       get_page(hpage);
-       page = kmap_atomic(hpage, KM_USER0);
-
-       /* patch dcbz into reserved instruction, so we trap */
-       for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
-               if ((page[i] & 0xff0007ff) == INS_DCBZ)
-                       page[i] &= 0xfffffff7;
-
-       kunmap_atomic(page, KM_USER0);
-       put_page(hpage);
-}
-
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
                         struct kvmppc_pte *pte)
 {
@@ -606,519 +400,6 @@ mmio:
        return EMULATE_DO_MMIO;
 }
 
-static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-       ulong mp_pa = vcpu->arch.magic_page_pa;
-
-       if (unlikely(mp_pa) &&
-           unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
-               return 1;
-       }
-
-       return kvm_is_visible_gfn(vcpu->kvm, gfn);
-}
-
-int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           ulong eaddr, int vec)
-{
-       bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
-       int r = RESUME_GUEST;
-       int relocated;
-       int page_found = 0;
-       struct kvmppc_pte pte;
-       bool is_mmio = false;
-       bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
-       bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
-       u64 vsid;
-
-       relocated = data ? dr : ir;
-
-       /* Resolve real address if translation turned on */
-       if (relocated) {
-               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
-       } else {
-               pte.may_execute = true;
-               pte.may_read = true;
-               pte.may_write = true;
-               pte.raddr = eaddr & KVM_PAM;
-               pte.eaddr = eaddr;
-               pte.vpage = eaddr >> 12;
-       }
-
-       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-       case 0:
-               pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
-               break;
-       case MSR_DR:
-       case MSR_IR:
-               vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
-
-               if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
-                       pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
-               else
-                       pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
-               pte.vpage |= vsid;
-
-               if (vsid == -1)
-                       page_found = -EINVAL;
-               break;
-       }
-
-       if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-          (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-               /*
-                * If we do the dcbz hack, we have to NX on every execution,
-                * so we can patch the executing code. This renders our guest
-                * NX-less.
-                */
-               pte.may_execute = !data;
-       }
-
-       if (page_found == -ENOENT) {
-               /* Page not found in guest PTE entries */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-               vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-               kvmppc_book3s_queue_irqprio(vcpu, vec);
-       } else if (page_found == -EPERM) {
-               /* Storage protection */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr =
-                       to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-               vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
-               vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-               kvmppc_book3s_queue_irqprio(vcpu, vec);
-       } else if (page_found == -EINVAL) {
-               /* Page not found in guest SLB */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-       } else if (!is_mmio &&
-                  kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
-               /* The guest's PTE is not mapped yet. Map on the host */
-               kvmppc_mmu_map_page(vcpu, &pte);
-               if (data)
-                       vcpu->stat.sp_storage++;
-               else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-                       (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
-                       kvmppc_patch_dcbz(vcpu, &pte);
-       } else {
-               /* MMIO */
-               vcpu->stat.mmio_exits++;
-               vcpu->arch.paddr_accessed = pte.raddr;
-               r = kvmppc_emulate_mmio(run, vcpu);
-               if ( r == RESUME_HOST_NV )
-                       r = RESUME_HOST;
-       }
-
-       return r;
-}
-
-static inline int get_fpr_index(int i)
-{
-#ifdef CONFIG_VSX
-       i *= 2;
-#endif
-       return i;
-}
-
-/* Give up external provider (FPU, Altivec, VSX) */
-void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
-{
-       struct thread_struct *t = &current->thread;
-       u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-       u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-       u64 *thread_fpr = (u64*)t->fpr;
-       int i;
-
-       if (!(vcpu->arch.guest_owned_ext & msr))
-               return;
-
-#ifdef DEBUG_EXT
-       printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
-#endif
-
-       switch (msr) {
-       case MSR_FP:
-               giveup_fpu(current);
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-                       vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
-
-               vcpu->arch.fpscr = t->fpscr.val;
-               break;
-       case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-               giveup_altivec(current);
-               memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
-               vcpu->arch.vscr = t->vscr;
-#endif
-               break;
-       case MSR_VSX:
-#ifdef CONFIG_VSX
-               __giveup_vsx(current);
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-                       vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
-               break;
-       default:
-               BUG();
-       }
-
-       vcpu->arch.guest_owned_ext &= ~msr;
-       current->thread.regs->msr &= ~msr;
-       kvmppc_recalc_shadow_msr(vcpu);
-}
-
-static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
-{
-       ulong srr0 = kvmppc_get_pc(vcpu);
-       u32 last_inst = kvmppc_get_last_inst(vcpu);
-       int ret;
-
-       ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
-       if (ret == -ENOENT) {
-               ulong msr = vcpu->arch.shared->msr;
-
-               msr = kvmppc_set_field(msr, 33, 33, 1);
-               msr = kvmppc_set_field(msr, 34, 36, 0);
-               vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
-               kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
-               return EMULATE_AGAIN;
-       }
-
-       return EMULATE_DONE;
-}
-
-static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
-{
-
-       /* Need to do paired single emulation? */
-       if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
-               return EMULATE_DONE;
-
-       /* Read out the instruction */
-       if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
-               /* Need to emulate */
-               return EMULATE_FAIL;
-
-       return EMULATE_AGAIN;
-}
-
-/* Handle external providers (FPU, Altivec, VSX) */
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-                            ulong msr)
-{
-       struct thread_struct *t = &current->thread;
-       u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-       u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-       u64 *thread_fpr = (u64*)t->fpr;
-       int i;
-
-       /* When we have paired singles, we emulate in software */
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
-               return RESUME_GUEST;
-
-       if (!(vcpu->arch.shared->msr & msr)) {
-               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               return RESUME_GUEST;
-       }
-
-       /* We already own the ext */
-       if (vcpu->arch.guest_owned_ext & msr) {
-               return RESUME_GUEST;
-       }
-
-#ifdef DEBUG_EXT
-       printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
-#endif
-
-       current->thread.regs->msr |= msr;
-
-       switch (msr) {
-       case MSR_FP:
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-                       thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
-               t->fpscr.val = vcpu->arch.fpscr;
-               t->fpexc_mode = 0;
-               kvmppc_load_up_fpu();
-               break;
-       case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-               memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
-               t->vscr = vcpu->arch.vscr;
-               t->vrsave = -1;
-               kvmppc_load_up_altivec();
-#endif
-               break;
-       case MSR_VSX:
-#ifdef CONFIG_VSX
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-                       thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-               kvmppc_load_up_vsx();
-#endif
-               break;
-       default:
-               BUG();
-       }
-
-       vcpu->arch.guest_owned_ext |= msr;
-
-       kvmppc_recalc_shadow_msr(vcpu);
-
-       return RESUME_GUEST;
-}
-
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
-{
-       int r = RESUME_HOST;
-
-       vcpu->stat.sum_exits++;
-
-       run->exit_reason = KVM_EXIT_UNKNOWN;
-       run->ready_for_interrupt_injection = 1;
-
-       trace_kvm_book3s_exit(exit_nr, vcpu);
-       kvm_resched(vcpu);
-       switch (exit_nr) {
-       case BOOK3S_INTERRUPT_INST_STORAGE:
-               vcpu->stat.pf_instruc++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-               /* We set segments as unused segments when invalidating them. So
-                * treat the respective fault as segment fault. */
-               if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-                   == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-                       r = RESUME_GUEST;
-                       break;
-               }
-#endif
-
-               /* only care about PTEG not found errors, but leave NX alone */
-               if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
-                       r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
-                       vcpu->stat.sp_instruc++;
-               } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-                         (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-                       /*
-                        * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
-                        *     so we can't use the NX bit inside the guest. Let's cross our fingers,
-                        *     that no guest that needs the dcbz hack does NX.
-                        */
-                       kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
-                       r = RESUME_GUEST;
-               } else {
-                       vcpu->arch.shared->msr |=
-                               to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       case BOOK3S_INTERRUPT_DATA_STORAGE:
-       {
-               ulong dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->stat.pf_storage++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-               /* We set segments as unused segments when invalidating them. So
-                * treat the respective fault as segment fault. */
-               if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, dar);
-                       r = RESUME_GUEST;
-                       break;
-               }
-#endif
-
-               /* The only case we need to handle is missing shadow PTEs */
-               if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
-                       r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
-               } else {
-                       vcpu->arch.shared->dar = dar;
-                       vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_DATA_SEGMENT:
-               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-                       vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-                       kvmppc_book3s_queue_irqprio(vcpu,
-                               BOOK3S_INTERRUPT_DATA_SEGMENT);
-               }
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_INST_SEGMENT:
-               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
-                       kvmppc_book3s_queue_irqprio(vcpu,
-                               BOOK3S_INTERRUPT_INST_SEGMENT);
-               }
-               r = RESUME_GUEST;
-               break;
-       /* We're good on these - the host merely wanted to get our attention */
-       case BOOK3S_INTERRUPT_DECREMENTER:
-               vcpu->stat.dec_exits++;
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_EXTERNAL:
-               vcpu->stat.ext_intr_exits++;
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_PERFMON:
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_PROGRAM:
-       {
-               enum emulation_result er;
-               ulong flags;
-
-program_interrupt:
-               flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
-
-               if (vcpu->arch.shared->msr & MSR_PR) {
-#ifdef EXIT_DEBUG
-                       printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-#endif
-                       if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
-                           (INS_DCBZ & 0xfffffff7)) {
-                               kvmppc_core_queue_program(vcpu, flags);
-                               r = RESUME_GUEST;
-                               break;
-                       }
-               }
-
-               vcpu->stat.emulated_inst_exits++;
-               er = kvmppc_emulate_instruction(run, vcpu);
-               switch (er) {
-               case EMULATE_DONE:
-                       r = RESUME_GUEST_NV;
-                       break;
-               case EMULATE_AGAIN:
-                       r = RESUME_GUEST;
-                       break;
-               case EMULATE_FAIL:
-                       printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-                              __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-                       kvmppc_core_queue_program(vcpu, flags);
-                       r = RESUME_GUEST;
-                       break;
-               case EMULATE_DO_MMIO:
-                       run->exit_reason = KVM_EXIT_MMIO;
-                       r = RESUME_HOST_NV;
-                       break;
-               default:
-                       BUG();
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_SYSCALL:
-               if (vcpu->arch.osi_enabled &&
-                   (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
-                   (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
-                       /* MOL hypercalls */
-                       u64 *gprs = run->osi.gprs;
-                       int i;
-
-                       run->exit_reason = KVM_EXIT_OSI;
-                       for (i = 0; i < 32; i++)
-                               gprs[i] = kvmppc_get_gpr(vcpu, i);
-                       vcpu->arch.osi_needed = 1;
-                       r = RESUME_HOST_NV;
-               } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
-                   (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
-                       /* KVM PV hypercalls */
-                       kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
-                       r = RESUME_GUEST;
-               } else {
-                       /* Guest syscalls */
-                       vcpu->stat.syscall_exits++;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       case BOOK3S_INTERRUPT_FP_UNAVAIL:
-       case BOOK3S_INTERRUPT_ALTIVEC:
-       case BOOK3S_INTERRUPT_VSX:
-       {
-               int ext_msr = 0;
-
-               switch (exit_nr) {
-               case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
-               case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
-               case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
-               }
-
-               switch (kvmppc_check_ext(vcpu, exit_nr)) {
-               case EMULATE_DONE:
-                       /* everything ok - let's enable the ext */
-                       r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
-                       break;
-               case EMULATE_FAIL:
-                       /* we need to emulate this instruction */
-                       goto program_interrupt;
-                       break;
-               default:
-                       /* nothing to worry about - go again */
-                       break;
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_ALIGNMENT:
-               if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-                       vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
-                               kvmppc_get_last_inst(vcpu));
-                       vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
-                               kvmppc_get_last_inst(vcpu));
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               }
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_MACHINE_CHECK:
-       case BOOK3S_INTERRUPT_TRACE:
-               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               r = RESUME_GUEST;
-               break;
-       default:
-               /* Ugh - bork here! What did we get? */
-               printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-                       exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
-               r = RESUME_HOST;
-               BUG();
-               break;
-       }
-
-
-       if (!(r & RESUME_HOST)) {
-               /* To avoid clobbering exit_reason, only check for signals if
-                * we aren't already exiting to userspace for some other
-                * reason. */
-               if (signal_pending(current)) {
-#ifdef EXIT_DEBUG
-                       printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
-                       vcpu->stat.signal_exits++;
-                       run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-               } else {
-                       /* In case an interrupt came in that was triggered
-                        * from userspace (like DEC), we need to check what
-                        * to inject now! */
-                       kvmppc_core_deliver_interrupts(vcpu);
-               }
-       }
-
-       trace_kvm_book3s_reenter(r, vcpu);
-
-       return r;
-}
-
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        return 0;
@@ -1179,69 +460,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-       int i;
-
-       sregs->pvr = vcpu->arch.pvr;
-
-       sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-               for (i = 0; i < 64; i++) {
-                       sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
-                       sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
-               }
-       } else {
-               for (i = 0; i < 16; i++)
-                       sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
-
-               for (i = 0; i < 8; i++) {
-                       sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
-                       sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
-               }
-       }
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-       int i;
-
-       kvmppc_set_pvr(vcpu, sregs->pvr);
-
-       vcpu3s->sdr1 = sregs->u.s.sdr1;
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-               for (i = 0; i < 64; i++) {
-                       vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
-                                                   sregs->u.s.ppc64.slb[i].slbe);
-               }
-       } else {
-               for (i = 0; i < 16; i++) {
-                       vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
-               }
-               for (i = 0; i < 8; i++) {
-                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
-                                      (u32)sregs->u.s.ppc32.ibat[i]);
-                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
-                                      (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
-                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
-                                      (u32)sregs->u.s.ppc32.dbat[i]);
-                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
-                                      (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
-               }
-       }
-
-       /* Flush the MMU after messing with the segments */
-       kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-       return 0;
-}
-
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -ENOTSUPP;
@@ -1296,202 +514,3 @@ out:
        mutex_unlock(&kvm->slots_lock);
        return r;
 }
-
-int kvmppc_core_check_processor_compat(void)
-{
-       return 0;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-       struct kvmppc_vcpu_book3s *vcpu_book3s;
-       struct kvm_vcpu *vcpu;
-       int err = -ENOMEM;
-       unsigned long p;
-
-       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
-       if (!vcpu_book3s)
-               goto out;
-
-       vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-               kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
-       if (!vcpu_book3s->shadow_vcpu)
-               goto free_vcpu;
-
-       vcpu = &vcpu_book3s->vcpu;
-       err = kvm_vcpu_init(vcpu, kvm, id);
-       if (err)
-               goto free_shadow_vcpu;
-
-       p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-       /* the real shared page fills the last 4k of our page */
-       vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
-       if (!p)
-               goto uninit_vcpu;
-
-       vcpu->arch.host_retip = kvm_return_point;
-       vcpu->arch.host_msr = mfmsr();
-#ifdef CONFIG_PPC_BOOK3S_64
-       /* default to book3s_64 (970fx) */
-       vcpu->arch.pvr = 0x3C0301;
-#else
-       /* default to book3s_32 (750) */
-       vcpu->arch.pvr = 0x84202;
-#endif
-       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
-       vcpu_book3s->slb_nr = 64;
-
-       /* remember where some real-mode handlers are */
-       vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
-       vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
-       vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-       vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-       vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
-       vcpu->arch.shadow_msr = MSR_USER64;
-
-       err = kvmppc_mmu_init(vcpu);
-       if (err < 0)
-               goto uninit_vcpu;
-
-       return vcpu;
-
-uninit_vcpu:
-       kvm_vcpu_uninit(vcpu);
-free_shadow_vcpu:
-       kfree(vcpu_book3s->shadow_vcpu);
-free_vcpu:
-       vfree(vcpu_book3s);
-out:
-       return ERR_PTR(err);
-}
-
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-
-       free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
-       kvm_vcpu_uninit(vcpu);
-       kfree(vcpu_book3s->shadow_vcpu);
-       vfree(vcpu_book3s);
-}
-
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-       int ret;
-       double fpr[32][TS_FPRWIDTH];
-       unsigned int fpscr;
-       int fpexc_mode;
-#ifdef CONFIG_ALTIVEC
-       vector128 vr[32];
-       vector128 vscr;
-       unsigned long uninitialized_var(vrsave);
-       int used_vr;
-#endif
-#ifdef CONFIG_VSX
-       int used_vsr;
-#endif
-       ulong ext_msr;
-
-       /* No need to go into the guest when all we do is going out */
-       if (signal_pending(current)) {
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
-       }
-
-       /* Save FPU state in stack */
-       if (current->thread.regs->msr & MSR_FP)
-               giveup_fpu(current);
-       memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
-       fpscr = current->thread.fpscr.val;
-       fpexc_mode = current->thread.fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-       /* Save Altivec state in stack */
-       used_vr = current->thread.used_vr;
-       if (used_vr) {
-               if (current->thread.regs->msr & MSR_VEC)
-                       giveup_altivec(current);
-               memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
-               vscr = current->thread.vscr;
-               vrsave = current->thread.vrsave;
-       }
-#endif
-
-#ifdef CONFIG_VSX
-       /* Save VSX state in stack */
-       used_vsr = current->thread.used_vsr;
-       if (used_vsr && (current->thread.regs->msr & MSR_VSX))
-                       __giveup_vsx(current);
-#endif
-
-       /* Remember the MSR with disabled extensions */
-       ext_msr = current->thread.regs->msr;
-
-       /* XXX we get called with irq disabled - change that! */
-       local_irq_enable();
-
-       /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
-               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-
-       ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
-
-       local_irq_disable();
-
-       current->thread.regs->msr = ext_msr;
-
-       /* Make sure we save the guest FPU/Altivec/VSX state */
-       kvmppc_giveup_ext(vcpu, MSR_FP);
-       kvmppc_giveup_ext(vcpu, MSR_VEC);
-       kvmppc_giveup_ext(vcpu, MSR_VSX);
-
-       /* Restore FPU state from stack */
-       memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
-       current->thread.fpscr.val = fpscr;
-       current->thread.fpexc_mode = fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-       /* Restore Altivec state from stack */
-       if (used_vr && current->thread.used_vr) {
-               memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
-               current->thread.vscr = vscr;
-               current->thread.vrsave = vrsave;
-       }
-       current->thread.used_vr = used_vr;
-#endif
-
-#ifdef CONFIG_VSX
-       current->thread.used_vsr = used_vsr;
-#endif
-
-       return ret;
-}
-
-static int kvmppc_book3s_init(void)
-{
-       int r;
-
-       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-                    THIS_MODULE);
-
-       if (r)
-               return r;
-
-       r = kvmppc_mmu_hpte_sysinit();
-
-       return r;
-}
-
-static void kvmppc_book3s_exit(void)
-{
-       kvmppc_mmu_hpte_sysexit();
-       kvm_exit();
-}
-
-module_init(kvmppc_book3s_init);
-module_exit(kvmppc_book3s_exit);
index d7889ef..c6d3e19 100644 (file)
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-                               struct kvmppc_vcpu_book3s *vcpu_book3s,
+                               struct kvm_vcpu *vcpu,
                                gva_t eaddr)
 {
        int i;
        u64 esid = GET_ESID(eaddr);
        u64 esid_1t = GET_ESID_1T(eaddr);
 
-       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
                u64 cmp_esid = esid;
 
-               if (!vcpu_book3s->slb[i].valid)
+               if (!vcpu->arch.slb[i].valid)
                        continue;
 
-               if (vcpu_book3s->slb[i].tb)
+               if (vcpu->arch.slb[i].tb)
                        cmp_esid = esid_1t;
 
-               if (vcpu_book3s->slb[i].esid == cmp_esid)
-                       return &vcpu_book3s->slb[i];
+               if (vcpu->arch.slb[i].esid == cmp_esid)
+                       return &vcpu->arch.slb[i];
        }
 
        dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
                eaddr, esid, esid_1t);
-       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
-           if (vcpu_book3s->slb[i].vsid)
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
+           if (vcpu->arch.slb[i].vsid)
                dprintk("  %d: %c%c%c %llx %llx\n", i,
-                       vcpu_book3s->slb[i].valid ? 'v' : ' ',
-                       vcpu_book3s->slb[i].large ? 'l' : ' ',
-                       vcpu_book3s->slb[i].tb    ? 't' : ' ',
-                       vcpu_book3s->slb[i].esid,
-                       vcpu_book3s->slb[i].vsid);
+                       vcpu->arch.slb[i].valid ? 'v' : ' ',
+                       vcpu->arch.slb[i].large ? 'l' : ' ',
+                       vcpu->arch.slb[i].tb    ? 't' : ' ',
+                       vcpu->arch.slb[i].esid,
+                       vcpu->arch.slb[i].vsid);
        }
 
        return NULL;
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
        struct kvmppc_slb *slb;
 
-       slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr);
+       slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
        if (!slb)
                return 0;
 
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                return 0;
        }
 
-       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
        if (!slbe)
                goto no_seg_found;
 
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
        esid_1t = GET_ESID_1T(rb);
        slb_nr = rb & 0xfff;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
        slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 
 static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return 0;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        return slbe->orige;
 }
 
 static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return 0;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        return slbe->origv;
 }
 
 static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
        dprintk("KVM MMU: slbie(0x%llx)\n", ea);
 
-       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea);
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 
        if (!slbe)
                return;
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 
 static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        int i;
 
        dprintk("KVM MMU: slbia()\n");
 
-       for (i = 1; i < vcpu_book3s->slb_nr; i++)
-               vcpu_book3s->slb[i].valid = false;
+       for (i = 1; i < vcpu->arch.slb_nr; i++)
+               vcpu->arch.slb[i].valid = false;
 
        if (vcpu->arch.shared->msr & MSR_IR) {
                kvmppc_mmu_flush_segments(vcpu);
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        ulong mp_ea = vcpu->arch.magic_page_ea;
 
        if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-               slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+               slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
                if (slb)
                        gvsid = slb->vsid;
        }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644 (file)
index 0000000..bc3a2ea
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER      24
+#define HPT_NPTEG      (1ul << (HPT_ORDER - 7))        /* 128B per pteg */
+#define HPT_HASH_MASK  (HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER        24
+#define VRMA_VSID      0x1ffffffUL     /* 1TB VSID reserved for VRMA */
+
+/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
+#define MAX_LPID_970   63
+#define NR_LPIDS       (LPID_RSVD + 1)
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+       unsigned long hpt;
+       unsigned long lpid;
+
+       hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+                              HPT_ORDER - PAGE_SHIFT);
+       if (!hpt) {
+               pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+               return -ENOMEM;
+       }
+       kvm->arch.hpt_virt = hpt;
+
+       do {
+               lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+               if (lpid >= NR_LPIDS) {
+                       pr_err("kvm_alloc_hpt: No LPIDs free\n");
+                       free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+                       return -ENOMEM;
+               }
+       } while (test_and_set_bit(lpid, lpid_inuse));
+
+       kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+       kvm->arch.lpid = lpid;
+
+       pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+       return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+       clear_bit(kvm->arch.lpid, lpid_inuse);
+       free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+       unsigned long i;
+       unsigned long npages = kvm->arch.ram_npages;
+       unsigned long pfn;
+       unsigned long *hpte;
+       unsigned long hash;
+       struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+       if (!pginfo)
+               return;
+
+       /* VRMA can't be > 1TB */
+       if (npages > 1ul << (40 - kvm->arch.ram_porder))
+               npages = 1ul << (40 - kvm->arch.ram_porder);
+       /* Can't use more than 1 HPTE per HPTEG */
+       if (npages > HPT_NPTEG)
+               npages = HPT_NPTEG;
+
+       for (i = 0; i < npages; ++i) {
+               pfn = pginfo[i].pfn;
+               if (!pfn)
+                       break;
+               /* can't use hpt_hash since va > 64 bits */
+               hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+               /*
+                * We assume that the hash table is empty and no
+                * vcpus are using it at this stage.  Since we create
+                * at most one HPTE per HPTEG, we just assume entry 7
+                * is available and use it.
+                */
+               hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+               hpte += 7 * 2;
+               /* HPTE low word - RPN, protection, etc. */
+               hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+                       HPTE_R_M | PP_RWXX;
+               wmb();
+               hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+                       (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+                       HPTE_V_LARGE | HPTE_V_VALID;
+       }
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+       unsigned long host_lpid, rsvd_lpid;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return -EINVAL;
+
+       memset(lpid_inuse, 0, sizeof(lpid_inuse));
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+               host_lpid = mfspr(SPRN_LPID);   /* POWER7 */
+               rsvd_lpid = LPID_RSVD;
+       } else {
+               host_lpid = 0;                  /* PPC970 */
+               rsvd_lpid = MAX_LPID_970;
+       }
+
+       set_bit(host_lpid, lpid_inuse);
+       /* rsvd_lpid is reserved for use in partition switching */
+       set_bit(rsvd_lpid, lpid_inuse);
+
+       return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+       kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                               struct kvmppc_pte *gpte, bool data)
+{
+       return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206))
+               vcpu->arch.slb_nr = 32;         /* POWER7 */
+       else
+               vcpu->arch.slb_nr = 64;
+
+       mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+       mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+       vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644 (file)
index 0000000..ea0f8c5
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvmppc_spapr_tce_table *stt;
+
+       /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+       /*          liobn, ioba, tce); */
+
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == liobn) {
+                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+                       struct page *page;
+                       u64 *tbl;
+
+                       /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+                       /*          liobn, stt, stt->window_size); */
+                       if (ioba >= stt->window_size)
+                               return H_PARAMETER;
+
+                       page = stt->pages[idx / TCES_PER_PAGE];
+                       tbl = (u64 *)page_address(page);
+
+                       /* FIXME: Need to validate the TCE itself */
+                       /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+                       tbl[idx % TCES_PER_PAGE] = tce;
+                       return H_SUCCESS;
+               }
+       }
+
+       /* Didn't find the liobn, punt it to userspace */
+       return H_TOO_HARD;
+}
index 1dd5a1d..88c8f26 100644 (file)
 #include <linux/module.h>
 #include <asm/kvm_book3s.h>
 
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#else
+EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
+EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_rmcall);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
 #ifdef CONFIG_VSX
 EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
 #endif
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644 (file)
index 0000000..cc0d7f1
--- /dev/null
@@ -0,0 +1,1269 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/*
+ * For now, limit memory to 64GB and require it to be large pages.
+ * This value is chosen because it makes the ram_pginfo array be
+ * 64kB in size, which is about as large as we want to be trying
+ * to allocate with kmalloc.
+ */
+#define MAX_MEM_ORDER          36
+
+#define LARGE_PAGE_ORDER       24      /* 16MB pages */
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       local_paca->kvm_hstate.kvm_vcpu = vcpu;
+       local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+       u64 now;
+       unsigned long dec_nsec;
+
+       now = get_tb();
+       if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+               kvmppc_core_queue_dec(vcpu);
+       if (vcpu->arch.pending_exceptions)
+               return;
+       if (vcpu->arch.dec_expires != ~(u64)0) {
+               dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+                       tb_ticks_per_sec;
+               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                             HRTIMER_MODE_REL);
+       }
+
+       kvmppc_vcpu_blocked(vcpu);
+
+       kvm_vcpu_block(vcpu);
+       vcpu->stat.halt_wakeup++;
+
+       if (vcpu->arch.dec_expires != ~(u64)0)
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+       kvmppc_vcpu_unblocked(vcpu);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+       vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+       vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+       pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+              vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+       for (r = 0; r < 16; ++r)
+               pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+                      r, kvmppc_get_gpr(vcpu, r),
+                      r+16, kvmppc_get_gpr(vcpu, r+16));
+       pr_err("ctr = %.16lx  lr  = %.16lx\n",
+              vcpu->arch.ctr, vcpu->arch.lr);
+       pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+              vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+       pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+              vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+       pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+              vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+       pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+              vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+       pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+       pr_err("fault dar = %.16lx dsisr = %.8x\n",
+              vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+       pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+       for (r = 0; r < vcpu->arch.slb_max; ++r)
+               pr_err("  ESID = %.16llx VSID = %.16llx\n",
+                      vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+       pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+              vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
+              vcpu->arch.last_inst);
+}
+
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+       int r;
+       struct kvm_vcpu *v, *ret = NULL;
+
+       mutex_lock(&kvm->lock);
+       kvm_for_each_vcpu(r, v, kvm) {
+               if (v->vcpu_id == id) {
+                       ret = v;
+                       break;
+               }
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+       vpa->shared_proc = 1;
+       vpa->yield_count = 1;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+                                      unsigned long flags,
+                                      unsigned long vcpuid, unsigned long vpa)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long pg_index, ra, len;
+       unsigned long pg_offset;
+       void *va;
+       struct kvm_vcpu *tvcpu;
+
+       tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+       if (!tvcpu)
+               return H_PARAMETER;
+
+       flags >>= 63 - 18;
+       flags &= 7;
+       if (flags == 0 || flags == 4)
+               return H_PARAMETER;
+       if (flags < 4) {
+               if (vpa & 0x7f)
+                       return H_PARAMETER;
+               /* registering new area; convert logical addr to real */
+               pg_index = vpa >> kvm->arch.ram_porder;
+               pg_offset = vpa & (kvm->arch.ram_psize - 1);
+               if (pg_index >= kvm->arch.ram_npages)
+                       return H_PARAMETER;
+               if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+                       return H_PARAMETER;
+               ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
+               ra |= pg_offset;
+               va = __va(ra);
+               if (flags <= 1)
+                       len = *(unsigned short *)(va + 4);
+               else
+                       len = *(unsigned int *)(va + 4);
+               if (pg_offset + len > kvm->arch.ram_psize)
+                       return H_PARAMETER;
+               switch (flags) {
+               case 1:         /* register VPA */
+                       if (len < 640)
+                               return H_PARAMETER;
+                       tvcpu->arch.vpa = va;
+                       init_vpa(vcpu, va);
+                       break;
+               case 2:         /* register DTL */
+                       if (len < 48)
+                               return H_PARAMETER;
+                       if (!tvcpu->arch.vpa)
+                               return H_RESOURCE;
+                       len -= len % 48;
+                       tvcpu->arch.dtl = va;
+                       tvcpu->arch.dtl_end = va + len;
+                       break;
+               case 3:         /* register SLB shadow buffer */
+                       if (len < 8)
+                               return H_PARAMETER;
+                       if (!tvcpu->arch.vpa)
+                               return H_RESOURCE;
+                       tvcpu->arch.slb_shadow = va;
+                       len = (len - 16) / 16;
+                       tvcpu->arch.slb_shadow = va;
+                       break;
+               }
+       } else {
+               switch (flags) {
+               case 5:         /* unregister VPA */
+                       if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
+                               return H_RESOURCE;
+                       tvcpu->arch.vpa = NULL;
+                       break;
+               case 6:         /* unregister DTL */
+                       tvcpu->arch.dtl = NULL;
+                       break;
+               case 7:         /* unregister SLB shadow buffer */
+                       tvcpu->arch.slb_shadow = NULL;
+                       break;
+               }
+       }
+       return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+       unsigned long req = kvmppc_get_gpr(vcpu, 3);
+       unsigned long target, ret = H_SUCCESS;
+       struct kvm_vcpu *tvcpu;
+
+       switch (req) {
+       case H_CEDE:
+               vcpu->arch.shregs.msr |= MSR_EE;
+               vcpu->arch.ceded = 1;
+               smp_mb();
+               if (!vcpu->arch.prodded)
+                       kvmppc_vcpu_block(vcpu);
+               else
+                       vcpu->arch.prodded = 0;
+               smp_mb();
+               vcpu->arch.ceded = 0;
+               break;
+       case H_PROD:
+               target = kvmppc_get_gpr(vcpu, 4);
+               tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+               if (!tvcpu) {
+                       ret = H_PARAMETER;
+                       break;
+               }
+               tvcpu->arch.prodded = 1;
+               smp_mb();
+               if (vcpu->arch.ceded) {
+                       if (waitqueue_active(&vcpu->wq)) {
+                               wake_up_interruptible(&vcpu->wq);
+                               vcpu->stat.halt_wakeup++;
+                       }
+               }
+               break;
+       case H_CONFER:
+               break;
+       case H_REGISTER_VPA:
+               ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5),
+                                       kvmppc_get_gpr(vcpu, 6));
+               break;
+       default:
+               return RESUME_HOST;
+       }
+       kvmppc_set_gpr(vcpu, 3, ret);
+       vcpu->arch.hcall_needed = 0;
+       return RESUME_GUEST;
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                             struct task_struct *tsk)
+{
+       int r = RESUME_HOST;
+
+       vcpu->stat.sum_exits++;
+
+       run->exit_reason = KVM_EXIT_UNKNOWN;
+       run->ready_for_interrupt_injection = 1;
+       switch (vcpu->arch.trap) {
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_HV_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PERFMON:
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PROGRAM:
+       {
+               ulong flags;
+               /*
+                * Normally program interrupts are delivered directly
+                * to the guest by the hardware, but we can get here
+                * as a result of a hypervisor emulation interrupt
+                * (e40) getting turned into a 700 by BML RTAS.
+                */
+               flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+               kvmppc_core_queue_program(vcpu, flags);
+               r = RESUME_GUEST;
+               break;
+       }
+       case BOOK3S_INTERRUPT_SYSCALL:
+       {
+               /* hcall - punt to userspace */
+               int i;
+
+               if (vcpu->arch.shregs.msr & MSR_PR) {
+                       /* sc 1 from userspace - reflect to guest syscall */
+                       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+                       r = RESUME_GUEST;
+                       break;
+               }
+               run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+               for (i = 0; i < 9; ++i)
+                       run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+               run->exit_reason = KVM_EXIT_PAPR_HCALL;
+               vcpu->arch.hcall_needed = 1;
+               r = RESUME_HOST;
+               break;
+       }
+       /*
+        * We get these next two if the guest does a bad real-mode access,
+        * as we have enabled VRMA (virtualized real mode area) mode in the
+        * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+        */
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+               vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+                                       0x08000000);
+               r = RESUME_GUEST;
+               break;
+       /*
+        * This occurs if the guest executes an illegal instruction.
+        * We just generate a program interrupt to the guest, since
+        * we don't emulate any guest instructions at this stage.
+        */
+       case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+               kvmppc_core_queue_program(vcpu, 0x80000);
+               r = RESUME_GUEST;
+               break;
+       default:
+               kvmppc_dump_regs(vcpu);
+               printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+                       vcpu->arch.trap, kvmppc_get_pc(vcpu),
+                       vcpu->arch.shregs.msr);
+               r = RESUME_HOST;
+               BUG();
+               break;
+       }
+
+
+       if (!(r & RESUME_HOST)) {
+               /* To avoid clobbering exit_reason, only check for signals if
+                * we aren't already exiting to userspace for some other
+                * reason. */
+               if (signal_pending(tsk)) {
+                       vcpu->stat.signal_exits++;
+                       run->exit_reason = KVM_EXIT_INTR;
+                       r = -EINTR;
+               } else {
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       }
+
+       return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       int i;
+
+       sregs->pvr = vcpu->arch.pvr;
+
+       memset(sregs, 0, sizeof(struct kvm_sregs));
+       for (i = 0; i < vcpu->arch.slb_max; i++) {
+               sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+               sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+       }
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       int i, j;
+
+       kvmppc_set_pvr(vcpu, sregs->pvr);
+
+       j = 0;
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
+               if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+                       vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+                       vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+                       ++j;
+               }
+       }
+       vcpu->arch.slb_max = j;
+
+       return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               return 0;
+       return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       struct kvm_vcpu *vcpu;
+       int err = -EINVAL;
+       int core;
+       struct kvmppc_vcore *vcore;
+
+       core = id / threads_per_core;
+       if (core >= KVM_MAX_VCORES)
+               goto out;
+
+       err = -ENOMEM;
+       vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+       if (!vcpu)
+               goto out;
+
+       err = kvm_vcpu_init(vcpu, kvm, id);
+       if (err)
+               goto free_vcpu;
+
+       vcpu->arch.shared = &vcpu->arch.shregs;
+       vcpu->arch.last_cpu = -1;
+       vcpu->arch.mmcr[0] = MMCR0_FC;
+       vcpu->arch.ctrl = CTRL_RUNLATCH;
+       /* default to host PVR, since we can't spoof it */
+       vcpu->arch.pvr = mfspr(SPRN_PVR);
+       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+       kvmppc_mmu_book3s_hv_init(vcpu);
+
+       /*
+        * Some vcpus may start out in stopped state.  If we initialize
+        * them to busy-in-host state they will stop other vcpus in the
+        * vcore from running.  Instead we initialize them to blocked
+        * state, effectively considering them to be stopped until we
+        * see the first run ioctl for them.
+        */
+       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+       init_waitqueue_head(&vcpu->arch.cpu_run);
+
+       mutex_lock(&kvm->lock);
+       vcore = kvm->arch.vcores[core];
+       if (!vcore) {
+               vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+               if (vcore) {
+                       INIT_LIST_HEAD(&vcore->runnable_threads);
+                       spin_lock_init(&vcore->lock);
+               }
+               kvm->arch.vcores[core] = vcore;
+       }
+       mutex_unlock(&kvm->lock);
+
+       if (!vcore)
+               goto free_vcpu;
+
+       spin_lock(&vcore->lock);
+       ++vcore->num_threads;
+       ++vcore->n_blocked;
+       spin_unlock(&vcore->lock);
+       vcpu->arch.vcore = vcore;
+
+       return vcpu;
+
+free_vcpu:
+       kfree(vcpu);
+out:
+       return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       kvm_vcpu_uninit(vcpu);
+       kfree(vcpu);
+}
+
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       spin_lock(&vc->lock);
+       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+       ++vc->n_blocked;
+       if (vc->n_runnable > 0 &&
+           vc->n_runnable + vc->n_blocked == vc->num_threads) {
+               vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+                                       arch.run_list);
+               wake_up(&vcpu->arch.cpu_run);
+       }
+       spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       spin_lock(&vc->lock);
+       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+       --vc->n_blocked;
+       spin_unlock(&vc->lock);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+                                  struct kvm_vcpu *vcpu)
+{
+       struct kvm_vcpu *v;
+
+       if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+               return;
+       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+       --vc->n_runnable;
+       /* decrement the physical thread id of each following vcpu */
+       v = vcpu;
+       list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
+               --v->arch.ptid;
+       list_del(&vcpu->arch.run_list);
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+       int cpu;
+       struct paca_struct *tpaca;
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       cpu = vc->pcpu + vcpu->arch.ptid;
+       tpaca = &paca[cpu];
+       tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       tpaca->kvm_hstate.kvm_vcore = vc;
+       smp_wmb();
+#ifdef CONFIG_PPC_ICP_NATIVE
+       if (vcpu->arch.ptid) {
+               tpaca->cpu_start = 0x80;
+               tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
+               wmb();
+               xics_wake_cpu(cpu);
+               ++vc->n_woken;
+       }
+#endif
+}
+
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+       int i;
+
+       HMT_low();
+       i = 0;
+       while (vc->nap_count < vc->n_woken) {
+               if (++i >= 1000000) {
+                       pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+                              vc->nap_count, vc->n_woken);
+                       break;
+               }
+               cpu_relax();
+       }
+       HMT_medium();
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.
+ */
+static int on_primary_thread(void)
+{
+       int cpu = smp_processor_id();
+       int thr = cpu_thread_in_core(cpu);
+
+       if (thr)
+               return 0;
+       while (++thr < threads_per_core)
+               if (cpu_online(cpu + thr))
+                       return 0;
+       return 1;
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu, *vnext;
+       long ret;
+       u64 now;
+
+       /* don't start if any threads have a signal pending */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               if (signal_pending(vcpu->arch.run_task))
+                       return 0;
+
+       /*
+        * Make sure we are running on thread 0, and that
+        * secondary threads are offline.
+        * XXX we should also block attempts to bring any
+        * secondary threads online.
+        */
+       if (threads_per_core > 1 && !on_primary_thread()) {
+               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+                       vcpu->arch.ret = -EBUSY;
+               goto out;
+       }
+
+       vc->n_woken = 0;
+       vc->nap_count = 0;
+       vc->entry_exit_count = 0;
+       vc->vcore_running = 1;
+       vc->in_guest = 0;
+       vc->pcpu = smp_processor_id();
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               kvmppc_start_thread(vcpu);
+       vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+                               arch.run_list);
+
+       spin_unlock(&vc->lock);
+
+       preempt_disable();
+       kvm_guest_enter();
+       __kvmppc_vcore_entry(NULL, vcpu);
+
+       /* wait for secondary threads to finish writing their state to memory */
+       spin_lock(&vc->lock);
+       if (vc->nap_count < vc->n_woken)
+               kvmppc_wait_for_nap(vc);
+       /* prevent other vcpu threads from doing kvmppc_start_thread() now */
+       vc->vcore_running = 2;
+       spin_unlock(&vc->lock);
+
+       /* make sure updates to secondary vcpu structs are visible now */
+       smp_mb();
+       kvm_guest_exit();
+
+       preempt_enable();
+       kvm_resched(vcpu);
+
+       now = get_tb();
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               /* cancel pending dec exception if dec is positive */
+               if (now < vcpu->arch.dec_expires &&
+                   kvmppc_core_pending_dec(vcpu))
+                       kvmppc_core_dequeue_dec(vcpu);
+               if (!vcpu->arch.trap) {
+                       if (signal_pending(vcpu->arch.run_task)) {
+                               vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                       }
+                       continue;               /* didn't get to run */
+               }
+               ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+                                        vcpu->arch.run_task);
+               vcpu->arch.ret = ret;
+               vcpu->arch.trap = 0;
+       }
+
+       spin_lock(&vc->lock);
+ out:
+       vc->vcore_running = 0;
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               if (vcpu->arch.ret != RESUME_GUEST) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+
+       return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int ptid;
+       int wait_state;
+       struct kvmppc_vcore *vc;
+       DEFINE_WAIT(wait);
+
+       /* No need to go into the guest when all we do is going out */
+       if (signal_pending(current)) {
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* On PPC970, check that we have an RMA region */
+       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+               return -EPERM;
+
+       kvm_run->exit_reason = 0;
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+
+       flush_fp_to_thread(current);
+       flush_altivec_to_thread(current);
+       flush_vsx_to_thread(current);
+
+       /*
+        * Synchronize with other threads in this virtual core
+        */
+       vc = vcpu->arch.vcore;
+       spin_lock(&vc->lock);
+       /* This happens the first time this is called for a vcpu */
+       if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+               --vc->n_blocked;
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+       ptid = vc->n_runnable;
+       vcpu->arch.run_task = current;
+       vcpu->arch.kvm_run = kvm_run;
+       vcpu->arch.ptid = ptid;
+       list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+       ++vc->n_runnable;
+
+       wait_state = TASK_INTERRUPTIBLE;
+       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+               if (signal_pending(current)) {
+                       if (!vc->vcore_running) {
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                               break;
+                       }
+                       /* have to wait for vcore to stop executing guest */
+                       wait_state = TASK_UNINTERRUPTIBLE;
+                       smp_send_reschedule(vc->pcpu);
+               }
+
+               if (!vc->vcore_running &&
+                   vc->n_runnable + vc->n_blocked == vc->num_threads) {
+                       /* we can run now */
+                       if (kvmppc_run_core(vc))
+                               continue;
+               }
+
+               if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+                       kvmppc_start_thread(vcpu);
+
+               /* wait for other threads to come in, or wait for vcore */
+               prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+               spin_unlock(&vc->lock);
+               schedule();
+               finish_wait(&vcpu->arch.cpu_run, &wait);
+               spin_lock(&vc->lock);
+       }
+
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+               kvmppc_remove_runnable(vc, vcpu);
+       spin_unlock(&vc->lock);
+
+       return vcpu->arch.ret;
+}
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       do {
+               r = kvmppc_run_vcpu(run, vcpu);
+
+               if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+                   !(vcpu->arch.shregs.msr & MSR_PR)) {
+                       r = kvmppc_pseries_do_hcall(vcpu);
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       } while (r == RESUME_GUEST);
+       return r;
+}
+
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+       struct kvm *kvm = stt->kvm;
+       int i;
+
+       mutex_lock(&kvm->lock);
+       list_del(&stt->list);
+       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+               __free_page(stt->pages[i]);
+       kfree(stt);
+       mutex_unlock(&kvm->lock);
+
+       kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+       struct page *page;
+
+       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+               return VM_FAULT_SIGBUS;
+
+       page = stt->pages[vmf->pgoff];
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+       .fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_ops = &kvm_spapr_tce_vm_ops;
+       return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+       struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+       release_spapr_tce_table(stt);
+       return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+       .mmap           = kvm_spapr_tce_mmap,
+       .release        = kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+                                  struct kvm_create_spapr_tce *args)
+{
+       struct kvmppc_spapr_tce_table *stt = NULL;
+       long npages;
+       int ret = -ENOMEM;
+       int i;
+
+       /* Check this LIOBN hasn't been previously allocated */
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == args->liobn)
+                       return -EBUSY;
+       }
+
+       npages = kvmppc_stt_npages(args->window_size);
+
+       stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+                     GFP_KERNEL);
+       if (!stt)
+               goto fail;
+
+       stt->liobn = args->liobn;
+       stt->window_size = args->window_size;
+       stt->kvm = kvm;
+
+       for (i = 0; i < npages; i++) {
+               stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!stt->pages[i])
+                       goto fail;
+       }
+
+       kvm_get_kvm(kvm);
+
+       mutex_lock(&kvm->lock);
+       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+       mutex_unlock(&kvm->lock);
+
+       return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+                               stt, O_RDWR);
+
+fail:
+       if (stt) {
+               for (i = 0; i < npages; i++)
+                       if (stt->pages[i])
+                               __free_page(stt->pages[i]);
+
+               kfree(stt);
+       }
+       return ret;
+}
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7 or PPC970. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+       switch (rma_size) {
+       case 32ul << 20:        /* 32 MB */
+               if (cpu_has_feature(CPU_FTR_ARCH_206))
+                       return 8;       /* only supported on POWER7 */
+               return -1;
+       case 64ul << 20:        /* 64 MB */
+               return 3;
+       case 128ul << 20:       /* 128 MB */
+               return 7;
+       case 256ul << 20:       /* 256 MB */
+               return 4;
+       case 1ul << 30:         /* 1 GB */
+               return 2;
+       case 16ul << 30:        /* 16 GB */
+               return 1;
+       case 256ul << 30:       /* 256 GB */
+               return 0;
+       default:
+               return -1;
+       }
+}
+
+static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+       struct page *page;
+
+       if (vmf->pgoff >= ri->npages)
+               return VM_FAULT_SIGBUS;
+
+       page = pfn_to_page(ri->base_pfn + vmf->pgoff);
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+
+static const struct vm_operations_struct kvm_rma_vm_ops = {
+       .fault = kvm_rma_fault,
+};
+
+static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &kvm_rma_vm_ops;
+       return 0;
+}
+
+static int kvm_rma_release(struct inode *inode, struct file *filp)
+{
+       struct kvmppc_rma_info *ri = filp->private_data;
+
+       kvm_release_rma(ri);
+       return 0;
+}
+
+static struct file_operations kvm_rma_fops = {
+       .mmap           = kvm_rma_mmap,
+       .release        = kvm_rma_release,
+};
+
+long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+{
+       struct kvmppc_rma_info *ri;
+       long fd;
+
+       ri = kvm_alloc_rma();
+       if (!ri)
+               return -ENOMEM;
+
+       fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+       if (fd < 0)
+               kvm_release_rma(ri);
+
+       ret->rma_size = ri->npages << PAGE_SHIFT;
+       return fd;
+}
+
+static struct page *hva_to_page(unsigned long addr)
+{
+       struct page *page[1];
+       int npages;
+
+       might_sleep();
+
+       npages = get_user_pages_fast(addr, 1, 1, page);
+
+       if (unlikely(npages != 1))
+               return 0;
+
+       return page[0];
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+       unsigned long psize, porder;
+       unsigned long i, npages, totalpages;
+       unsigned long pg_ix;
+       struct kvmppc_pginfo *pginfo;
+       unsigned long hva;
+       struct kvmppc_rma_info *ri = NULL;
+       struct page *page;
+
+       /* For now, only allow 16MB pages */
+       porder = LARGE_PAGE_ORDER;
+       psize = 1ul << porder;
+       if ((mem->memory_size & (psize - 1)) ||
+           (mem->guest_phys_addr & (psize - 1))) {
+               pr_err("bad memory_size=%llx @ %llx\n",
+                      mem->memory_size, mem->guest_phys_addr);
+               return -EINVAL;
+       }
+
+       npages = mem->memory_size >> porder;
+       totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+
+       /* More memory than we have space to track? */
+       if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
+               return -EINVAL;
+
+       /* Do we already have an RMA registered? */
+       if (mem->guest_phys_addr == 0 && kvm->arch.rma)
+               return -EINVAL;
+
+       if (totalpages > kvm->arch.ram_npages)
+               kvm->arch.ram_npages = totalpages;
+
+       /* Is this one of our preallocated RMAs? */
+       if (mem->guest_phys_addr == 0) {
+               struct vm_area_struct *vma;
+
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma(current->mm, mem->userspace_addr);
+               if (vma && vma->vm_file &&
+                   vma->vm_file->f_op == &kvm_rma_fops &&
+                   mem->userspace_addr == vma->vm_start)
+                       ri = vma->vm_file->private_data;
+               up_read(&current->mm->mmap_sem);
+               if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       pr_err("CPU requires an RMO\n");
+                       return -EINVAL;
+               }
+       }
+
+       if (ri) {
+               unsigned long rma_size;
+               unsigned long lpcr;
+               long rmls;
+
+               rma_size = ri->npages << PAGE_SHIFT;
+               if (rma_size > mem->memory_size)
+                       rma_size = mem->memory_size;
+               rmls = lpcr_rmls(rma_size);
+               if (rmls < 0) {
+                       pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
+                       return -EINVAL;
+               }
+               atomic_inc(&ri->use_count);
+               kvm->arch.rma = ri;
+               kvm->arch.n_rma_pages = rma_size >> porder;
+
+               /* Update LPCR and RMOR */
+               lpcr = kvm->arch.lpcr;
+               if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       /* PPC970; insert RMLS value (split field) in HID4 */
+                       lpcr &= ~((1ul << HID4_RMLS0_SH) |
+                                 (3ul << HID4_RMLS2_SH));
+                       lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+                               ((rmls & 3) << HID4_RMLS2_SH);
+                       /* RMOR is also in HID4 */
+                       lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+                               << HID4_RMOR_SH;
+               } else {
+                       /* POWER7 */
+                       lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+                       lpcr |= rmls << LPCR_RMLS_SH;
+                       kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+               }
+               kvm->arch.lpcr = lpcr;
+               pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+                       ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
+       }
+
+       pg_ix = mem->guest_phys_addr >> porder;
+       pginfo = kvm->arch.ram_pginfo + pg_ix;
+       for (i = 0; i < npages; ++i, ++pg_ix) {
+               if (ri && pg_ix < kvm->arch.n_rma_pages) {
+                       pginfo[i].pfn = ri->base_pfn +
+                               (pg_ix << (porder - PAGE_SHIFT));
+                       continue;
+               }
+               hva = mem->userspace_addr + (i << porder);
+               page = hva_to_page(hva);
+               if (!page) {
+                       pr_err("oops, no pfn for hva %lx\n", hva);
+                       goto err;
+               }
+               /* Check it's a 16MB page */
+               if (!PageHead(page) ||
+                   compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
+                       pr_err("page at %lx isn't 16MB (o=%d)\n",
+                              hva, compound_order(page));
+                       goto err;
+               }
+               pginfo[i].pfn = page_to_pfn(page);
+       }
+
+       return 0;
+
+ err:
+       return -EINVAL;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+       if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
+           !kvm->arch.rma)
+               kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+       long r;
+       unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
+       long err = -ENOMEM;
+       unsigned long lpcr;
+
+       /* Allocate hashed page table */
+       r = kvmppc_alloc_hpt(kvm);
+       if (r)
+               return r;
+
+       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+
+       kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
+                                      GFP_KERNEL);
+       if (!kvm->arch.ram_pginfo) {
+               pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
+                      npages * sizeof(struct kvmppc_pginfo));
+               goto out_free;
+       }
+
+       kvm->arch.ram_npages = 0;
+       kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+       kvm->arch.ram_porder = LARGE_PAGE_ORDER;
+       kvm->arch.rma = NULL;
+       kvm->arch.n_rma_pages = 0;
+
+       kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+
+       if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+               /* PPC970; HID4 is effectively the LPCR */
+               unsigned long lpid = kvm->arch.lpid;
+               kvm->arch.host_lpid = 0;
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+               lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+               lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+                       ((lpid & 0xf) << HID4_LPID5_SH);
+       } else {
+               /* POWER7; init LPCR for virtual RMA mode */
+               kvm->arch.host_lpid = mfspr(SPRN_LPID);
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+               lpcr &= LPCR_PECE | LPCR_LPES;
+               lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+                       LPCR_VPM0 | LPCR_VRMA_L;
+       }
+       kvm->arch.lpcr = lpcr;
+
+       return 0;
+
+ out_free:
+       kvmppc_free_hpt(kvm);
+       return err;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+       struct kvmppc_pginfo *pginfo;
+       unsigned long i;
+
+       if (kvm->arch.ram_pginfo) {
+               pginfo = kvm->arch.ram_pginfo;
+               kvm->arch.ram_pginfo = NULL;
+               for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
+                       if (pginfo[i].pfn)
+                               put_page(pfn_to_page(pginfo[i].pfn));
+               kfree(pginfo);
+       }
+       if (kvm->arch.rma) {
+               kvm_release_rma(kvm->arch.rma);
+               kvm->arch.rma = NULL;
+       }
+
+       kvmppc_free_hpt(kvm);
+       WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+       return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+       return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+       return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+       int r;
+
+       r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+       if (r)
+               return r;
+
+       r = kvmppc_mmu_hv_init();
+
+       return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+       kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644 (file)
index 0000000..d431203
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/*
+ * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Each RMA has to be physically contiguous and of a size that the
+ * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
+ * and other larger sizes.  Since we are unlikely to be allocate that
+ * much physically contiguous memory after the system is up and running,
+ * we preallocate a set of RMAs in early boot for KVM to use.
+ */
+static unsigned long kvm_rma_size = 64 << 20;  /* 64MB */
+static unsigned long kvm_rma_count;
+
+static int __init early_parse_rma_size(char *p)
+{
+       if (!p)
+               return 1;
+
+       kvm_rma_size = memparse(p, &p);
+
+       return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+       if (!p)
+               return 1;
+
+       kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+       return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+static struct kvmppc_rma_info *rma_info;
+static LIST_HEAD(free_rmas);
+static DEFINE_SPINLOCK(rma_lock);
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7 or PPC970. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+       switch (rma_size) {
+       case 32ul << 20:        /* 32 MB */
+               if (cpu_has_feature(CPU_FTR_ARCH_206))
+                       return 8;       /* only supported on POWER7 */
+               return -1;
+       case 64ul << 20:        /* 64 MB */
+               return 3;
+       case 128ul << 20:       /* 128 MB */
+               return 7;
+       case 256ul << 20:       /* 256 MB */
+               return 4;
+       case 1ul << 30:         /* 1 GB */
+               return 2;
+       case 16ul << 30:        /* 16 GB */
+               return 1;
+       case 256ul << 30:       /* 256 GB */
+               return 0;
+       default:
+               return -1;
+       }
+}
+
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the real memory
+ * areas for guests.
+ */
+void kvm_rma_init(void)
+{
+       unsigned long i;
+       unsigned long j, npages;
+       void *rma;
+       struct page *pg;
+
+       /* Only do this on PPC970 in HV mode */
+       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+           !cpu_has_feature(CPU_FTR_ARCH_201))
+               return;
+
+       if (!kvm_rma_size || !kvm_rma_count)
+               return;
+
+       /* Check that the requested size is one supported in hardware */
+       if (lpcr_rmls(kvm_rma_size) < 0) {
+               pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+               return;
+       }
+
+       npages = kvm_rma_size >> PAGE_SHIFT;
+       rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
+       for (i = 0; i < kvm_rma_count; ++i) {
+               rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
+               pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
+                       kvm_rma_size >> 20);
+               rma_info[i].base_virt = rma;
+               rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
+               rma_info[i].npages = npages;
+               list_add_tail(&rma_info[i].list, &free_rmas);
+               atomic_set(&rma_info[i].use_count, 0);
+
+               pg = pfn_to_page(rma_info[i].base_pfn);
+               for (j = 0; j < npages; ++j) {
+                       atomic_inc(&pg->_count);
+                       ++pg;
+               }
+       }
+}
+
+struct kvmppc_rma_info *kvm_alloc_rma(void)
+{
+       struct kvmppc_rma_info *ri;
+
+       ri = NULL;
+       spin_lock(&rma_lock);
+       if (!list_empty(&free_rmas)) {
+               ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+               list_del(&ri->list);
+               atomic_inc(&ri->use_count);
+       }
+       spin_unlock(&rma_lock);
+       return ri;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_rma_info *ri)
+{
+       if (atomic_dec_and_test(&ri->use_count)) {
+               spin_lock(&rma_lock);
+               list_add_tail(&ri->list, &free_rmas);
+               spin_unlock(&rma_lock);
+
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644 (file)
index 0000000..3f7b674
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+       /* Write correct stack frame */
+       mflr    r0
+       std     r0,PPC_LR_STKOFF(r1)
+
+       /* Save host state to the stack */
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+       /* Save non-volatile registers (r14 - r31) */
+       SAVE_NVGPRS(r1)
+
+       /* Save host DSCR */
+BEGIN_FTR_SECTION
+       mfspr   r3, SPRN_DSCR
+       std     r3, HSTATE_DSCR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save host DABR */
+       mfspr   r3, SPRN_DABR
+       std     r3, HSTATE_DABR(r13)
+
+       /* Hard-disable interrupts */
+       mfmsr   r10
+       std     r10, HSTATE_HOST_MSR(r13)
+       rldicl  r10,r10,48,1
+       rotldi  r10,r10,16
+       mtmsrd  r10,1
+
+       /* Save host PMU registers and load guest PMU registers */
+       /* R4 is live here (vcpu pointer) but not r3 or r5 */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
+       isync
+       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
+       lbz     r5, LPPACA_PMCINUSE(r3)
+       cmpwi   r5, 0
+       beq     31f                     /* skip if not */
+       mfspr   r5, SPRN_MMCR1
+       mfspr   r6, SPRN_MMCRA
+       std     r7, HSTATE_MMCR(r13)
+       std     r5, HSTATE_MMCR + 8(r13)
+       std     r6, HSTATE_MMCR + 16(r13)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r5, SPRN_PMC2
+       mfspr   r6, SPRN_PMC3
+       mfspr   r7, SPRN_PMC4
+       mfspr   r8, SPRN_PMC5
+       mfspr   r9, SPRN_PMC6
+BEGIN_FTR_SECTION
+       mfspr   r10, SPRN_PMC7
+       mfspr   r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       stw     r3, HSTATE_PMC(r13)
+       stw     r5, HSTATE_PMC + 4(r13)
+       stw     r6, HSTATE_PMC + 8(r13)
+       stw     r7, HSTATE_PMC + 12(r13)
+       stw     r8, HSTATE_PMC + 16(r13)
+       stw     r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+       stw     r10, HSTATE_PMC + 24(r13)
+       stw     r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+31:
+
+       /*
+        * Put whatever is in the decrementer into the
+        * hypervisor decrementer.
+        */
+       mfspr   r8,SPRN_DEC
+       mftb    r7
+       mtspr   SPRN_HDEC,r8
+       extsw   r8,r8
+       add     r8,r8,r7
+       std     r8,HSTATE_DECEXP(r13)
+
+       /*
+        * On PPC970, if the guest vcpu has an external interrupt pending,
+        * send ourselves an IPI so as to interrupt the guest once it
+        * enables interrupts.  (It must have interrupts disabled,
+        * otherwise we would already have delivered the interrupt.)
+        */
+BEGIN_FTR_SECTION
+       ld      r0, VCPU_PENDING_EXC(r4)
+       li      r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and.    r0, r0, r7
+       beq     32f
+       mr      r31, r4
+       lhz     r3, PACAPACAINDEX(r13)
+       bl      smp_send_reschedule
+       nop
+       mr      r4, r31
+32:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+
+       /* Jump to partition switch code */
+       bl      .kvmppc_hv_entry_trampoline
+       nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are enabled again at this point.
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+       /*
+        * Register usage at this point:
+        *
+        * R1       = host R1
+        * R2       = host R2
+        * R12      = exit handler id
+        * R13      = PACA
+        */
+
+       /* Restore non-volatile host registers (r14 - r31) */
+       REST_NVGPRS(r1)
+
+       addi    r1, r1, SWITCH_FRAME_SIZE
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644 (file)
index 0000000..fcfe6b0
--- /dev/null
@@ -0,0 +1,370 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER      24
+#define HPT_NPTEG      (1ul << (HPT_ORDER - 7))        /* 128B per pteg */
+#define HPT_HASH_MASK  (HPT_NPTEG - 1)
+
+#define HPTE_V_HVLOCK  0x40UL
+
+static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+       unsigned long tmp, old;
+
+       asm volatile("  ldarx   %0,0,%2\n"
+                    "  and.    %1,%0,%3\n"
+                    "  bne     2f\n"
+                    "  ori     %0,%0,%4\n"
+                    "  stdcx.  %0,0,%2\n"
+                    "  beq+    2f\n"
+                    "  li      %1,%3\n"
+                    "2:        isync"
+                    : "=&r" (tmp), "=&r" (old)
+                    : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+                    : "cc", "memory");
+       return old == 0;
+}
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+                   long pte_index, unsigned long pteh, unsigned long ptel)
+{
+       unsigned long porder;
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long i, lpn, pa;
+       unsigned long *hpte;
+
+       /* only handle 4k, 64k and 16M pages for now */
+       porder = 12;
+       if (pteh & HPTE_V_LARGE) {
+               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+                   (ptel & 0xf000) == 0x1000) {
+                       /* 64k page */
+                       porder = 16;
+               } else if ((ptel & 0xff000) == 0) {
+                       /* 16M page */
+                       porder = 24;
+                       /* lowest AVA bit must be 0 for 16M pages */
+                       if (pteh & 0x80)
+                               return H_PARAMETER;
+               } else
+                       return H_PARAMETER;
+       }
+       lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+       if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
+               return H_PARAMETER;
+       pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+       if (!pa)
+               return H_PARAMETER;
+       /* Check WIMG */
+       if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+           (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+               return H_PARAMETER;
+       pteh &= ~0x60UL;
+       ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+       ptel |= pa;
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       if (likely((flags & H_EXACT) == 0)) {
+               pte_index &= ~7UL;
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               for (i = 0; ; ++i) {
+                       if (i == 8)
+                               return H_PTEG_FULL;
+                       if ((*hpte & HPTE_V_VALID) == 0 &&
+                           lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+                               break;
+                       hpte += 2;
+               }
+       } else {
+               i = 0;
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+                       return H_PTEG_FULL;
+       }
+       hpte[1] = ptel;
+       eieio();
+       hpte[0] = pteh;
+       asm volatile("ptesync" : : : "memory");
+       atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+       vcpu->arch.gpr[4] = pte_index + i;
+       return H_SUCCESS;
+}
+
+static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+                                     unsigned long pte_index)
+{
+       unsigned long rb, va_low;
+
+       rb = (v & ~0x7fUL) << 16;               /* AVA field */
+       va_low = pte_index >> 3;
+       if (v & HPTE_V_SECONDARY)
+               va_low = ~va_low;
+       /* xor vsid from AVA */
+       if (!(v & HPTE_V_1TB_SEG))
+               va_low ^= v >> 12;
+       else
+               va_low ^= v >> 24;
+       va_low &= 0x7ff;
+       if (v & HPTE_V_LARGE) {
+               rb |= 1;                        /* L field */
+               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+                   (r & 0xff000)) {
+                       /* non-16MB large page, must be 64k */
+                       /* (masks depend on page size) */
+                       rb |= 0x1000;           /* page encoding in LP field */
+                       rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+                       rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+               }
+       } else {
+               /* 4kB page */
+               rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of VA */
+       }
+       rb |= (v >> 54) & 0x300;                /* B field */
+       return rb;
+}
+
+#define LOCK_TOKEN     (*(u32 *)(&get_paca()->lock_token))
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+       unsigned int tmp, old;
+       unsigned int token = LOCK_TOKEN;
+
+       asm volatile("1:lwarx   %1,0,%2\n"
+                    "  cmpwi   cr0,%1,0\n"
+                    "  bne     2f\n"
+                    "  stwcx.  %3,0,%2\n"
+                    "  bne-    1b\n"
+                    "  isync\n"
+                    "2:"
+                    : "=&r" (tmp), "=&r" (old)
+                    : "r" (lock), "r" (token)
+                    : "cc", "memory");
+       return old == 0;
+}
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+                    unsigned long pte_index, unsigned long avpn,
+                    unsigned long va)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte;
+       unsigned long v, r, rb;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       if ((hpte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
+           ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
+               hpte[0] &= ~HPTE_V_HVLOCK;
+               return H_NOT_FOUND;
+       }
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               flags |= H_LOCAL;
+       vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
+       vcpu->arch.gpr[5] = r = hpte[1];
+       rb = compute_tlbie_rb(v, r, pte_index);
+       hpte[0] = 0;
+       if (!(flags & H_LOCAL)) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+                            : : "r" (rb), "r" (kvm->arch.lpid));
+               asm volatile("ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               asm volatile("tlbiel %0" : : "r" (rb));
+               asm volatile("ptesync" : : : "memory");
+       }
+       return H_SUCCESS;
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *args = &vcpu->arch.gpr[4];
+       unsigned long *hp, tlbrb[4];
+       long int i, found;
+       long int n_inval = 0;
+       unsigned long flags, req, pte_index;
+       long int local = 0;
+       long int ret = H_SUCCESS;
+
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               local = 1;
+       for (i = 0; i < 4; ++i) {
+               pte_index = args[i * 2];
+               flags = pte_index >> 56;
+               pte_index &= ((1ul << 56) - 1);
+               req = flags >> 6;
+               flags &= 3;
+               if (req == 3)
+                       break;
+               if (req != 1 || flags == 3 ||
+                   pte_index >= (HPT_NPTEG << 3)) {
+                       /* parameter error */
+                       args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
+                       ret = H_PARAMETER;
+                       break;
+               }
+               hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               while (!lock_hpte(hp, HPTE_V_HVLOCK))
+                       cpu_relax();
+               found = 0;
+               if (hp[0] & HPTE_V_VALID) {
+                       switch (flags & 3) {
+                       case 0:         /* absolute */
+                               found = 1;
+                               break;
+                       case 1:         /* andcond */
+                               if (!(hp[0] & args[i * 2 + 1]))
+                                       found = 1;
+                               break;
+                       case 2:         /* AVPN */
+                               if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+                                       found = 1;
+                               break;
+                       }
+               }
+               if (!found) {
+                       hp[0] &= ~HPTE_V_HVLOCK;
+                       args[i * 2] = ((0x90 | flags) << 56) + pte_index;
+                       continue;
+               }
+               /* insert R and C bits from PTE */
+               flags |= (hp[1] >> 5) & 0x0c;
+               args[i * 2] = ((0x80 | flags) << 56) + pte_index;
+               tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+               hp[0] = 0;
+       }
+       if (n_inval == 0)
+               return ret;
+
+       if (!local) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < n_inval; ++i)
+                       asm volatile(PPC_TLBIE(%1,%0)
+                                    : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
+               asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < n_inval; ++i)
+                       asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
+               asm volatile("ptesync" : : : "memory");
+       }
+       return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+                     unsigned long pte_index, unsigned long avpn,
+                     unsigned long va)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte;
+       unsigned long v, r, rb;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       if ((hpte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
+               hpte[0] &= ~HPTE_V_HVLOCK;
+               return H_NOT_FOUND;
+       }
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               flags |= H_LOCAL;
+       v = hpte[0];
+       r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+                       HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+       r |= (flags << 55) & HPTE_R_PP0;
+       r |= (flags << 48) & HPTE_R_KEY_HI;
+       r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+       rb = compute_tlbie_rb(v, r, pte_index);
+       hpte[0] = v & ~HPTE_V_VALID;
+       if (!(flags & H_LOCAL)) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+                            : : "r" (rb), "r" (kvm->arch.lpid));
+               asm volatile("ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               asm volatile("tlbiel %0" : : "r" (rb));
+               asm volatile("ptesync" : : : "memory");
+       }
+       hpte[1] = r;
+       eieio();
+       hpte[0] = v & ~HPTE_V_HVLOCK;
+       asm volatile("ptesync" : : : "memory");
+       return H_SUCCESS;
+}
+
+static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
+{
+       long int i;
+       unsigned long offset, rpn;
+
+       offset = realaddr & (kvm->arch.ram_psize - 1);
+       rpn = (realaddr - offset) >> PAGE_SHIFT;
+       for (i = 0; i < kvm->arch.ram_npages; ++i)
+               if (rpn == kvm->arch.ram_pginfo[i].pfn)
+                       return (i << PAGE_SHIFT) + offset;
+       return HPTE_R_RPN;      /* all 1s in the RPN field */
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+                  unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte, r;
+       int i, n = 1;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       if (flags & H_READ_4) {
+               pte_index &= ~3;
+               n = 4;
+       }
+       for (i = 0; i < n; ++i, ++pte_index) {
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               r = hpte[1];
+               if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
+                       r = reverse_xlate(kvm, r & HPTE_R_RPN) |
+                               (r & ~HPTE_R_RPN);
+               vcpu->arch.gpr[4 + i * 2] = hpte[0];
+               vcpu->arch.gpr[5 + i * 2] = r;
+       }
+       return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644 (file)
index 0000000..6dd3358
--- /dev/null
@@ -0,0 +1,1345 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+       .globl  kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+       mfspr   r13,SPRN_SRR0
+       addi    r13,r13,4
+       mtspr   SPRN_SRR0,r13
+       GET_SCRATCH0(r13)
+       rfid
+       b       .
+
+       .globl  kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+       mfspr   r13,SPRN_HSRR0
+       addi    r13,r13,4
+       mtspr   SPRN_HSRR0,r13
+       GET_SCRATCH0(r13)
+       hrfid
+       b       .
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL(kvmppc_hv_entry_trampoline)
+       mfmsr   r10
+       LOAD_REG_ADDR(r5, kvmppc_hv_entry)
+       li      r0,MSR_RI
+       andc    r0,r10,r0
+       li      r6,MSR_IR | MSR_DR
+       andc    r6,r10,r6
+       mtmsrd  r0,1            /* clear RI in MSR */
+       mtsrr0  r5
+       mtsrr1  r6
+       RFI
+
+#define ULONG_SIZE             8
+#define VCPU_GPR(n)            (VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+#define XICS_XIRR              4
+#define XICS_QIRR              0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+       .globl  kvm_start_guest
+kvm_start_guest:
+       ld      r1,PACAEMERGSP(r13)
+       subi    r1,r1,STACK_FRAME_OVERHEAD
+
+       /* get vcpu pointer */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+
+       /* We got here with an IPI; clear it */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r0, 0xff
+       li      r6, XICS_QIRR
+       li      r7, XICS_XIRR
+       lwzcix  r8, r5, r7              /* ack the interrupt */
+       sync
+       stbcix  r0, r5, r6              /* clear it */
+       stwcix  r8, r5, r7              /* EOI it */
+
+.global kvmppc_hv_entry
+kvmppc_hv_entry:
+
+       /* Required state:
+        *
+        * R4 = vcpu pointer
+        * MSR = ~IR|DR
+        * R13 = PACA
+        * R1 = host R1
+        * all other volatile GPRS = free
+        */
+       mflr    r0
+       std     r0, HSTATE_VMHANDLER(r13)
+
+       ld      r14, VCPU_GPR(r14)(r4)
+       ld      r15, VCPU_GPR(r15)(r4)
+       ld      r16, VCPU_GPR(r16)(r4)
+       ld      r17, VCPU_GPR(r17)(r4)
+       ld      r18, VCPU_GPR(r18)(r4)
+       ld      r19, VCPU_GPR(r19)(r4)
+       ld      r20, VCPU_GPR(r20)(r4)
+       ld      r21, VCPU_GPR(r21)(r4)
+       ld      r22, VCPU_GPR(r22)(r4)
+       ld      r23, VCPU_GPR(r23)(r4)
+       ld      r24, VCPU_GPR(r24)(r4)
+       ld      r25, VCPU_GPR(r25)(r4)
+       ld      r26, VCPU_GPR(r26)(r4)
+       ld      r27, VCPU_GPR(r27)(r4)
+       ld      r28, VCPU_GPR(r28)(r4)
+       ld      r29, VCPU_GPR(r29)(r4)
+       ld      r30, VCPU_GPR(r30)(r4)
+       ld      r31, VCPU_GPR(r31)(r4)
+
+       /* Load guest PMU registers */
+       /* R4 is live here (vcpu pointer) */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
+       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
+       lwz     r6, VCPU_PMC + 8(r4)
+       lwz     r7, VCPU_PMC + 12(r4)
+       lwz     r8, VCPU_PMC + 16(r4)
+       lwz     r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+       lwz     r10, VCPU_PMC + 24(r4)
+       lwz     r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r5
+       mtspr   SPRN_PMC3, r6
+       mtspr   SPRN_PMC4, r7
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+       mtspr   SPRN_PMC7, r10
+       mtspr   SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       ld      r3, VCPU_MMCR(r4)
+       ld      r5, VCPU_MMCR + 8(r4)
+       ld      r6, VCPU_MMCR + 16(r4)
+       mtspr   SPRN_MMCR1, r5
+       mtspr   SPRN_MMCRA, r6
+       mtspr   SPRN_MMCR0, r3
+       isync
+
+       /* Load up FP, VMX and VSX registers */
+       bl      kvmppc_load_fp
+
+BEGIN_FTR_SECTION
+       /* Switch DSCR to guest value */
+       ld      r5, VCPU_DSCR(r4)
+       mtspr   SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /*
+        * Set the decrementer to the guest decrementer.
+        */
+       ld      r8,VCPU_DEC_EXPIRES(r4)
+       mftb    r7
+       subf    r3,r7,r8
+       mtspr   SPRN_DEC,r3
+       stw     r3,VCPU_DEC(r4)
+
+       ld      r5, VCPU_SPRG0(r4)
+       ld      r6, VCPU_SPRG1(r4)
+       ld      r7, VCPU_SPRG2(r4)
+       ld      r8, VCPU_SPRG3(r4)
+       mtspr   SPRN_SPRG0, r5
+       mtspr   SPRN_SPRG1, r6
+       mtspr   SPRN_SPRG2, r7
+       mtspr   SPRN_SPRG3, r8
+
+       /* Save R1 in the PACA */
+       std     r1, HSTATE_HOST_R1(r13)
+
+       /* Increment yield count if they have a VPA */
+       ld      r3, VCPU_VPA(r4)
+       cmpdi   r3, 0
+       beq     25f
+       lwz     r5, LPPACA_YIELDCOUNT(r3)
+       addi    r5, r5, 1
+       stw     r5, LPPACA_YIELDCOUNT(r3)
+25:
+       /* Load up DAR and DSISR */
+       ld      r5, VCPU_DAR(r4)
+       lwz     r6, VCPU_DSISR(r4)
+       mtspr   SPRN_DAR, r5
+       mtspr   SPRN_DSISR, r6
+
+       /* Set partition DABR */
+       li      r5,3
+       ld      r6,VCPU_DABR(r4)
+       mtspr   SPRN_DABRX,r5
+       mtspr   SPRN_DABR,r6
+
+BEGIN_FTR_SECTION
+       /* Restore AMR and UAMOR, set AMOR to all 1s */
+       ld      r5,VCPU_AMR(r4)
+       ld      r6,VCPU_UAMOR(r4)
+       li      r7,-1
+       mtspr   SPRN_AMR,r5
+       mtspr   SPRN_UAMOR,r6
+       mtspr   SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Clear out SLB */
+       li      r6,0
+       slbmte  r6,r6
+       slbia
+       ptesync
+
+BEGIN_FTR_SECTION
+       b       30f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       /*
+        * POWER7 host -> guest partition switch code.
+        * We don't have to lock against concurrent tlbies,
+        * but we do have to coordinate across hardware threads.
+        */
+       /* Increment entry count iff exit count is zero. */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       addi    r9,r5,VCORE_ENTRY_EXIT
+21:    lwarx   r3,0,r9
+       cmpwi   r3,0x100                /* any threads starting to exit? */
+       bge     secondary_too_late      /* if so we're too late to the party */
+       addi    r3,r3,1
+       stwcx.  r3,0,r9
+       bne     21b
+
+       /* Primary thread switches to guest partition. */
+       ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+       lwz     r6,VCPU_PTID(r4)
+       cmpwi   r6,0
+       bne     20f
+       ld      r6,KVM_SDR1(r9)
+       lwz     r7,KVM_LPID(r9)
+       li      r0,LPID_RSVD            /* switch to reserved LPID */
+       mtspr   SPRN_LPID,r0
+       ptesync
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+       mtspr   SPRN_LPID,r7
+       isync
+       li      r0,1
+       stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
+       b       10f
+
+       /* Secondary threads wait for primary to have done partition switch */
+20:    lbz     r0,VCORE_IN_GUEST(r5)
+       cmpwi   r0,0
+       beq     20b
+
+       /* Set LPCR.  Set the MER bit if there is a pending external irq. */
+10:    ld      r8,KVM_LPCR(r9)
+       ld      r0,VCPU_PENDING_EXC(r4)
+       li      r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and.    r0,r0,r7
+       beq     11f
+       ori     r8,r8,LPCR_MER
+11:    mtspr   SPRN_LPCR,r8
+       ld      r8,KVM_RMOR(r9)
+       mtspr   SPRN_RMOR,r8
+       isync
+
+       /* Check if HDEC expires soon */
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,10
+       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       mr      r9,r4
+       blt     hdec_soon
+
+       /*
+        * Invalidate the TLB if we could possibly have stale TLB
+        * entries for this partition on this core due to the use
+        * of tlbiel.
+        * XXX maybe only need this on primary thread?
+        */
+       ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+       lwz     r5,VCPU_VCPUID(r4)
+       lhz     r6,PACAPACAINDEX(r13)
+       rldimi  r6,r5,0,62              /* XXX map as if threads 1:1 p:v */
+       lhz     r8,VCPU_LAST_CPU(r4)
+       sldi    r7,r6,1                 /* see if this is the same vcpu */
+       add     r7,r7,r9                /* as last ran on this pcpu */
+       lhz     r0,KVM_LAST_VCPU(r7)
+       cmpw    r6,r8                   /* on the same cpu core as last time? */
+       bne     3f
+       cmpw    r0,r5                   /* same vcpu as this core last ran? */
+       beq     1f
+3:     sth     r6,VCPU_LAST_CPU(r4)    /* if not, invalidate partition TLB */
+       sth     r5,KVM_LAST_VCPU(r7)
+       li      r6,128
+       mtctr   r6
+       li      r7,0x800                /* IS field = 0b10 */
+       ptesync
+2:     tlbiel  r7
+       addi    r7,r7,0x1000
+       bdnz    2b
+       ptesync
+1:
+
+       /* Save purr/spurr */
+       mfspr   r5,SPRN_PURR
+       mfspr   r6,SPRN_SPURR
+       std     r5,HSTATE_PURR(r13)
+       std     r6,HSTATE_SPURR(r13)
+       ld      r7,VCPU_PURR(r4)
+       ld      r8,VCPU_SPURR(r4)
+       mtspr   SPRN_PURR,r7
+       mtspr   SPRN_SPURR,r8
+       b       31f
+
+       /*
+        * PPC970 host -> guest partition switch code.
+        * We have to lock against concurrent tlbies,
+        * using native_tlbie_lock to lock against host tlbies
+        * and kvm->arch.tlbie_lock to lock against guest tlbies.
+        * We also have to invalidate the TLB since its
+        * entries aren't tagged with the LPID.
+        */
+30:    ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+
+       /* first take native_tlbie_lock */
+       .section ".toc","aw"
+toc_tlbie_lock:
+       .tc     native_tlbie_lock[TC],native_tlbie_lock
+       .previous
+       ld      r3,toc_tlbie_lock@toc(2)
+       lwz     r8,PACA_LOCK_TOKEN(r13)
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r7,KVM_LPCR(r9)         /* use kvm->arch.lpcr to store HID4 */
+       li      r0,0x18f
+       rotldi  r0,r0,HID4_LPID5_SH     /* all lpid bits in HID4 = 1 */
+       or      r0,r7,r0
+       ptesync
+       sync
+       mtspr   SPRN_HID4,r0            /* switch to reserved LPID */
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop native_tlbie_lock */
+
+       /* invalidate the whole TLB */
+       li      r0,256
+       mtctr   r0
+       li      r6,0
+25:    tlbiel  r6
+       addi    r6,r6,0x1000
+       bdnz    25b
+       ptesync
+
+       /* Take the guest's tlbie_lock */
+       addi    r3,r9,KVM_TLBIE_LOCK
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+       ld      r6,KVM_SDR1(r9)
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+
+       /* Set up HID4 with the guest's LPID etc. */
+       sync
+       mtspr   SPRN_HID4,r7
+       isync
+
+       /* drop the guest's tlbie_lock */
+       li      r0,0
+       stw     r0,0(r3)
+
+       /* Check if HDEC expires soon */
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,10
+       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       mr      r9,r4
+       blt     hdec_soon
+
+       /* Enable HDEC interrupts */
+       mfspr   r0,SPRN_HID0
+       li      r3,1
+       rldimi  r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+       sync
+       mtspr   SPRN_HID0,r0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+
+       /* Load up guest SLB entries */
+31:    lwz     r5,VCPU_SLB_MAX(r4)
+       cmpwi   r5,0
+       beq     9f
+       mtctr   r5
+       addi    r6,r4,VCPU_SLB
+1:     ld      r8,VCPU_SLB_E(r6)
+       ld      r9,VCPU_SLB_V(r6)
+       slbmte  r9,r8
+       addi    r6,r6,VCPU_SLB_SIZE
+       bdnz    1b
+9:
+
+       /* Restore state of CTRL run bit; assume 1 on entry */
+       lwz     r5,VCPU_CTRL(r4)
+       andi.   r5,r5,1
+       bne     4f
+       mfspr   r6,SPRN_CTRLF
+       clrrdi  r6,r6,1
+       mtspr   SPRN_CTRLT,r6
+4:
+       ld      r6, VCPU_CTR(r4)
+       lwz     r7, VCPU_XER(r4)
+
+       mtctr   r6
+       mtxer   r7
+
+       /* Move SRR0 and SRR1 into the respective regs */
+       ld      r6, VCPU_SRR0(r4)
+       ld      r7, VCPU_SRR1(r4)
+       mtspr   SPRN_SRR0, r6
+       mtspr   SPRN_SRR1, r7
+
+       ld      r10, VCPU_PC(r4)
+
+       ld      r11, VCPU_MSR(r4)       /* r10 = vcpu->arch.msr & ~MSR_HV */
+       rldicl  r11, r11, 63 - MSR_HV_LG, 1
+       rotldi  r11, r11, 1 + MSR_HV_LG
+       ori     r11, r11, MSR_ME
+
+fast_guest_return:
+       mtspr   SPRN_HSRR0,r10
+       mtspr   SPRN_HSRR1,r11
+
+       /* Activate guest mode, so faults get handled by KVM */
+       li      r9, KVM_GUEST_MODE_GUEST
+       stb     r9, HSTATE_IN_GUEST(r13)
+
+       /* Enter guest */
+
+       ld      r5, VCPU_LR(r4)
+       lwz     r6, VCPU_CR(r4)
+       mtlr    r5
+       mtcr    r6
+
+       ld      r0, VCPU_GPR(r0)(r4)
+       ld      r1, VCPU_GPR(r1)(r4)
+       ld      r2, VCPU_GPR(r2)(r4)
+       ld      r3, VCPU_GPR(r3)(r4)
+       ld      r5, VCPU_GPR(r5)(r4)
+       ld      r6, VCPU_GPR(r6)(r4)
+       ld      r7, VCPU_GPR(r7)(r4)
+       ld      r8, VCPU_GPR(r8)(r4)
+       ld      r9, VCPU_GPR(r9)(r4)
+       ld      r10, VCPU_GPR(r10)(r4)
+       ld      r11, VCPU_GPR(r11)(r4)
+       ld      r12, VCPU_GPR(r12)(r4)
+       ld      r13, VCPU_GPR(r13)(r4)
+
+       ld      r4, VCPU_GPR(r4)(r4)
+
+       hrfid
+       b       .
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+       .globl  kvmppc_interrupt
+kvmppc_interrupt:
+       /*
+        * Register contents:
+        * R12          = interrupt vector
+        * R13          = PACA
+        * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+        * guest R13 saved in SPRN_SCRATCH0
+        */
+       /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+       std     r9, HSTATE_HOST_R2(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Save registers */
+
+       std     r0, VCPU_GPR(r0)(r9)
+       std     r1, VCPU_GPR(r1)(r9)
+       std     r2, VCPU_GPR(r2)(r9)
+       std     r3, VCPU_GPR(r3)(r9)
+       std     r4, VCPU_GPR(r4)(r9)
+       std     r5, VCPU_GPR(r5)(r9)
+       std     r6, VCPU_GPR(r6)(r9)
+       std     r7, VCPU_GPR(r7)(r9)
+       std     r8, VCPU_GPR(r8)(r9)
+       ld      r0, HSTATE_HOST_R2(r13)
+       std     r0, VCPU_GPR(r9)(r9)
+       std     r10, VCPU_GPR(r10)(r9)
+       std     r11, VCPU_GPR(r11)(r9)
+       ld      r3, HSTATE_SCRATCH0(r13)
+       lwz     r4, HSTATE_SCRATCH1(r13)
+       std     r3, VCPU_GPR(r12)(r9)
+       stw     r4, VCPU_CR(r9)
+
+       /* Restore R1/R2 so we can handle faults */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       mfspr   r10, SPRN_SRR0
+       mfspr   r11, SPRN_SRR1
+       std     r10, VCPU_SRR0(r9)
+       std     r11, VCPU_SRR1(r9)
+       andi.   r0, r12, 2              /* need to read HSRR0/1? */
+       beq     1f
+       mfspr   r10, SPRN_HSRR0
+       mfspr   r11, SPRN_HSRR1
+       clrrdi  r12, r12, 2
+1:     std     r10, VCPU_PC(r9)
+       std     r11, VCPU_MSR(r9)
+
+       GET_SCRATCH0(r3)
+       mflr    r4
+       std     r3, VCPU_GPR(r13)(r9)
+       std     r4, VCPU_LR(r9)
+
+       /* Unset guest mode */
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+
+       stw     r12,VCPU_TRAP(r9)
+
+       /* See if this is a leftover HDEC interrupt */
+       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       bne     2f
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,0
+       bge     ignore_hdec
+2:
+       /* See if this is something we can handle in real mode */
+       cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
+       beq     hcall_try_real_mode
+hcall_real_cont:
+
+       /* Check for mediated interrupts (could be done earlier really ...) */
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_EXTERNAL
+       bne+    1f
+       ld      r5,VCPU_KVM(r9)
+       ld      r5,KVM_LPCR(r5)
+       andi.   r0,r11,MSR_EE
+       beq     1f
+       andi.   r0,r5,LPCR_MER
+       bne     bounce_ext_interrupt
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save DEC */
+       mfspr   r5,SPRN_DEC
+       mftb    r6
+       extsw   r5,r5
+       add     r5,r5,r6
+       std     r5,VCPU_DEC_EXPIRES(r9)
+
+       /* Save HEIR (HV emulation assist reg) in last_inst
+          if this is an HEI (HV emulation interrupt, e40) */
+       li      r3,-1
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+       bne     11f
+       mfspr   r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+11:    stw     r3,VCPU_LAST_INST(r9)
+
+       /* Save more register state  */
+       mfxer   r5
+       mfdar   r6
+       mfdsisr r7
+       mfctr   r8
+
+       stw     r5, VCPU_XER(r9)
+       std     r6, VCPU_DAR(r9)
+       stw     r7, VCPU_DSISR(r9)
+       std     r8, VCPU_CTR(r9)
+       /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+       beq     6f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+7:     std     r6, VCPU_FAULT_DAR(r9)
+       stw     r7, VCPU_FAULT_DSISR(r9)
+
+       /* Save guest CTRL register, set runlatch to 1 */
+       mfspr   r6,SPRN_CTRLF
+       stw     r6,VCPU_CTRL(r9)
+       andi.   r0,r6,1
+       bne     4f
+       ori     r6,r6,1
+       mtspr   SPRN_CTRLT,r6
+4:
+       /* Read the guest SLB and save it away */
+       lwz     r0,VCPU_SLB_NR(r9)      /* number of entries in SLB */
+       mtctr   r0
+       li      r6,0
+       addi    r7,r9,VCPU_SLB
+       li      r5,0
+1:     slbmfee r8,r6
+       andis.  r0,r8,SLB_ESID_V@h
+       beq     2f
+       add     r8,r8,r6                /* put index in */
+       slbmfev r3,r6
+       std     r8,VCPU_SLB_E(r7)
+       std     r3,VCPU_SLB_V(r7)
+       addi    r7,r7,VCPU_SLB_SIZE
+       addi    r5,r5,1
+2:     addi    r6,r6,1
+       bdnz    1b
+       stw     r5,VCPU_SLB_MAX(r9)
+
+       /*
+        * Save the guest PURR/SPURR
+        */
+BEGIN_FTR_SECTION
+       mfspr   r5,SPRN_PURR
+       mfspr   r6,SPRN_SPURR
+       ld      r7,VCPU_PURR(r9)
+       ld      r8,VCPU_SPURR(r9)
+       std     r5,VCPU_PURR(r9)
+       std     r6,VCPU_SPURR(r9)
+       subf    r5,r7,r5
+       subf    r6,r8,r6
+
+       /*
+        * Restore host PURR/SPURR and add guest times
+        * so that the time in the guest gets accounted.
+        */
+       ld      r3,HSTATE_PURR(r13)
+       ld      r4,HSTATE_SPURR(r13)
+       add     r3,r3,r5
+       add     r4,r4,r6
+       mtspr   SPRN_PURR,r3
+       mtspr   SPRN_SPURR,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
+
+       /* Clear out SLB */
+       li      r5,0
+       slbmte  r5,r5
+       slbia
+       ptesync
+
+hdec_soon:
+BEGIN_FTR_SECTION
+       b       32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       /*
+        * POWER7 guest -> host partition switch code.
+        * We don't have to lock against tlbies but we do
+        * have to coordinate the hardware threads.
+        */
+       /* Increment the threads-exiting-guest count in the 0xff00
+          bits of vcore->entry_exit_count */
+       lwsync
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       addi    r6,r5,VCORE_ENTRY_EXIT
+41:    lwarx   r3,0,r6
+       addi    r0,r3,0x100
+       stwcx.  r0,0,r6
+       bne     41b
+
+       /*
+        * At this point we have an interrupt that we have to pass
+        * up to the kernel or qemu; we can't handle it in real mode.
+        * Thus we have to do a partition switch, so we have to
+        * collect the other threads, if we are the first thread
+        * to take an interrupt.  To do this, we set the HDEC to 0,
+        * which causes an HDEC interrupt in all threads within 2ns
+        * because the HDEC register is shared between all 4 threads.
+        * However, we don't need to bother if this is an HDEC
+        * interrupt, since the other threads will already be on their
+        * way here in that case.
+        */
+       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       beq     40f
+       cmpwi   r3,0x100        /* Are we the first here? */
+       bge     40f
+       cmpwi   r3,1
+       ble     40f
+       li      r0,0
+       mtspr   SPRN_HDEC,r0
+40:
+
+       /* Secondary threads wait for primary to do partition switch */
+       ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r3,VCPU_PTID(r9)
+       cmpwi   r3,0
+       beq     15f
+       HMT_LOW
+13:    lbz     r3,VCORE_IN_GUEST(r5)
+       cmpwi   r3,0
+       bne     13b
+       HMT_MEDIUM
+       b       16f
+
+       /* Primary thread waits for all the secondaries to exit guest */
+15:    lwz     r3,VCORE_ENTRY_EXIT(r5)
+       srwi    r0,r3,8
+       clrldi  r3,r3,56
+       cmpw    r3,r0
+       bne     15b
+       isync
+
+       /* Primary thread switches back to host partition */
+       ld      r6,KVM_HOST_SDR1(r4)
+       lwz     r7,KVM_HOST_LPID(r4)
+       li      r8,LPID_RSVD            /* switch to reserved LPID */
+       mtspr   SPRN_LPID,r8
+       ptesync
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+       mtspr   SPRN_LPID,r7
+       isync
+       li      r0,0
+       stb     r0,VCORE_IN_GUEST(r5)
+       lis     r8,0x7fff               /* MAX_INT@h */
+       mtspr   SPRN_HDEC,r8
+
+16:    ld      r8,KVM_HOST_LPCR(r4)
+       mtspr   SPRN_LPCR,r8
+       isync
+       b       33f
+
+       /*
+        * PPC970 guest -> host partition switch code.
+        * We have to lock against concurrent tlbies, and
+        * we have to flush the whole TLB.
+        */
+32:    ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+
+       /* Take the guest's tlbie_lock */
+       lwz     r8,PACA_LOCK_TOKEN(r13)
+       addi    r3,r4,KVM_TLBIE_LOCK
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r7,KVM_HOST_LPCR(r4)    /* use kvm->arch.host_lpcr for HID4 */
+       li      r0,0x18f
+       rotldi  r0,r0,HID4_LPID5_SH     /* all lpid bits in HID4 = 1 */
+       or      r0,r7,r0
+       ptesync
+       sync
+       mtspr   SPRN_HID4,r0            /* switch to reserved LPID */
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop guest tlbie_lock */
+
+       /* invalidate the whole TLB */
+       li      r0,256
+       mtctr   r0
+       li      r6,0
+25:    tlbiel  r6
+       addi    r6,r6,0x1000
+       bdnz    25b
+       ptesync
+
+       /* take native_tlbie_lock */
+       ld      r3,toc_tlbie_lock@toc(2)
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r6,KVM_HOST_SDR1(r4)
+       mtspr   SPRN_SDR1,r6            /* switch to host page table */
+
+       /* Set up host HID4 value */
+       sync
+       mtspr   SPRN_HID4,r7
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop native_tlbie_lock */
+
+       lis     r8,0x7fff               /* MAX_INT@h */
+       mtspr   SPRN_HDEC,r8
+
+       /* Disable HDEC interrupts */
+       mfspr   r0,SPRN_HID0
+       li      r3,0
+       rldimi  r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+       sync
+       mtspr   SPRN_HID0,r0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+
+       /* load host SLB entries */
+33:    ld      r8,PACA_SLBSHADOWPTR(r13)
+
+       .rept   SLB_NUM_BOLTED
+       ld      r5,SLBSHADOW_SAVEAREA(r8)
+       ld      r6,SLBSHADOW_SAVEAREA+8(r8)
+       andis.  r7,r5,SLB_ESID_V@h
+       beq     1f
+       slbmte  r6,r5
+1:     addi    r8,r8,16
+       .endr
+
+       /* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
+       mfspr   r5,SPRN_AMR
+       mfspr   r6,SPRN_UAMOR
+       std     r5,VCPU_AMR(r9)
+       std     r6,VCPU_UAMOR(r9)
+       li      r6,0
+       mtspr   SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Restore host DABR and DABRX */
+       ld      r5,HSTATE_DABR(r13)
+       li      r6,7
+       mtspr   SPRN_DABR,r5
+       mtspr   SPRN_DABRX,r6
+
+       /* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
+       mfspr   r8, SPRN_DSCR
+       ld      r7, HSTATE_DSCR(r13)
+       std     r8, VCPU_DSCR(r7)
+       mtspr   SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save non-volatile GPRs */
+       std     r14, VCPU_GPR(r14)(r9)
+       std     r15, VCPU_GPR(r15)(r9)
+       std     r16, VCPU_GPR(r16)(r9)
+       std     r17, VCPU_GPR(r17)(r9)
+       std     r18, VCPU_GPR(r18)(r9)
+       std     r19, VCPU_GPR(r19)(r9)
+       std     r20, VCPU_GPR(r20)(r9)
+       std     r21, VCPU_GPR(r21)(r9)
+       std     r22, VCPU_GPR(r22)(r9)
+       std     r23, VCPU_GPR(r23)(r9)
+       std     r24, VCPU_GPR(r24)(r9)
+       std     r25, VCPU_GPR(r25)(r9)
+       std     r26, VCPU_GPR(r26)(r9)
+       std     r27, VCPU_GPR(r27)(r9)
+       std     r28, VCPU_GPR(r28)(r9)
+       std     r29, VCPU_GPR(r29)(r9)
+       std     r30, VCPU_GPR(r30)(r9)
+       std     r31, VCPU_GPR(r31)(r9)
+
+       /* Save SPRGs */
+       mfspr   r3, SPRN_SPRG0
+       mfspr   r4, SPRN_SPRG1
+       mfspr   r5, SPRN_SPRG2
+       mfspr   r6, SPRN_SPRG3
+       std     r3, VCPU_SPRG0(r9)
+       std     r4, VCPU_SPRG1(r9)
+       std     r5, VCPU_SPRG2(r9)
+       std     r6, VCPU_SPRG3(r9)
+
+       /* Increment yield count if they have a VPA */
+       ld      r8, VCPU_VPA(r9)        /* do they have a VPA? */
+       cmpdi   r8, 0
+       beq     25f
+       lwz     r3, LPPACA_YIELDCOUNT(r8)
+       addi    r3, r3, 1
+       stw     r3, LPPACA_YIELDCOUNT(r8)
+25:
+       /* Save PMU registers if requested */
+       /* r8 and cr0.eq are live here */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+       beq     21f                     /* if no VPA, save PMU stuff anyway */
+       lbz     r7, LPPACA_PMCINUSE(r8)
+       cmpwi   r7, 0                   /* did they ask for PMU stuff to be saved? */
+       bne     21f
+       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
+       b       22f
+21:    mfspr   r5, SPRN_MMCR1
+       mfspr   r6, SPRN_MMCRA
+       std     r4, VCPU_MMCR(r9)
+       std     r5, VCPU_MMCR + 8(r9)
+       std     r6, VCPU_MMCR + 16(r9)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r4, SPRN_PMC2
+       mfspr   r5, SPRN_PMC3
+       mfspr   r6, SPRN_PMC4
+       mfspr   r7, SPRN_PMC5
+       mfspr   r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+       mfspr   r10, SPRN_PMC7
+       mfspr   r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       stw     r3, VCPU_PMC(r9)
+       stw     r4, VCPU_PMC + 4(r9)
+       stw     r5, VCPU_PMC + 8(r9)
+       stw     r6, VCPU_PMC + 12(r9)
+       stw     r7, VCPU_PMC + 16(r9)
+       stw     r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+       stw     r10, VCPU_PMC + 24(r9)
+       stw     r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+22:
+       /* save FP state */
+       mr      r3, r9
+       bl      .kvmppc_save_fp
+
+       /* Secondary threads go off to take a nap on POWER7 */
+BEGIN_FTR_SECTION
+       lwz     r0,VCPU_PTID(r3)
+       cmpwi   r0,0
+       bne     secondary_nap
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /*
+        * Reload DEC.  HDEC interrupts were disabled when
+        * we reloaded the host's LPCR value.
+        */
+       ld      r3, HSTATE_DECEXP(r13)
+       mftb    r4
+       subf    r4, r4, r3
+       mtspr   SPRN_DEC, r4
+
+       /* Reload the host's PMU registers */
+       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
+       lbz     r4, LPPACA_PMCINUSE(r3)
+       cmpwi   r4, 0
+       beq     23f                     /* skip if not */
+       lwz     r3, HSTATE_PMC(r13)
+       lwz     r4, HSTATE_PMC + 4(r13)
+       lwz     r5, HSTATE_PMC + 8(r13)
+       lwz     r6, HSTATE_PMC + 12(r13)
+       lwz     r8, HSTATE_PMC + 16(r13)
+       lwz     r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+       lwz     r10, HSTATE_PMC + 24(r13)
+       lwz     r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r4
+       mtspr   SPRN_PMC3, r5
+       mtspr   SPRN_PMC4, r6
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+       mtspr   SPRN_PMC7, r10
+       mtspr   SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       ld      r3, HSTATE_MMCR(r13)
+       ld      r4, HSTATE_MMCR + 8(r13)
+       ld      r5, HSTATE_MMCR + 16(r13)
+       mtspr   SPRN_MMCR1, r4
+       mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_MMCR0, r3
+       isync
+23:
+       /*
+        * For external and machine check interrupts, we need
+        * to call the Linux handler to process the interrupt.
+        * We do that by jumping to the interrupt vector address
+        * which we have in r12.  The [h]rfid at the end of the
+        * handler will return to the book3s_hv_interrupts.S code.
+        * For other interrupts we do the rfid to get back
+        * to the book3s_interrupts.S code here.
+        */
+       ld      r8, HSTATE_VMHANDLER(r13)
+       ld      r7, HSTATE_HOST_MSR(r13)
+
+       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
+       beq     11f
+       cmpwi   r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+
+       /* RFI into the highmem handler, or branch to interrupt handler */
+12:    mfmsr   r6
+       mtctr   r12
+       li      r0, MSR_RI
+       andc    r6, r6, r0
+       mtmsrd  r6, 1                   /* Clear RI in MSR */
+       mtsrr0  r8
+       mtsrr1  r7
+       beqctr
+       RFI
+
+11:
+BEGIN_FTR_SECTION
+       b       12b
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       mtspr   SPRN_HSRR0, r8
+       mtspr   SPRN_HSRR1, r7
+       ba      0x500
+
+6:     mfspr   r6,SPRN_HDAR
+       mfspr   r7,SPRN_HDSISR
+       b       7b
+
+/*
+ * Try to handle an hcall in real mode.
+ * Returns to the guest if we handle it, or continues on up to
+ * the kernel if we can't (i.e. if we don't have a handler for
+ * it, or if the handler returns H_TOO_HARD).
+ */
+       .globl  hcall_try_real_mode
+hcall_try_real_mode:
+       ld      r3,VCPU_GPR(r3)(r9)
+       andi.   r0,r11,MSR_PR
+       bne     hcall_real_cont
+       clrrdi  r3,r3,2
+       cmpldi  r3,hcall_real_table_end - hcall_real_table
+       bge     hcall_real_cont
+       LOAD_REG_ADDR(r4, hcall_real_table)
+       lwzx    r3,r3,r4
+       cmpwi   r3,0
+       beq     hcall_real_cont
+       add     r3,r3,r4
+       mtctr   r3
+       mr      r3,r9           /* get vcpu pointer */
+       ld      r4,VCPU_GPR(r4)(r9)
+       bctrl
+       cmpdi   r3,H_TOO_HARD
+       beq     hcall_real_fallback
+       ld      r4,HSTATE_KVM_VCPU(r13)
+       std     r3,VCPU_GPR(r3)(r4)
+       ld      r10,VCPU_PC(r4)
+       ld      r11,VCPU_MSR(r4)
+       b       fast_guest_return
+
+       /* We've attempted a real mode hcall, but it's punted it back
+        * to userspace.  We need to restore some clobbered volatiles
+        * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+       li      r12,BOOK3S_INTERRUPT_SYSCALL
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       ld      r11, VCPU_MSR(r9)
+
+       b       hcall_real_cont
+
+       .globl  hcall_real_table
+hcall_real_table:
+       .long   0               /* 0 - unused */
+       .long   .kvmppc_h_remove - hcall_real_table
+       .long   .kvmppc_h_enter - hcall_real_table
+       .long   .kvmppc_h_read - hcall_real_table
+       .long   0               /* 0x10 - H_CLEAR_MOD */
+       .long   0               /* 0x14 - H_CLEAR_REF */
+       .long   .kvmppc_h_protect - hcall_real_table
+       .long   0               /* 0x1c - H_GET_TCE */
+       .long   .kvmppc_h_put_tce - hcall_real_table
+       .long   0               /* 0x24 - H_SET_SPRG0 */
+       .long   .kvmppc_h_set_dabr - hcall_real_table
+       .long   0               /* 0x2c */
+       .long   0               /* 0x30 */
+       .long   0               /* 0x34 */
+       .long   0               /* 0x38 */
+       .long   0               /* 0x3c */
+       .long   0               /* 0x40 */
+       .long   0               /* 0x44 */
+       .long   0               /* 0x48 */
+       .long   0               /* 0x4c */
+       .long   0               /* 0x50 */
+       .long   0               /* 0x54 */
+       .long   0               /* 0x58 */
+       .long   0               /* 0x5c */
+       .long   0               /* 0x60 */
+       .long   0               /* 0x64 */
+       .long   0               /* 0x68 */
+       .long   0               /* 0x6c */
+       .long   0               /* 0x70 */
+       .long   0               /* 0x74 */
+       .long   0               /* 0x78 */
+       .long   0               /* 0x7c */
+       .long   0               /* 0x80 */
+       .long   0               /* 0x84 */
+       .long   0               /* 0x88 */
+       .long   0               /* 0x8c */
+       .long   0               /* 0x90 */
+       .long   0               /* 0x94 */
+       .long   0               /* 0x98 */
+       .long   0               /* 0x9c */
+       .long   0               /* 0xa0 */
+       .long   0               /* 0xa4 */
+       .long   0               /* 0xa8 */
+       .long   0               /* 0xac */
+       .long   0               /* 0xb0 */
+       .long   0               /* 0xb4 */
+       .long   0               /* 0xb8 */
+       .long   0               /* 0xbc */
+       .long   0               /* 0xc0 */
+       .long   0               /* 0xc4 */
+       .long   0               /* 0xc8 */
+       .long   0               /* 0xcc */
+       .long   0               /* 0xd0 */
+       .long   0               /* 0xd4 */
+       .long   0               /* 0xd8 */
+       .long   0               /* 0xdc */
+       .long   0               /* 0xe0 */
+       .long   0               /* 0xe4 */
+       .long   0               /* 0xe8 */
+       .long   0               /* 0xec */
+       .long   0               /* 0xf0 */
+       .long   0               /* 0xf4 */
+       .long   0               /* 0xf8 */
+       .long   0               /* 0xfc */
+       .long   0               /* 0x100 */
+       .long   0               /* 0x104 */
+       .long   0               /* 0x108 */
+       .long   0               /* 0x10c */
+       .long   0               /* 0x110 */
+       .long   0               /* 0x114 */
+       .long   0               /* 0x118 */
+       .long   0               /* 0x11c */
+       .long   0               /* 0x120 */
+       .long   .kvmppc_h_bulk_remove - hcall_real_table
+hcall_real_table_end:
+
+ignore_hdec:
+       mr      r4,r9
+       b       fast_guest_return
+
+bounce_ext_interrupt:
+       mr      r4,r9
+       mtspr   SPRN_SRR0,r10
+       mtspr   SPRN_SRR1,r11
+       li      r10,BOOK3S_INTERRUPT_EXTERNAL
+       LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+       b       fast_guest_return
+
+_GLOBAL(kvmppc_h_set_dabr)
+       std     r4,VCPU_DABR(r3)
+       mtspr   SPRN_DABR,r4
+       li      r3,0
+       blr
+
+secondary_too_late:
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       HMT_LOW
+13:    lbz     r3,VCORE_IN_GUEST(r5)
+       cmpwi   r3,0
+       bne     13b
+       HMT_MEDIUM
+       ld      r11,PACA_SLBSHADOWPTR(r13)
+
+       .rept   SLB_NUM_BOLTED
+       ld      r5,SLBSHADOW_SAVEAREA(r11)
+       ld      r6,SLBSHADOW_SAVEAREA+8(r11)
+       andis.  r7,r5,SLB_ESID_V@h
+       beq     1f
+       slbmte  r6,r5
+1:     addi    r11,r11,16
+       .endr
+       b       50f
+
+secondary_nap:
+       /* Clear any pending IPI */
+50:    ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r0, 0xff
+       li      r6, XICS_QIRR
+       stbcix  r0, r5, r6
+
+       /* increment the nap count and then go to nap mode */
+       ld      r4, HSTATE_KVM_VCORE(r13)
+       addi    r4, r4, VCORE_NAP_COUNT
+       lwsync                          /* make previous updates visible */
+51:    lwarx   r3, 0, r4
+       addi    r3, r3, 1
+       stwcx.  r3, 0, r4
+       bne     51b
+       isync
+
+       mfspr   r4, SPRN_LPCR
+       li      r0, LPCR_PECE
+       andc    r4, r4, r0
+       ori     r4, r4, LPCR_PECE0      /* exit nap on interrupt */
+       mtspr   SPRN_LPCR, r4
+       li      r0, 0
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+ */
+_GLOBAL(kvmppc_save_fp)
+       mfmsr   r9
+       ori     r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       oris    r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+       oris    r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+       mtmsrd  r8
+       isync
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+       reg = 0
+       .rept   32
+       li      r6,reg*16+VCPU_VSRS
+       stxvd2x reg,r6,r3
+       reg = reg + 1
+       .endr
+FTR_SECTION_ELSE
+#endif
+       reg = 0
+       .rept   32
+       stfd    reg,reg*8+VCPU_FPRS(r3)
+       reg = reg + 1
+       .endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+       mffs    fr0
+       stfd    fr0,VCPU_FPSCR(r3)
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       reg = 0
+       .rept   32
+       li      r6,reg*16+VCPU_VRS
+       stvx    reg,r6,r3
+       reg = reg + 1
+       .endr
+       mfvscr  vr0
+       li      r6,VCPU_VSCR
+       stvx    vr0,r6,r3
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+       mfspr   r6,SPRN_VRSAVE
+       stw     r6,VCPU_VRSAVE(r3)
+       mtmsrd  r9
+       isync
+       blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ */
+       .globl  kvmppc_load_fp
+kvmppc_load_fp:
+       mfmsr   r9
+       ori     r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       oris    r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+       oris    r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+       mtmsrd  r8
+       isync
+       lfd     fr0,VCPU_FPSCR(r4)
+       MTFSF_L(fr0)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+       reg = 0
+       .rept   32
+       li      r7,reg*16+VCPU_VSRS
+       lxvd2x  reg,r7,r4
+       reg = reg + 1
+       .endr
+FTR_SECTION_ELSE
+#endif
+       reg = 0
+       .rept   32
+       lfd     reg,reg*8+VCPU_FPRS(r4)
+       reg = reg + 1
+       .endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       li      r7,VCPU_VSCR
+       lvx     vr0,r7,r4
+       mtvscr  vr0
+       reg = 0
+       .rept   32
+       li      r7,reg*16+VCPU_VRS
+       lvx     reg,r7,r4
+       reg = reg + 1
+       .endr
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+       lwz     r7,VCPU_VRSAVE(r4)
+       mtspr   SPRN_VRSAVE,r7
+       blr
index 2f0bc92..c54b0e3 100644 (file)
@@ -29,8 +29,7 @@
 #define ULONG_SIZE             8
 #define FUNC(name)             GLUE(.,name)
 
-#define GET_SHADOW_VCPU(reg)    \
-        addi    reg, r13, PACA_KVM_SVCPU
+#define GET_SHADOW_VCPU_R13
 
 #define DISABLE_INTERRUPTS     \
        mfmsr   r0;             \
@@ -43,8 +42,8 @@
 #define ULONG_SIZE              4
 #define FUNC(name)             name
 
-#define GET_SHADOW_VCPU(reg)    \
-        lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+#define GET_SHADOW_VCPU_R13    \
+       lwz     r13, (THREAD + THREAD_KVM_SVCPU)(r2)
 
 #define DISABLE_INTERRUPTS     \
        mfmsr   r0;             \
@@ -85,7 +84,7 @@
  *  r3: kvm_run pointer
  *  r4: vcpu pointer
  */
-_GLOBAL(__kvmppc_vcpu_entry)
+_GLOBAL(__kvmppc_vcpu_run)
 
 kvm_start_entry:
        /* Write correct stack frame */
@@ -107,17 +106,11 @@ kvm_start_entry:
        /* Load non-volatile guest state from the vcpu */
        VCPU_LOAD_NVGPRS(r4)
 
-       GET_SHADOW_VCPU(r5)
-
-       /* Save R1/R2 in the PACA */
-       PPC_STL r1, SVCPU_HOST_R1(r5)
-       PPC_STL r2, SVCPU_HOST_R2(r5)
+kvm_start_lightweight:
 
-       /* XXX swap in/out on load? */
+       GET_SHADOW_VCPU_R13
        PPC_LL  r3, VCPU_HIGHMEM_HANDLER(r4)
-       PPC_STL r3, SVCPU_VMHANDLER(r5)
-
-kvm_start_lightweight:
+       PPC_STL r3, HSTATE_VMHANDLER(r13)
 
        PPC_LL  r10, VCPU_SHADOW_MSR(r4)        /* r10 = vcpu->arch.shadow_msr */
 
index 79751d8..41cb001 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -29,6 +28,8 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 
+#include "trace.h"
+
 #define PTE_SIZE       12
 
 static struct kmem_cache *hpte_cache;
@@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
 void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
        u64 index;
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 
        trace_kvm_book3s_mmu_map(pte);
 
-       spin_lock(&vcpu->arch.mmu_lock);
+       spin_lock(&vcpu3s->mmu_lock);
 
        /* Add to ePTE list */
        index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
-       hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+       hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
 
        /* Add to ePTE_long list */
        index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
        hlist_add_head_rcu(&pte->list_pte_long,
-                          &vcpu->arch.hpte_hash_pte_long[index]);
+                          &vcpu3s->hpte_hash_pte_long[index]);
 
        /* Add to vPTE list */
        index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
-       hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+       hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
 
        /* Add to vPTE_long list */
        index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
        hlist_add_head_rcu(&pte->list_vpte_long,
-                          &vcpu->arch.hpte_hash_vpte_long[index]);
+                          &vcpu3s->hpte_hash_vpte_long[index]);
 
-       spin_unlock(&vcpu->arch.mmu_lock);
+       spin_unlock(&vcpu3s->mmu_lock);
 }
 
 static void free_pte_rcu(struct rcu_head *head)
@@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head)
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
        trace_kvm_book3s_mmu_invalidate(pte);
 
        /* Different for 32 and 64 bit */
        kvmppc_mmu_invalidate_pte(vcpu, pte);
 
-       spin_lock(&vcpu->arch.mmu_lock);
+       spin_lock(&vcpu3s->mmu_lock);
 
        /* pte already invalidated in between? */
        if (hlist_unhashed(&pte->list_pte)) {
-               spin_unlock(&vcpu->arch.mmu_lock);
+               spin_unlock(&vcpu3s->mmu_lock);
                return;
        }
 
@@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
        else
                kvm_release_pfn_clean(pte->pfn);
 
-       spin_unlock(&vcpu->arch.mmu_lock);
+       spin_unlock(&vcpu3s->mmu_lock);
 
-       vcpu->arch.hpte_cache_count--;
+       vcpu3s->hpte_cache_count--;
        call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
 static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hpte_cache *pte;
        struct hlist_node *node;
        int i;
@@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
        rcu_read_lock();
 
        for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-               struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+               struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
                hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
                        invalidate_pte(vcpu, pte);
@@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 
 static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hlist_head *list;
        struct hlist_node *node;
        struct hpte_cache *pte;
 
        /* Find the list of entries in the map */
-       list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+       list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
 
        rcu_read_lock();
 
@@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 
 static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hlist_head *list;
        struct hlist_node *node;
        struct hpte_cache *pte;
 
        /* Find the list of entries in the map */
-       list = &vcpu->arch.hpte_hash_pte_long[
+       list = &vcpu3s->hpte_hash_pte_long[
                        kvmppc_mmu_hash_pte_long(guest_ea)];
 
        rcu_read_lock();
@@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 /* Flush with mask 0xfffffffff */
 static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hlist_head *list;
        struct hlist_node *node;
        struct hpte_cache *pte;
        u64 vp_mask = 0xfffffffffULL;
 
-       list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+       list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
 
        rcu_read_lock();
 
@@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hlist_head *list;
        struct hlist_node *node;
        struct hpte_cache *pte;
        u64 vp_mask = 0xffffff000ULL;
 
-       list = &vcpu->arch.hpte_hash_vpte_long[
+       list = &vcpu3s->hpte_hash_vpte_long[
                kvmppc_mmu_hash_vpte_long(guest_vp)];
 
        rcu_read_lock();
@@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 
 void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hlist_node *node;
        struct hpte_cache *pte;
        int i;
@@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
        rcu_read_lock();
 
        for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-               struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+               struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
                hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
                        if ((pte->pte.raddr >= pa_start) &&
@@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 
 struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hpte_cache *pte;
 
        pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
-       vcpu->arch.hpte_cache_count++;
+       vcpu3s->hpte_cache_count++;
 
-       if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+       if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
                kvmppc_mmu_pte_flush_all(vcpu);
 
        return pte;
@@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
 
 int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 {
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
        /* init hpte lookup hashes */
-       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
-                                 ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
-       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
-                                 ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
-       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
-                                 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
-       kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
-                                 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
-
-       spin_lock_init(&vcpu->arch.mmu_lock);
+       kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
+                                 ARRAY_SIZE(vcpu3s->hpte_hash_pte));
+       kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
+                                 ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
+       kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
+                                 ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
+       kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
+                                 ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+
+       spin_lock_init(&vcpu3s->mmu_lock);
 
        return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644 (file)
index 0000000..0c0d3f2
--- /dev/null
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ * Functions relating to running KVM on Book 3S processors where
+ * we don't have access to hypervisor mode, and we run the guest
+ * in problem state (user mode).
+ *
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "trace.h"
+
+/* #define EXIT_DEBUG */
+/* #define DEBUG_EXT */
+
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+                            ulong msr);
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#endif
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+       memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+       memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
+              sizeof(get_paca()->shadow_vcpu));
+       to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+       current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
+#endif
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+       memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+       memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+              sizeof(get_paca()->shadow_vcpu));
+       to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+#endif
+
+       kvmppc_giveup_ext(vcpu, MSR_FP);
+       kvmppc_giveup_ext(vcpu, MSR_VEC);
+       kvmppc_giveup_ext(vcpu, MSR_VSX);
+}
+
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+       ulong smsr = vcpu->arch.shared->msr;
+
+       /* Guest MSR values */
+       smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
+       /* Process MSR values */
+       smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+       /* External providers the guest reserved */
+       smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
+       /* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+       smsr |= MSR_ISF | MSR_HV;
+#endif
+       vcpu->arch.shadow_msr = smsr;
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+       ulong old_msr = vcpu->arch.shared->msr;
+
+#ifdef EXIT_DEBUG
+       printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+
+       msr &= to_book3s(vcpu)->msr_mask;
+       vcpu->arch.shared->msr = msr;
+       kvmppc_recalc_shadow_msr(vcpu);
+
+       if (msr & MSR_POW) {
+               if (!vcpu->arch.pending_exceptions) {
+                       kvm_vcpu_block(vcpu);
+                       vcpu->stat.halt_wakeup++;
+
+                       /* Unset POW bit after we woke up */
+                       msr &= ~MSR_POW;
+                       vcpu->arch.shared->msr = msr;
+               }
+       }
+
+       if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+                  (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
+               kvmppc_mmu_flush_segments(vcpu);
+               kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+               /* Preload magic page segment when in kernel mode */
+               if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+                       struct kvm_vcpu_arch *a = &vcpu->arch;
+
+                       if (msr & MSR_DR)
+                               kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+                       else
+                               kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+               }
+       }
+
+       /* Preload FPU if it's enabled */
+       if (vcpu->arch.shared->msr & MSR_FP)
+               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+       u32 host_pvr;
+
+       vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
+       vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
+       if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+               kvmppc_mmu_book3s_64_init(vcpu);
+               to_book3s(vcpu)->hior = 0xfff00000;
+               to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+       } else
+#endif
+       {
+               kvmppc_mmu_book3s_32_init(vcpu);
+               to_book3s(vcpu)->hior = 0;
+               to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+       }
+
+       /* If we are in hypervisor level on 970, we can tell the CPU to
+        * treat DCBZ as 32 bytes store */
+       vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+       if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+           !strcmp(cur_cpu_spec->platform, "ppc970"))
+               vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+       /* Cell performs badly if MSR_FEx are set. So let's hope nobody
+          really needs them in a VM on Cell and force disable them. */
+       if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+               to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+       /* 32 bit Book3S always has 32 byte dcbz */
+       vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+       /* On some CPUs we can execute paired single operations natively */
+       asm ( "mfpvr %0" : "=r"(host_pvr));
+       switch (host_pvr) {
+       case 0x00080200:        /* lonestar 2.0 */
+       case 0x00088202:        /* lonestar 2.2 */
+       case 0x70000100:        /* gekko 1.0 */
+       case 0x00080100:        /* gekko 2.0 */
+       case 0x00083203:        /* gekko 2.3a */
+       case 0x00083213:        /* gekko 2.3b */
+       case 0x00083204:        /* gekko 2.4 */
+       case 0x00083214:        /* gekko 2.4e (8SE) - retail HW2 */
+       case 0x00087200:        /* broadway */
+               vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+               /* Enable HID2.PSE - in case we need it later */
+               mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+       }
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+       struct page *hpage;
+       u64 hpage_offset;
+       u32 *page;
+       int i;
+
+       hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+       if (is_error_page(hpage)) {
+               kvm_release_page_clean(hpage);
+               return;
+       }
+
+       hpage_offset = pte->raddr & ~PAGE_MASK;
+       hpage_offset &= ~0xFFFULL;
+       hpage_offset /= 4;
+
+       get_page(hpage);
+       page = kmap_atomic(hpage, KM_USER0);
+
+       /* patch dcbz into reserved instruction, so we trap */
+       for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+               if ((page[i] & 0xff0007ff) == INS_DCBZ)
+                       page[i] &= 0xfffffff7;
+
+       kunmap_atomic(page, KM_USER0);
+       put_page(hpage);
+}
+
+static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       ulong mp_pa = vcpu->arch.magic_page_pa;
+
+       if (unlikely(mp_pa) &&
+           unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
+               return 1;
+       }
+
+       return kvm_is_visible_gfn(vcpu->kvm, gfn);
+}
+
+int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           ulong eaddr, int vec)
+{
+       bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+       int r = RESUME_GUEST;
+       int relocated;
+       int page_found = 0;
+       struct kvmppc_pte pte;
+       bool is_mmio = false;
+       bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
+       bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
+       u64 vsid;
+
+       relocated = data ? dr : ir;
+
+       /* Resolve real address if translation turned on */
+       if (relocated) {
+               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
+       } else {
+               pte.may_execute = true;
+               pte.may_read = true;
+               pte.may_write = true;
+               pte.raddr = eaddr & KVM_PAM;
+               pte.eaddr = eaddr;
+               pte.vpage = eaddr >> 12;
+       }
+
+       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+       case 0:
+               pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+               break;
+       case MSR_DR:
+       case MSR_IR:
+               vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+               if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
+                       pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+               else
+                       pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+               pte.vpage |= vsid;
+
+               if (vsid == -1)
+                       page_found = -EINVAL;
+               break;
+       }
+
+       if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+          (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+               /*
+                * If we do the dcbz hack, we have to NX on every execution,
+                * so we can patch the executing code. This renders our guest
+                * NX-less.
+                */
+               pte.may_execute = !data;
+       }
+
+       if (page_found == -ENOENT) {
+               /* Page not found in guest PTE entries */
+               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+               vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+               vcpu->arch.shared->msr |=
+                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+               kvmppc_book3s_queue_irqprio(vcpu, vec);
+       } else if (page_found == -EPERM) {
+               /* Storage protection */
+               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+               vcpu->arch.shared->dsisr =
+                       to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+               vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
+               vcpu->arch.shared->msr |=
+                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+               kvmppc_book3s_queue_irqprio(vcpu, vec);
+       } else if (page_found == -EINVAL) {
+               /* Page not found in guest SLB */
+               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+               kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+       } else if (!is_mmio &&
+                  kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
+               /* The guest's PTE is not mapped yet. Map on the host */
+               kvmppc_mmu_map_page(vcpu, &pte);
+               if (data)
+                       vcpu->stat.sp_storage++;
+               else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+                       (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+                       kvmppc_patch_dcbz(vcpu, &pte);
+       } else {
+               /* MMIO */
+               vcpu->stat.mmio_exits++;
+               vcpu->arch.paddr_accessed = pte.raddr;
+               r = kvmppc_emulate_mmio(run, vcpu);
+               if ( r == RESUME_HOST_NV )
+                       r = RESUME_HOST;
+       }
+
+       return r;
+}
+
+static inline int get_fpr_index(int i)
+{
+#ifdef CONFIG_VSX
+       i *= 2;
+#endif
+       return i;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+       struct thread_struct *t = &current->thread;
+       u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+       u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+       u64 *thread_fpr = (u64*)t->fpr;
+       int i;
+
+       if (!(vcpu->arch.guest_owned_ext & msr))
+               return;
+
+#ifdef DEBUG_EXT
+       printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+       switch (msr) {
+       case MSR_FP:
+               giveup_fpu(current);
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+                       vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
+
+               vcpu->arch.fpscr = t->fpscr.val;
+               break;
+       case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+               giveup_altivec(current);
+               memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
+               vcpu->arch.vscr = t->vscr;
+#endif
+               break;
+       case MSR_VSX:
+#ifdef CONFIG_VSX
+               __giveup_vsx(current);
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+                       vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+               break;
+       default:
+               BUG();
+       }
+
+       vcpu->arch.guest_owned_ext &= ~msr;
+       current->thread.regs->msr &= ~msr;
+       kvmppc_recalc_shadow_msr(vcpu);
+}
+
+static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
+{
+       ulong srr0 = kvmppc_get_pc(vcpu);
+       u32 last_inst = kvmppc_get_last_inst(vcpu);
+       int ret;
+
+       ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+       if (ret == -ENOENT) {
+               ulong msr = vcpu->arch.shared->msr;
+
+               msr = kvmppc_set_field(msr, 33, 33, 1);
+               msr = kvmppc_set_field(msr, 34, 36, 0);
+               vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
+               kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
+               return EMULATE_AGAIN;
+       }
+
+       return EMULATE_DONE;
+}
+
+static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+
+       /* Need to do paired single emulation? */
+       if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+               return EMULATE_DONE;
+
+       /* Read out the instruction */
+       if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
+               /* Need to emulate */
+               return EMULATE_FAIL;
+
+       return EMULATE_AGAIN;
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+                            ulong msr)
+{
+       struct thread_struct *t = &current->thread;
+       u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+       u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+       u64 *thread_fpr = (u64*)t->fpr;
+       int i;
+
+       /* When we have paired singles, we emulate in software */
+       if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+               return RESUME_GUEST;
+
+       if (!(vcpu->arch.shared->msr & msr)) {
+               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+               return RESUME_GUEST;
+       }
+
+       /* We already own the ext */
+       if (vcpu->arch.guest_owned_ext & msr) {
+               return RESUME_GUEST;
+       }
+
+#ifdef DEBUG_EXT
+       printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+       current->thread.regs->msr |= msr;
+
+       switch (msr) {
+       case MSR_FP:
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+                       thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
+
+               t->fpscr.val = vcpu->arch.fpscr;
+               t->fpexc_mode = 0;
+               kvmppc_load_up_fpu();
+               break;
+       case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+               memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
+               t->vscr = vcpu->arch.vscr;
+               t->vrsave = -1;
+               kvmppc_load_up_altivec();
+#endif
+               break;
+       case MSR_VSX:
+#ifdef CONFIG_VSX
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+                       thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+               kvmppc_load_up_vsx();
+#endif
+               break;
+       default:
+               BUG();
+       }
+
+       vcpu->arch.guest_owned_ext |= msr;
+
+       kvmppc_recalc_shadow_msr(vcpu);
+
+       return RESUME_GUEST;
+}
+
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+       int r = RESUME_HOST;
+
+       vcpu->stat.sum_exits++;
+
+       run->exit_reason = KVM_EXIT_UNKNOWN;
+       run->ready_for_interrupt_injection = 1;
+
+       trace_kvm_book3s_exit(exit_nr, vcpu);
+       kvm_resched(vcpu);
+       switch (exit_nr) {
+       case BOOK3S_INTERRUPT_INST_STORAGE:
+               vcpu->stat.pf_instruc++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+               /* We set segments as unused segments when invalidating them. So
+                * treat the respective fault as segment fault. */
+               if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
+                   == SR_INVALID) {
+                       kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+                       r = RESUME_GUEST;
+                       break;
+               }
+#endif
+
+               /* only care about PTEG not found errors, but leave NX alone */
+               if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+                       r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
+                       vcpu->stat.sp_instruc++;
+               } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+                         (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+                       /*
+                        * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+                        *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+                        *     that no guest that needs the dcbz hack does NX.
+                        */
+                       kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+                       r = RESUME_GUEST;
+               } else {
+                       vcpu->arch.shared->msr |=
+                               to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       r = RESUME_GUEST;
+               }
+               break;
+       case BOOK3S_INTERRUPT_DATA_STORAGE:
+       {
+               ulong dar = kvmppc_get_fault_dar(vcpu);
+               vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+               /* We set segments as unused segments when invalidating them. So
+                * treat the respective fault as segment fault. */
+               if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+                       kvmppc_mmu_map_segment(vcpu, dar);
+                       r = RESUME_GUEST;
+                       break;
+               }
+#endif
+
+               /* The only case we need to handle is missing shadow PTEs */
+               if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+                       r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
+               } else {
+                       vcpu->arch.shared->dar = dar;
+                       vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       r = RESUME_GUEST;
+               }
+               break;
+       }
+       case BOOK3S_INTERRUPT_DATA_SEGMENT:
+               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+                       vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+                       kvmppc_book3s_queue_irqprio(vcpu,
+                               BOOK3S_INTERRUPT_DATA_SEGMENT);
+               }
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_INST_SEGMENT:
+               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
+                       kvmppc_book3s_queue_irqprio(vcpu,
+                               BOOK3S_INTERRUPT_INST_SEGMENT);
+               }
+               r = RESUME_GUEST;
+               break;
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PERFMON:
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PROGRAM:
+       {
+               enum emulation_result er;
+               ulong flags;
+
+program_interrupt:
+               flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
+
+               if (vcpu->arch.shared->msr & MSR_PR) {
+#ifdef EXIT_DEBUG
+                       printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+#endif
+                       if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
+                           (INS_DCBZ & 0xfffffff7)) {
+                               kvmppc_core_queue_program(vcpu, flags);
+                               r = RESUME_GUEST;
+                               break;
+                       }
+               }
+
+               vcpu->stat.emulated_inst_exits++;
+               er = kvmppc_emulate_instruction(run, vcpu);
+               switch (er) {
+               case EMULATE_DONE:
+                       r = RESUME_GUEST_NV;
+                       break;
+               case EMULATE_AGAIN:
+                       r = RESUME_GUEST;
+                       break;
+               case EMULATE_FAIL:
+                       printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+                              __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+                       kvmppc_core_queue_program(vcpu, flags);
+                       r = RESUME_GUEST;
+                       break;
+               case EMULATE_DO_MMIO:
+                       run->exit_reason = KVM_EXIT_MMIO;
+                       r = RESUME_HOST_NV;
+                       break;
+               default:
+                       BUG();
+               }
+               break;
+       }
+       case BOOK3S_INTERRUPT_SYSCALL:
+               if (vcpu->arch.osi_enabled &&
+                   (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+                   (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+                       /* MOL hypercalls */
+                       u64 *gprs = run->osi.gprs;
+                       int i;
+
+                       run->exit_reason = KVM_EXIT_OSI;
+                       for (i = 0; i < 32; i++)
+                               gprs[i] = kvmppc_get_gpr(vcpu, i);
+                       vcpu->arch.osi_needed = 1;
+                       r = RESUME_HOST_NV;
+               } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+                   (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+                       /* KVM PV hypercalls */
+                       kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+                       r = RESUME_GUEST;
+               } else {
+                       /* Guest syscalls */
+                       vcpu->stat.syscall_exits++;
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       r = RESUME_GUEST;
+               }
+               break;
+       case BOOK3S_INTERRUPT_FP_UNAVAIL:
+       case BOOK3S_INTERRUPT_ALTIVEC:
+       case BOOK3S_INTERRUPT_VSX:
+       {
+               int ext_msr = 0;
+
+               switch (exit_nr) {
+               case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
+               case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
+               case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
+               }
+
+               switch (kvmppc_check_ext(vcpu, exit_nr)) {
+               case EMULATE_DONE:
+                       /* everything ok - let's enable the ext */
+                       r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+                       break;
+               case EMULATE_FAIL:
+                       /* we need to emulate this instruction */
+                       goto program_interrupt;
+                       break;
+               default:
+                       /* nothing to worry about - go again */
+                       break;
+               }
+               break;
+       }
+       case BOOK3S_INTERRUPT_ALIGNMENT:
+               if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
+                       vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
+                               kvmppc_get_last_inst(vcpu));
+                       vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
+                               kvmppc_get_last_inst(vcpu));
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+               }
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_MACHINE_CHECK:
+       case BOOK3S_INTERRUPT_TRACE:
+               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+               r = RESUME_GUEST;
+               break;
+       default:
+               /* Ugh - bork here! What did we get? */
+               printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+                       exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
+               r = RESUME_HOST;
+               BUG();
+               break;
+       }
+
+
+       if (!(r & RESUME_HOST)) {
+               /* To avoid clobbering exit_reason, only check for signals if
+                * we aren't already exiting to userspace for some other
+                * reason. */
+               if (signal_pending(current)) {
+#ifdef EXIT_DEBUG
+                       printk(KERN_EMERG "KVM: Going back to host\n");
+#endif
+                       vcpu->stat.signal_exits++;
+                       run->exit_reason = KVM_EXIT_INTR;
+                       r = -EINTR;
+               } else {
+                       /* In case an interrupt came in that was triggered
+                        * from userspace (like DEC), we need to check what
+                        * to inject now! */
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       }
+
+       trace_kvm_book3s_reenter(r, vcpu);
+
+       return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+       int i;
+
+       sregs->pvr = vcpu->arch.pvr;
+
+       sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+               for (i = 0; i < 64; i++) {
+                       sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+                       sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+               }
+       } else {
+               for (i = 0; i < 16; i++)
+                       sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
+
+               for (i = 0; i < 8; i++) {
+                       sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+                       sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+               }
+       }
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+       int i;
+
+       kvmppc_set_pvr(vcpu, sregs->pvr);
+
+       vcpu3s->sdr1 = sregs->u.s.sdr1;
+       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+               for (i = 0; i < 64; i++) {
+                       vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
+                                                   sregs->u.s.ppc64.slb[i].slbe);
+               }
+       } else {
+               for (i = 0; i < 16; i++) {
+                       vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+               }
+               for (i = 0; i < 8; i++) {
+                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+                                      (u32)sregs->u.s.ppc32.ibat[i]);
+                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+                                      (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+                                      (u32)sregs->u.s.ppc32.dbat[i]);
+                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+                                      (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+               }
+       }
+
+       /* Flush the MMU after messing with the segments */
+       kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+       return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s;
+       struct kvm_vcpu *vcpu;
+       int err = -ENOMEM;
+       unsigned long p;
+
+       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+       if (!vcpu_book3s)
+               goto out;
+
+       vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
+               kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
+       if (!vcpu_book3s->shadow_vcpu)
+               goto free_vcpu;
+
+       vcpu = &vcpu_book3s->vcpu;
+       err = kvm_vcpu_init(vcpu, kvm, id);
+       if (err)
+               goto free_shadow_vcpu;
+
+       p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+       /* the real shared page fills the last 4k of our page */
+       vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
+       if (!p)
+               goto uninit_vcpu;
+
+       vcpu->arch.host_retip = kvm_return_point;
+       vcpu->arch.host_msr = mfmsr();
+#ifdef CONFIG_PPC_BOOK3S_64
+       /* default to book3s_64 (970fx) */
+       vcpu->arch.pvr = 0x3C0301;
+#else
+       /* default to book3s_32 (750) */
+       vcpu->arch.pvr = 0x84202;
+#endif
+       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+       vcpu->arch.slb_nr = 64;
+
+       /* remember where some real-mode handlers are */
+       vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
+       vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
+       vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+#ifdef CONFIG_PPC_BOOK3S_64
+       vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+#else
+       vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
+#endif
+
+       vcpu->arch.shadow_msr = MSR_USER64;
+
+       err = kvmppc_mmu_init(vcpu);
+       if (err < 0)
+               goto uninit_vcpu;
+
+       return vcpu;
+
+uninit_vcpu:
+       kvm_vcpu_uninit(vcpu);
+free_shadow_vcpu:
+       kfree(vcpu_book3s->shadow_vcpu);
+free_vcpu:
+       vfree(vcpu_book3s);
+out:
+       return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+       free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
+       kvm_vcpu_uninit(vcpu);
+       kfree(vcpu_book3s->shadow_vcpu);
+       vfree(vcpu_book3s);
+}
+
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int ret;
+       double fpr[32][TS_FPRWIDTH];
+       unsigned int fpscr;
+       int fpexc_mode;
+#ifdef CONFIG_ALTIVEC
+       vector128 vr[32];
+       vector128 vscr;
+       unsigned long uninitialized_var(vrsave);
+       int used_vr;
+#endif
+#ifdef CONFIG_VSX
+       int used_vsr;
+#endif
+       ulong ext_msr;
+
+       /* No need to go into the guest when all we do is going out */
+       if (signal_pending(current)) {
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* Save FPU state in stack */
+       if (current->thread.regs->msr & MSR_FP)
+               giveup_fpu(current);
+       memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+       fpscr = current->thread.fpscr.val;
+       fpexc_mode = current->thread.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+       /* Save Altivec state in stack */
+       used_vr = current->thread.used_vr;
+       if (used_vr) {
+               if (current->thread.regs->msr & MSR_VEC)
+                       giveup_altivec(current);
+               memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+               vscr = current->thread.vscr;
+               vrsave = current->thread.vrsave;
+       }
+#endif
+
+#ifdef CONFIG_VSX
+       /* Save VSX state in stack */
+       used_vsr = current->thread.used_vsr;
+       if (used_vsr && (current->thread.regs->msr & MSR_VSX))
+                       __giveup_vsx(current);
+#endif
+
+       /* Remember the MSR with disabled extensions */
+       ext_msr = current->thread.regs->msr;
+
+       /* Preload FPU if it's enabled */
+       if (vcpu->arch.shared->msr & MSR_FP)
+               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+       kvm_guest_enter();
+
+       ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+       kvm_guest_exit();
+
+       local_irq_disable();
+
+       current->thread.regs->msr = ext_msr;
+
+       /* Make sure we save the guest FPU/Altivec/VSX state */
+       kvmppc_giveup_ext(vcpu, MSR_FP);
+       kvmppc_giveup_ext(vcpu, MSR_VEC);
+       kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+       /* Restore FPU state from stack */
+       memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+       current->thread.fpscr.val = fpscr;
+       current->thread.fpexc_mode = fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+       /* Restore Altivec state from stack */
+       if (used_vr && current->thread.used_vr) {
+               memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+               current->thread.vscr = vscr;
+               current->thread.vrsave = vrsave;
+       }
+       current->thread.used_vr = used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+       current->thread.used_vsr = used_vsr;
+#endif
+
+       return ret;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                                     struct kvm_userspace_memory_region *mem)
+{
+       return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
+static int kvmppc_book3s_init(void)
+{
+       int r;
+
+       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+                    THIS_MODULE);
+
+       if (r)
+               return r;
+
+       r = kvmppc_mmu_hpte_sysinit();
+
+       return r;
+}
+
+static void kvmppc_book3s_exit(void)
+{
+       kvmppc_mmu_hpte_sysexit();
+       kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
index 1a1b344..c1f877c 100644 (file)
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define LOAD_SHADOW_VCPU(reg)  GET_PACA(reg)                                   
-#define SHADOW_VCPU_OFF                PACA_KVM_SVCPU
 #define MSR_NOIRQ              MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name)             GLUE(.,name)
 
+kvmppc_skip_interrupt:
+       /*
+        * Here all GPRs are unchanged from when the interrupt happened
+        * except for r13, which is saved in SPRG_SCRATCH0.
+        */
+       mfspr   r13, SPRN_SRR0
+       addi    r13, r13, 4
+       mtspr   SPRN_SRR0, r13
+       GET_SCRATCH0(r13)
+       rfid
+       b       .
+
+kvmppc_skip_Hinterrupt:
+       /*
+        * Here all GPRs are unchanged from when the interrupt happened
+        * except for r13, which is saved in SPRG_SCRATCH0.
+        */
+       mfspr   r13, SPRN_HSRR0
+       addi    r13, r13, 4
+       mtspr   SPRN_HSRR0, r13
+       GET_SCRATCH0(r13)
+       hrfid
+       b       .
+
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
-#define LOAD_SHADOW_VCPU(reg)                                          \
-       mfspr   reg, SPRN_SPRG_THREAD;                                  \
-       lwz     reg, THREAD_KVM_SVCPU(reg);                             \
-       /* PPC32 can have a NULL pointer - let's check for that */      \
-       mtspr   SPRN_SPRG_SCRATCH1, r12;        /* Save r12 */          \
-       mfcr    r12;                                                    \
-       cmpwi   reg, 0;                                                 \
-       bne     1f;                                                     \
-       mfspr   reg, SPRN_SPRG_SCRATCH0;                                \
-       mtcr    r12;                                                    \
-       mfspr   r12, SPRN_SPRG_SCRATCH1;                                \
-       b       kvmppc_resume_\intno;                                   \
-1:;                                                                    \
-       mtcr    r12;                                                    \
-       mfspr   r12, SPRN_SPRG_SCRATCH1;                                \
-       tophys(reg, reg)
-
-#define SHADOW_VCPU_OFF                0
 #define MSR_NOIRQ              MSR_KERNEL
 #define FUNC(name)             name
 
-#endif
-
 .macro INTERRUPT_TRAMPOLINE intno
 
 .global kvmppc_trampoline_\intno
 kvmppc_trampoline_\intno:
 
-       SET_SCRATCH0(r13)               /* Save r13 */
+       mtspr   SPRN_SPRG_SCRATCH0, r13         /* Save r13 */
 
        /*
         * First thing to do is to find out if we're coming
@@ -78,19 +81,28 @@ kvmppc_trampoline_\intno:
         *
         * To distinguish, we check a magic byte in the PACA/current
         */
-       LOAD_SHADOW_VCPU(r13)
-       PPC_STL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+       mfspr   r13, SPRN_SPRG_THREAD
+       lwz     r13, THREAD_KVM_SVCPU(r13)
+       /* PPC32 can have a NULL pointer - let's check for that */
+       mtspr   SPRN_SPRG_SCRATCH1, r12         /* Save r12 */
        mfcr    r12
-       stw     r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
-       lbz     r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+       cmpwi   r13, 0
+       bne     1f
+2:     mtcr    r12
+       mfspr   r12, SPRN_SPRG_SCRATCH1
+       mfspr   r13, SPRN_SPRG_SCRATCH0         /* r13 = original r13 */
+       b       kvmppc_resume_\intno            /* Get back original handler */
+
+1:     tophys(r13, r13)
+       stw     r12, HSTATE_SCRATCH1(r13)
+       mfspr   r12, SPRN_SPRG_SCRATCH1
+       stw     r12, HSTATE_SCRATCH0(r13)
+       lbz     r12, HSTATE_IN_GUEST(r13)
        cmpwi   r12, KVM_GUEST_MODE_NONE
        bne     ..kvmppc_handler_hasmagic_\intno
        /* No KVM guest? Then jump back to the Linux handler! */
-       lwz     r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
-       mtcr    r12
-       PPC_LL  r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-       GET_SCRATCH0(r13)                       /* r13 = original r13 */
-       b       kvmppc_resume_\intno            /* Get back original handler */
+       lwz     r12, HSTATE_SCRATCH1(r13)
+       b       2b
 
        /* Now we know we're handling a KVM guest */
 ..kvmppc_handler_hasmagic_\intno:
@@ -112,9 +124,6 @@ INTERRUPT_TRAMPOLINE        BOOK3S_INTERRUPT_MACHINE_CHECK
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_DATA_STORAGE
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_INST_STORAGE
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_EXTERNAL
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_EXTERNAL_HV
-#endif
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_ALIGNMENT
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_PROGRAM
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_FP_UNAVAIL
@@ -124,14 +133,6 @@ INTERRUPT_TRAMPOLINE       BOOK3S_INTERRUPT_TRACE
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_PERFMON
 INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_ALTIVEC
 
-/* Those are only available on 64 bit machines */
-
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_DATA_SEGMENT
-INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_INST_SEGMENT
-INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_VSX
-#endif
-
 /*
  * Bring us back to the faulting code, but skip the
  * faulting instruction.
@@ -143,8 +144,8 @@ INTERRUPT_TRAMPOLINE        BOOK3S_INTERRUPT_VSX
  *
  * R12            = free
  * R13            = Shadow VCPU (PACA)
- * SVCPU.SCRATCH0 = guest R12
- * SVCPU.SCRATCH1 = guest CR
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CR
  * SPRG_SCRATCH0  = guest R13
  *
  */
@@ -156,13 +157,14 @@ kvmppc_handler_skip_ins:
        mtsrr0  r12
 
        /* Clean up all state */
-       lwz     r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+       lwz     r12, HSTATE_SCRATCH1(r13)
        mtcr    r12
-       PPC_LL  r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+       PPC_LL  r12, HSTATE_SCRATCH0(r13)
        GET_SCRATCH0(r13)
 
        /* And get back into the code */
        RFI
+#endif
 
 /*
  * This trampoline brings us back to a real mode handler
@@ -251,12 +253,4 @@ define_load_up(altivec)
 define_load_up(vsx)
 #endif
 
-.global kvmppc_trampoline_lowmem
-kvmppc_trampoline_lowmem:
-       PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
-
-.global kvmppc_trampoline_enter
-kvmppc_trampoline_enter:
-       PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
-
 #include "book3s_segment.S"
index 4512642..aed32e5 100644 (file)
@@ -22,7 +22,7 @@
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define GET_SHADOW_VCPU(reg)    \
-       addi    reg, r13, PACA_KVM_SVCPU
+       mr      reg, r13
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
@@ -71,6 +71,10 @@ kvmppc_handler_trampoline_enter:
        /* r3 = shadow vcpu */
        GET_SHADOW_VCPU(r3)
 
+       /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
+       PPC_STL r1, HSTATE_HOST_R1(r3)
+       PPC_STL r2, HSTATE_HOST_R2(r3)
+
        /* Move SRR0 and SRR1 into the respective regs */
        PPC_LL  r9, SVCPU_PC(r3)
        mtsrr0  r9
@@ -78,36 +82,36 @@ kvmppc_handler_trampoline_enter:
 
        /* Activate guest mode, so faults get handled by KVM */
        li      r11, KVM_GUEST_MODE_GUEST
-       stb     r11, SVCPU_IN_GUEST(r3)
+       stb     r11, HSTATE_IN_GUEST(r3)
 
        /* Switch to guest segment. This is subarch specific. */
        LOAD_GUEST_SEGMENTS
 
        /* Enter guest */
 
-       PPC_LL  r4, (SVCPU_CTR)(r3)
-       PPC_LL  r5, (SVCPU_LR)(r3)
-       lwz     r6, (SVCPU_CR)(r3)
-       lwz     r7, (SVCPU_XER)(r3)
+       PPC_LL  r4, SVCPU_CTR(r3)
+       PPC_LL  r5, SVCPU_LR(r3)
+       lwz     r6, SVCPU_CR(r3)
+       lwz     r7, SVCPU_XER(r3)
 
        mtctr   r4
        mtlr    r5
        mtcr    r6
        mtxer   r7
 
-       PPC_LL  r0, (SVCPU_R0)(r3)
-       PPC_LL  r1, (SVCPU_R1)(r3)
-       PPC_LL  r2, (SVCPU_R2)(r3)
-       PPC_LL  r4, (SVCPU_R4)(r3)
-       PPC_LL  r5, (SVCPU_R5)(r3)
-       PPC_LL  r6, (SVCPU_R6)(r3)
-       PPC_LL  r7, (SVCPU_R7)(r3)
-       PPC_LL  r8, (SVCPU_R8)(r3)
-       PPC_LL  r9, (SVCPU_R9)(r3)
-       PPC_LL  r10, (SVCPU_R10)(r3)
-       PPC_LL  r11, (SVCPU_R11)(r3)
-       PPC_LL  r12, (SVCPU_R12)(r3)
-       PPC_LL  r13, (SVCPU_R13)(r3)
+       PPC_LL  r0, SVCPU_R0(r3)
+       PPC_LL  r1, SVCPU_R1(r3)
+       PPC_LL  r2, SVCPU_R2(r3)
+       PPC_LL  r4, SVCPU_R4(r3)
+       PPC_LL  r5, SVCPU_R5(r3)
+       PPC_LL  r6, SVCPU_R6(r3)
+       PPC_LL  r7, SVCPU_R7(r3)
+       PPC_LL  r8, SVCPU_R8(r3)
+       PPC_LL  r9, SVCPU_R9(r3)
+       PPC_LL  r10, SVCPU_R10(r3)
+       PPC_LL  r11, SVCPU_R11(r3)
+       PPC_LL  r12, SVCPU_R12(r3)
+       PPC_LL  r13, SVCPU_R13(r3)
 
        PPC_LL  r3, (SVCPU_R3)(r3)
 
@@ -125,56 +129,63 @@ kvmppc_handler_trampoline_enter_end:
 .global kvmppc_handler_trampoline_exit
 kvmppc_handler_trampoline_exit:
 
+.global kvmppc_interrupt
+kvmppc_interrupt:
+
        /* Register usage at this point:
         *
         * SPRG_SCRATCH0  = guest R13
         * R12            = exit handler id
-        * R13            = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
-        * SVCPU.SCRATCH0 = guest R12
-        * SVCPU.SCRATCH1 = guest CR
+        * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+        * HSTATE.SCRATCH0 = guest R12
+        * HSTATE.SCRATCH1 = guest CR
         *
         */
 
        /* Save registers */
 
-       PPC_STL r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13)
-       PPC_STL r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13)
-       PPC_STL r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13)
-       PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13)
-       PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13)
-       PPC_STL r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13)
-       PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13)
-       PPC_STL r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13)
-       PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13)
-       PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13)
-       PPC_STL r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13)
-       PPC_STL r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13)
+       PPC_STL r0, SVCPU_R0(r13)
+       PPC_STL r1, SVCPU_R1(r13)
+       PPC_STL r2, SVCPU_R2(r13)
+       PPC_STL r3, SVCPU_R3(r13)
+       PPC_STL r4, SVCPU_R4(r13)
+       PPC_STL r5, SVCPU_R5(r13)
+       PPC_STL r6, SVCPU_R6(r13)
+       PPC_STL r7, SVCPU_R7(r13)
+       PPC_STL r8, SVCPU_R8(r13)
+       PPC_STL r9, SVCPU_R9(r13)
+       PPC_STL r10, SVCPU_R10(r13)
+       PPC_STL r11, SVCPU_R11(r13)
 
        /* Restore R1/R2 so we can handle faults */
-       PPC_LL  r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
-       PPC_LL  r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+       PPC_LL  r1, HSTATE_HOST_R1(r13)
+       PPC_LL  r2, HSTATE_HOST_R2(r13)
 
        /* Save guest PC and MSR */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
        andi.   r0,r12,0x2
        beq     1f
        mfspr   r3,SPRN_HSRR0
        mfspr   r4,SPRN_HSRR1
        andi.   r12,r12,0x3ffd
        b       2f
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
 1:     mfsrr0  r3
        mfsrr1  r4
 2:
-       PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13)
-       PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13)
+       PPC_STL r3, SVCPU_PC(r13)
+       PPC_STL r4, SVCPU_SHADOW_SRR1(r13)
 
        /* Get scratch'ed off registers */
        GET_SCRATCH0(r9)
-       PPC_LL  r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-       lwz     r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+       PPC_LL  r8, HSTATE_SCRATCH0(r13)
+       lwz     r7, HSTATE_SCRATCH1(r13)
 
-       PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13)
-       PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13)
-       stw     r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13)
+       PPC_STL r9, SVCPU_R13(r13)
+       PPC_STL r8, SVCPU_R12(r13)
+       stw     r7, SVCPU_CR(r13)
 
        /* Save more register state  */
 
@@ -184,11 +195,11 @@ kvmppc_handler_trampoline_exit:
        mfctr   r8
        mflr    r9
 
-       stw     r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13)
-       PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13)
-       stw     r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13)
-       PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13)
-       PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13)
+       stw     r5, SVCPU_XER(r13)
+       PPC_STL r6, SVCPU_FAULT_DAR(r13)
+       stw     r7, SVCPU_FAULT_DSISR(r13)
+       PPC_STL r8, SVCPU_CTR(r13)
+       PPC_STL r9, SVCPU_LR(r13)
 
        /*
         * In order for us to easily get the last instruction,
@@ -218,7 +229,7 @@ ld_last_inst:
        /* Set guest mode to 'jump over instruction' so if lwz faults
         * we'll just continue at the next IP. */
        li      r9, KVM_GUEST_MODE_SKIP
-       stb     r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+       stb     r9, HSTATE_IN_GUEST(r13)
 
        /*    1) enable paging for data */
        mfmsr   r9
@@ -232,13 +243,13 @@ ld_last_inst:
        sync
 
 #endif
-       stw     r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13)
+       stw     r0, SVCPU_LAST_INST(r13)
 
 no_ld_last_inst:
 
        /* Unset guest mode */
        li      r9, KVM_GUEST_MODE_NONE
-       stb     r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+       stb     r9, HSTATE_IN_GUEST(r13)
 
        /* Switch back to host MMU */
        LOAD_HOST_SEGMENTS
@@ -248,7 +259,7 @@ no_ld_last_inst:
         * R1       = host R1
         * R2       = host R2
         * R12      = exit handler id
-        * R13      = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+        * R13      = shadow vcpu (32-bit) or PACA (64-bit)
         * SVCPU.*  = guest *
         *
         */
@@ -258,7 +269,7 @@ no_ld_last_inst:
        ori     r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME     /* Enable paging */
        mtsrr1  r7
        /* Load highmem handler address */
-       PPC_LL  r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13)
+       PPC_LL  r8, HSTATE_VMHANDLER(r13)
        mtsrr0  r8
 
        RFI
index 8462b3a..ee45fa0 100644 (file)
@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  * Copyright IBM Corp. 2007
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
@@ -78,6 +79,60 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
        }
 }
 
+#ifdef CONFIG_SPE
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       enable_kernel_spe();
+       kvmppc_save_guest_spe(vcpu);
+       vcpu->arch.shadow_msr &= ~MSR_SPE;
+       preempt_enable();
+}
+
+static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       enable_kernel_spe();
+       kvmppc_load_guest_spe(vcpu);
+       vcpu->arch.shadow_msr |= MSR_SPE;
+       preempt_enable();
+}
+
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.shared->msr & MSR_SPE) {
+               if (!(vcpu->arch.shadow_msr & MSR_SPE))
+                       kvmppc_vcpu_enable_spe(vcpu);
+       } else if (vcpu->arch.shadow_msr & MSR_SPE) {
+               kvmppc_vcpu_disable_spe(vcpu);
+       }
+}
+#else
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
+/*
+ * Helper function for "full" MSR writes.  No need to call this if only
+ * EE/CE/ME/DE/RI are changing.
+ */
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+       u32 old_msr = vcpu->arch.shared->msr;
+
+       vcpu->arch.shared->msr = new_msr;
+
+       kvmppc_mmu_msr_notify(vcpu, old_msr);
+
+       if (vcpu->arch.shared->msr & MSR_WE) {
+               kvm_vcpu_block(vcpu);
+               kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+       };
+
+       kvmppc_vcpu_sync_spe(vcpu);
+}
+
 static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
                                        unsigned int priority)
 {
@@ -257,6 +312,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
                vcpu->arch.shared->int_pending = 0;
 }
 
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       local_irq_disable();
+       kvm_guest_enter();
+       ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+       kvm_guest_exit();
+       local_irq_enable();
+
+       return ret;
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -344,10 +412,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                r = RESUME_GUEST;
                break;
 
-       case BOOKE_INTERRUPT_SPE_UNAVAIL:
-               kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL);
+#ifdef CONFIG_SPE
+       case BOOKE_INTERRUPT_SPE_UNAVAIL: {
+               if (vcpu->arch.shared->msr & MSR_SPE)
+                       kvmppc_vcpu_enable_spe(vcpu);
+               else
+                       kvmppc_booke_queue_irqprio(vcpu,
+                                                  BOOKE_IRQPRIO_SPE_UNAVAIL);
                r = RESUME_GUEST;
                break;
+       }
 
        case BOOKE_INTERRUPT_SPE_FP_DATA:
                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
@@ -358,6 +432,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
                r = RESUME_GUEST;
                break;
+#else
+       case BOOKE_INTERRUPT_SPE_UNAVAIL:
+               /*
+                * Guest wants SPE, but host kernel doesn't support it.  Send
+                * an "unimplemented operation" program check to the guest.
+                */
+               kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
+               r = RESUME_GUEST;
+               break;
+
+       /*
+        * These really should never happen without CONFIG_SPE,
+        * as we should never enable the real MSR[SPE] in the guest.
+        */
+       case BOOKE_INTERRUPT_SPE_FP_DATA:
+       case BOOKE_INTERRUPT_SPE_FP_ROUND:
+               printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
+                      __func__, exit_nr, vcpu->arch.pc);
+               run->hw.hardware_exit_reason = exit_nr;
+               r = RESUME_HOST;
+               break;
+#endif
 
        case BOOKE_INTERRUPT_DATA_STORAGE:
                kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
@@ -392,6 +488,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                gpa_t gpaddr;
                gfn_t gfn;
 
+#ifdef CONFIG_KVM_E500
+               if (!(vcpu->arch.shared->msr & MSR_PR) &&
+                   (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+                       kvmppc_map_magic(vcpu);
+                       kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+                       r = RESUME_GUEST;
+
+                       break;
+               }
+#endif
+
                /* Check the guest TLB. */
                gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
                if (gtlb_index < 0) {
@@ -514,6 +621,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
        vcpu->arch.pc = 0;
        vcpu->arch.shared->msr = 0;
+       vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
        kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
        vcpu->arch.shadow_pid = 1;
@@ -770,6 +878,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
        return -ENOTSUPP;
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                                     struct kvm_userspace_memory_region *mem)
+{
+       return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 int __init kvmppc_booke_init(void)
 {
        unsigned long ivor[16];
index 492bb70..8e1fe33 100644 (file)
 
 extern unsigned long kvmppc_booke_handlers;
 
-/* Helper function for "full" MSR writes. No need to call this if only EE is
- * changing. */
-static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
-{
-       if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
-               kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
-
-       vcpu->arch.shared->msr = new_msr;
-
-       if (vcpu->arch.shared->msr & MSR_WE) {
-               kvm_vcpu_block(vcpu);
-               kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
-       };
-}
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance);
 int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
 int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
 
+/* low-level asm code to transfer guest state */
+void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
+void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
+
+/* high-level function, manages flags, host state */
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
+
 #endif /* __KVM_BOOKE_H__ */
index b58ccae..42f2fb1 100644 (file)
@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  */
@@ -24,8 +25,6 @@
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
-#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS)
-
 #define VCPU_GPR(n)     (VCPU_GPRS + (n * 4))
 
 /* The host stack layout: */
@@ -192,6 +191,12 @@ _GLOBAL(kvmppc_resume_host)
        lwz     r3, VCPU_HOST_PID(r4)
        mtspr   SPRN_PID, r3
 
+#ifdef CONFIG_FSL_BOOKE
+       /* we cheat and know that Linux doesn't use PID1 which is always 0 */
+       lis     r3, 0
+       mtspr   SPRN_PID1, r3
+#endif
+
        /* Restore host IVPR before re-enabling interrupts. We cheat and know
         * that Linux IVPR is always 0xc0000000. */
        lis     r3, 0xc000
@@ -241,6 +246,14 @@ _GLOBAL(kvmppc_resume_host)
 heavyweight_exit:
        /* Not returning to guest. */
 
+#ifdef CONFIG_SPE
+       /* save guest SPEFSCR and load host SPEFSCR */
+       mfspr   r9, SPRN_SPEFSCR
+       stw     r9, VCPU_SPEFSCR(r4)
+       lwz     r9, VCPU_HOST_SPEFSCR(r4)
+       mtspr   SPRN_SPEFSCR, r9
+#endif
+
        /* We already saved guest volatile register state; now save the
         * non-volatiles. */
        stw     r15, VCPU_GPR(r15)(r4)
@@ -342,6 +355,14 @@ _GLOBAL(__kvmppc_vcpu_run)
        lwz     r30, VCPU_GPR(r30)(r4)
        lwz     r31, VCPU_GPR(r31)(r4)
 
+#ifdef CONFIG_SPE
+       /* save host SPEFSCR and load guest SPEFSCR */
+       mfspr   r3, SPRN_SPEFSCR
+       stw     r3, VCPU_HOST_SPEFSCR(r4)
+       lwz     r3, VCPU_SPEFSCR(r4)
+       mtspr   SPRN_SPEFSCR, r3
+#endif
+
 lightweight_exit:
        stw     r2, HOST_R2(r1)
 
@@ -350,6 +371,11 @@ lightweight_exit:
        lwz     r3, VCPU_SHADOW_PID(r4)
        mtspr   SPRN_PID, r3
 
+#ifdef CONFIG_FSL_BOOKE
+       lwz     r3, VCPU_SHADOW_PID1(r4)
+       mtspr   SPRN_PID1, r3
+#endif
+
 #ifdef CONFIG_44x
        iccci   0, 0 /* XXX hack */
 #endif
@@ -405,20 +431,17 @@ lightweight_exit:
 
        /* Finish loading guest volatiles and jump to guest. */
        lwz     r3, VCPU_CTR(r4)
+       lwz     r5, VCPU_CR(r4)
+       lwz     r6, VCPU_PC(r4)
+       lwz     r7, VCPU_SHADOW_MSR(r4)
        mtctr   r3
-       lwz     r3, VCPU_CR(r4)
-       mtcr    r3
+       mtcr    r5
+       mtsrr0  r6
+       mtsrr1  r7
        lwz     r5, VCPU_GPR(r5)(r4)
        lwz     r6, VCPU_GPR(r6)(r4)
        lwz     r7, VCPU_GPR(r7)(r4)
        lwz     r8, VCPU_GPR(r8)(r4)
-       lwz     r3, VCPU_PC(r4)
-       mtsrr0  r3
-       lwz     r3, VCPU_SHARED(r4)
-       lwz     r3, (VCPU_SHARED_MSR + 4)(r3)
-       oris    r3, r3, KVMPPC_MSR_MASK@h
-       ori     r3, r3, KVMPPC_MSR_MASK@l
-       mtsrr1  r3
 
        /* Clear any debug events which occurred since we disabled MSR[DE].
         * XXX This gives us a 3-instruction window in which a breakpoint
@@ -430,3 +453,24 @@ lightweight_exit:
        lwz     r3, VCPU_GPR(r3)(r4)
        lwz     r4, VCPU_GPR(r4)(r4)
        rfi
+
+#ifdef CONFIG_SPE
+_GLOBAL(kvmppc_save_guest_spe)
+       cmpi    0,r3,0
+       beqlr-
+       SAVE_32EVRS(0, r4, r3, VCPU_EVR)
+       evxor   evr6, evr6, evr6
+       evmwumiaa evr6, evr6, evr6
+       li      r4,VCPU_ACC
+       evstddx evr6, r4, r3            /* save acc */
+       blr
+
+_GLOBAL(kvmppc_load_guest_spe)
+       cmpi    0,r3,0
+       beqlr-
+       li      r4,VCPU_ACC
+       evlddx  evr6,r4,r3
+       evmra   evr6,evr6               /* load acc */
+       REST_32EVRS(0, r4, r3, VCPU_EVR)
+       blr
+#endif
index 318dbc6..797a744 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, <yu.liu@freescale.com>
  *
@@ -41,6 +41,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
        kvmppc_e500_tlb_put(vcpu);
+
+#ifdef CONFIG_SPE
+       if (vcpu->arch.shadow_msr & MSR_SPE)
+               kvmppc_vcpu_disable_spe(vcpu);
+#endif
 }
 
 int kvmppc_core_check_processor_compat(void)
index 69cd665..d48ae39 100644 (file)
@@ -81,8 +81,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
                kvmppc_set_pid(vcpu, spr_val);
                break;
        case SPRN_PID1:
+               if (spr_val != 0)
+                       return EMULATE_FAIL;
                vcpu_e500->pid[1] = spr_val; break;
        case SPRN_PID2:
+               if (spr_val != 0)
+                       return EMULATE_FAIL;
                vcpu_e500->pid[2] = spr_val; break;
        case SPRN_MAS0:
                vcpu_e500->mas0 = spr_val; break;
index b18fe35..13c432e 100644 (file)
 
 #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
 
+struct id {
+       unsigned long val;
+       struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS     [0..1]
+ * guestTID    [0..255]
+ * guestPR     [0..1]
+ * ID          [1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+       struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+       struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
 static unsigned int tlb1_entry_num;
 
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+       unsigned long sid;
+       int ret = -1;
+
+       sid = ++(__get_cpu_var(pcpu_last_used_sid));
+       if (sid < NUM_TIDS) {
+               __get_cpu_var(pcpu_sids).entry[sid] = entry;
+               entry->val = sid;
+               entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
+               ret = sid;
+       }
+
+       /*
+        * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+        * the caller will invalidate everything and start over.
+        *
+        * sid > NUM_TIDS indicates a race, which we disable preemption to
+        * avoid.
+        */
+       WARN_ON(sid > NUM_TIDS);
+
+       return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+       if (entry && entry->val != 0 &&
+           __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
+           entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
+               return entry->val;
+       return -1;
+}
+
+/* Invalidate all id mappings on local core */
+static inline void local_sid_destroy_all(void)
+{
+       preempt_disable();
+       __get_cpu_var(pcpu_last_used_sid) = 0;
+       memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
+       preempt_enable();
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+       vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+       return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+       kfree(vcpu_e500->idt);
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+       memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+       /* Update shadow pid when mappings are changed */
+       kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+                              struct kvmppc_vcpu_e500 *vcpu_e500,
+                              int as, int pid, int pr)
+{
+       struct vcpu_id_table *idt = vcpu_e500->idt;
+
+       BUG_ON(as >= 2);
+       BUG_ON(pid >= NUM_TIDS);
+       BUG_ON(pr >= 2);
+
+       idt->id[as][pid][pr].val = 0;
+       idt->id[as][pid][pr].pentry = NULL;
+
+       /* Update shadow pid when mappings are changed */
+       kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+                                       unsigned int as, unsigned int gid,
+                                       unsigned int pr, int avoid_recursion)
+{
+       struct vcpu_id_table *idt = vcpu_e500->idt;
+       int sid;
+
+       BUG_ON(as >= 2);
+       BUG_ON(gid >= NUM_TIDS);
+       BUG_ON(pr >= 2);
+
+       sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+       while (sid <= 0) {
+               /* No mapping yet */
+               sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+               if (sid <= 0) {
+                       _tlbil_all();
+                       local_sid_destroy_all();
+               }
+
+               /* Update shadow pid when mappings are changed */
+               if (!avoid_recursion)
+                       kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+       }
+
+       return sid;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+       preempt_disable();
+       vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+                       get_cur_as(&vcpu_e500->vcpu),
+                       get_cur_pid(&vcpu_e500->vcpu),
+                       get_cur_pr(&vcpu_e500->vcpu), 1);
+       vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+                       get_cur_as(&vcpu_e500->vcpu), 0,
+                       get_cur_pr(&vcpu_e500->vcpu), 1);
+       preempt_enable();
+}
+
 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -41,25 +229,14 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 
        for (tlbsel = 0; tlbsel < 2; tlbsel++) {
                printk("Guest TLB%d:\n", tlbsel);
-               for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
-                       tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+               for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
+                       tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
                        if (tlbe->mas1 & MAS1_VALID)
                                printk(" G[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
                                        tlbsel, i, tlbe->mas1, tlbe->mas2,
                                        tlbe->mas3, tlbe->mas7);
                }
        }
-
-       for (tlbsel = 0; tlbsel < 2; tlbsel++) {
-               printk("Shadow TLB%d:\n", tlbsel);
-               for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) {
-                       tlbe = &vcpu_e500->shadow_tlb[tlbsel][i];
-                       if (tlbe->mas1 & MAS1_VALID)
-                               printk(" S[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
-                                       tlbsel, i, tlbe->mas1, tlbe->mas2,
-                                       tlbe->mas3, tlbe->mas7);
-               }
-       }
 }
 
 static inline unsigned int tlb0_get_next_victim(
@@ -67,16 +244,17 @@ static inline unsigned int tlb0_get_next_victim(
 {
        unsigned int victim;
 
-       victim = vcpu_e500->guest_tlb_nv[0]++;
-       if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
-               vcpu_e500->guest_tlb_nv[0] = 0;
+       victim = vcpu_e500->gtlb_nv[0]++;
+       if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
+               vcpu_e500->gtlb_nv[0] = 0;
 
        return victim;
 }
 
 static inline unsigned int tlb1_max_shadow_size(void)
 {
-       return tlb1_entry_num - tlbcam_index;
+       /* reserve one entry for magic page */
+       return tlb1_entry_num - tlbcam_index - 1;
 }
 
 static inline int tlbe_is_writable(struct tlbe *tlbe)
@@ -112,72 +290,149 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
 /*
  * writing shadow tlb entry to host TLB
  */
-static inline void __write_host_tlbe(struct tlbe *stlbe)
+static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0)
 {
+       unsigned long flags;
+
+       local_irq_save(flags);
+       mtspr(SPRN_MAS0, mas0);
        mtspr(SPRN_MAS1, stlbe->mas1);
        mtspr(SPRN_MAS2, stlbe->mas2);
        mtspr(SPRN_MAS3, stlbe->mas3);
        mtspr(SPRN_MAS7, stlbe->mas7);
-       __asm__ __volatile__ ("tlbwe\n" : : );
+       asm volatile("isync; tlbwe" : : : "memory");
+       local_irq_restore(flags);
 }
 
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
-               int tlbsel, int esel)
+               int tlbsel, int esel, struct tlbe *stlbe)
 {
-       struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-
-       local_irq_disable();
        if (tlbsel == 0) {
-               __write_host_tlbe(stlbe);
+               __write_host_tlbe(stlbe,
+                                 MAS0_TLBSEL(0) |
+                                 MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1)));
        } else {
-               unsigned register mas0;
-
-               mas0 = mfspr(SPRN_MAS0);
-
-               mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel)));
-               __write_host_tlbe(stlbe);
-
-               mtspr(SPRN_MAS0, mas0);
+               __write_host_tlbe(stlbe,
+                                 MAS0_TLBSEL(1) |
+                                 MAS0_ESEL(to_htlb1_esel(esel)));
        }
-       local_irq_enable();
+       trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
+                            stlbe->mas3, stlbe->mas7);
+}
+
+void kvmppc_map_magic(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+       struct tlbe magic;
+       ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+       unsigned int stid;
+       pfn_t pfn;
+
+       pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+       get_page(pfn_to_page(pfn));
+
+       preempt_disable();
+       stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
+
+       magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
+                    MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+       magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
+       magic.mas3 = (pfn << PAGE_SHIFT) |
+                    MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+       magic.mas7 = pfn >> (32 - PAGE_SHIFT);
+
+       __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
+       preempt_enable();
 }
 
 void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-       int i;
-       unsigned register mas0;
-
-       /* Load all valid TLB1 entries to reduce guest tlb miss fault */
-       local_irq_disable();
-       mas0 = mfspr(SPRN_MAS0);
-       for (i = 0; i < tlb1_max_shadow_size(); i++) {
-               struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
-
-               if (get_tlb_v(stlbe)) {
-                       mtspr(SPRN_MAS0, MAS0_TLBSEL(1)
-                                       | MAS0_ESEL(to_htlb1_esel(i)));
-                       __write_host_tlbe(stlbe);
-               }
-       }
-       mtspr(SPRN_MAS0, mas0);
-       local_irq_enable();
+
+       /* Shadow PID may be expired on local core */
+       kvmppc_e500_recalc_shadow_pid(vcpu_e500);
 }
 
 void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
 {
-       _tlbil_all();
+}
+
+static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
+                                        int tlbsel, int esel)
+{
+       struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+       struct vcpu_id_table *idt = vcpu_e500->idt;
+       unsigned int pr, tid, ts, pid;
+       u32 val, eaddr;
+       unsigned long flags;
+
+       ts = get_tlb_ts(gtlbe);
+       tid = get_tlb_tid(gtlbe);
+
+       preempt_disable();
+
+       /* One guest ID may be mapped to two shadow IDs */
+       for (pr = 0; pr < 2; pr++) {
+               /*
+                * The shadow PID can have a valid mapping on at most one
+                * host CPU.  In the common case, it will be valid on this
+                * CPU, in which case (for TLB0) we do a local invalidation
+                * of the specific address.
+                *
+                * If the shadow PID is not valid on the current host CPU, or
+                * if we're invalidating a TLB1 entry, we invalidate the
+                * entire shadow PID.
+                */
+               if (tlbsel == 1 ||
+                   (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) {
+                       kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+                       continue;
+               }
+
+               /*
+                * The guest is invalidating a TLB0 entry which is in a PID
+                * that has a valid shadow mapping on this host CPU.  We
+                * search host TLB0 to invalidate it's shadow TLB entry,
+                * similar to __tlbil_va except that we need to look in AS1.
+                */
+               val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+               eaddr = get_tlb_eaddr(gtlbe);
+
+               local_irq_save(flags);
+
+               mtspr(SPRN_MAS6, val);
+               asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+               val = mfspr(SPRN_MAS1);
+               if (val & MAS1_VALID) {
+                       mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+                       asm volatile("tlbwe");
+               }
+
+               local_irq_restore(flags);
+       }
+
+       preempt_enable();
 }
 
 /* Search the guest TLB for a matching entry. */
 static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
                gva_t eaddr, int tlbsel, unsigned int pid, int as)
 {
+       int size = vcpu_e500->gtlb_size[tlbsel];
+       int set_base;
        int i;
 
-       /* XXX Replace loop with fancy data structures. */
-       for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
-               struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+       if (tlbsel == 0) {
+               int mask = size / KVM_E500_TLB0_WAY_NUM - 1;
+               set_base = (eaddr >> PAGE_SHIFT) & mask;
+               set_base *= KVM_E500_TLB0_WAY_NUM;
+               size = KVM_E500_TLB0_WAY_NUM;
+       } else {
+               set_base = 0;
+       }
+
+       for (i = 0; i < size; i++) {
+               struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i];
                unsigned int tid;
 
                if (eaddr < get_tlb_eaddr(tlbe))
@@ -196,66 +451,32 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
                if (get_tlb_ts(tlbe) != as && as != -1)
                        continue;
 
-               return i;
+               return set_base + i;
        }
 
        return -1;
 }
 
-static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500,
-               int tlbsel, int esel)
-{
-       struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-       struct page *page = vcpu_e500->shadow_pages[tlbsel][esel];
-
-       if (page) {
-               vcpu_e500->shadow_pages[tlbsel][esel] = NULL;
-
-               if (get_tlb_v(stlbe)) {
-                       if (tlbe_is_writable(stlbe))
-                               kvm_release_page_dirty(page);
-                       else
-                               kvm_release_page_clean(page);
-               }
-       }
-}
-
-static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-               int tlbsel, int esel)
+static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv,
+                                         struct tlbe *gtlbe,
+                                         pfn_t pfn)
 {
-       struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+       priv->pfn = pfn;
+       priv->flags = E500_TLB_VALID;
 
-       kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
-       stlbe->mas1 = 0;
-       trace_kvm_stlb_inval(index_of(tlbsel, esel));
+       if (tlbe_is_writable(gtlbe))
+               priv->flags |= E500_TLB_DIRTY;
 }
 
-static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-               gva_t eaddr, gva_t eend, u32 tid)
+static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv)
 {
-       unsigned int pid = tid & 0xff;
-       unsigned int i;
-
-       /* XXX Replace loop with fancy data structures. */
-       for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) {
-               struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
-               unsigned int tid;
-
-               if (!get_tlb_v(stlbe))
-                       continue;
-
-               if (eend < get_tlb_eaddr(stlbe))
-                       continue;
+       if (priv->flags & E500_TLB_VALID) {
+               if (priv->flags & E500_TLB_DIRTY)
+                       kvm_release_pfn_dirty(priv->pfn);
+               else
+                       kvm_release_pfn_clean(priv->pfn);
 
-               if (eaddr > get_tlb_end(stlbe))
-                       continue;
-
-               tid = get_tlb_tid(stlbe);
-               if (tid && (tid != pid))
-                       continue;
-
-               kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
-               write_host_tlbe(vcpu_e500, 1, i);
+               priv->flags = 0;
        }
 }
 
@@ -273,7 +494,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
        tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
 
        vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
-               | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+               | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
        vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
                | MAS1_TID(vcpu_e500->pid[pidsel])
                | MAS1_TSIZE(tsized);
@@ -286,56 +507,154 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
        vcpu_e500->mas7 = 0;
 }
 
-static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-       u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
+static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+                                          struct tlbe *gtlbe, int tsize,
+                                          struct tlbe_priv *priv,
+                                          u64 gvaddr, struct tlbe *stlbe)
 {
-       struct page *new_page;
-       struct tlbe *stlbe;
-       hpa_t hpaddr;
-
-       stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
-
-       /* Get reference to new page. */
-       new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
-       if (is_error_page(new_page)) {
-               printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
-                               (long)gfn);
-               kvm_release_page_clean(new_page);
-               return;
-       }
-       hpaddr = page_to_phys(new_page);
-
-       /* Drop reference to old page. */
-       kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
+       pfn_t pfn = priv->pfn;
+       unsigned int stid;
 
-       vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
+       stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
+                                  get_tlb_tid(gtlbe),
+                                  get_cur_pr(&vcpu_e500->vcpu), 0);
 
-       /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
-       stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
-               | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
+       /* Force TS=1 IPROT=0 for all guest mappings. */
+       stlbe->mas1 = MAS1_TSIZE(tsize)
+               | MAS1_TID(stid) | MAS1_TS | MAS1_VALID;
        stlbe->mas2 = (gvaddr & MAS2_EPN)
                | e500_shadow_mas2_attrib(gtlbe->mas2,
                                vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-       stlbe->mas3 = (hpaddr & MAS3_RPN)
+       stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
                | e500_shadow_mas3_attrib(gtlbe->mas3,
                                vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-       stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
+       stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
+}
 
-       trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-                            stlbe->mas3, stlbe->mas7);
+
+static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+       u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel,
+       struct tlbe *stlbe)
+{
+       struct kvm_memory_slot *slot;
+       unsigned long pfn, hva;
+       int pfnmap = 0;
+       int tsize = BOOK3E_PAGESZ_4K;
+       struct tlbe_priv *priv;
+
+       /*
+        * Translate guest physical to true physical, acquiring
+        * a page reference if it is normal, non-reserved memory.
+        *
+        * gfn_to_memslot() must succeed because otherwise we wouldn't
+        * have gotten this far.  Eventually we should just pass the slot
+        * pointer through from the first lookup.
+        */
+       slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+       hva = gfn_to_hva_memslot(slot, gfn);
+
+       if (tlbsel == 1) {
+               struct vm_area_struct *vma;
+               down_read(&current->mm->mmap_sem);
+
+               vma = find_vma(current->mm, hva);
+               if (vma && hva >= vma->vm_start &&
+                   (vma->vm_flags & VM_PFNMAP)) {
+                       /*
+                        * This VMA is a physically contiguous region (e.g.
+                        * /dev/mem) that bypasses normal Linux page
+                        * management.  Find the overlap between the
+                        * vma and the memslot.
+                        */
+
+                       unsigned long start, end;
+                       unsigned long slot_start, slot_end;
+
+                       pfnmap = 1;
+
+                       start = vma->vm_pgoff;
+                       end = start +
+                             ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+
+                       pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
+
+                       slot_start = pfn - (gfn - slot->base_gfn);
+                       slot_end = slot_start + slot->npages;
+
+                       if (start < slot_start)
+                               start = slot_start;
+                       if (end > slot_end)
+                               end = slot_end;
+
+                       tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+                               MAS1_TSIZE_SHIFT;
+
+                       /*
+                        * e500 doesn't implement the lowest tsize bit,
+                        * or 1K pages.
+                        */
+                       tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+                       /*
+                        * Now find the largest tsize (up to what the guest
+                        * requested) that will cover gfn, stay within the
+                        * range, and for which gfn and pfn are mutually
+                        * aligned.
+                        */
+
+                       for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+                               unsigned long gfn_start, gfn_end, tsize_pages;
+                               tsize_pages = 1 << (tsize - 2);
+
+                               gfn_start = gfn & ~(tsize_pages - 1);
+                               gfn_end = gfn_start + tsize_pages;
+
+                               if (gfn_start + pfn - gfn < start)
+                                       continue;
+                               if (gfn_end + pfn - gfn > end)
+                                       continue;
+                               if ((gfn & (tsize_pages - 1)) !=
+                                   (pfn & (tsize_pages - 1)))
+                                       continue;
+
+                               gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+                               pfn &= ~(tsize_pages - 1);
+                               break;
+                       }
+               }
+
+               up_read(&current->mm->mmap_sem);
+       }
+
+       if (likely(!pfnmap)) {
+               pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+               if (is_error_pfn(pfn)) {
+                       printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
+                                       (long)gfn);
+                       kvm_release_pfn_clean(pfn);
+                       return;
+               }
+       }
+
+       /* Drop old priv and setup new one. */
+       priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
+       kvmppc_e500_priv_release(priv);
+       kvmppc_e500_priv_setup(priv, gtlbe, pfn);
+
+       kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
-static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-               int tlbsel, int esel)
+static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+                               int esel, struct tlbe *stlbe)
 {
        struct tlbe *gtlbe;
 
-       gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+       gtlbe = &vcpu_e500->gtlb_arch[0][esel];
 
        kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
                        get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
-                       gtlbe, tlbsel, esel);
+                       gtlbe, 0, esel, stlbe);
 
        return esel;
 }
@@ -344,53 +663,37 @@ static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
  * the shadow TLB. */
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-               u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe)
+               u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe)
 {
        unsigned int victim;
 
-       victim = vcpu_e500->guest_tlb_nv[1]++;
+       victim = vcpu_e500->gtlb_nv[1]++;
 
-       if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size()))
-               vcpu_e500->guest_tlb_nv[1] = 0;
+       if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size()))
+               vcpu_e500->gtlb_nv[1] = 0;
 
-       kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim);
+       kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe);
 
        return victim;
 }
 
-/* Invalidate all guest kernel mappings when enter usermode,
- * so that when they fault back in they will get the
- * proper permission bits. */
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
-       if (usermode) {
-               struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-               int i;
-
-               /* XXX Replace loop with fancy data structures. */
-               for (i = 0; i < tlb1_max_shadow_size(); i++)
-                       kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
+       struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
-               _tlbil_all();
-       }
+       /* Recalc shadow pid since MSR changes */
+       kvmppc_e500_recalc_shadow_pid(vcpu_e500);
 }
 
-static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-               int tlbsel, int esel)
+static inline int kvmppc_e500_gtlbe_invalidate(
+                               struct kvmppc_vcpu_e500 *vcpu_e500,
+                               int tlbsel, int esel)
 {
-       struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+       struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
        if (unlikely(get_tlb_iprot(gtlbe)))
                return -1;
 
-       if (tlbsel == 1) {
-               kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe),
-                               get_tlb_end(gtlbe),
-                               get_tlb_tid(gtlbe));
-       } else {
-               kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
-       }
-
        gtlbe->mas1 = 0;
 
        return 0;
@@ -401,13 +704,14 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
        int esel;
 
        if (value & MMUCSR0_TLB0FI)
-               for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++)
+               for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++)
                        kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
        if (value & MMUCSR0_TLB1FI)
-               for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++)
+               for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++)
                        kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
-       _tlbil_all();
+       /* Invalidate all vcpu id mappings */
+       kvmppc_e500_id_table_reset_all(vcpu_e500);
 
        return EMULATE_DONE;
 }
@@ -428,7 +732,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 
        if (ia) {
                /* invalidate all entries */
-               for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++)
+               for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++)
                        kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
        } else {
                ea &= 0xfffff000;
@@ -438,7 +742,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
                        kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
        }
 
-       _tlbil_all();
+       /* Invalidate all vcpu id mappings */
+       kvmppc_e500_id_table_reset_all(vcpu_e500);
 
        return EMULATE_DONE;
 }
@@ -452,9 +757,9 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
        tlbsel = get_tlb_tlbsel(vcpu_e500);
        esel = get_tlb_esel(vcpu_e500, tlbsel);
 
-       gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+       gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
        vcpu_e500->mas0 &= ~MAS0_NV(~0);
-       vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+       vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
        vcpu_e500->mas1 = gtlbe->mas1;
        vcpu_e500->mas2 = gtlbe->mas2;
        vcpu_e500->mas3 = gtlbe->mas3;
@@ -477,14 +782,14 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
        for (tlbsel = 0; tlbsel < 2; tlbsel++) {
                esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
                if (esel >= 0) {
-                       gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+                       gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
                        break;
                }
        }
 
        if (gtlbe) {
                vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
-                       | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+                       | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
                vcpu_e500->mas1 = gtlbe->mas1;
                vcpu_e500->mas2 = gtlbe->mas2;
                vcpu_e500->mas3 = gtlbe->mas3;
@@ -497,7 +802,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
                victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
 
                vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
-                       | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+                       | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
                vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0)
                        | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0))
                        | (vcpu_e500->mas4 & MAS4_TSIZED(~0));
@@ -514,23 +819,16 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-       u64 eaddr;
-       u64 raddr;
-       u32 tid;
        struct tlbe *gtlbe;
-       int tlbsel, esel, stlbsel, sesel;
+       int tlbsel, esel;
 
        tlbsel = get_tlb_tlbsel(vcpu_e500);
        esel = get_tlb_esel(vcpu_e500, tlbsel);
 
-       gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+       gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
 
-       if (get_tlb_v(gtlbe) && tlbsel == 1) {
-               eaddr = get_tlb_eaddr(gtlbe);
-               tid = get_tlb_tid(gtlbe);
-               kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr,
-                               get_tlb_end(gtlbe), tid);
-       }
+       if (get_tlb_v(gtlbe))
+               kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
 
        gtlbe->mas1 = vcpu_e500->mas1;
        gtlbe->mas2 = vcpu_e500->mas2;
@@ -542,6 +840,12 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
        /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
        if (tlbe_is_host_safe(vcpu, gtlbe)) {
+               struct tlbe stlbe;
+               int stlbsel, sesel;
+               u64 eaddr;
+               u64 raddr;
+
+               preempt_disable();
                switch (tlbsel) {
                case 0:
                        /* TLB0 */
@@ -549,7 +853,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
                        gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 
                        stlbsel = 0;
-                       sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
+                       sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
 
                        break;
 
@@ -564,13 +868,14 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
                         * are mapped on the fly. */
                        stlbsel = 1;
                        sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-                                       raddr >> PAGE_SHIFT, gtlbe);
+                                       raddr >> PAGE_SHIFT, gtlbe, &stlbe);
                        break;
 
                default:
                        BUG();
                }
-               write_host_tlbe(vcpu_e500, stlbsel, sesel);
+               write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
+               preempt_enable();
        }
 
        kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -610,7 +915,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
        struct tlbe *gtlbe =
-               &vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)];
+               &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)];
        u64 pgmask = get_tlb_bytes(gtlbe) - 1;
 
        return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
@@ -618,38 +923,37 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-       int tlbsel, i;
-
-       for (tlbsel = 0; tlbsel < 2; tlbsel++)
-               for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++)
-                       kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i);
-
-       /* discard all guest mapping */
-       _tlbil_all();
 }
 
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
                        unsigned int index)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+       struct tlbe_priv *priv;
+       struct tlbe *gtlbe, stlbe;
        int tlbsel = tlbsel_of(index);
        int esel = esel_of(index);
        int stlbsel, sesel;
 
+       gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+
+       preempt_disable();
        switch (tlbsel) {
        case 0:
                stlbsel = 0;
                sesel = esel;
+               priv = &vcpu_e500->gtlb_priv[stlbsel][sesel];
+
+               kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
+                                       priv, eaddr, &stlbe);
                break;
 
        case 1: {
                gfn_t gfn = gpaddr >> PAGE_SHIFT;
-               struct tlbe *gtlbe
-                       = &vcpu_e500->guest_tlb[tlbsel][esel];
 
                stlbsel = 1;
-               sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe);
+               sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
+                                            gtlbe, &stlbe);
                break;
        }
 
@@ -657,7 +961,9 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
                BUG();
                break;
        }
-       write_host_tlbe(vcpu_e500, stlbsel, sesel);
+
+       write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
+       preempt_enable();
 }
 
 int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -679,8 +985,10 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
-       vcpu_e500->pid[0] = vcpu->arch.shadow_pid =
-               vcpu->arch.pid = pid;
+       if (vcpu->arch.pid != pid) {
+               vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+               kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+       }
 }
 
 void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -688,14 +996,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
        struct tlbe *tlbe;
 
        /* Insert large initial mapping for guest. */
-       tlbe = &vcpu_e500->guest_tlb[1][0];
+       tlbe = &vcpu_e500->gtlb_arch[1][0];
        tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
        tlbe->mas2 = 0;
        tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
        tlbe->mas7 = 0;
 
        /* 4K map for serial output. Used by kernel wrapper. */
-       tlbe = &vcpu_e500->guest_tlb[1][1];
+       tlbe = &vcpu_e500->gtlb_arch[1][1];
        tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
        tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
        tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
@@ -706,68 +1014,64 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
        tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF;
 
-       vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE;
-       vcpu_e500->guest_tlb[0] =
+       vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE;
+       vcpu_e500->gtlb_arch[0] =
                kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-       if (vcpu_e500->guest_tlb[0] == NULL)
+       if (vcpu_e500->gtlb_arch[0] == NULL)
                goto err_out;
 
-       vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE;
-       vcpu_e500->shadow_tlb[0] =
-               kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-       if (vcpu_e500->shadow_tlb[0] == NULL)
-               goto err_out_guest0;
-
-       vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE;
-       vcpu_e500->guest_tlb[1] =
+       vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE;
+       vcpu_e500->gtlb_arch[1] =
                kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
-       if (vcpu_e500->guest_tlb[1] == NULL)
-               goto err_out_shadow0;
+       if (vcpu_e500->gtlb_arch[1] == NULL)
+               goto err_out_guest0;
 
-       vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num;
-       vcpu_e500->shadow_tlb[1] =
-               kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL);
-       if (vcpu_e500->shadow_tlb[1] == NULL)
+       vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *)
+               kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
+       if (vcpu_e500->gtlb_priv[0] == NULL)
                goto err_out_guest1;
+       vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *)
+               kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
 
-       vcpu_e500->shadow_pages[0] = (struct page **)
-               kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-       if (vcpu_e500->shadow_pages[0] == NULL)
-               goto err_out_shadow1;
+       if (vcpu_e500->gtlb_priv[1] == NULL)
+               goto err_out_priv0;
 
-       vcpu_e500->shadow_pages[1] = (struct page **)
-               kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL);
-       if (vcpu_e500->shadow_pages[1] == NULL)
-               goto err_out_page0;
+       if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+               goto err_out_priv1;
 
        /* Init TLB configuration register */
        vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
-       vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0];
+       vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0];
        vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
-       vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1];
+       vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1];
 
        return 0;
 
-err_out_page0:
-       kfree(vcpu_e500->shadow_pages[0]);
-err_out_shadow1:
-       kfree(vcpu_e500->shadow_tlb[1]);
+err_out_priv1:
+       kfree(vcpu_e500->gtlb_priv[1]);
+err_out_priv0:
+       kfree(vcpu_e500->gtlb_priv[0]);
 err_out_guest1:
-       kfree(vcpu_e500->guest_tlb[1]);
-err_out_shadow0:
-       kfree(vcpu_e500->shadow_tlb[0]);
+       kfree(vcpu_e500->gtlb_arch[1]);
 err_out_guest0:
-       kfree(vcpu_e500->guest_tlb[0]);
+       kfree(vcpu_e500->gtlb_arch[0]);
 err_out:
        return -1;
 }
 
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
-       kfree(vcpu_e500->shadow_pages[1]);
-       kfree(vcpu_e500->shadow_pages[0]);
-       kfree(vcpu_e500->shadow_tlb[1]);
-       kfree(vcpu_e500->guest_tlb[1]);
-       kfree(vcpu_e500->shadow_tlb[0]);
-       kfree(vcpu_e500->guest_tlb[0]);
+       int stlbsel, i;
+
+       /* release all privs */
+       for (stlbsel = 0; stlbsel < 2; stlbsel++)
+               for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) {
+                       struct tlbe_priv *priv =
+                               &vcpu_e500->gtlb_priv[stlbsel][i];
+                       kvmppc_e500_priv_release(priv);
+               }
+
+       kvmppc_e500_id_table_free(vcpu_e500);
+       kfree(vcpu_e500->gtlb_arch[1]);
+       kfree(vcpu_e500->gtlb_arch[0]);
 }
index 458946b..59b88e9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
  *
@@ -55,6 +55,7 @@ extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
 extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 
 /* TLB helper functions */
 static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
@@ -110,6 +111,16 @@ static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
        return vcpu->arch.pid & 0xff;
 }
 
+static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
+{
+       return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
+}
+
+static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
+{
+       return !!(vcpu->arch.shared->msr & MSR_PR);
+}
+
 static inline unsigned int get_cur_spid(
                const struct kvmppc_vcpu_e500 *vcpu_e500)
 {
index 616dd51..a107c9b 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include <asm/cputhreads.h>
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
+#ifndef CONFIG_KVM_BOOK3S_64_HV
        return !(v->arch.shared->msr & MSR_WE) ||
               !!(v->arch.pending_exceptions);
+#else
+       return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
+#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -73,7 +78,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
        }
        case HC_VENDOR_KVM | KVM_HC_FEATURES:
                r = HC_EV_SUCCESS;
-#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */
+#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500)
+               /* XXX Missing magic page on 44x */
                r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
 #endif
 
@@ -147,7 +153,7 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm)
 {
-       return 0;
+       return kvmppc_core_init_vm(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -163,6 +169,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                kvm->vcpus[i] = NULL;
 
        atomic_set(&kvm->online_vcpus, 0);
+
+       kvmppc_core_destroy_vm(kvm);
+
        mutex_unlock(&kvm->lock);
 }
 
@@ -180,10 +189,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 #else
        case KVM_CAP_PPC_SEGSTATE:
 #endif
-       case KVM_CAP_PPC_PAIRED_SINGLES:
        case KVM_CAP_PPC_UNSET_IRQ:
        case KVM_CAP_PPC_IRQ_LEVEL:
        case KVM_CAP_ENABLE_CAP:
+               r = 1;
+               break;
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+       case KVM_CAP_PPC_PAIRED_SINGLES:
        case KVM_CAP_PPC_OSI:
        case KVM_CAP_PPC_GET_PVINFO:
                r = 1;
@@ -191,6 +203,21 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       case KVM_CAP_SPAPR_TCE:
+               r = 1;
+               break;
+       case KVM_CAP_PPC_SMT:
+               r = threads_per_core;
+               break;
+       case KVM_CAP_PPC_RMA:
+               r = 1;
+               /* PPC970 requires an RMA */
+               if (cpu_has_feature(CPU_FTR_ARCH_201))
+                       r = 2;
+               break;
+#endif
        default:
                r = 0;
                break;
@@ -211,7 +238,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-       return 0;
+       return kvmppc_core_prepare_memory_region(kvm, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -219,7 +246,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_memory_slot old,
                int user_alloc)
 {
-       return;
+       kvmppc_core_commit_memory_region(kvm, mem);
 }
 
 
@@ -287,6 +314,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
        tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
        vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+       vcpu->arch.dec_expires = ~(u64)0;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
        mutex_init(&vcpu->arch.exit_timing_lock);
@@ -313,6 +341,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 #endif
        kvmppc_core_vcpu_load(vcpu, cpu);
+       vcpu->cpu = smp_processor_id();
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -321,6 +350,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_BOOKE
        vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 #endif
+       vcpu->cpu = -1;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -492,15 +522,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                for (i = 0; i < 32; i++)
                        kvmppc_set_gpr(vcpu, i, gprs[i]);
                vcpu->arch.osi_needed = 0;
+       } else if (vcpu->arch.hcall_needed) {
+               int i;
+
+               kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+               for (i = 0; i < 9; ++i)
+                       kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+               vcpu->arch.hcall_needed = 0;
        }
 
        kvmppc_core_deliver_interrupts(vcpu);
 
-       local_irq_disable();
-       kvm_guest_enter();
-       r = __kvmppc_vcpu_run(run, vcpu);
-       kvm_guest_exit();
-       local_irq_enable();
+       r = kvmppc_vcpu_run(run, vcpu);
 
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -518,6 +551,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
+       } else if (vcpu->cpu != -1) {
+               smp_send_reschedule(vcpu->cpu);
        }
 
        return 0;
@@ -633,6 +668,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
                break;
        }
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       case KVM_CREATE_SPAPR_TCE: {
+               struct kvm_create_spapr_tce create_tce;
+               struct kvm *kvm = filp->private_data;
+
+               r = -EFAULT;
+               if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+                       goto out;
+               r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+               goto out;
+       }
+
+       case KVM_ALLOCATE_RMA: {
+               struct kvm *kvm = filp->private_data;
+               struct kvm_allocate_rma rma;
+
+               r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+               if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+                       r = -EFAULT;
+               break;
+       }
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
        default:
                r = -ENOTTY;
        }
index 319177d..07b6110 100644 (file)
@@ -56,15 +56,6 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
 {
        u64 old;
 
-       do_div(duration, tb_ticks_per_usec);
-       if (unlikely(duration > 0xFFFFFFFF)) {
-               printk(KERN_ERR"%s - duration too big -> overflow"
-                       " duration %lld type %d exit #%d\n",
-                       __func__, duration, type,
-                       vcpu->arch.timing_count_type[type]);
-               return;
-       }
-
        mutex_lock(&vcpu->arch.exit_timing_lock);
 
        vcpu->arch.timing_count_type[type]++;
index 3aca1b0..b135d3d 100644 (file)
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
  *                         Book3S trace points                           *
  *************************************************************************/
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_KVM_BOOK3S_PR
 
 TRACE_EVENT(kvm_book3s_exit,
        TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
@@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush,
        ),
 
        TP_fast_assign(
-               __entry->count          = vcpu->arch.hpte_cache_count;
+               __entry->count          = to_book3s(vcpu)->hpte_cache_count;
                __entry->p1             = p1;
                __entry->p2             = p2;
                __entry->type           = type;
index dfd7648..90039bc 100644 (file)
@@ -37,7 +37,7 @@
 
 #define HPTE_LOCK_BIT 3
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {
@@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
                va &= ~0xffful;
                va |= ssize << 8;
                asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
-                            : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
                             : "memory");
                break;
        default:
@@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
                va |= ssize << 8;
                va |= 1; /* L */
                asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
-                            : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+                            : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
                             : "memory");
                break;
        }
index 29c02f3..f519ee1 100644 (file)
@@ -167,7 +167,7 @@ BEGIN_FTR_SECTION
        std     r12,PACA_EXGEN+EX_R13(r13)
        EXCEPTION_PROLOG_ISERIES_1
 FTR_SECTION_ELSE
-       EXCEPTION_PROLOG_1(PACA_EXGEN)
+       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0)
        EXCEPTION_PROLOG_ISERIES_1
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
        b       data_access_common
index bae3fba..50271b5 100644 (file)
@@ -39,7 +39,7 @@
 label##_iSeries:                                                       \
        HMT_MEDIUM;                                                     \
        mtspr   SPRN_SPRG_SCRATCH0,r13; /* save r13 */                  \
-       EXCEPTION_PROLOG_1(area);                                       \
+       EXCEPTION_PROLOG_1(area, NOTEST, 0);                            \
        EXCEPTION_PROLOG_ISERIES_1;                                     \
        b       label##_common
 
@@ -48,7 +48,7 @@ label##_iSeries:                                                      \
 label##_iSeries:                                                       \
        HMT_MEDIUM;                                                     \
        mtspr   SPRN_SPRG_SCRATCH0,r13; /* save r13 */                  \
-       EXCEPTION_PROLOG_1(PACA_EXGEN);                                 \
+       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0);                      \
        lbz     r10,PACASOFTIRQEN(r13);                                 \
        cmpwi   0,r10,0;                                                \
        beq-    label##_iSeries_masked;                                 \
index 1f15ad4..ba382b5 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/prom.h>
 #include <asm/io.h>
@@ -24,6 +25,7 @@
 #include <asm/irq.h>
 #include <asm/errno.h>
 #include <asm/xics.h>
+#include <asm/kvm_ppc.h>
 
 struct icp_ipl {
        union {
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
        icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+void xics_wake_cpu(int cpu)
+{
+       icp_native_set_qirr(cpu, IPI_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(xics_wake_cpu);
+
 static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
        int cpu = smp_processor_id();
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
        }
 
        icp_native_regs[cpu] = ioremap(addr, size);
+       kvmppc_set_xics_phys(cpu, addr);
        if (!icp_native_regs[cpu]) {
                pr_warning("icp_native: Failed ioremap for CPU %d, "
                           "interrupt server #0x%x, addr %#lx\n",
index 5ed8d64..0317a35 100644 (file)
@@ -1,15 +1,12 @@
 /*
  * Cryptographic API.
  *
- * s390 implementation of the SHA256 Secure Hash Algorithm.
+ * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm.
  *
  * s390 Version:
- *   Copyright IBM Corp. 2005,2007
+ *   Copyright IBM Corp. 2005,2011
  *   Author(s): Jan Glauber (jang@de.ibm.com)
  *
- * Derived from "crypto/sha256_generic.c"
- * and "arch/s390/crypto/sha1_s390.c"
- *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -65,7 +62,7 @@ static int sha256_import(struct shash_desc *desc, const void *in)
        return 0;
 }
 
-static struct shash_alg alg = {
+static struct shash_alg sha256_alg = {
        .digestsize     =       SHA256_DIGEST_SIZE,
        .init           =       sha256_init,
        .update         =       s390_sha_update,
@@ -84,22 +81,69 @@ static struct shash_alg alg = {
        }
 };
 
-static int sha256_s390_init(void)
+static int sha224_init(struct shash_desc *desc)
 {
+       struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
+
+       sctx->state[0] = SHA224_H0;
+       sctx->state[1] = SHA224_H1;
+       sctx->state[2] = SHA224_H2;
+       sctx->state[3] = SHA224_H3;
+       sctx->state[4] = SHA224_H4;
+       sctx->state[5] = SHA224_H5;
+       sctx->state[6] = SHA224_H6;
+       sctx->state[7] = SHA224_H7;
+       sctx->count = 0;
+       sctx->func = KIMD_SHA_256;
+
+       return 0;
+}
+
+static struct shash_alg sha224_alg = {
+       .digestsize     =       SHA224_DIGEST_SIZE,
+       .init           =       sha224_init,
+       .update         =       s390_sha_update,
+       .final          =       s390_sha_final,
+       .export         =       sha256_export,
+       .import         =       sha256_import,
+       .descsize       =       sizeof(struct s390_sha_ctx),
+       .statesize      =       sizeof(struct sha256_state),
+       .base           =       {
+               .cra_name       =       "sha224",
+               .cra_driver_name=       "sha224-s390",
+               .cra_priority   =       CRYPT_S390_PRIORITY,
+               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+               .cra_blocksize  =       SHA224_BLOCK_SIZE,
+               .cra_module     =       THIS_MODULE,
+       }
+};
+
+static int __init sha256_s390_init(void)
+{
+       int ret;
+
        if (!crypt_s390_func_available(KIMD_SHA_256, CRYPT_S390_MSA))
                return -EOPNOTSUPP;
-
-       return crypto_register_shash(&alg);
+       ret = crypto_register_shash(&sha256_alg);
+       if (ret < 0)
+               goto out;
+       ret = crypto_register_shash(&sha224_alg);
+       if (ret < 0)
+               crypto_unregister_shash(&sha256_alg);
+out:
+       return ret;
 }
 
 static void __exit sha256_s390_fini(void)
 {
-       crypto_unregister_shash(&alg);
+       crypto_unregister_shash(&sha224_alg);
+       crypto_unregister_shash(&sha256_alg);
 }
 
 module_init(sha256_s390_init);
 module_exit(sha256_s390_fini);
 
 MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha224");
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm");
+MODULE_DESCRIPTION("SHA256 and SHA224 Secure Hash Algorithm");
index b212754..a67e014 100644 (file)
@@ -529,6 +529,18 @@ menuconfig PARAVIRT_GUEST
 
 if PARAVIRT_GUEST
 
+config PARAVIRT_TIME_ACCOUNTING
+       bool "Paravirtual steal time accounting"
+       select PARAVIRT
+       default n
+       ---help---
+         Select this option to enable fine granularity task steal time
+         accounting. Time spent executing other tasks in parallel with
+         the current vCPU is discounted from the vCPU power. To account for
+         that, there can be a small performance impact.
+
+         If in doubt, say N here.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_CLOCK
index 7a6e68e..976aa64 100644 (file)
@@ -245,7 +245,7 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
        crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
                               & CRYPTO_TFM_RES_MASK);
 
-       return 0;
+       return err;
 }
 
 static int ghash_async_init_tfm(struct crypto_tfm *tfm)
index 0049211..6040d11 100644 (file)
@@ -229,7 +229,26 @@ struct read_cache {
        unsigned long end;
 };
 
-struct decode_cache {
+struct x86_emulate_ctxt {
+       struct x86_emulate_ops *ops;
+
+       /* Register state before/after emulation. */
+       unsigned long eflags;
+       unsigned long eip; /* eip before instruction emulation */
+       /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+       int mode;
+
+       /* interruptibility state, as a result of execution of STI or MOV SS */
+       int interruptibility;
+
+       bool guest_mode; /* guest running a nested guest */
+       bool perm_ok; /* do not check permissions if true */
+       bool only_vendor_specific_insn;
+
+       bool have_exception;
+       struct x86_exception exception;
+
+       /* decode cache */
        u8 twobyte;
        u8 b;
        u8 intercept;
@@ -246,8 +265,6 @@ struct decode_cache {
        unsigned int d;
        int (*execute)(struct x86_emulate_ctxt *ctxt);
        int (*check_perm)(struct x86_emulate_ctxt *ctxt);
-       unsigned long regs[NR_VCPU_REGS];
-       unsigned long eip;
        /* modrm */
        u8 modrm;
        u8 modrm_mod;
@@ -255,34 +272,14 @@ struct decode_cache {
        u8 modrm_rm;
        u8 modrm_seg;
        bool rip_relative;
+       unsigned long _eip;
+       /* Fields above regs are cleared together. */
+       unsigned long regs[NR_VCPU_REGS];
        struct fetch_cache fetch;
        struct read_cache io_read;
        struct read_cache mem_read;
 };
 
-struct x86_emulate_ctxt {
-       struct x86_emulate_ops *ops;
-
-       /* Register state before/after emulation. */
-       unsigned long eflags;
-       unsigned long eip; /* eip before instruction emulation */
-       /* Emulated execution mode, represented by an X86EMUL_MODE value. */
-       int mode;
-
-       /* interruptibility state, as a result of execution of STI or MOV SS */
-       int interruptibility;
-
-       bool guest_mode; /* guest running a nested guest */
-       bool perm_ok; /* do not check permissions if true */
-       bool only_vendor_specific_insn;
-
-       bool have_exception;
-       struct x86_exception exception;
-
-       /* decode cache */
-       struct decode_cache decode;
-};
-
 /* Repeat String Operation Prefix */
 #define REPE_PREFIX    0xf3
 #define REPNE_PREFIX   0xf2
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
                         u16 tss_selector, int reason,
                         bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-                    struct x86_emulate_ops *ops, int irq);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
index d2ac8e2..dd51c83 100644 (file)
@@ -48,7 +48,7 @@
        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
-                         | X86_CR4_OSXSAVE \
+                         | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
                unsigned invalid:1;
                unsigned nxe:1;
                unsigned cr0_wp:1;
+               unsigned smep_andnot_wp:1;
        };
 };
 
@@ -227,15 +228,17 @@ struct kvm_mmu_page {
         * in this shadow page.
         */
        DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-       bool multimapped;         /* More than one parent_pte? */
        bool unsync;
        int root_count;          /* Currently serving as active root */
        unsigned int unsync_children;
-       union {
-               u64 *parent_pte;               /* !multimapped */
-               struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-       };
+       unsigned long parent_ptes;      /* Reverse mapping for parent_pte */
        DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+       int clear_spte_count;
+#endif
+
+       struct rcu_head rcu;
 };
 
 struct kvm_pv_mmu_op_buffer {
@@ -269,8 +272,6 @@ struct kvm_mmu {
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
                            struct x86_exception *exception);
        gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
-       void (*prefetch_page)(struct kvm_vcpu *vcpu,
-                             struct kvm_mmu_page *page);
        int (*sync_page)(struct kvm_vcpu *vcpu,
                         struct kvm_mmu_page *sp);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch {
         * put it here to avoid allocation */
        struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
-       struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-       struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+       struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
        struct kvm_mmu_memory_cache mmu_page_cache;
        struct kvm_mmu_memory_cache mmu_page_header_cache;
 
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch {
        unsigned int hw_tsc_khz;
        unsigned int time_offset;
        struct page *time_page;
+
+       struct {
+               u64 msr_val;
+               u64 last_steal;
+               u64 accum_steal;
+               struct gfn_to_hva_cache stime;
+               struct kvm_steal_time steal;
+       } st;
+
        u64 last_guest_tsc;
        u64 last_kernel_ns;
        u64 last_tsc_nsec;
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch {
        u64 mcg_ctl;
        u64 *mce_banks;
 
+       /* Cache MMIO info */
+       u64 mmio_gva;
+       unsigned access;
+       gfn_t mmio_gfn;
+
        /* used for guest single stepping over the given code position */
        unsigned long singlestep_rip;
 
@@ -441,6 +455,7 @@ struct kvm_arch {
        unsigned int n_used_mmu_pages;
        unsigned int n_requested_mmu_pages;
        unsigned int n_max_mmu_pages;
+       unsigned int indirect_shadow_pages;
        atomic_t invlpg_counter;
        struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
        /*
@@ -477,6 +492,8 @@ struct kvm_arch {
        u64 hv_guest_os_id;
        u64 hv_hypercall;
 
+       atomic_t reader_counter;
+
        #ifdef CONFIG_KVM_MMU_AUDIT
        int audit_point;
        #endif
@@ -559,7 +576,7 @@ struct kvm_x86_ops {
        void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
        void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-       void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+       int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
        void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
        void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
        void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -830,11 +846,12 @@ enum {
 asmlinkage void kvm_spurious_fault(void);
 extern bool kvm_rebooting;
 
-#define __kvm_handle_fault_on_reboot(insn) \
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)     \
        "666: " insn "\n\t" \
        "668: \n\t"                           \
        ".pushsection .fixup, \"ax\" \n" \
        "667: \n\t" \
+       cleanup_insn "\n\t"                   \
        "cmpb $0, kvm_rebooting \n\t"         \
        "jne 668b \n\t"                       \
        __ASM_SIZE(push) " $666b \n\t"        \
@@ -844,6 +861,9 @@ extern bool kvm_rebooting;
        _ASM_PTR " 666b, 667b \n\t" \
        ".popsection"
 
+#define __kvm_handle_fault_on_reboot(insn)             \
+       ____kvm_handle_fault_on_reboot(insn, "")
+
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
index a427bf7..734c376 100644 (file)
@@ -21,6 +21,7 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 #define KVM_FEATURE_ASYNC_PF           4
+#define KVM_FEATURE_STEAL_TIME         5
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define KVM_MSR_ENABLED 1
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME  0x4b564d03
+
+struct kvm_steal_time {
+       __u64 steal;
+       __u32 version;
+       __u32 flags;
+       __u32 pad[12];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
 void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
 #else
 #define kvm_guest_init() do { } while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
 {
        return 0;
 }
+
+static inline void kvm_disable_steal_time(void)
+{
+       return;
+}
 #endif
 
 #endif /* __KERNEL__ */
index d96bdb2..d52609a 100644 (file)
 #define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT      32
+#define VMX_BASIC_64           0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT       50
+#define VMX_BASIC_MEM_TYPE_MASK        0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB  6LLU
+#define VMX_BASIC_INOUT                0x0040000000000000LLU
 
 /* AMD-V MSRs */
 
index ebbc4d8..a7d2db9 100644 (file)
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
        return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+       return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
        return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
index 8288509..2c76521 100644 (file)
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
 
 struct pv_time_ops {
        unsigned long long (*sched_clock)(void);
+       unsigned long long (*steal_clock)(int cpu);
        unsigned long (*get_tsc_khz)(void);
 };
 
index 59ab4df..2dddb31 100644 (file)
@@ -59,6 +59,7 @@
 #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE   0x00002000 /* enable VMX virtualization */
+#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
 #define X86_CR4_SMEP   0x00100000 /* enable SMEP support */
 
index 84471b8..2caf290 100644 (file)
@@ -132,6 +132,8 @@ enum vmcs_field {
        GUEST_IA32_PAT_HIGH             = 0x00002805,
        GUEST_IA32_EFER                 = 0x00002806,
        GUEST_IA32_EFER_HIGH            = 0x00002807,
+       GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
+       GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
        GUEST_PDPTR0                    = 0x0000280a,
        GUEST_PDPTR0_HIGH               = 0x0000280b,
        GUEST_PDPTR1                    = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
        HOST_IA32_PAT_HIGH              = 0x00002c01,
        HOST_IA32_EFER                  = 0x00002c02,
        HOST_IA32_EFER_HIGH             = 0x00002c03,
+       HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
+       HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
        PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
        CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
        EXCEPTION_BITMAP                = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
        u64 value;
 } __aligned(16);
 
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT             0
+#define ENTRY_FAIL_PDPTE               2
+#define ENTRY_FAIL_NMI                 3
+#define ENTRY_FAIL_VMCS_LINK_PTR       4
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+       VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+       VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+       VMXERR_VMCLEAR_VMXON_POINTER = 3,
+       VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+       VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+       VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+       VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+       VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+       VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+       VMXERR_VMPTRLD_VMXON_POINTER = 10,
+       VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+       VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+       VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+       VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+       VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+       VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+       VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+       VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+       VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+       VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+       VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+       VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+       VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+       VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
 #endif
index d240ea9..417777d 100644 (file)
@@ -39,6 +39,8 @@
 #include <linux/string.h>
 #include <linux/types.h>
 
+#include <trace/events/xen.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -459,6 +461,8 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
        mcl->op = __HYPERVISOR_fpu_taskswitch;
        mcl->args[0] = set;
+
+       trace_xen_mc_entry(mcl, 1);
 }
 
 static inline void
@@ -475,6 +479,8 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
                mcl->args[2] = new_val.pte >> 32;
                mcl->args[3] = flags;
        }
+
+       trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
 }
 
 static inline void
@@ -485,6 +491,8 @@ MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
        mcl->args[0] = cmd;
        mcl->args[1] = (unsigned long)uop;
        mcl->args[2] = count;
+
+       trace_xen_mc_entry(mcl, 3);
 }
 
 static inline void
@@ -504,6 +512,8 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
                mcl->args[3] = flags;
                mcl->args[4] = domid;
        }
+
+       trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5);
 }
 
 static inline void
@@ -520,6 +530,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
                mcl->args[2] = desc.a;
                mcl->args[3] = desc.b;
        }
+
+       trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
 }
 
 static inline void
@@ -528,6 +540,8 @@ MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
        mcl->op = __HYPERVISOR_memory_op;
        mcl->args[0] = cmd;
        mcl->args[1] = (unsigned long)arg;
+
+       trace_xen_mc_entry(mcl, 2);
 }
 
 static inline void
@@ -539,6 +553,8 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
        mcl->args[1] = count;
        mcl->args[2] = (unsigned long)success_count;
        mcl->args[3] = domid;
+
+       trace_xen_mc_entry(mcl, 4);
 }
 
 static inline void
@@ -550,6 +566,8 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
        mcl->args[1] = count;
        mcl->args[2] = (unsigned long)success_count;
        mcl->args[3] = domid;
+
+       trace_xen_mc_entry(mcl, 4);
 }
 
 static inline void
@@ -558,6 +576,8 @@ MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
        mcl->op = __HYPERVISOR_set_gdt;
        mcl->args[0] = (unsigned long)frames;
        mcl->args[1] = entries;
+
+       trace_xen_mc_entry(mcl, 2);
 }
 
 static inline void
@@ -567,6 +587,8 @@ MULTI_stack_switch(struct multicall_entry *mcl,
        mcl->op = __HYPERVISOR_stack_switch;
        mcl->args[0] = ss;
        mcl->args[1] = esp;
+
+       trace_xen_mc_entry(mcl, 2);
 }
 
 #endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644 (file)
index 0000000..21e1874
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _ASM_XEN_TRACE_TYPES_H
+#define _ASM_XEN_TRACE_TYPES_H
+
+enum xen_mc_flush_reason {
+       XEN_MC_FL_NONE,         /* explicit flush */
+       XEN_MC_FL_BATCH,        /* out of hypercall space */
+       XEN_MC_FL_ARGS,         /* out of argument space */
+       XEN_MC_FL_CALLBACK,     /* out of callback space */
+};
+
+enum xen_mc_extend_args {
+       XEN_MC_XE_OK,
+       XEN_MC_XE_BAD_OP,
+       XEN_MC_XE_NO_SPACE
+};
+typedef void (*xen_mc_callback_fn_t)(void *);
+
+#endif /* _ASM_XEN_TRACE_TYPES_H */
index 33c07b0..a9c2116 100644 (file)
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
 
 early_param("no-kvmapf", parse_no_kvmapf);
 
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+        steal_acc = 0;
+        return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
 struct kvm_para_state {
        u8 mmu_queue[MMU_QUEUE_SIZE];
        int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+static void kvm_register_steal_time(void)
+{
+       int cpu = smp_processor_id();
+       struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+       if (!has_steal_clock)
+               return;
+
+       memset(st, 0, sizeof(*st));
+
+       wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+       printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+               cpu, __pa(st));
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
        if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
                printk(KERN_INFO"KVM setup async PF for cpu %d\n",
                       smp_processor_id());
        }
+
+       if (has_steal_clock)
+               kvm_register_steal_time();
 }
 
 static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
        .notifier_call = kvm_pv_reboot_notify,
 };
 
+static u64 kvm_steal_clock(int cpu)
+{
+       u64 steal;
+       struct kvm_steal_time *src;
+       int version;
+
+       src = &per_cpu(steal_time, cpu);
+       do {
+               version = src->version;
+               rmb();
+               steal = src->steal;
+               rmb();
+       } while ((version & 1) || (version != src->version));
+
+       return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+       if (!has_steal_clock)
+               return;
+
+       wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 
 static void kvm_guest_cpu_offline(void *dummy)
 {
+       kvm_disable_steal_time();
        kvm_pv_disable_apf(NULL);
        apf_task_wake_all();
 }
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
                x86_init.irqs.trap_init = kvm_apf_trap_init;
 
+       if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+               has_steal_clock = 1;
+               pv_time_ops.steal_clock = kvm_steal_clock;
+       }
+
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
        kvm_guest_cpu_init();
 #endif
 }
+
+static __init int activate_jump_labels(void)
+{
+       if (has_steal_clock) {
+               jump_label_inc(&paravirt_steal_enabled);
+               if (steal_acc)
+                       jump_label_inc(&paravirt_steal_rq_enabled);
+       }
+
+       return 0;
+}
+arch_initcall(activate_jump_labels);
index 6389a6b..c1a0188 100644 (file)
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
        native_write_msr(msr_kvm_system_time, 0, 0);
+       kvm_disable_steal_time();
        native_machine_crash_shutdown(regs);
 }
 #endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 static void kvm_shutdown(void)
 {
        native_write_msr(msr_kvm_system_time, 0, 0);
+       kvm_disable_steal_time();
        native_machine_shutdown();
 }
 
index 869e1ae..613a793 100644 (file)
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
        __native_flush_tlb_single(addr);
 }
 
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+       return 0;
+}
+
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
 
 struct pv_time_ops pv_time_ops = {
        .sched_clock = native_sched_clock,
+       .steal_clock = native_steal_clock,
 };
 
 struct pv_irq_ops pv_irq_ops = {
index 65cf823..988724b 100644 (file)
@@ -31,6 +31,7 @@ config KVM
        select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
+       select TASK_DELAY_ACCT
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
index adc9867..6f08bc9 100644 (file)
@@ -407,76 +407,59 @@ struct gprefix {
                }                                                       \
        } while (0)
 
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({     unsigned long _x;                                               \
-       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
-       if (rc != X86EMUL_CONTINUE)                                     \
-               goto done;                                              \
-       (_eip) += (_size);                                              \
-       (_type)_x;                                                      \
-})
-
-#define insn_fetch_arr(_arr, _size, _eip)                              \
-({     rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));           \
-       if (rc != X86EMUL_CONTINUE)                                     \
-               goto done;                                              \
-       (_eip) += (_size);                                              \
-})
-
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
                                    enum x86_intercept intercept,
                                    enum x86_intercept_stage stage)
 {
        struct x86_instruction_info info = {
                .intercept  = intercept,
-               .rep_prefix = ctxt->decode.rep_prefix,
-               .modrm_mod  = ctxt->decode.modrm_mod,
-               .modrm_reg  = ctxt->decode.modrm_reg,
-               .modrm_rm   = ctxt->decode.modrm_rm,
-               .src_val    = ctxt->decode.src.val64,
-               .src_bytes  = ctxt->decode.src.bytes,
-               .dst_bytes  = ctxt->decode.dst.bytes,
-               .ad_bytes   = ctxt->decode.ad_bytes,
+               .rep_prefix = ctxt->rep_prefix,
+               .modrm_mod  = ctxt->modrm_mod,
+               .modrm_reg  = ctxt->modrm_reg,
+               .modrm_rm   = ctxt->modrm_rm,
+               .src_val    = ctxt->src.val64,
+               .src_bytes  = ctxt->src.bytes,
+               .dst_bytes  = ctxt->dst.bytes,
+               .ad_bytes   = ctxt->ad_bytes,
                .next_rip   = ctxt->eip,
        };
 
        return ctxt->ops->intercept(ctxt, &info, stage);
 }
 
-static inline unsigned long ad_mask(struct decode_cache *c)
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
-       return (1UL << (c->ad_bytes << 3)) - 1;
+       return (1UL << (ctxt->ad_bytes << 3)) - 1;
 }
 
 /* Access/update address held in a register, based on addressing mode. */
 static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-       if (c->ad_bytes == sizeof(unsigned long))
+       if (ctxt->ad_bytes == sizeof(unsigned long))
                return reg;
        else
-               return reg & ad_mask(c);
+               return reg & ad_mask(ctxt);
 }
 
 static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-       return address_mask(c, reg);
+       return address_mask(ctxt, reg);
 }
 
 static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
-       if (c->ad_bytes == sizeof(unsigned long))
+       if (ctxt->ad_bytes == sizeof(unsigned long))
                *reg += inc;
        else
-               *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+               *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
 }
 
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
 {
-       register_address_increment(c, &c->eip, rel);
+       register_address_increment(ctxt, &ctxt->_eip, rel);
 }
 
 static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
        return desc->g ? (limit << 12) | 0xfff : limit;
 }
 
-static void set_seg_override(struct decode_cache *c, int seg)
+static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
 {
-       c->has_seg_override = true;
-       c->seg_override = seg;
+       ctxt->has_seg_override = true;
+       ctxt->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
-                             struct x86_emulate_ops *ops, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
 {
        if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
                return 0;
 
-       return ops->get_cached_segment_base(ctxt, seg);
+       return ctxt->ops->get_cached_segment_base(ctxt, seg);
 }
 
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
-                            struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
 {
-       if (!c->has_seg_override)
+       if (!ctxt->has_seg_override)
                return 0;
 
-       return c->seg_override;
+       return ctxt->seg_override;
 }
 
 static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
                     unsigned size, bool write, bool fetch,
                     ulong *linear)
 {
-       struct decode_cache *c = &ctxt->decode;
        struct desc_struct desc;
        bool usable;
        ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
        u16 sel;
        unsigned cpl, rpl;
 
-       la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+       la = seg_base(ctxt, addr.seg) + addr.ea;
        switch (ctxt->mode) {
        case X86EMUL_MODE_REAL:
                break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
                }
                break;
        }
-       if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
+       if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
                la &= (u32)-1;
        *linear = la;
        return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
        return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
-                             struct x86_emulate_ops *ops,
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
                              unsigned long eip, u8 *dest)
 {
-       struct fetch_cache *fc = &ctxt->decode.fetch;
+       struct fetch_cache *fc = &ctxt->fetch;
        int rc;
        int size, cur_size;
 
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
                rc = __linearize(ctxt, addr, size, false, true, &linear);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
-               rc = ops->fetch(ctxt, linear, fc->data + cur_size,
-                               size, &ctxt->exception);
+               rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+                                     size, &ctxt->exception);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-                        struct x86_emulate_ops *ops,
                         unsigned long eip, void *dest, unsigned size)
 {
        int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
        if (eip + size - ctxt->eip > 15)
                return X86EMUL_UNHANDLEABLE;
        while (size--) {
-               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+               rc = do_insn_fetch_byte(ctxt, eip++, dest++);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
        }
        return X86EMUL_CONTINUE;
 }
 
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)                                 \
+({     unsigned long _x;                                               \
+       rc = do_insn_fetch(ctxt, (_eip), &_x, (_size));                 \
+       if (rc != X86EMUL_CONTINUE)                                     \
+               goto done;                                              \
+       (_eip) += (_size);                                              \
+       (_type)_x;                                                      \
+})
+
+#define insn_fetch_arr(_arr, _size, _eip)                              \
+({     rc = do_insn_fetch(ctxt, (_eip), _arr, (_size));                \
+       if (rc != X86EMUL_CONTINUE)                                     \
+               goto done;                                              \
+       (_eip) += (_size);                                              \
+})
+
 /*
  * Given the 'reg' portion of a ModRM byte, and a register block, return a
  * pointer into the block that addresses the relevant register.
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                    struct operand *op,
-                                   struct decode_cache *c,
                                    int inhibit_bytereg)
 {
-       unsigned reg = c->modrm_reg;
-       int highbyte_regs = c->rex_prefix == 0;
+       unsigned reg = ctxt->modrm_reg;
+       int highbyte_regs = ctxt->rex_prefix == 0;
 
-       if (!(c->d & ModRM))
-               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+       if (!(ctxt->d & ModRM))
+               reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
 
-       if (c->d & Sse) {
+       if (ctxt->d & Sse) {
                op->type = OP_XMM;
                op->bytes = 16;
                op->addr.xmm = reg;
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
        }
 
        op->type = OP_REG;
-       if ((c->d & ByteOp) && !inhibit_bytereg) {
-               op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+       if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+               op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
                op->bytes = 1;
        } else {
-               op->addr.reg = decode_register(reg, c->regs, 0);
-               op->bytes = c->op_bytes;
+               op->addr.reg = decode_register(reg, ctxt->regs, 0);
+               op->bytes = ctxt->op_bytes;
        }
        fetch_register_operand(op);
        op->orig_val = op->val;
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-                       struct x86_emulate_ops *ops,
                        struct operand *op)
 {
-       struct decode_cache *c = &ctxt->decode;
        u8 sib;
        int index_reg = 0, base_reg = 0, scale;
        int rc = X86EMUL_CONTINUE;
        ulong modrm_ea = 0;
 
-       if (c->rex_prefix) {
-               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
-               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
-               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+       if (ctxt->rex_prefix) {
+               ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1;  /* REX.R */
+               index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
+               ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
        }
 
-       c->modrm = insn_fetch(u8, 1, c->eip);
-       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-       c->modrm_reg |= (c->modrm & 0x38) >> 3;
-       c->modrm_rm |= (c->modrm & 0x07);
-       c->modrm_seg = VCPU_SREG_DS;
+       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+       ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+       ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+       ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+       ctxt->modrm_seg = VCPU_SREG_DS;
 
-       if (c->modrm_mod == 3) {
+       if (ctxt->modrm_mod == 3) {
                op->type = OP_REG;
-               op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               op->addr.reg = decode_register(c->modrm_rm,
-                                              c->regs, c->d & ByteOp);
-               if (c->d & Sse) {
+               op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               op->addr.reg = decode_register(ctxt->modrm_rm,
+                                              ctxt->regs, ctxt->d & ByteOp);
+               if (ctxt->d & Sse) {
                        op->type = OP_XMM;
                        op->bytes = 16;
-                       op->addr.xmm = c->modrm_rm;
-                       read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
+                       op->addr.xmm = ctxt->modrm_rm;
+                       read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
                        return rc;
                }
                fetch_register_operand(op);
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 
        op->type = OP_MEM;
 
-       if (c->ad_bytes == 2) {
-               unsigned bx = c->regs[VCPU_REGS_RBX];
-               unsigned bp = c->regs[VCPU_REGS_RBP];
-               unsigned si = c->regs[VCPU_REGS_RSI];
-               unsigned di = c->regs[VCPU_REGS_RDI];
+       if (ctxt->ad_bytes == 2) {
+               unsigned bx = ctxt->regs[VCPU_REGS_RBX];
+               unsigned bp = ctxt->regs[VCPU_REGS_RBP];
+               unsigned si = ctxt->regs[VCPU_REGS_RSI];
+               unsigned di = ctxt->regs[VCPU_REGS_RDI];
 
                /* 16-bit ModR/M decode. */
-               switch (c->modrm_mod) {
+               switch (ctxt->modrm_mod) {
                case 0:
-                       if (c->modrm_rm == 6)
-                               modrm_ea += insn_fetch(u16, 2, c->eip);
+                       if (ctxt->modrm_rm == 6)
+                               modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
                        break;
                case 1:
-                       modrm_ea += insn_fetch(s8, 1, c->eip);
+                       modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
                        break;
                case 2:
-                       modrm_ea += insn_fetch(u16, 2, c->eip);
+                       modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
                        break;
                }
-               switch (c->modrm_rm) {
+               switch (ctxt->modrm_rm) {
                case 0:
                        modrm_ea += bx + si;
                        break;
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                        modrm_ea += di;
                        break;
                case 6:
-                       if (c->modrm_mod != 0)
+                       if (ctxt->modrm_mod != 0)
                                modrm_ea += bp;
                        break;
                case 7:
                        modrm_ea += bx;
                        break;
                }
-               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-                   (c->modrm_rm == 6 && c->modrm_mod != 0))
-                       c->modrm_seg = VCPU_SREG_SS;
+               if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+                   (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+                       ctxt->modrm_seg = VCPU_SREG_SS;
                modrm_ea = (u16)modrm_ea;
        } else {
                /* 32/64-bit ModR/M decode. */
-               if ((c->modrm_rm & 7) == 4) {
-                       sib = insn_fetch(u8, 1, c->eip);
+               if ((ctxt->modrm_rm & 7) == 4) {
+                       sib = insn_fetch(u8, 1, ctxt->_eip);
                        index_reg |= (sib >> 3) & 7;
                        base_reg |= sib & 7;
                        scale = sib >> 6;
 
-                       if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-                               modrm_ea += insn_fetch(s32, 4, c->eip);
+                       if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+                               modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
                        else
-                               modrm_ea += c->regs[base_reg];
+                               modrm_ea += ctxt->regs[base_reg];
                        if (index_reg != 4)
-                               modrm_ea += c->regs[index_reg] << scale;
-               } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+                               modrm_ea += ctxt->regs[index_reg] << scale;
+               } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
                        if (ctxt->mode == X86EMUL_MODE_PROT64)
-                               c->rip_relative = 1;
+                               ctxt->rip_relative = 1;
                } else
-                       modrm_ea += c->regs[c->modrm_rm];
-               switch (c->modrm_mod) {
+                       modrm_ea += ctxt->regs[ctxt->modrm_rm];
+               switch (ctxt->modrm_mod) {
                case 0:
-                       if (c->modrm_rm == 5)
-                               modrm_ea += insn_fetch(s32, 4, c->eip);
+                       if (ctxt->modrm_rm == 5)
+                               modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
                        break;
                case 1:
-                       modrm_ea += insn_fetch(s8, 1, c->eip);
+                       modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
                        break;
                case 2:
-                       modrm_ea += insn_fetch(s32, 4, c->eip);
+                       modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
                        break;
                }
        }
@@ -1014,53 +1006,50 @@ done:
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-                     struct x86_emulate_ops *ops,
                      struct operand *op)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
 
        op->type = OP_MEM;
-       switch (c->ad_bytes) {
+       switch (ctxt->ad_bytes) {
        case 2:
-               op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+               op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
                break;
        case 4:
-               op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+               op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
                break;
        case 8:
-               op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+               op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
                break;
        }
 done:
        return rc;
 }
 
-static void fetch_bit_operand(struct decode_cache *c)
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
 {
        long sv = 0, mask;
 
-       if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
-               mask = ~(c->dst.bytes * 8 - 1);
+       if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+               mask = ~(ctxt->dst.bytes * 8 - 1);
 
-               if (c->src.bytes == 2)
-                       sv = (s16)c->src.val & (s16)mask;
-               else if (c->src.bytes == 4)
-                       sv = (s32)c->src.val & (s32)mask;
+               if (ctxt->src.bytes == 2)
+                       sv = (s16)ctxt->src.val & (s16)mask;
+               else if (ctxt->src.bytes == 4)
+                       sv = (s32)ctxt->src.val & (s32)mask;
 
-               c->dst.addr.mem.ea += (sv >> 3);
+               ctxt->dst.addr.mem.ea += (sv >> 3);
        }
 
        /* only subword offset */
-       c->src.val &= (c->dst.bytes << 3) - 1;
+       ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
-                        struct x86_emulate_ops *ops,
                         unsigned long addr, void *dest, unsigned size)
 {
        int rc;
-       struct read_cache *mc = &ctxt->decode.mem_read;
+       struct read_cache *mc = &ctxt->mem_read;
 
        while (size) {
                int n = min(size, 8u);
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
                if (mc->pos < mc->end)
                        goto read_cached;
 
-               rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
-                                       &ctxt->exception);
+               rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
+                                             &ctxt->exception);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                mc->end += n;
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
        rc = linearize(ctxt, addr, size, false, &linear);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       return read_emulated(ctxt, ctxt->ops, linear, data, size);
+       return read_emulated(ctxt, linear, data, size);
 }
 
 static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
 }
 
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-                          struct x86_emulate_ops *ops,
                           unsigned int size, unsigned short port,
                           void *dest)
 {
-       struct read_cache *rc = &ctxt->decode.io_read;
+       struct read_cache *rc = &ctxt->io_read;
 
        if (rc->pos == rc->end) { /* refill pio read ahead */
-               struct decode_cache *c = &ctxt->decode;
                unsigned int in_page, n;
-               unsigned int count = c->rep_prefix ?
-                       address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+               unsigned int count = ctxt->rep_prefix ?
+                       address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
                in_page = (ctxt->eflags & EFLG_DF) ?
-                       offset_in_page(c->regs[VCPU_REGS_RDI]) :
-                       PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+                       offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
+                       PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
                n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
                        count);
                if (n == 0)
                        n = 1;
                rc->pos = rc->end = 0;
-               if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+               if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
                        return 0;
                rc->end = n * size;
        }
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 }
 
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
-                                    struct x86_emulate_ops *ops,
                                     u16 selector, struct desc_ptr *dt)
 {
+       struct x86_emulate_ops *ops = ctxt->ops;
+
        if (selector & 1 << 2) {
                struct desc_struct desc;
                u16 sel;
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 
 /* allowed just for 8 bytes segments */
 static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-                                  struct x86_emulate_ops *ops,
                                   u16 selector, struct desc_struct *desc)
 {
        struct desc_ptr dt;
        u16 index = selector >> 3;
-       int ret;
        ulong addr;
 
-       get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+       get_descriptor_table_ptr(ctxt, selector, &dt);
 
        if (dt.size < index * 8 + 7)
                return emulate_gp(ctxt, selector & 0xfffc);
-       addr = dt.address + index * 8;
-       ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
 
-       return ret;
+       addr = dt.address + index * 8;
+       return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+                                  &ctxt->exception);
 }
 
 /* allowed just for 8 bytes segments */
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-                                   struct x86_emulate_ops *ops,
                                    u16 selector, struct desc_struct *desc)
 {
        struct desc_ptr dt;
        u16 index = selector >> 3;
        ulong addr;
-       int ret;
 
-       get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+       get_descriptor_table_ptr(ctxt, selector, &dt);
 
        if (dt.size < index * 8 + 7)
                return emulate_gp(ctxt, selector & 0xfffc);
 
        addr = dt.address + index * 8;
-       ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
-
-       return ret;
+       return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
+                                   &ctxt->exception);
 }
 
 /* Does not support long mode */
 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-                                  struct x86_emulate_ops *ops,
                                   u16 selector, int seg)
 {
        struct desc_struct seg_desc;
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        if (null_selector) /* for NULL selector skip all following checks */
                goto load;
 
-       ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+       ret = read_segment_descriptor(ctxt, selector, &seg_desc);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
        rpl = selector & 3;
        dpl = seg_desc.dpl;
-       cpl = ops->cpl(ctxt);
+       cpl = ctxt->ops->cpl(ctxt);
 
        switch (seg) {
        case VCPU_SREG_SS:
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        if (seg_desc.s) {
                /* mark segment as accessed */
                seg_desc.type |= 1;
-               ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+               ret = write_segment_descriptor(ctxt, selector, &seg_desc);
                if (ret != X86EMUL_CONTINUE)
                        return ret;
        }
 load:
-       ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+       ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
        return X86EMUL_CONTINUE;
 exception:
        emulate_exception(ctxt, err_vec, err_code, true);
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op)
 static int writeback(struct x86_emulate_ctxt *ctxt)
 {
        int rc;
-       struct decode_cache *c = &ctxt->decode;
 
-       switch (c->dst.type) {
+       switch (ctxt->dst.type) {
        case OP_REG:
-               write_register_operand(&c->dst);
+               write_register_operand(&ctxt->dst);
                break;
        case OP_MEM:
-               if (c->lock_prefix)
+               if (ctxt->lock_prefix)
                        rc = segmented_cmpxchg(ctxt,
-                                              c->dst.addr.mem,
-                                              &c->dst.orig_val,
-                                              &c->dst.val,
-                                              c->dst.bytes);
+                                              ctxt->dst.addr.mem,
+                                              &ctxt->dst.orig_val,
+                                              &ctxt->dst.val,
+                                              ctxt->dst.bytes);
                else
                        rc = segmented_write(ctxt,
-                                            c->dst.addr.mem,
-                                            &c->dst.val,
-                                            c->dst.bytes);
+                                            ctxt->dst.addr.mem,
+                                            &ctxt->dst.val,
+                                            ctxt->dst.bytes);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                break;
        case OP_XMM:
-               write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
+               write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
                break;
        case OP_NONE:
                /* no writeback */
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        struct segmented_address addr;
 
-       register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-       addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+       addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
        addr.seg = VCPU_SREG_SS;
 
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
-       return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
+       ctxt->dst.type = OP_NONE;
+       return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
                       void *dest, int len)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
        struct segmented_address addr;
 
-       addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+       addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
        addr.seg = VCPU_SREG_SS;
        rc = segmented_read(ctxt, addr, dest, len);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
        return rc;
 }
 
 static int em_pop(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+       return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
 static int emulate_popf(struct x86_emulate_ctxt *ctxt,
-                      struct x86_emulate_ops *ops,
-                      void *dest, int len)
+                       void *dest, int len)
 {
        int rc;
        unsigned long val, change_mask;
        int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-       int cpl = ops->cpl(ctxt);
+       int cpl = ctxt->ops->cpl(ctxt);
 
        rc = emulate_pop(ctxt, &val, len);
        if (rc != X86EMUL_CONTINUE)
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 
 static int em_popf(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->dst.type = OP_REG;
-       c->dst.addr.reg = &ctxt->eflags;
-       c->dst.bytes = c->op_bytes;
-       return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+       ctxt->dst.type = OP_REG;
+       ctxt->dst.addr.reg = &ctxt->eflags;
+       ctxt->dst.bytes = ctxt->op_bytes;
+       return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
-static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
-                            struct x86_emulate_ops *ops, int seg)
+static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->src.val = get_segment_selector(ctxt, seg);
+       ctxt->src.val = get_segment_selector(ctxt, seg);
 
        return em_push(ctxt);
 }
 
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
-                            struct x86_emulate_ops *ops, int seg)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-       struct decode_cache *c = &ctxt->decode;
        unsigned long selector;
        int rc;
 
-       rc = emulate_pop(ctxt, &selector, c->op_bytes);
+       rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+       rc = load_segment_descriptor(ctxt, (u16)selector, seg);
        return rc;
 }
 
 static int em_pusha(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+       unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
        int rc = X86EMUL_CONTINUE;
        int reg = VCPU_REGS_RAX;
 
        while (reg <= VCPU_REGS_RDI) {
                (reg == VCPU_REGS_RSP) ?
-               (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+               (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
 
                rc = em_push(ctxt);
                if (rc != X86EMUL_CONTINUE)
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->src.val =  (unsigned long)ctxt->eflags;
+       ctxt->src.val =  (unsigned long)ctxt->eflags;
        return em_push(ctxt);
 }
 
 static int em_popa(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
        int reg = VCPU_REGS_RDI;
 
        while (reg >= VCPU_REGS_RAX) {
                if (reg == VCPU_REGS_RSP) {
-                       register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                                       c->op_bytes);
+                       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
+                                                       ctxt->op_bytes);
                        --reg;
                }
 
-               rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
+               rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
                if (rc != X86EMUL_CONTINUE)
                        break;
                --reg;
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
        return rc;
 }
 
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-                              struct x86_emulate_ops *ops, int irq)
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
-       struct decode_cache *c = &ctxt->decode;
+       struct x86_emulate_ops *ops = ctxt->ops;
        int rc;
        struct desc_ptr dt;
        gva_t cs_addr;
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
        u16 cs, eip;
 
        /* TODO: Add limit checks */
-       c->src.val = ctxt->eflags;
+       ctxt->src.val = ctxt->eflags;
        rc = em_push(ctxt);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
        ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
 
-       c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+       ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
        rc = em_push(ctxt);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->src.val = c->eip;
+       ctxt->src.val = ctxt->_eip;
        rc = em_push(ctxt);
        if (rc != X86EMUL_CONTINUE)
                return rc;
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+       rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->eip = eip;
+       ctxt->_eip = eip;
 
        return rc;
 }
 
-static int emulate_int(struct x86_emulate_ctxt *ctxt,
-                      struct x86_emulate_ops *ops, int irq)
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 {
        switch(ctxt->mode) {
        case X86EMUL_MODE_REAL:
-               return emulate_int_real(ctxt, ops, irq);
+               return emulate_int_real(ctxt, irq);
        case X86EMUL_MODE_VM86:
        case X86EMUL_MODE_PROT16:
        case X86EMUL_MODE_PROT32:
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
        }
 }
 
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
-                            struct x86_emulate_ops *ops)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
        unsigned long temp_eip = 0;
        unsigned long temp_eflags = 0;
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 
        /* TODO: Add stack limit check */
 
-       rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
+       rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
 
        if (rc != X86EMUL_CONTINUE)
                return rc;
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
        if (temp_eip & ~0xffff)
                return emulate_gp(ctxt, 0);
 
-       rc = emulate_pop(ctxt, &cs, c->op_bytes);
+       rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
 
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
+       rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
 
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+       rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
 
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->eip = temp_eip;
+       ctxt->_eip = temp_eip;
 
 
-       if (c->op_bytes == 4)
+       if (ctxt->op_bytes == 4)
                ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
-       else if (c->op_bytes == 2) {
+       else if (ctxt->op_bytes == 2) {
                ctxt->eflags &= ~0xffff;
                ctxt->eflags |= temp_eflags;
        }
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
        return rc;
 }
 
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
-                                   struct x86_emulate_ops* ops)
+static int em_iret(struct x86_emulate_ctxt *ctxt)
 {
        switch(ctxt->mode) {
        case X86EMUL_MODE_REAL:
-               return emulate_iret_real(ctxt, ops);
+               return emulate_iret_real(ctxt);
        case X86EMUL_MODE_VM86:
        case X86EMUL_MODE_PROT16:
        case X86EMUL_MODE_PROT32:
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
 
 static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
        unsigned short sel;
 
-       memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+       memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-       rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
+       rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->eip = 0;
-       memcpy(&c->eip, c->src.valptr, c->op_bytes);
+       ctxt->_eip = 0;
+       memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
        return X86EMUL_CONTINUE;
 }
 
 static int em_grp1a(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
+       return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
 }
 
 static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       switch (c->modrm_reg) {
+       switch (ctxt->modrm_reg) {
        case 0: /* rol */
-               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 1: /* ror */
-               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 2: /* rcl */
-               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 3: /* rcr */
-               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 4: /* sal/shl */
        case 6: /* sal/shl */
-               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 5: /* shr */
-               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 7: /* sar */
-               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        }
        return X86EMUL_CONTINUE;
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp3(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       unsigned long *rax = &c->regs[VCPU_REGS_RAX];
-       unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+       unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
+       unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
        u8 de = 0;
 
-       switch (c->modrm_reg) {
+       switch (ctxt->modrm_reg) {
        case 0 ... 1:   /* test */
-               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 2: /* not */
-               c->dst.val = ~c->dst.val;
+               ctxt->dst.val = ~ctxt->dst.val;
                break;
        case 3: /* neg */
-               emulate_1op("neg", c->dst, ctxt->eflags);
+               emulate_1op("neg", ctxt->dst, ctxt->eflags);
                break;
        case 4: /* mul */
-               emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+               emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
                break;
        case 5: /* imul */
-               emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+               emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
                break;
        case 6: /* div */
-               emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+               emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
                                       ctxt->eflags, de);
                break;
        case 7: /* idiv */
-               emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+               emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
                                       ctxt->eflags, de);
                break;
        default:
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp45(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
 
-       switch (c->modrm_reg) {
+       switch (ctxt->modrm_reg) {
        case 0: /* inc */
-               emulate_1op("inc", c->dst, ctxt->eflags);
+               emulate_1op("inc", ctxt->dst, ctxt->eflags);
                break;
        case 1: /* dec */
-               emulate_1op("dec", c->dst, ctxt->eflags);
+               emulate_1op("dec", ctxt->dst, ctxt->eflags);
                break;
        case 2: /* call near abs */ {
                long int old_eip;
-               old_eip = c->eip;
-               c->eip = c->src.val;
-               c->src.val = old_eip;
+               old_eip = ctxt->_eip;
+               ctxt->_eip = ctxt->src.val;
+               ctxt->src.val = old_eip;
                rc = em_push(ctxt);
                break;
        }
        case 4: /* jmp abs */
-               c->eip = c->src.val;
+               ctxt->_eip = ctxt->src.val;
                break;
        case 5: /* jmp far */
                rc = em_jmp_far(ctxt);
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp9(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       u64 old = c->dst.orig_val64;
+       u64 old = ctxt->dst.orig_val64;
 
-       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
-           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+       if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
+           ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
+               ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+               ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
                ctxt->eflags &= ~EFLG_ZF;
        } else {
-               c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
-                       (u32) c->regs[VCPU_REGS_RBX];
+               ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
+                       (u32) ctxt->regs[VCPU_REGS_RBX];
 
                ctxt->eflags |= EFLG_ZF;
        }
        return X86EMUL_CONTINUE;
 }
 
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
-                          struct x86_emulate_ops *ops)
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+       ctxt->dst.type = OP_REG;
+       ctxt->dst.addr.reg = &ctxt->_eip;
+       ctxt->dst.bytes = ctxt->op_bytes;
+       return em_pop(ctxt);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
        unsigned long cs;
 
-       rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
+       rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       if (c->op_bytes == 4)
-               c->eip = (u32)c->eip;
-       rc = emulate_pop(ctxt, &cs, c->op_bytes);
+       if (ctxt->op_bytes == 4)
+               ctxt->_eip = (u32)ctxt->_eip;
+       rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+       rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
        return rc;
 }
 
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
-                          struct x86_emulate_ops *ops, int seg)
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
 {
-       struct decode_cache *c = &ctxt->decode;
        unsigned short sel;
        int rc;
 
-       memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+       memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-       rc = load_segment_descriptor(ctxt, ops, sel, seg);
+       rc = load_segment_descriptor(ctxt, sel, seg);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->dst.val = c->src.val;
+       ctxt->dst.val = ctxt->src.val;
        return rc;
 }
 
-static inline void
+static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-                       struct x86_emulate_ops *ops, struct desc_struct *cs,
-                       struct desc_struct *ss)
+                       struct desc_struct *cs, struct desc_struct *ss)
 {
        u16 selector;
 
        memset(cs, 0, sizeof(struct desc_struct));
-       ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
+       ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
        memset(ss, 0, sizeof(struct desc_struct));
 
        cs->l = 0;              /* will be adjusted later */
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
        ss->p = 1;
 }
 
-static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        u16 cs_sel, ss_sel;
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                return emulate_ud(ctxt);
 
        ops->get_msr(ctxt, MSR_EFER, &efer);
-       setup_syscalls_segments(ctxt, ops, &cs, &ss);
+       setup_syscalls_segments(ctxt, &cs, &ss);
 
        ops->get_msr(ctxt, MSR_STAR, &msr_data);
        msr_data >>= 32;
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-       c->regs[VCPU_REGS_RCX] = c->eip;
+       ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
        if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
-               c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+               ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
                ops->get_msr(ctxt,
                             ctxt->mode == X86EMUL_MODE_PROT64 ?
                             MSR_LSTAR : MSR_CSTAR, &msr_data);
-               c->eip = msr_data;
+               ctxt->_eip = msr_data;
 
                ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
                ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        } else {
                /* legacy mode */
                ops->get_msr(ctxt, MSR_STAR, &msr_data);
-               c->eip = (u32)msr_data;
+               ctxt->_eip = (u32)msr_data;
 
                ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
        }
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        u16 cs_sel, ss_sel;
        u64 efer = 0;
 
-       ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+       ops->get_msr(ctxt, MSR_EFER, &efer);
        /* inject #GP if in real mode */
        if (ctxt->mode == X86EMUL_MODE_REAL)
                return emulate_gp(ctxt, 0);
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        if (ctxt->mode == X86EMUL_MODE_PROT64)
                return emulate_ud(ctxt);
 
-       setup_syscalls_segments(ctxt, ops, &cs, &ss);
+       setup_syscalls_segments(ctxt, &cs, &ss);
 
        ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
        switch (ctxt->mode) {
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
        ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
-       c->eip = msr_data;
+       ctxt->_eip = msr_data;
 
        ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
-       c->regs[VCPU_REGS_RSP] = msr_data;
+       ctxt->regs[VCPU_REGS_RSP] = msr_data;
 
        return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct cs, ss;
        u64 msr_data;
        int usermode;
-       u16 cs_sel, ss_sel;
+       u16 cs_sel = 0, ss_sel = 0;
 
        /* inject #GP if in real mode or Virtual 8086 mode */
        if (ctxt->mode == X86EMUL_MODE_REAL ||
            ctxt->mode == X86EMUL_MODE_VM86)
                return emulate_gp(ctxt, 0);
 
-       setup_syscalls_segments(ctxt, ops, &cs, &ss);
+       setup_syscalls_segments(ctxt, &cs, &ss);
 
-       if ((c->rex_prefix & 0x8) != 0x0)
+       if ((ctxt->rex_prefix & 0x8) != 0x0)
                usermode = X86EMUL_MODE_PROT64;
        else
                usermode = X86EMUL_MODE_PROT32;
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-       c->eip = c->regs[VCPU_REGS_RDX];
-       c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+       ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
+       ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
 
        return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
-                             struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 {
        int iopl;
        if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
        if (ctxt->mode == X86EMUL_MODE_VM86)
                return true;
        iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-       return ops->cpl(ctxt) > iopl;
+       return ctxt->ops->cpl(ctxt) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
-                                           struct x86_emulate_ops *ops,
                                            u16 port, u16 len)
 {
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct tr_seg;
        u32 base3;
        int r;
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 }
 
 static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
-                                struct x86_emulate_ops *ops,
                                 u16 port, u16 len)
 {
        if (ctxt->perm_ok)
                return true;
 
-       if (emulator_bad_iopl(ctxt, ops))
-               if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+       if (emulator_bad_iopl(ctxt))
+               if (!emulator_io_port_access_allowed(ctxt, port, len))
                        return false;
 
        ctxt->perm_ok = true;
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 }
 
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
-                               struct x86_emulate_ops *ops,
                                struct tss_segment_16 *tss)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       tss->ip = c->eip;
+       tss->ip = ctxt->_eip;
        tss->flag = ctxt->eflags;
-       tss->ax = c->regs[VCPU_REGS_RAX];
-       tss->cx = c->regs[VCPU_REGS_RCX];
-       tss->dx = c->regs[VCPU_REGS_RDX];
-       tss->bx = c->regs[VCPU_REGS_RBX];
-       tss->sp = c->regs[VCPU_REGS_RSP];
-       tss->bp = c->regs[VCPU_REGS_RBP];
-       tss->si = c->regs[VCPU_REGS_RSI];
-       tss->di = c->regs[VCPU_REGS_RDI];
+       tss->ax = ctxt->regs[VCPU_REGS_RAX];
+       tss->cx = ctxt->regs[VCPU_REGS_RCX];
+       tss->dx = ctxt->regs[VCPU_REGS_RDX];
+       tss->bx = ctxt->regs[VCPU_REGS_RBX];
+       tss->sp = ctxt->regs[VCPU_REGS_RSP];
+       tss->bp = ctxt->regs[VCPU_REGS_RBP];
+       tss->si = ctxt->regs[VCPU_REGS_RSI];
+       tss->di = ctxt->regs[VCPU_REGS_RDI];
 
        tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
        tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
-                                struct x86_emulate_ops *ops,
                                 struct tss_segment_16 *tss)
 {
-       struct decode_cache *c = &ctxt->decode;
        int ret;
 
-       c->eip = tss->ip;
+       ctxt->_eip = tss->ip;
        ctxt->eflags = tss->flag | 2;
-       c->regs[VCPU_REGS_RAX] = tss->ax;
-       c->regs[VCPU_REGS_RCX] = tss->cx;
-       c->regs[VCPU_REGS_RDX] = tss->dx;
-       c->regs[VCPU_REGS_RBX] = tss->bx;
-       c->regs[VCPU_REGS_RSP] = tss->sp;
-       c->regs[VCPU_REGS_RBP] = tss->bp;
-       c->regs[VCPU_REGS_RSI] = tss->si;
-       c->regs[VCPU_REGS_RDI] = tss->di;
+       ctxt->regs[VCPU_REGS_RAX] = tss->ax;
+       ctxt->regs[VCPU_REGS_RCX] = tss->cx;
+       ctxt->regs[VCPU_REGS_RDX] = tss->dx;
+       ctxt->regs[VCPU_REGS_RBX] = tss->bx;
+       ctxt->regs[VCPU_REGS_RSP] = tss->sp;
+       ctxt->regs[VCPU_REGS_RBP] = tss->bp;
+       ctxt->regs[VCPU_REGS_RSI] = tss->si;
+       ctxt->regs[VCPU_REGS_RDI] = tss->di;
 
        /*
         * SDM says that segment selectors are loaded before segment
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
         * Now load segment descriptors. If fault happenes at this stage
         * it is handled in a context of new task
         */
-       ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+       ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+       ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+       ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+       ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+       ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_16(struct x86_emulate_ctxt *ctxt,
-                         struct x86_emulate_ops *ops,
                          u16 tss_selector, u16 old_tss_sel,
                          ulong old_tss_base, struct desc_struct *new_desc)
 {
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct tss_segment_16 tss_seg;
        int ret;
        u32 new_tss_base = get_desc_base(new_desc);
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                /* FIXME: need to provide precise fault address */
                return ret;
 
-       save_state_to_tss16(ctxt, ops, &tss_seg);
+       save_state_to_tss16(ctxt, &tss_seg);
 
        ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
                             &ctxt->exception);
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                        return ret;
        }
 
-       return load_state_from_tss16(ctxt, ops, &tss_seg);
+       return load_state_from_tss16(ctxt, &tss_seg);
 }
 
 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
-                               struct x86_emulate_ops *ops,
                                struct tss_segment_32 *tss)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       tss->cr3 = ops->get_cr(ctxt, 3);
-       tss->eip = c->eip;
+       tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
+       tss->eip = ctxt->_eip;
        tss->eflags = ctxt->eflags;
-       tss->eax = c->regs[VCPU_REGS_RAX];
-       tss->ecx = c->regs[VCPU_REGS_RCX];
-       tss->edx = c->regs[VCPU_REGS_RDX];
-       tss->ebx = c->regs[VCPU_REGS_RBX];
-       tss->esp = c->regs[VCPU_REGS_RSP];
-       tss->ebp = c->regs[VCPU_REGS_RBP];
-       tss->esi = c->regs[VCPU_REGS_RSI];
-       tss->edi = c->regs[VCPU_REGS_RDI];
+       tss->eax = ctxt->regs[VCPU_REGS_RAX];
+       tss->ecx = ctxt->regs[VCPU_REGS_RCX];
+       tss->edx = ctxt->regs[VCPU_REGS_RDX];
+       tss->ebx = ctxt->regs[VCPU_REGS_RBX];
+       tss->esp = ctxt->regs[VCPU_REGS_RSP];
+       tss->ebp = ctxt->regs[VCPU_REGS_RBP];
+       tss->esi = ctxt->regs[VCPU_REGS_RSI];
+       tss->edi = ctxt->regs[VCPU_REGS_RDI];
 
        tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
        tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
-                                struct x86_emulate_ops *ops,
                                 struct tss_segment_32 *tss)
 {
-       struct decode_cache *c = &ctxt->decode;
        int ret;
 
-       if (ops->set_cr(ctxt, 3, tss->cr3))
+       if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
                return emulate_gp(ctxt, 0);
-       c->eip = tss->eip;
+       ctxt->_eip = tss->eip;
        ctxt->eflags = tss->eflags | 2;
-       c->regs[VCPU_REGS_RAX] = tss->eax;
-       c->regs[VCPU_REGS_RCX] = tss->ecx;
-       c->regs[VCPU_REGS_RDX] = tss->edx;
-       c->regs[VCPU_REGS_RBX] = tss->ebx;
-       c->regs[VCPU_REGS_RSP] = tss->esp;
-       c->regs[VCPU_REGS_RBP] = tss->ebp;
-       c->regs[VCPU_REGS_RSI] = tss->esi;
-       c->regs[VCPU_REGS_RDI] = tss->edi;
+       ctxt->regs[VCPU_REGS_RAX] = tss->eax;
+       ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
+       ctxt->regs[VCPU_REGS_RDX] = tss->edx;
+       ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
+       ctxt->regs[VCPU_REGS_RSP] = tss->esp;
+       ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
+       ctxt->regs[VCPU_REGS_RSI] = tss->esi;
+       ctxt->regs[VCPU_REGS_RDI] = tss->edi;
 
        /*
         * SDM says that segment selectors are loaded before segment
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
         * Now load segment descriptors. If fault happenes at this stage
         * it is handled in a context of new task
         */
-       ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+       ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+       ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+       ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+       ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+       ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+       ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+       ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_32(struct x86_emulate_ctxt *ctxt,
-                         struct x86_emulate_ops *ops,
                          u16 tss_selector, u16 old_tss_sel,
                          ulong old_tss_base, struct desc_struct *new_desc)
 {
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct tss_segment_32 tss_seg;
        int ret;
        u32 new_tss_base = get_desc_base(new_desc);
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                /* FIXME: need to provide precise fault address */
                return ret;
 
-       save_state_to_tss32(ctxt, ops, &tss_seg);
+       save_state_to_tss32(ctxt, &tss_seg);
 
        ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
                             &ctxt->exception);
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                        return ret;
        }
 
-       return load_state_from_tss32(ctxt, ops, &tss_seg);
+       return load_state_from_tss32(ctxt, &tss_seg);
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-                                  struct x86_emulate_ops *ops,
                                   u16 tss_selector, int reason,
                                   bool has_error_code, u32 error_code)
 {
+       struct x86_emulate_ops *ops = ctxt->ops;
        struct desc_struct curr_tss_desc, next_tss_desc;
        int ret;
        u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
        /* FIXME: old_tss_base == ~0 ? */
 
-       ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+       ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
        if (ret != X86EMUL_CONTINUE)
                return ret;
-       ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+       ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
        if (ret != X86EMUL_CONTINUE)
                return ret;
 
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
        if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
                curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
-               write_segment_descriptor(ctxt, ops, old_tss_sel,
-                                        &curr_tss_desc);
+               write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
        }
 
        if (reason == TASK_SWITCH_IRET)
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
                old_tss_sel = 0xffff;
 
        if (next_tss_desc.type & 8)
-               ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+               ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
                                     old_tss_base, &next_tss_desc);
        else
-               ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+               ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
                                     old_tss_base, &next_tss_desc);
        if (ret != X86EMUL_CONTINUE)
                return ret;
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
        if (reason != TASK_SWITCH_IRET) {
                next_tss_desc.type |= (1 << 1); /* set busy flag */
-               write_segment_descriptor(ctxt, ops, tss_selector,
-                                        &next_tss_desc);
+               write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
        }
 
        ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
        ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
 
        if (has_error_code) {
-               struct decode_cache *c = &ctxt->decode;
-
-               c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
-               c->lock_prefix = 0;
-               c->src.val = (unsigned long) error_code;
+               ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+               ctxt->lock_prefix = 0;
+               ctxt->src.val = (unsigned long) error_code;
                ret = em_push(ctxt);
        }
 
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
                         u16 tss_selector, int reason,
                         bool has_error_code, u32 error_code)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
-       struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       c->eip = ctxt->eip;
-       c->dst.type = OP_NONE;
+       ctxt->_eip = ctxt->eip;
+       ctxt->dst.type = OP_NONE;
 
-       rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+       rc = emulator_do_task_switch(ctxt, tss_selector, reason,
                                     has_error_code, error_code);
 
        if (rc == X86EMUL_CONTINUE)
-               ctxt->eip = c->eip;
+               ctxt->eip = ctxt->_eip;
 
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
                            int reg, struct operand *op)
 {
-       struct decode_cache *c = &ctxt->decode;
        int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-       register_address_increment(c, &c->regs[reg], df * op->bytes);
-       op->addr.mem.ea = register_address(c, c->regs[reg]);
+       register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
+       op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
        op->addr.mem.seg = seg;
 }
 
 static int em_das(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        u8 al, old_al;
        bool af, cf, old_cf;
 
        cf = ctxt->eflags & X86_EFLAGS_CF;
-       al = c->dst.val;
+       al = ctxt->dst.val;
 
        old_al = al;
        old_cf = cf;
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
                cf = true;
        }
 
-       c->dst.val = al;
+       ctxt->dst.val = al;
        /* Set PF, ZF, SF */
-       c->src.type = OP_IMM;
-       c->src.val = 0;
-       c->src.bytes = 1;
-       emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+       ctxt->src.type = OP_IMM;
+       ctxt->src.val = 0;
+       ctxt->src.bytes = 1;
+       emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
        ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
        if (cf)
                ctxt->eflags |= X86_EFLAGS_CF;
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 
 static int em_call_far(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        u16 sel, old_cs;
        ulong old_eip;
        int rc;
 
        old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
-       old_eip = c->eip;
+       old_eip = ctxt->_eip;
 
-       memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-       if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+       memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+       if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
                return X86EMUL_CONTINUE;
 
-       c->eip = 0;
-       memcpy(&c->eip, c->src.valptr, c->op_bytes);
+       ctxt->_eip = 0;
+       memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
 
-       c->src.val = old_cs;
+       ctxt->src.val = old_cs;
        rc = em_push(ctxt);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       c->src.val = old_eip;
+       ctxt->src.val = old_eip;
        return em_push(ctxt);
 }
 
 static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       c->dst.type = OP_REG;
-       c->dst.addr.reg = &c->eip;
-       c->dst.bytes = c->op_bytes;
-       rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+       ctxt->dst.type = OP_REG;
+       ctxt->dst.addr.reg = &ctxt->_eip;
+       ctxt->dst.bytes = ctxt->op_bytes;
+       rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
        return X86EMUL_CONTINUE;
 }
 
 static int em_add(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_or(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_adc(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_sbb(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_and(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_sub(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_xor(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
        return X86EMUL_CONTINUE;
 }
 
 static int em_cmp(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+       emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
-static int em_imul(struct x86_emulate_ctxt *ctxt)
+static int em_test(struct x86_emulate_ctxt *ctxt)
+{
+       emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
+       return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
+       /* Write back the register source. */
+       ctxt->src.val = ctxt->dst.val;
+       write_register_operand(&ctxt->src);
 
-       emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+       /* Write back the memory destination with implicit LOCK prefix. */
+       ctxt->dst.val = ctxt->src.orig_val;
+       ctxt->lock_prefix = 1;
        return X86EMUL_CONTINUE;
 }
 
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
+       emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
+       return X86EMUL_CONTINUE;
+}
 
-       c->dst.val = c->src2.val;
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+       ctxt->dst.val = ctxt->src2.val;
        return em_imul(ctxt);
 }
 
 static int em_cwd(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->dst.type = OP_REG;
-       c->dst.bytes = c->src.bytes;
-       c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-       c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+       ctxt->dst.type = OP_REG;
+       ctxt->dst.bytes = ctxt->src.bytes;
+       ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+       ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
 
        return X86EMUL_CONTINUE;
 }
 
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        u64 tsc = 0;
 
        ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
-       c->regs[VCPU_REGS_RAX] = (u32)tsc;
-       c->regs[VCPU_REGS_RDX] = tsc >> 32;
+       ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
+       ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
        return X86EMUL_CONTINUE;
 }
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       c->dst.val = c->src.val;
+       ctxt->dst.val = ctxt->src.val;
        return X86EMUL_CONTINUE;
 }
 
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+       if (ctxt->modrm_reg > VCPU_SREG_GS)
+               return emulate_ud(ctxt);
+
+       ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+       return X86EMUL_CONTINUE;
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+       u16 sel = ctxt->src.val;
+
+       if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+               return emulate_ud(ctxt);
+
+       if (ctxt->modrm_reg == VCPU_SREG_SS)
+               ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+       /* Disable writeback. */
+       ctxt->dst.type = OP_NONE;
+       return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
 static int em_movdqu(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
+       memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
        return X86EMUL_CONTINUE;
 }
 
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
        ulong linear;
 
-       rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
+       rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
        if (rc == X86EMUL_CONTINUE)
                ctxt->ops->invlpg(ctxt, linear);
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
 
 static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       if (c->modrm_mod != 3 || c->modrm_rm != 1)
+       if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
                return X86EMUL_UNHANDLEABLE;
 
        rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
                return rc;
 
        /* Let the processor re-execute the fixed hypercall */
-       c->eip = ctxt->eip;
+       ctxt->_eip = ctxt->eip;
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
 static int em_lgdt(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        struct desc_ptr desc_ptr;
        int rc;
 
-       rc = read_descriptor(ctxt, c->src.addr.mem,
+       rc = read_descriptor(ctxt, ctxt->src.addr.mem,
                             &desc_ptr.size, &desc_ptr.address,
-                            c->op_bytes);
+                            ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
        ctxt->ops->set_gdt(ctxt, &desc_ptr);
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
 static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        int rc;
 
        rc = ctxt->ops->fix_hypercall(ctxt);
 
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return rc;
 }
 
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        struct desc_ptr desc_ptr;
        int rc;
 
-       rc = read_descriptor(ctxt, c->src.addr.mem,
+       rc = read_descriptor(ctxt, ctxt->src.addr.mem,
                             &desc_ptr.size, &desc_ptr.address,
-                            c->op_bytes);
+                            ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
        ctxt->ops->set_idt(ctxt, &desc_ptr);
        /* Disable writeback. */
-       c->dst.type = OP_NONE;
+       ctxt->dst.type = OP_NONE;
        return X86EMUL_CONTINUE;
 }
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->dst.bytes = 2;
-       c->dst.val = ctxt->ops->get_cr(ctxt, 0);
+       ctxt->dst.bytes = 2;
+       ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
        return X86EMUL_CONTINUE;
 }
 
 static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
        ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
-                         | (c->src.val & 0x0f));
-       c->dst.type = OP_NONE;
+                         | (ctxt->src.val & 0x0f));
+       ctxt->dst.type = OP_NONE;
+       return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+       register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+       if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+           (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+               jmp_rel(ctxt, ctxt->src.val);
+
+       return X86EMUL_CONTINUE;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+       if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+               jmp_rel(ctxt, ctxt->src.val);
+
+       return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+       if (emulator_bad_iopl(ctxt))
+               return emulate_gp(ctxt, 0);
+
+       ctxt->eflags &= ~X86_EFLAGS_IF;
+       return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+       if (emulator_bad_iopl(ctxt))
+               return emulate_gp(ctxt, 0);
+
+       ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+       ctxt->eflags |= X86_EFLAGS_IF;
        return X86EMUL_CONTINUE;
 }
 
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr)
 
 static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       if (!valid_cr(c->modrm_reg))
+       if (!valid_cr(ctxt->modrm_reg))
                return emulate_ud(ctxt);
 
        return X86EMUL_CONTINUE;
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       u64 new_val = c->src.val64;
-       int cr = c->modrm_reg;
+       u64 new_val = ctxt->src.val64;
+       int cr = ctxt->modrm_reg;
        u64 efer = 0;
 
        static u64 cr_reserved_bits[] = {
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       int dr = c->modrm_reg;
+       int dr = ctxt->modrm_reg;
        u64 cr4;
 
        if (dr > 7)
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_write(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-       u64 new_val = c->src.val64;
-       int dr = c->modrm_reg;
+       u64 new_val = ctxt->src.val64;
+       int dr = ctxt->modrm_reg;
 
        if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
                return emulate_gp(ctxt, 0);
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
 
 static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
 {
-       u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
+       u64 rax = ctxt->regs[VCPU_REGS_RAX];
 
        /* Valid physical address? */
        if (rax & 0xffff000000000000ULL)
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
 static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
        u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-       u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
+       u64 rcx = ctxt->regs[VCPU_REGS_RCX];
 
        if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
            (rcx > 3))
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->dst.bytes = min(c->dst.bytes, 4u);
-       if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
+       ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+       if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
                return emulate_gp(ctxt, 0);
 
        return X86EMUL_CONTINUE;
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
-       c->src.bytes = min(c->src.bytes, 4u);
-       if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
+       ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+       if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
                return emulate_gp(ctxt, 0);
 
        return X86EMUL_CONTINUE;
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = {
        G(DstMem | SrcImm | ModRM | Group, group1),
        G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
        G(DstMem | SrcImmByte | ModRM | Group, group1),
-       D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+       I2bv(DstMem | SrcReg | ModRM, em_test),
+       I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
        /* 0x88 - 0x8F */
        I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
        I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
-       D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
-       D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+       I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+       D(ModRM | SrcMem | NoAccess | DstReg),
+       I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+       G(0, group1A),
        /* 0x90 - 0x97 */
        DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
        /* 0x98 - 0x9F */
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = {
        I2bv(SrcSI | DstDI | Mov | String, em_mov),
        I2bv(SrcSI | DstDI | String, em_cmp),
        /* 0xA8 - 0xAF */
-       D2bv(DstAcc | SrcImm),
+       I2bv(DstAcc | SrcImm, em_test),
        I2bv(SrcAcc | DstDI | Mov | String, em_mov),
        I2bv(SrcSI | DstAcc | Mov | String, em_mov),
        I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = {
        /* 0xC0 - 0xC7 */
        D2bv(DstMem | SrcImmByte | ModRM),
        I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
-       D(ImplicitOps | Stack),
+       I(ImplicitOps | Stack, em_ret),
        D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
        G(ByteOp, group11), G(0, group11),
        /* 0xC8 - 0xCF */
-       N, N, N, D(ImplicitOps | Stack),
+       N, N, N, I(ImplicitOps | Stack, em_ret_far),
        D(ImplicitOps), DI(SrcImmByte, intn),
-       D(ImplicitOps | No64), DI(ImplicitOps, iret),
+       D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
        /* 0xD0 - 0xD7 */
        D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
        N, N, N, N,
        /* 0xD8 - 0xDF */
        N, N, N, N, N, N, N, N,
        /* 0xE0 - 0xE7 */
-       X4(D(SrcImmByte)),
+       X3(I(SrcImmByte, em_loop)),
+       I(SrcImmByte, em_jcxz),
        D2bvIP(SrcImmUByte | DstAcc, in,  check_perm_in),
        D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
        /* 0xE8 - 0xEF */
        D(SrcImm | Stack), D(SrcImm | ImplicitOps),
-       D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+       I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
        D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
        D2bvIP(SrcAcc | DstDX, out, check_perm_out),
        /* 0xF0 - 0xF7 */
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = {
        DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
        G(ByteOp, group3), G(0, group3),
        /* 0xF8 - 0xFF */
-       D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+       D(ImplicitOps), D(ImplicitOps),
+       I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
        D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
 static struct opcode twobyte_table[256] = {
        /* 0x00 - 0x0F */
        G(0, group6), GD(0, &group7), N, N,
-       N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
+       N, I(ImplicitOps | VendorSpecific, em_syscall),
+       II(ImplicitOps | Priv, em_clts, clts), N,
        DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
        N, D(ImplicitOps | ModRM), N, N,
        /* 0x10 - 0x1F */
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = {
        IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
        DI(ImplicitOps | Priv, rdmsr),
        DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
-       D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+       I(ImplicitOps | VendorSpecific, em_sysenter),
+       I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
        N, N,
        N, N, N, N, N, N, N, N,
        /* 0x40 - 0x4F */
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
 #undef I2bv
 #undef I6ALU
 
-static unsigned imm_size(struct decode_cache *c)
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
 {
        unsigned size;
 
-       size = (c->d & ByteOp) ? 1 : c->op_bytes;
+       size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
        if (size == 8)
                size = 4;
        return size;
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
 static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
                      unsigned size, bool sign_extension)
 {
-       struct decode_cache *c = &ctxt->decode;
-       struct x86_emulate_ops *ops = ctxt->ops;
        int rc = X86EMUL_CONTINUE;
 
        op->type = OP_IMM;
        op->bytes = size;
-       op->addr.mem.ea = c->eip;
+       op->addr.mem.ea = ctxt->_eip;
        /* NB. Immediates are sign-extended as necessary. */
        switch (op->bytes) {
        case 1:
-               op->val = insn_fetch(s8, 1, c->eip);
+               op->val = insn_fetch(s8, 1, ctxt->_eip);
                break;
        case 2:
-               op->val = insn_fetch(s16, 2, c->eip);
+               op->val = insn_fetch(s16, 2, ctxt->_eip);
                break;
        case 4:
-               op->val = insn_fetch(s32, 4, c->eip);
+               op->val = insn_fetch(s32, 4, ctxt->_eip);
                break;
        }
        if (!sign_extension) {
@@ -3362,11 +3335,8 @@ done:
        return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
-       struct x86_emulate_ops *ops = ctxt->ops;
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
        int mode = ctxt->mode;
        int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        struct opcode opcode;
        struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
-       c->eip = ctxt->eip;
-       c->fetch.start = c->eip;
-       c->fetch.end = c->fetch.start + insn_len;
+       ctxt->_eip = ctxt->eip;
+       ctxt->fetch.start = ctxt->_eip;
+       ctxt->fetch.end = ctxt->fetch.start + insn_len;
        if (insn_len > 0)
-               memcpy(c->fetch.data, insn, insn_len);
+               memcpy(ctxt->fetch.data, insn, insn_len);
 
        switch (mode) {
        case X86EMUL_MODE_REAL:
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
                return -1;
        }
 
-       c->op_bytes = def_op_bytes;
-       c->ad_bytes = def_ad_bytes;
+       ctxt->op_bytes = def_op_bytes;
+       ctxt->ad_bytes = def_ad_bytes;
 
        /* Legacy prefixes. */
        for (;;) {
-               switch (c->b = insn_fetch(u8, 1, c->eip)) {
+               switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
                case 0x66:      /* operand-size override */
                        op_prefix = true;
                        /* switch between 2/4 bytes */
-                       c->op_bytes = def_op_bytes ^ 6;
+                       ctxt->op_bytes = def_op_bytes ^ 6;
                        break;
                case 0x67:      /* address-size override */
                        if (mode == X86EMUL_MODE_PROT64)
                                /* switch between 4/8 bytes */
-                               c->ad_bytes = def_ad_bytes ^ 12;
+                               ctxt->ad_bytes = def_ad_bytes ^ 12;
                        else
                                /* switch between 2/4 bytes */
-                               c->ad_bytes = def_ad_bytes ^ 6;
+                               ctxt->ad_bytes = def_ad_bytes ^ 6;
                        break;
                case 0x26:      /* ES override */
                case 0x2e:      /* CS override */
                case 0x36:      /* SS override */
                case 0x3e:      /* DS override */
-                       set_seg_override(c, (c->b >> 3) & 3);
+                       set_seg_override(ctxt, (ctxt->b >> 3) & 3);
                        break;
                case 0x64:      /* FS override */
                case 0x65:      /* GS override */
-                       set_seg_override(c, c->b & 7);
+                       set_seg_override(ctxt, ctxt->b & 7);
                        break;
                case 0x40 ... 0x4f: /* REX */
                        if (mode != X86EMUL_MODE_PROT64)
                                goto done_prefixes;
-                       c->rex_prefix = c->b;
+                       ctxt->rex_prefix = ctxt->b;
                        continue;
                case 0xf0:      /* LOCK */
-                       c->lock_prefix = 1;
+                       ctxt->lock_prefix = 1;
                        break;
                case 0xf2:      /* REPNE/REPNZ */
                case 0xf3:      /* REP/REPE/REPZ */
-                       c->rep_prefix = c->b;
+                       ctxt->rep_prefix = ctxt->b;
                        break;
                default:
                        goto done_prefixes;
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 
                /* Any legacy prefix after a REX prefix nullifies its effect. */
 
-               c->rex_prefix = 0;
+               ctxt->rex_prefix = 0;
        }
 
 done_prefixes:
 
        /* REX prefix. */
-       if (c->rex_prefix & 8)
-               c->op_bytes = 8;        /* REX.W */
+       if (ctxt->rex_prefix & 8)
+               ctxt->op_bytes = 8;     /* REX.W */
 
        /* Opcode byte(s). */
-       opcode = opcode_table[c->b];
+       opcode = opcode_table[ctxt->b];
        /* Two-byte opcode? */
-       if (c->b == 0x0f) {
-               c->twobyte = 1;
-               c->b = insn_fetch(u8, 1, c->eip);
-               opcode = twobyte_table[c->b];
+       if (ctxt->b == 0x0f) {
+               ctxt->twobyte = 1;
+               ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+               opcode = twobyte_table[ctxt->b];
        }
-       c->d = opcode.flags;
+       ctxt->d = opcode.flags;
 
-       while (c->d & GroupMask) {
-               switch (c->d & GroupMask) {
+       while (ctxt->d & GroupMask) {
+               switch (ctxt->d & GroupMask) {
                case Group:
-                       c->modrm = insn_fetch(u8, 1, c->eip);
-                       --c->eip;
-                       goffset = (c->modrm >> 3) & 7;
+                       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+                       --ctxt->_eip;
+                       goffset = (ctxt->modrm >> 3) & 7;
                        opcode = opcode.u.group[goffset];
                        break;
                case GroupDual:
-                       c->modrm = insn_fetch(u8, 1, c->eip);
-                       --c->eip;
-                       goffset = (c->modrm >> 3) & 7;
-                       if ((c->modrm >> 6) == 3)
+                       ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+                       --ctxt->_eip;
+                       goffset = (ctxt->modrm >> 3) & 7;
+                       if ((ctxt->modrm >> 6) == 3)
                                opcode = opcode.u.gdual->mod3[goffset];
                        else
                                opcode = opcode.u.gdual->mod012[goffset];
                        break;
                case RMExt:
-                       goffset = c->modrm & 7;
+                       goffset = ctxt->modrm & 7;
                        opcode = opcode.u.group[goffset];
                        break;
                case Prefix:
-                       if (c->rep_prefix && op_prefix)
+                       if (ctxt->rep_prefix && op_prefix)
                                return X86EMUL_UNHANDLEABLE;
-                       simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
+                       simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
                        switch (simd_prefix) {
                        case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
                        case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3501,61 +3471,61 @@ done_prefixes:
                        return X86EMUL_UNHANDLEABLE;
                }
 
-               c->d &= ~GroupMask;
-               c->d |= opcode.flags;
+               ctxt->d &= ~GroupMask;
+               ctxt->d |= opcode.flags;
        }
 
-       c->execute = opcode.u.execute;
-       c->check_perm = opcode.check_perm;
-       c->intercept = opcode.intercept;
+       ctxt->execute = opcode.u.execute;
+       ctxt->check_perm = opcode.check_perm;
+       ctxt->intercept = opcode.intercept;
 
        /* Unrecognised? */
-       if (c->d == 0 || (c->d & Undefined))
+       if (ctxt->d == 0 || (ctxt->d & Undefined))
                return -1;
 
-       if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+       if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
                return -1;
 
-       if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-               c->op_bytes = 8;
+       if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
+               ctxt->op_bytes = 8;
 
-       if (c->d & Op3264) {
+       if (ctxt->d & Op3264) {
                if (mode == X86EMUL_MODE_PROT64)
-                       c->op_bytes = 8;
+                       ctxt->op_bytes = 8;
                else
-                       c->op_bytes = 4;
+                       ctxt->op_bytes = 4;
        }
 
-       if (c->d & Sse)
-               c->op_bytes = 16;
+       if (ctxt->d & Sse)
+               ctxt->op_bytes = 16;
 
        /* ModRM and SIB bytes. */
-       if (c->d & ModRM) {
-               rc = decode_modrm(ctxt, ops, &memop);
-               if (!c->has_seg_override)
-                       set_seg_override(c, c->modrm_seg);
-       } else if (c->d & MemAbs)
-               rc = decode_abs(ctxt, ops, &memop);
+       if (ctxt->d & ModRM) {
+               rc = decode_modrm(ctxt, &memop);
+               if (!ctxt->has_seg_override)
+                       set_seg_override(ctxt, ctxt->modrm_seg);
+       } else if (ctxt->d & MemAbs)
+               rc = decode_abs(ctxt, &memop);
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
-       if (!c->has_seg_override)
-               set_seg_override(c, VCPU_SREG_DS);
+       if (!ctxt->has_seg_override)
+               set_seg_override(ctxt, VCPU_SREG_DS);
 
-       memop.addr.mem.seg = seg_override(ctxt, c);
+       memop.addr.mem.seg = seg_override(ctxt);
 
-       if (memop.type == OP_MEM && c->ad_bytes != 8)
+       if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
                memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
        /*
         * Decode and fetch the source operand: register, memory
         * or immediate.
         */
-       switch (c->d & SrcMask) {
+       switch (ctxt->d & SrcMask) {
        case SrcNone:
                break;
        case SrcReg:
-               decode_register_operand(ctxt, &c->src, c, 0);
+               decode_register_operand(ctxt, &ctxt->src, 0);
                break;
        case SrcMem16:
                memop.bytes = 2;
@@ -3564,60 +3534,60 @@ done_prefixes:
                memop.bytes = 4;
                goto srcmem_common;
        case SrcMem:
-               memop.bytes = (c->d & ByteOp) ? 1 :
-                                                          c->op_bytes;
+               memop.bytes = (ctxt->d & ByteOp) ? 1 :
+                                                          ctxt->op_bytes;
        srcmem_common:
-               c->src = memop;
-               memopp = &c->src;
+               ctxt->src = memop;
+               memopp = &ctxt->src;
                break;
        case SrcImmU16:
-               rc = decode_imm(ctxt, &c->src, 2, false);
+               rc = decode_imm(ctxt, &ctxt->src, 2, false);
                break;
        case SrcImm:
-               rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+               rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
                break;
        case SrcImmU:
-               rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+               rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
                break;
        case SrcImmByte:
-               rc = decode_imm(ctxt, &c->src, 1, true);
+               rc = decode_imm(ctxt, &ctxt->src, 1, true);
                break;
        case SrcImmUByte:
-               rc = decode_imm(ctxt, &c->src, 1, false);
+               rc = decode_imm(ctxt, &ctxt->src, 1, false);
                break;
        case SrcAcc:
-               c->src.type = OP_REG;
-               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-               fetch_register_operand(&c->src);
+               ctxt->src.type = OP_REG;
+               ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+               fetch_register_operand(&ctxt->src);
                break;
        case SrcOne:
-               c->src.bytes = 1;
-               c->src.val = 1;
+               ctxt->src.bytes = 1;
+               ctxt->src.val = 1;
                break;
        case SrcSI:
-               c->src.type = OP_MEM;
-               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->src.addr.mem.ea =
-                       register_address(c, c->regs[VCPU_REGS_RSI]);
-               c->src.addr.mem.seg = seg_override(ctxt, c);
-               c->src.val = 0;
+               ctxt->src.type = OP_MEM;
+               ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               ctxt->src.addr.mem.ea =
+                       register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+               ctxt->src.addr.mem.seg = seg_override(ctxt);
+               ctxt->src.val = 0;
                break;
        case SrcImmFAddr:
-               c->src.type = OP_IMM;
-               c->src.addr.mem.ea = c->eip;
-               c->src.bytes = c->op_bytes + 2;
-               insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+               ctxt->src.type = OP_IMM;
+               ctxt->src.addr.mem.ea = ctxt->_eip;
+               ctxt->src.bytes = ctxt->op_bytes + 2;
+               insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
                break;
        case SrcMemFAddr:
-               memop.bytes = c->op_bytes + 2;
+               memop.bytes = ctxt->op_bytes + 2;
                goto srcmem_common;
                break;
        case SrcDX:
-               c->src.type = OP_REG;
-               c->src.bytes = 2;
-               c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
-               fetch_register_operand(&c->src);
+               ctxt->src.type = OP_REG;
+               ctxt->src.bytes = 2;
+               ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+               fetch_register_operand(&ctxt->src);
                break;
        }
 
@@ -3628,22 +3598,22 @@ done_prefixes:
         * Decode and fetch the second source operand: register, memory
         * or immediate.
         */
-       switch (c->d & Src2Mask) {
+       switch (ctxt->d & Src2Mask) {
        case Src2None:
                break;
        case Src2CL:
-               c->src2.bytes = 1;
-               c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+               ctxt->src2.bytes = 1;
+               ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
                break;
        case Src2ImmByte:
-               rc = decode_imm(ctxt, &c->src2, 1, true);
+               rc = decode_imm(ctxt, &ctxt->src2, 1, true);
                break;
        case Src2One:
-               c->src2.bytes = 1;
-               c->src2.val = 1;
+               ctxt->src2.bytes = 1;
+               ctxt->src2.val = 1;
                break;
        case Src2Imm:
-               rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+               rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
                break;
        }
 
@@ -3651,68 +3621,66 @@ done_prefixes:
                goto done;
 
        /* Decode and fetch the destination operand: register or memory. */
-       switch (c->d & DstMask) {
+       switch (ctxt->d & DstMask) {
        case DstReg:
-               decode_register_operand(ctxt, &c->dst, c,
-                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+               decode_register_operand(ctxt, &ctxt->dst,
+                        ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
                break;
        case DstImmUByte:
-               c->dst.type = OP_IMM;
-               c->dst.addr.mem.ea = c->eip;
-               c->dst.bytes = 1;
-               c->dst.val = insn_fetch(u8, 1, c->eip);
+               ctxt->dst.type = OP_IMM;
+               ctxt->dst.addr.mem.ea = ctxt->_eip;
+               ctxt->dst.bytes = 1;
+               ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
                break;
        case DstMem:
        case DstMem64:
-               c->dst = memop;
-               memopp = &c->dst;
-               if ((c->d & DstMask) == DstMem64)
-                       c->dst.bytes = 8;
+               ctxt->dst = memop;
+               memopp = &ctxt->dst;
+               if ((ctxt->d & DstMask) == DstMem64)
+                       ctxt->dst.bytes = 8;
                else
-                       c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               if (c->d & BitOp)
-                       fetch_bit_operand(c);
-               c->dst.orig_val = c->dst.val;
+                       ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               if (ctxt->d & BitOp)
+                       fetch_bit_operand(ctxt);
+               ctxt->dst.orig_val = ctxt->dst.val;
                break;
        case DstAcc:
-               c->dst.type = OP_REG;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
-               fetch_register_operand(&c->dst);
-               c->dst.orig_val = c->dst.val;
+               ctxt->dst.type = OP_REG;
+               ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+               fetch_register_operand(&ctxt->dst);
+               ctxt->dst.orig_val = ctxt->dst.val;
                break;
        case DstDI:
-               c->dst.type = OP_MEM;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.addr.mem.ea =
-                       register_address(c, c->regs[VCPU_REGS_RDI]);
-               c->dst.addr.mem.seg = VCPU_SREG_ES;
-               c->dst.val = 0;
+               ctxt->dst.type = OP_MEM;
+               ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+               ctxt->dst.addr.mem.ea =
+                       register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+               ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
+               ctxt->dst.val = 0;
                break;
        case DstDX:
-               c->dst.type = OP_REG;
-               c->dst.bytes = 2;
-               c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-               fetch_register_operand(&c->dst);
+               ctxt->dst.type = OP_REG;
+               ctxt->dst.bytes = 2;
+               ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+               fetch_register_operand(&ctxt->dst);
                break;
        case ImplicitOps:
                /* Special instructions do their own operand decoding. */
        default:
-               c->dst.type = OP_NONE; /* Disable writeback. */
+               ctxt->dst.type = OP_NONE; /* Disable writeback. */
                break;
        }
 
 done:
-       if (memopp && memopp->type == OP_MEM && c->rip_relative)
-               memopp->addr.mem.ea += c->eip;
+       if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
+               memopp->addr.mem.ea += ctxt->_eip;
 
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 {
-       struct decode_cache *c = &ctxt->decode;
-
        /* The second termination condition only applies for REPE
         * and REPNE. Test if the repeat string operation prefix is
         * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
         *      - if REPE/REPZ and ZF = 0 then done
         *      - if REPNE/REPNZ and ZF = 1 then done
         */
-       if (((c->b == 0xa6) || (c->b == 0xa7) ||
-            (c->b == 0xae) || (c->b == 0xaf))
-           && (((c->rep_prefix == REPE_PREFIX) &&
+       if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+            (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+           && (((ctxt->rep_prefix == REPE_PREFIX) &&
                 ((ctxt->eflags & EFLG_ZF) == 0))
-               || ((c->rep_prefix == REPNE_PREFIX) &&
+               || ((ctxt->rep_prefix == REPNE_PREFIX) &&
                    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
                return true;
 
        return false;
 }
 
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
        struct x86_emulate_ops *ops = ctxt->ops;
        u64 msr_data;
-       struct decode_cache *c = &ctxt->decode;
        int rc = X86EMUL_CONTINUE;
-       int saved_dst_type = c->dst.type;
-       int irq; /* Used for int 3, int, and into */
+       int saved_dst_type = ctxt->dst.type;
 
-       ctxt->decode.mem_read.pos = 0;
+       ctxt->mem_read.pos = 0;
 
-       if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+       if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
                rc = emulate_ud(ctxt);
                goto done;
        }
 
        /* LOCK prefix is allowed only with some instructions */
-       if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
+       if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
                rc = emulate_ud(ctxt);
                goto done;
        }
 
-       if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+       if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
                rc = emulate_ud(ctxt);
                goto done;
        }
 
-       if ((c->d & Sse)
+       if ((ctxt->d & Sse)
            && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
                || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
                rc = emulate_ud(ctxt);
                goto done;
        }
 
-       if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+       if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
                rc = emulate_nm(ctxt);
                goto done;
        }
 
-       if (unlikely(ctxt->guest_mode) && c->intercept) {
-               rc = emulator_check_intercept(ctxt, c->intercept,
+       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+               rc = emulator_check_intercept(ctxt, ctxt->intercept,
                                              X86_ICPT_PRE_EXCEPT);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
 
        /* Privileged instruction can be executed only in CPL=0 */
-       if ((c->d & Priv) && ops->cpl(ctxt)) {
+       if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
                rc = emulate_gp(ctxt, 0);
                goto done;
        }
 
        /* Instruction can only be executed in protected mode */
-       if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+       if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
                rc = emulate_ud(ctxt);
                goto done;
        }
 
        /* Do instruction specific permission checks */
-       if (c->check_perm) {
-               rc = c->check_perm(ctxt);
+       if (ctxt->check_perm) {
+               rc = ctxt->check_perm(ctxt);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
 
-       if (unlikely(ctxt->guest_mode) && c->intercept) {
-               rc = emulator_check_intercept(ctxt, c->intercept,
+       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+               rc = emulator_check_intercept(ctxt, ctxt->intercept,
                                              X86_ICPT_POST_EXCEPT);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
 
-       if (c->rep_prefix && (c->d & String)) {
+       if (ctxt->rep_prefix && (ctxt->d & String)) {
                /* All REP prefixes have the same first termination condition */
-               if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-                       ctxt->eip = c->eip;
+               if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+                       ctxt->eip = ctxt->_eip;
                        goto done;
                }
        }
 
-       if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
-               rc = segmented_read(ctxt, c->src.addr.mem,
-                                   c->src.valptr, c->src.bytes);
+       if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+               rc = segmented_read(ctxt, ctxt->src.addr.mem,
+                                   ctxt->src.valptr, ctxt->src.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
-               c->src.orig_val64 = c->src.val64;
+               ctxt->src.orig_val64 = ctxt->src.val64;
        }
 
-       if (c->src2.type == OP_MEM) {
-               rc = segmented_read(ctxt, c->src2.addr.mem,
-                                   &c->src2.val, c->src2.bytes);
+       if (ctxt->src2.type == OP_MEM) {
+               rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+                                   &ctxt->src2.val, ctxt->src2.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
 
-       if ((c->d & DstMask) == ImplicitOps)
+       if ((ctxt->d & DstMask) == ImplicitOps)
                goto special_insn;
 
 
-       if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+       if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
                /* optimisation - avoid slow emulated read if Mov */
-               rc = segmented_read(ctxt, c->dst.addr.mem,
-                                  &c->dst.val, c->dst.bytes);
+               rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+                                  &ctxt->dst.val, ctxt->dst.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
-       c->dst.orig_val = c->dst.val;
+       ctxt->dst.orig_val = ctxt->dst.val;
 
 special_insn:
 
-       if (unlikely(ctxt->guest_mode) && c->intercept) {
-               rc = emulator_check_intercept(ctxt, c->intercept,
+       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+               rc = emulator_check_intercept(ctxt, ctxt->intercept,
                                              X86_ICPT_POST_MEMACCESS);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
 
-       if (c->execute) {
-               rc = c->execute(ctxt);
+       if (ctxt->execute) {
+               rc = ctxt->execute(ctxt);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
                goto writeback;
        }
 
-       if (c->twobyte)
+       if (ctxt->twobyte)
                goto twobyte_insn;
 
-       switch (c->b) {
+       switch (ctxt->b) {
        case 0x06:              /* push es */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
                break;
        case 0x07:              /* pop es */
-               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+               rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
                break;
        case 0x0e:              /* push cs */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
                break;
        case 0x16:              /* push ss */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
                break;
        case 0x17:              /* pop ss */
-               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+               rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
                break;
        case 0x1e:              /* push ds */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
                break;
        case 0x1f:              /* pop ds */
-               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+               rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
                break;
        case 0x40 ... 0x47: /* inc r16/r32 */
-               emulate_1op("inc", c->dst, ctxt->eflags);
+               emulate_1op("inc", ctxt->dst, ctxt->eflags);
                break;
        case 0x48 ... 0x4f: /* dec r16/r32 */
-               emulate_1op("dec", c->dst, ctxt->eflags);
+               emulate_1op("dec", ctxt->dst, ctxt->eflags);
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
                        goto cannot_emulate;
-               c->dst.val = (s32) c->src.val;
+               ctxt->dst.val = (s32) ctxt->src.val;
                break;
        case 0x6c:              /* insb */
        case 0x6d:              /* insw/insd */
-               c->src.val = c->regs[VCPU_REGS_RDX];
+               ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
                goto do_io_in;
        case 0x6e:              /* outsb */
        case 0x6f:              /* outsw/outsd */
-               c->dst.val = c->regs[VCPU_REGS_RDX];
+               ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
                goto do_io_out;
                break;
        case 0x70 ... 0x7f: /* jcc (short) */
-               if (test_cc(c->b, ctxt->eflags))
-                       jmp_rel(c, c->src.val);
-               break;
-       case 0x84 ... 0x85:
-       test:
-               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x86 ... 0x87:     /* xchg */
-       xchg:
-               /* Write back the register source. */
-               c->src.val = c->dst.val;
-               write_register_operand(&c->src);
-               /*
-                * Write back the memory destination with implicit LOCK
-                * prefix.
-                */
-               c->dst.val = c->src.orig_val;
-               c->lock_prefix = 1;
-               break;
-       case 0x8c:  /* mov r/m, sreg */
-               if (c->modrm_reg > VCPU_SREG_GS) {
-                       rc = emulate_ud(ctxt);
-                       goto done;
-               }
-               c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
+               if (test_cc(ctxt->b, ctxt->eflags))
+                       jmp_rel(ctxt, ctxt->src.val);
                break;
        case 0x8d: /* lea r16/r32, m */
-               c->dst.val = c->src.addr.mem.ea;
+               ctxt->dst.val = ctxt->src.addr.mem.ea;
                break;
-       case 0x8e: { /* mov seg, r/m16 */
-               uint16_t sel;
-
-               sel = c->src.val;
-
-               if (c->modrm_reg == VCPU_SREG_CS ||
-                   c->modrm_reg > VCPU_SREG_GS) {
-                       rc = emulate_ud(ctxt);
-                       goto done;
-               }
-
-               if (c->modrm_reg == VCPU_SREG_SS)
-                       ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
-
-               rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
-
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       }
        case 0x8f:              /* pop (sole member of Grp1a) */
                rc = em_grp1a(ctxt);
                break;
        case 0x90 ... 0x97: /* nop / xchg reg, rax */
-               if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+               if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
                        break;
-               goto xchg;
+               rc = em_xchg(ctxt);
+               break;
        case 0x98: /* cbw/cwde/cdqe */
-               switch (c->op_bytes) {
-               case 2: c->dst.val = (s8)c->dst.val; break;
-               case 4: c->dst.val = (s16)c->dst.val; break;
-               case 8: c->dst.val = (s32)c->dst.val; break;
+               switch (ctxt->op_bytes) {
+               case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+               case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+               case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
                }
                break;
-       case 0xa8 ... 0xa9:     /* test ax, imm */
-               goto test;
        case 0xc0 ... 0xc1:
                rc = em_grp2(ctxt);
                break;
-       case 0xc3: /* ret */
-               c->dst.type = OP_REG;
-               c->dst.addr.reg = &c->eip;
-               c->dst.bytes = c->op_bytes;
-               rc = em_pop(ctxt);
-               break;
        case 0xc4:              /* les */
-               rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+               rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
                break;
        case 0xc5:              /* lds */
-               rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
-               break;
-       case 0xcb:              /* ret far */
-               rc = emulate_ret_far(ctxt, ops);
+               rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
                break;
        case 0xcc:              /* int3 */
-               irq = 3;
-               goto do_interrupt;
+               rc = emulate_int(ctxt, 3);
+               break;
        case 0xcd:              /* int n */
-               irq = c->src.val;
-       do_interrupt:
-               rc = emulate_int(ctxt, ops, irq);
+               rc = emulate_int(ctxt, ctxt->src.val);
                break;
        case 0xce:              /* into */
-               if (ctxt->eflags & EFLG_OF) {
-                       irq = 4;
-                       goto do_interrupt;
-               }
-               break;
-       case 0xcf:              /* iret */
-               rc = emulate_iret(ctxt, ops);
+               if (ctxt->eflags & EFLG_OF)
+                       rc = emulate_int(ctxt, 4);
                break;
        case 0xd0 ... 0xd1:     /* Grp2 */
                rc = em_grp2(ctxt);
                break;
        case 0xd2 ... 0xd3:     /* Grp2 */
-               c->src.val = c->regs[VCPU_REGS_RCX];
+               ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
                rc = em_grp2(ctxt);
                break;
-       case 0xe0 ... 0xe2:     /* loop/loopz/loopnz */
-               register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-               if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
-                   (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
-                       jmp_rel(c, c->src.val);
-               break;
-       case 0xe3:      /* jcxz/jecxz/jrcxz */
-               if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
-                       jmp_rel(c, c->src.val);
-               break;
        case 0xe4:      /* inb */
        case 0xe5:      /* in */
                goto do_io_in;
@@ -4025,35 +3921,30 @@ special_insn:
        case 0xe7: /* out */
                goto do_io_out;
        case 0xe8: /* call (near) */ {
-               long int rel = c->src.val;
-               c->src.val = (unsigned long) c->eip;
-               jmp_rel(c, rel);
+               long int rel = ctxt->src.val;
+               ctxt->src.val = (unsigned long) ctxt->_eip;
+               jmp_rel(ctxt, rel);
                rc = em_push(ctxt);
                break;
        }
        case 0xe9: /* jmp rel */
-               goto jmp;
-       case 0xea: /* jmp far */
-               rc = em_jmp_far(ctxt);
-               break;
-       case 0xeb:
-             jmp:              /* jmp rel short */
-               jmp_rel(c, c->src.val);
-               c->dst.type = OP_NONE; /* Disable writeback. */
+       case 0xeb: /* jmp rel short */
+               jmp_rel(ctxt, ctxt->src.val);
+               ctxt->dst.type = OP_NONE; /* Disable writeback. */
                break;
        case 0xec: /* in al,dx */
        case 0xed: /* in (e/r)ax,dx */
        do_io_in:
-               if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
-                                    &c->dst.val))
+               if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+                                    &ctxt->dst.val))
                        goto done; /* IO is needed */
                break;
        case 0xee: /* out dx,al */
        case 0xef: /* out dx,(e/r)ax */
        do_io_out:
-               ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
-                                     &c->src.val, 1);
-               c->dst.type = OP_NONE;  /* Disable writeback. */
+               ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+                                     &ctxt->src.val, 1);
+               ctxt->dst.type = OP_NONE;       /* Disable writeback. */
                break;
        case 0xf4:              /* hlt */
                ctxt->ops->halt(ctxt);
@@ -4071,22 +3962,6 @@ special_insn:
        case 0xf9: /* stc */
                ctxt->eflags |= EFLG_CF;
                break;
-       case 0xfa: /* cli */
-               if (emulator_bad_iopl(ctxt, ops)) {
-                       rc = emulate_gp(ctxt, 0);
-                       goto done;
-               } else
-                       ctxt->eflags &= ~X86_EFLAGS_IF;
-               break;
-       case 0xfb: /* sti */
-               if (emulator_bad_iopl(ctxt, ops)) {
-                       rc = emulate_gp(ctxt, 0);
-                       goto done;
-               } else {
-                       ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
-                       ctxt->eflags |= X86_EFLAGS_IF;
-               }
-               break;
        case 0xfc: /* cld */
                ctxt->eflags &= ~EFLG_DF;
                break;
@@ -4115,40 +3990,40 @@ writeback:
         * restore dst type in case the decoding will be reused
         * (happens for string instruction )
         */
-       c->dst.type = saved_dst_type;
+       ctxt->dst.type = saved_dst_type;
 
-       if ((c->d & SrcMask) == SrcSI)
-               string_addr_inc(ctxt, seg_override(ctxt, c),
-                               VCPU_REGS_RSI, &c->src);
+       if ((ctxt->d & SrcMask) == SrcSI)
+               string_addr_inc(ctxt, seg_override(ctxt),
+                               VCPU_REGS_RSI, &ctxt->src);
 
-       if ((c->d & DstMask) == DstDI)
+       if ((ctxt->d & DstMask) == DstDI)
                string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
-                               &c->dst);
+                               &ctxt->dst);
 
-       if (c->rep_prefix && (c->d & String)) {
-               struct read_cache *r = &ctxt->decode.io_read;
-               register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+       if (ctxt->rep_prefix && (ctxt->d & String)) {
+               struct read_cache *r = &ctxt->io_read;
+               register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
 
                if (!string_insn_completed(ctxt)) {
                        /*
                         * Re-enter guest when pio read ahead buffer is empty
                         * or, if it is not used, after each 1024 iteration.
                         */
-                       if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+                       if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
                            (r->end == 0 || r->end != r->pos)) {
                                /*
                                 * Reset read cache. Usually happens before
                                 * decode, but since instruction is restarted
                                 * we have to do it here.
                                 */
-                               ctxt->decode.mem_read.end = 0;
+                               ctxt->mem_read.end = 0;
                                return EMULATION_RESTART;
                        }
                        goto done; /* skip rip writeback */
                }
        }
 
-       ctxt->eip = c->eip;
+       ctxt->eip = ctxt->_eip;
 
 done:
        if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4159,13 +4034,7 @@ done:
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
-       switch (c->b) {
-       case 0x05:              /* syscall */
-               rc = emulate_syscall(ctxt, ops);
-               break;
-       case 0x06:
-               rc = em_clts(ctxt);
-               break;
+       switch (ctxt->b) {
        case 0x09:              /* wbinvd */
                (ctxt->ops->wbinvd)(ctxt);
                break;
@@ -4174,21 +4043,21 @@ twobyte_insn:
        case 0x18:              /* Grp16 (prefetch/nop) */
                break;
        case 0x20: /* mov cr, reg */
-               c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
+               ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
                break;
        case 0x21: /* mov from dr to reg */
-               ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
+               ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
                break;
        case 0x22: /* mov reg, cr */
-               if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
+               if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
                        emulate_gp(ctxt, 0);
                        rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
-               c->dst.type = OP_NONE;
+               ctxt->dst.type = OP_NONE;
                break;
        case 0x23: /* mov from reg to dr */
-               if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
+               if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
                                ((ctxt->mode == X86EMUL_MODE_PROT64) ?
                                 ~0ULL : ~0U)) < 0) {
                        /* #UD condition is already handled by the code above */
@@ -4197,13 +4066,13 @@ twobyte_insn:
                        goto done;
                }
 
-               c->dst.type = OP_NONE;  /* no writeback */
+               ctxt->dst.type = OP_NONE;       /* no writeback */
                break;
        case 0x30:
                /* wrmsr */
-               msr_data = (u32)c->regs[VCPU_REGS_RAX]
-                       | ((u64)c->regs[VCPU_REGS_RDX] << 32);
-               if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
+               msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+                       | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+               if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
                        emulate_gp(ctxt, 0);
                        rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
@@ -4212,64 +4081,58 @@ twobyte_insn:
                break;
        case 0x32:
                /* rdmsr */
-               if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
+               if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
                        emulate_gp(ctxt, 0);
                        rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                } else {
-                       c->regs[VCPU_REGS_RAX] = (u32)msr_data;
-                       c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+                       ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+                       ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
                }
                rc = X86EMUL_CONTINUE;
                break;
-       case 0x34:              /* sysenter */
-               rc = emulate_sysenter(ctxt, ops);
-               break;
-       case 0x35:              /* sysexit */
-               rc = emulate_sysexit(ctxt, ops);
-               break;
        case 0x40 ... 0x4f:     /* cmov */
-               c->dst.val = c->dst.orig_val = c->src.val;
-               if (!test_cc(c->b, ctxt->eflags))
-                       c->dst.type = OP_NONE; /* no writeback */
+               ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
+               if (!test_cc(ctxt->b, ctxt->eflags))
+                       ctxt->dst.type = OP_NONE; /* no writeback */
                break;
        case 0x80 ... 0x8f: /* jnz rel, etc*/
-               if (test_cc(c->b, ctxt->eflags))
-                       jmp_rel(c, c->src.val);
+               if (test_cc(ctxt->b, ctxt->eflags))
+                       jmp_rel(ctxt, ctxt->src.val);
                break;
        case 0x90 ... 0x9f:     /* setcc r/m8 */
-               c->dst.val = test_cc(c->b, ctxt->eflags);
+               ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
                break;
        case 0xa0:        /* push fs */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
                break;
        case 0xa1:       /* pop fs */
-               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+               rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
                break;
        case 0xa3:
              bt:               /* bt */
-               c->dst.type = OP_NONE;
+               ctxt->dst.type = OP_NONE;
                /* only subword offset */
-               c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+               ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xa4: /* shld imm8, r, r/m */
        case 0xa5: /* shld cl, r, r/m */
-               emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+               emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xa8:      /* push gs */
-               rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+               rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
                break;
        case 0xa9:      /* pop gs */
-               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+               rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
                break;
        case 0xab:
              bts:              /* bts */
-               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xac: /* shrd imm8, r, r/m */
        case 0xad: /* shrd cl, r, r/m */
-               emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+               emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xae:              /* clflush */
                break;
@@ -4278,38 +4141,38 @@ twobyte_insn:
                 * Save real source value, then compare EAX against
                 * destination.
                 */
-               c->src.orig_val = c->src.val;
-               c->src.val = c->regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               ctxt->src.orig_val = ctxt->src.val;
+               ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+               emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
                if (ctxt->eflags & EFLG_ZF) {
                        /* Success: write back to memory. */
-                       c->dst.val = c->src.orig_val;
+                       ctxt->dst.val = ctxt->src.orig_val;
                } else {
                        /* Failure: write the value we saw to EAX. */
-                       c->dst.type = OP_REG;
-                       c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+                       ctxt->dst.type = OP_REG;
+                       ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
                }
                break;
        case 0xb2:              /* lss */
-               rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+               rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
                break;
        case 0xb3:
              btr:              /* btr */
-               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xb4:              /* lfs */
-               rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+               rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
                break;
        case 0xb5:              /* lgs */
-               rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+               rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
                break;
        case 0xb6 ... 0xb7:     /* movzx */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-                                                      : (u16) c->src.val;
+               ctxt->dst.bytes = ctxt->op_bytes;
+               ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+                                                      : (u16) ctxt->src.val;
                break;
        case 0xba:              /* Grp8 */
-               switch (c->modrm_reg & 3) {
+               switch (ctxt->modrm_reg & 3) {
                case 0:
                        goto bt;
                case 1:
@@ -4322,47 +4185,47 @@ twobyte_insn:
                break;
        case 0xbb:
              btc:              /* btc */
-               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
                break;
        case 0xbc: {            /* bsf */
                u8 zf;
                __asm__ ("bsf %2, %0; setz %1"
-                        : "=r"(c->dst.val), "=q"(zf)
-                        : "r"(c->src.val));
+                        : "=r"(ctxt->dst.val), "=q"(zf)
+                        : "r"(ctxt->src.val));
                ctxt->eflags &= ~X86_EFLAGS_ZF;
                if (zf) {
                        ctxt->eflags |= X86_EFLAGS_ZF;
-                       c->dst.type = OP_NONE;  /* Disable writeback. */
+                       ctxt->dst.type = OP_NONE;       /* Disable writeback. */
                }
                break;
        }
        case 0xbd: {            /* bsr */
                u8 zf;
                __asm__ ("bsr %2, %0; setz %1"
-                        : "=r"(c->dst.val), "=q"(zf)
-                        : "r"(c->src.val));
+                        : "=r"(ctxt->dst.val), "=q"(zf)
+                        : "r"(ctxt->src.val));
                ctxt->eflags &= ~X86_EFLAGS_ZF;
                if (zf) {
                        ctxt->eflags |= X86_EFLAGS_ZF;
-                       c->dst.type = OP_NONE;  /* Disable writeback. */
+                       ctxt->dst.type = OP_NONE;       /* Disable writeback. */
                }
                break;
        }
        case 0xbe ... 0xbf:     /* movsx */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-                                                       (s16) c->src.val;
+               ctxt->dst.bytes = ctxt->op_bytes;
+               ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+                                                       (s16) ctxt->src.val;
                break;
        case 0xc0 ... 0xc1:     /* xadd */
-               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+               emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
                /* Write back the register source. */
-               c->src.val = c->dst.orig_val;
-               write_register_operand(&c->src);
+               ctxt->src.val = ctxt->dst.orig_val;
+               write_register_operand(&ctxt->src);
                break;
        case 0xc3:              /* movnti */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-                                                       (u64) c->src.val;
+               ctxt->dst.bytes = ctxt->op_bytes;
+               ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
+                                                       (u64) ctxt->src.val;
                break;
        case 0xc7:              /* Grp9 (cmpxchg8b) */
                rc = em_grp9(ctxt);
index aee3862..9335e1b 100644 (file)
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
                        | PT64_NX_MASK)
 
-#define RMAP_EXT 4
+#define PTE_LIST_EXT 4
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -164,16 +164,16 @@ module_param(oos_shadow, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
-struct kvm_rmap_desc {
-       u64 *sptes[RMAP_EXT];
-       struct kvm_rmap_desc *more;
+struct pte_list_desc {
+       u64 *sptes[PTE_LIST_EXT];
+       struct pte_list_desc *more;
 };
 
 struct kvm_shadow_walk_iterator {
        u64 addr;
        hpa_t shadow_addr;
-       int level;
        u64 *sptep;
+       int level;
        unsigned index;
 };
 
@@ -182,32 +182,68 @@ struct kvm_shadow_walk_iterator {
             shadow_walk_okay(&(_walker));                      \
             shadow_walk_next(&(_walker)))
 
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)    \
+       for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
+            shadow_walk_okay(&(_walker)) &&                            \
+               ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
+            __shadow_walk_next(&(_walker), spte))
 
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;        /* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
 
-static inline u64 rsvd_bits(int s, int e)
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
-       return ((1ULL << (e - s + 1)) - 1) << s;
+       shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+       access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+       trace_mark_mmio_spte(sptep, gfn, access);
+       mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+       return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+       return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+       return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
 }
 
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
 {
-       shadow_trap_nonpresent_pte = trap_pte;
-       shadow_notrap_nonpresent_pte = notrap_pte;
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(sptep, gfn, access);
+               return true;
+       }
+
+       return false;
+}
+
+static inline u64 rsvd_bits(int s, int e)
+{
+       return ((1ULL << (e - s + 1)) - 1) << s;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +256,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
-       return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
 static int is_cpuid_PSE36(void)
 {
        return 1;
@@ -237,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-       return pte != shadow_trap_nonpresent_pte
-               && pte != shadow_notrap_nonpresent_pte;
+       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -246,11 +276,6 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_writable_pte(unsigned long pte)
-{
-       return pte & PT_WRITABLE_MASK;
-}
-
 static int is_dirty_gpte(unsigned long pte)
 {
        return pte & PT_DIRTY_MASK;
@@ -282,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)
        return (gpte & PT32_DIR_PSE36_MASK) << shift;
 }
 
+#ifdef CONFIG_X86_64
 static void __set_spte(u64 *sptep, u64 spte)
 {
-       set_64bit(sptep, spte);
+       *sptep = spte;
 }
 
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 {
-#ifdef CONFIG_X86_64
-       return xchg(sptep, new_spte);
+       *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+       return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+       return ACCESS_ONCE(*sptep);
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+       /* It is valid if the spte is zapped. */
+       return spte == 0ull;
+}
 #else
-       u64 old_spte;
+union split_spte {
+       struct {
+               u32 spte_low;
+               u32 spte_high;
+       };
+       u64 spte;
+};
 
-       do {
-               old_spte = *sptep;
-       } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 
-       return old_spte;
-#endif
+       if (is_shadow_present_pte(spte))
+               return;
+
+       /* Ensure the spte is completely set before we increase the count */
+       smp_wmb();
+       sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       ssptep->spte_high = sspte.spte_high;
+
+       /*
+        * If we map the spte from nonpresent to present, We should store
+        * the high bits firstly, then set present bit, so cpu can not
+        * fetch this spte while we are setting the spte.
+        */
+       smp_wmb();
+
+       ssptep->spte_low = sspte.spte_low;
 }
 
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       ssptep->spte_low = sspte.spte_low;
+
+       /*
+        * If we map the spte from present to nonpresent, we should clear
+        * present bit firstly to avoid vcpu fetch the old high bits.
+        */
+       smp_wmb();
+
+       ssptep->spte_high = sspte.spte_high;
+       count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+       union split_spte *ssptep, sspte, orig;
+
+       ssptep = (union split_spte *)sptep;
+       sspte = (union split_spte)spte;
+
+       /* xchg acts as a barrier before the setting of the high bits */
+       orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+       orig.spte_high = ssptep->spte_high = sspte.spte_high;
+       count_spte_clear(sptep, spte);
+
+       return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+       union split_spte spte, *orig = (union split_spte *)sptep;
+       int count;
+
+retry:
+       count = sp->clear_spte_count;
+       smp_rmb();
+
+       spte.spte_low = orig->spte_low;
+       smp_rmb();
+
+       spte.spte_high = orig->spte_high;
+       smp_rmb();
+
+       if (unlikely(spte.spte_low != orig->spte_low ||
+             count != sp->clear_spte_count))
+               goto retry;
+
+       return spte.spte;
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+       union split_spte sspte = (union split_spte)spte;
+       u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+       /* It is valid if the spte is zapped. */
+       if (spte == 0ull)
+               return true;
+
+       /* It is valid if the spte is being zapped. */
+       if (sspte.spte_low == 0ull &&
+           (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+               return true;
+
+       return false;
+}
+#endif
+
 static bool spte_has_volatile_bits(u64 spte)
 {
        if (!shadow_accessed_mask)
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
        return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
 
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+       WARN_ON(is_shadow_present_pte(*sptep));
+       __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
 {
        u64 mask, old_spte = *sptep;
 
        WARN_ON(!is_rmap_spte(new_spte));
 
+       if (!is_shadow_present_pte(old_spte))
+               return mmu_spte_set(sptep, new_spte);
+
        new_spte |= old_spte & shadow_dirty_mask;
 
        mask = shadow_accessed_mask;
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
                mask |= shadow_dirty_mask;
 
        if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
-               __set_spte(sptep, new_spte);
+               __update_clear_spte_fast(sptep, new_spte);
        else
-               old_spte = __xchg_spte(sptep, new_spte);
+               old_spte = __update_clear_spte_slow(sptep, new_spte);
 
        if (!shadow_accessed_mask)
                return;
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+       pfn_t pfn;
+       u64 old_spte = *sptep;
+
+       if (!spte_has_volatile_bits(old_spte))
+               __update_clear_spte_fast(sptep, 0ull);
+       else
+               old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+       if (!is_rmap_spte(old_spte))
+               return 0;
+
+       pfn = spte_to_pfn(old_spte);
+       if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+               kvm_set_pfn_accessed(pfn);
+       if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+               kvm_set_pfn_dirty(pfn);
+       return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+       __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+       return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+       rcu_read_lock();
+       atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+       /* Increase the counter before walking shadow page table */
+       smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+       /* Decrease the counter after walking shadow page table finished */
+       smp_mb__before_atomic_dec();
+       atomic_dec(&vcpu->kvm->arch.reader_counter);
+       rcu_read_unlock();
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
                                  struct kmem_cache *base_cache, int min)
 {
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-                                  pte_chain_cache, 4);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-                                  rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+                                  pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
        if (r)
                goto out;
        r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +641,8 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+                               pte_list_desc_cache);
        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
        mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
                                mmu_page_header_cache);
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
        return p;
 }
 
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-                                     sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-       kmem_cache_free(pte_chain_cache, pc);
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
+                                     sizeof(struct pte_list_desc));
 }
 
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-                                     sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-       kmem_cache_free(rmap_desc_cache, rd);
+       kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 }
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
                linfo = lpage_info_slot(gfn, slot, i);
                linfo->write_count += 1;
        }
+       kvm->arch.indirect_shadow_pages++;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
                linfo->write_count -= 1;
                WARN_ON(linfo->write_count < 0);
        }
+       kvm->arch.indirect_shadow_pages--;
 }
 
 static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 }
 
 /*
- * Take gfn and return the reverse mapping to it.
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
-       struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
-
-       slot = gfn_to_memslot(kvm, gfn);
-       if (likely(level == PT_PAGE_TABLE_LEVEL))
-               return &slot->rmap[gfn - slot->base_gfn];
-
-       linfo = lpage_info_slot(gfn, slot, level);
-
-       return &linfo->rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
+ * Pte mapping structures:
  *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
+ * If pte_list bit zero is zero, then pte_list point to the spte.
  *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
+ * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * pte_list_desc containing more mappings.
  *
- * Returns the number of rmap entries before the spte was added or zero if
+ * Returns the number of pte entries before the spte was added or zero if
  * the spte was not added.
  *
  */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+                       unsigned long *pte_list)
 {
-       struct kvm_mmu_page *sp;
-       struct kvm_rmap_desc *desc;
-       unsigned long *rmapp;
+       struct pte_list_desc *desc;
        int i, count = 0;
 
-       if (!is_rmap_spte(*spte))
-               return count;
-       sp = page_header(__pa(spte));
-       kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-       if (!*rmapp) {
-               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-               *rmapp = (unsigned long)spte;
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-               desc = mmu_alloc_rmap_desc(vcpu);
-               desc->sptes[0] = (u64 *)*rmapp;
+       if (!*pte_list) {
+               rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+               *pte_list = (unsigned long)spte;
+       } else if (!(*pte_list & 1)) {
+               rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_pte_list_desc(vcpu);
+               desc->sptes[0] = (u64 *)*pte_list;
                desc->sptes[1] = spte;
-               *rmapp = (unsigned long)desc | 1;
+               *pte_list = (unsigned long)desc | 1;
                ++count;
        } else {
-               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-               while (desc->sptes[RMAP_EXT-1] && desc->more) {
+               rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+               while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
                        desc = desc->more;
-                       count += RMAP_EXT;
+                       count += PTE_LIST_EXT;
                }
-               if (desc->sptes[RMAP_EXT-1]) {
-                       desc->more = mmu_alloc_rmap_desc(vcpu);
+               if (desc->sptes[PTE_LIST_EXT-1]) {
+                       desc->more = mmu_alloc_pte_list_desc(vcpu);
                        desc = desc->more;
                }
                for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
        return count;
 }
 
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-                                  struct kvm_rmap_desc *desc,
-                                  int i,
-                                  struct kvm_rmap_desc *prev_desc)
+static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
+{
+       struct pte_list_desc *desc;
+       u64 *prev_spte;
+       int i;
+
+       if (!*pte_list)
+               return NULL;
+       else if (!(*pte_list & 1)) {
+               if (!spte)
+                       return (u64 *)*pte_list;
+               return NULL;
+       }
+       desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+       prev_spte = NULL;
+       while (desc) {
+               for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+                       if (prev_spte == spte)
+                               return desc->sptes[i];
+                       prev_spte = desc->sptes[i];
+               }
+               desc = desc->more;
+       }
+       return NULL;
+}
+
+static void
+pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
+                          int i, struct pte_list_desc *prev_desc)
 {
        int j;
 
-       for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
+       for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
                ;
        desc->sptes[i] = desc->sptes[j];
        desc->sptes[j] = NULL;
        if (j != 0)
                return;
        if (!prev_desc && !desc->more)
-               *rmapp = (unsigned long)desc->sptes[0];
+               *pte_list = (unsigned long)desc->sptes[0];
        else
                if (prev_desc)
                        prev_desc->more = desc->more;
                else
-                       *rmapp = (unsigned long)desc->more | 1;
-       mmu_free_rmap_desc(desc);
+                       *pte_list = (unsigned long)desc->more | 1;
+       mmu_free_pte_list_desc(desc);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 {
-       struct kvm_rmap_desc *desc;
-       struct kvm_rmap_desc *prev_desc;
-       struct kvm_mmu_page *sp;
-       gfn_t gfn;
-       unsigned long *rmapp;
+       struct pte_list_desc *desc;
+       struct pte_list_desc *prev_desc;
        int i;
 
-       sp = page_header(__pa(spte));
-       gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-       rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
-       if (!*rmapp) {
-               printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+       if (!*pte_list) {
+               printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
                BUG();
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_remove:  %p 1->0\n", spte);
-               if ((u64 *)*rmapp != spte) {
-                       printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
+       } else if (!(*pte_list & 1)) {
+               rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+               if ((u64 *)*pte_list != spte) {
+                       printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
                        BUG();
                }
-               *rmapp = 0;
+               *pte_list = 0;
        } else {
-               rmap_printk("rmap_remove:  %p many->many\n", spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               rmap_printk("pte_list_remove:  %p many->many\n", spte);
+               desc = (struct pte_list_desc *)(*pte_list & ~1ul);
                prev_desc = NULL;
                while (desc) {
-                       for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+                       for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
                                if (desc->sptes[i] == spte) {
-                                       rmap_desc_remove_entry(rmapp,
+                                       pte_list_desc_remove_entry(pte_list,
                                                               desc, i,
                                                               prev_desc);
                                        return;
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
                        prev_desc = desc;
                        desc = desc->more;
                }
-               pr_err("rmap_remove: %p many->many\n", spte);
+               pr_err("pte_list_remove: %p many->many\n", spte);
                BUG();
        }
 }
 
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
+typedef void (*pte_list_walk_fn) (u64 *spte);
+static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 {
-       pfn_t pfn;
-       u64 old_spte = *sptep;
+       struct pte_list_desc *desc;
+       int i;
 
-       if (!spte_has_volatile_bits(old_spte))
-               __set_spte(sptep, new_spte);
-       else
-               old_spte = __xchg_spte(sptep, new_spte);
+       if (!*pte_list)
+               return;
 
-       if (!is_rmap_spte(old_spte))
-               return 0;
+       if (!(*pte_list & 1))
+               return fn((u64 *)*pte_list);
 
-       pfn = spte_to_pfn(old_spte);
-       if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
-               kvm_set_pfn_accessed(pfn);
-       if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
-               kvm_set_pfn_dirty(pfn);
-       return 1;
+       desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+       while (desc) {
+               for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+                       fn(desc->sptes[i]);
+               desc = desc->more;
+       }
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
-       if (set_spte_track_bits(sptep, new_spte))
-               rmap_remove(kvm, sptep);
+       struct kvm_memory_slot *slot;
+       struct kvm_lpage_info *linfo;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       if (likely(level == PT_PAGE_TABLE_LEVEL))
+               return &slot->rmap[gfn - slot->base_gfn];
+
+       linfo = lpage_info_slot(gfn, slot, level);
+
+       return &linfo->rmap_pde;
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+       unsigned long *rmapp;
+
+       sp = page_header(__pa(spte));
+       kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+       return pte_list_add(vcpu, spte, rmapp);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
-       struct kvm_rmap_desc *desc;
-       u64 *prev_spte;
-       int i;
+       return pte_list_next(rmapp, spte);
+}
 
-       if (!*rmapp)
-               return NULL;
-       else if (!(*rmapp & 1)) {
-               if (!spte)
-                       return (u64 *)*rmapp;
-               return NULL;
-       }
-       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-       prev_spte = NULL;
-       while (desc) {
-               for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-                       if (prev_spte == spte)
-                               return desc->sptes[i];
-                       prev_spte = desc->sptes[i];
-               }
-               desc = desc->more;
-       }
-       return NULL;
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+       struct kvm_mmu_page *sp;
+       gfn_t gfn;
+       unsigned long *rmapp;
+
+       sp = page_header(__pa(spte));
+       gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+       rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+       pte_list_remove(spte, rmapp);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+       if (mmu_spte_clear_track_bits(sptep))
+               rmap_remove(kvm, sptep);
 }
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
                if (is_writable_pte(*spte)) {
-                       update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+                       mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                        BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
                        pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
                        if (is_writable_pte(*spte)) {
-                               drop_spte(kvm, spte,
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, spte);
                                --kvm->stat.lpages;
                                spte = NULL;
                                write_protected = 1;
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        while ((spte = rmap_next(kvm, rmapp, NULL))) {
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-               drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+               drop_spte(kvm, spte);
                need_tlb_flush = 1;
        }
        return need_tlb_flush;
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
                need_flush = 1;
                if (pte_write(*ptep)) {
-                       drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+                       drop_spte(kvm, spte);
                        spte = rmap_next(kvm, rmapp, NULL);
                } else {
                        new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        new_spte &= ~PT_WRITABLE_MASK;
                        new_spte &= ~SPTE_HOST_WRITEABLE;
                        new_spte &= ~shadow_accessed_mask;
-                       set_spte_track_bits(spte, new_spte);
+                       mmu_spte_clear_track_bits(spte);
+                       mmu_spte_set(spte, new_spte);
                        spte = rmap_next(kvm, rmapp, spte);
                }
        }
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
        hlist_del(&sp->hash_link);
-       list_del(&sp->link);
-       free_page((unsigned long)sp->spt);
        if (!sp->role.direct)
                free_page((unsigned long)sp->gfns);
-       kmem_cache_free(mmu_page_header_cache, sp);
-       kvm_mod_used_mmu_pages(kvm, -1);
 }
 
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-       return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+       list_del(&sp->link);
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-                                              u64 *parent_pte, int direct)
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-       struct kvm_mmu_page *sp;
-
-       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
-       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-       if (!direct)
-               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
-                                                 PAGE_SIZE);
-       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-       bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-       sp->multimapped = 0;
-       sp->parent_pte = parent_pte;
-       kvm_mod_used_mmu_pages(vcpu->kvm, +1);
-       return sp;
+       return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp, u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
        if (!parent_pte)
                return;
-       if (!sp->multimapped) {
-               u64 *old = sp->parent_pte;
 
-               if (!old) {
-                       sp->parent_pte = parent_pte;
-                       return;
-               }
-               sp->multimapped = 1;
-               pte_chain = mmu_alloc_pte_chain(vcpu);
-               INIT_HLIST_HEAD(&sp->parent_ptes);
-               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-               pte_chain->parent_ptes[0] = old;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-                       continue;
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-                       if (!pte_chain->parent_ptes[i]) {
-                               pte_chain->parent_ptes[i] = parent_pte;
-                               return;
-                       }
-       }
-       pte_chain = mmu_alloc_pte_chain(vcpu);
-       BUG_ON(!pte_chain);
-       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-       pte_chain->parent_ptes[0] = parent_pte;
+       pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
 }
 
 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
                                       u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!sp->multimapped) {
-               BUG_ON(sp->parent_pte != parent_pte);
-               sp->parent_pte = NULL;
-               return;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
-                               break;
-                       if (pte_chain->parent_ptes[i] != parent_pte)
-                               continue;
-                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
-                               && pte_chain->parent_ptes[i + 1]) {
-                               pte_chain->parent_ptes[i]
-                                       = pte_chain->parent_ptes[i + 1];
-                               ++i;
-                       }
-                       pte_chain->parent_ptes[i] = NULL;
-                       if (i == 0) {
-                               hlist_del(&pte_chain->link);
-                               mmu_free_pte_chain(pte_chain);
-                               if (hlist_empty(&sp->parent_ptes)) {
-                                       sp->multimapped = 0;
-                                       sp->parent_pte = NULL;
-                               }
-                       }
-                       return;
-               }
-       BUG();
+       pte_list_remove(parent_pte, &sp->parent_ptes);
 }
 
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+                           u64 *parent_pte)
 {
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       struct kvm_mmu_page *parent_sp;
-       int i;
-
-       if (!sp->multimapped && sp->parent_pte) {
-               parent_sp = page_header(__pa(sp->parent_pte));
-               fn(parent_sp, sp->parent_pte);
-               return;
-       }
-
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       u64 *spte = pte_chain->parent_ptes[i];
+       mmu_page_remove_parent_pte(sp, parent_pte);
+       mmu_spte_clear_no_track(parent_pte);
+}
 
-                       if (!spte)
-                               break;
-                       parent_sp = page_header(__pa(spte));
-                       fn(parent_sp, spte);
-               }
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte, int direct)
+{
+       struct kvm_mmu_page *sp;
+       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
+                                       sizeof *sp);
+       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       if (!direct)
+               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+                                                 PAGE_SIZE);
+       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+       sp->parent_ptes = 0;
+       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+       kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+       return sp;
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void mark_unsync(u64 *spte);
 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-       mmu_parent_walk(sp, mark_unsync);
+       pte_list_walk(&sp->parent_ptes, mark_unsync);
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void mark_unsync(u64 *spte)
 {
+       struct kvm_mmu_page *sp;
        unsigned int index;
 
+       sp = page_header(__pa(spte));
        index = spte - sp->spt;
        if (__test_and_set_bit(index, sp->unsync_child_bitmap))
                return;
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
        kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *sp)
-{
-       int i;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                               struct kvm_mmu_page *sp)
 {
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
        }
 }
 
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = 0ull;
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             gfn_t gfn,
                                             gva_t gaddr,
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
                account_shadowed(vcpu->kvm, gfn);
        }
-       if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-               vcpu->arch.mmu.prefetch_page(vcpu, sp);
-       else
-               nonpaging_prefetch_page(vcpu, sp);
+       init_shadow_page_table(sp);
        trace_kvm_mmu_get_page(sp, true);
        return sp;
 }
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
        if (iterator->level < PT_PAGE_TABLE_LEVEL)
                return false;
 
-       if (iterator->level == PT_PAGE_TABLE_LEVEL)
-               if (is_large_pte(*iterator->sptep))
-                       return false;
-
        iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
        iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
        return true;
 }
 
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+                              u64 spte)
 {
-       iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+       if (is_last_spte(spte, iterator->level)) {
+               iterator->level = 0;
+               return;
+       }
+
+       iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
        --iterator->level;
 }
 
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+       return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 {
        u64 spte;
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
        spte = __pa(sp->spt)
                | PT_PRESENT_MASK | PT_ACCESSED_MASK
                | PT_WRITABLE_MASK | PT_USER_MASK;
-       __set_spte(sptep, spte);
+       mmu_spte_set(sptep, spte);
 }
 
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
        if (is_large_pte(*sptep)) {
-               drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+               drop_spte(vcpu->kvm, sptep);
                kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                if (child->role.access == direct_access)
                        return;
 
-               mmu_page_remove_parent_pte(child, sptep);
-               __set_spte(sptep, shadow_trap_nonpresent_pte);
+               drop_parent_pte(child, sptep);
                kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
 
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+                            u64 *spte)
+{
+       u64 pte;
+       struct kvm_mmu_page *child;
+
+       pte = *spte;
+       if (is_shadow_present_pte(pte)) {
+               if (is_last_spte(pte, sp->role.level))
+                       drop_spte(kvm, spte);
+               else {
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       drop_parent_pte(child, spte);
+               }
+       } else if (is_mmio_spte(pte))
+               mmu_spte_clear_no_track(spte);
+
+       if (is_large_pte(pte))
+               --kvm->stat.lpages;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                                         struct kvm_mmu_page *sp)
 {
        unsigned i;
-       u64 *pt;
-       u64 ent;
-
-       pt = sp->spt;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               ent = pt[i];
-
-               if (is_shadow_present_pte(ent)) {
-                       if (!is_last_spte(ent, sp->role.level)) {
-                               ent &= PT64_BASE_ADDR_MASK;
-                               mmu_page_remove_parent_pte(page_header(ent),
-                                                          &pt[i]);
-                       } else {
-                               if (is_large_pte(ent))
-                                       --kvm->stat.lpages;
-                               drop_spte(kvm, &pt[i],
-                                         shadow_trap_nonpresent_pte);
-                       }
-               }
-               pt[i] = shadow_trap_nonpresent_pte;
-       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               mmu_page_zap_pte(kvm, sp, sp->spt + i);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        u64 *parent_pte;
 
-       while (sp->multimapped || sp->parent_pte) {
-               if (!sp->multimapped)
-                       parent_pte = sp->parent_pte;
-               else {
-                       struct kvm_pte_chain *chain;
-
-                       chain = container_of(sp->parent_ptes.first,
-                                            struct kvm_pte_chain, link);
-                       parent_pte = chain->parent_ptes[0];
-               }
-               BUG_ON(!parent_pte);
-               kvm_mmu_put_page(sp, parent_pte);
-               __set_spte(parent_pte, shadow_trap_nonpresent_pte);
-       }
+       while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
+               drop_parent_pte(sp, parent_pte);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                /* Count self */
                ret++;
                list_move(&sp->link, invalid_list);
+               kvm_mod_used_mmu_pages(kvm, -1);
        } else {
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
                kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
        return ret;
 }
 
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+       struct kvm_mmu_page *sp;
+
+       list_for_each_entry(sp, invalid_list, link)
+               kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+       struct kvm_mmu_page *next, *sp;
+
+       sp = container_of(head, struct kvm_mmu_page, rcu);
+       while (sp) {
+               if (!list_empty(&sp->link))
+                       next = list_first_entry(&sp->link,
+                                     struct kvm_mmu_page, link);
+               else
+                       next = NULL;
+               kvm_mmu_free_page(sp);
+               sp = next;
+       }
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list)
 {
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
        kvm_flush_remote_tlbs(kvm);
 
+       if (atomic_read(&kvm->arch.reader_counter)) {
+               kvm_mmu_isolate_pages(invalid_list);
+               sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+               list_del_init(invalid_list);
+
+               trace_kvm_mmu_delay_free_pages(sp);
+               call_rcu(&sp->rcu, free_pages_rcu);
+               return;
+       }
+
        do {
                sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
                WARN_ON(!sp->role.invalid || sp->root_count);
-               kvm_mmu_free_page(kvm, sp);
+               kvm_mmu_isolate_page(sp);
+               kvm_mmu_free_page(sp);
        } while (!list_empty(invalid_list));
 
 }
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
                        page = container_of(kvm->arch.active_mmu_pages.prev,
                                            struct kvm_mmu_page, link);
                        kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
                }
+               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
        }
 
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        __set_bit(slot, sp->slot_bitmap);
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
-       int i;
-       u64 *pt = sp->spt;
-
-       if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
-               return;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               if (pt[i] == shadow_notrap_nonpresent_pte)
-                       __set_spte(&pt[i], shadow_trap_nonpresent_pte);
-       }
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        sp->unsync = 1;
 
        kvm_mmu_mark_parents_unsync(sp);
-       mmu_convert_notrap(sp);
 }
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int user_fault,
-                   int write_fault, int dirty, int level,
+                   int write_fault, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
        u64 spte, entry = *sptep;
        int ret = 0;
 
+       if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+               return 0;
+
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        spte = PT_PRESENT_MASK;
        if (!speculative)
                spte |= shadow_accessed_mask;
-       if (!dirty)
-               pte_access &= ~ACC_WRITE_MASK;
+
        if (pte_access & ACC_EXEC_MASK)
                spte |= shadow_x_mask;
        else
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level)) {
                        ret = 1;
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                        goto done;
                }
 
                spte |= PT_WRITABLE_MASK;
 
                if (!vcpu->arch.mmu.direct_map
-                   && !(pte_access & ACC_WRITE_MASK))
+                   && !(pte_access & ACC_WRITE_MASK)) {
                        spte &= ~PT_USER_MASK;
+                       /*
+                        * If we converted a user page to a kernel page,
+                        * so that the kernel can write to it when cr0.wp=0,
+                        * then we should prevent the kernel from executing it
+                        * if SMEP is enabled.
+                        */
+                       if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+                               spte |= PT64_NX_MASK;
+               }
 
                /*
                 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-       update_spte(sptep, spte);
+       mmu_spte_update(sptep, spte);
        /*
         * If we overwrite a writable spte with a read-only one we
         * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2283,8 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int level, gfn_t gfn,
+                        int user_fault, int write_fault,
+                        int *emulate, int level, gfn_t gfn,
                         pfn_t pfn, bool speculative,
                         bool host_writable)
 {
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                        u64 pte = *sptep;
 
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, sptep);
-                       __set_spte(sptep, shadow_trap_nonpresent_pte);
+                       drop_parent_pte(child, sptep);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else if (pfn != spte_to_pfn(*sptep)) {
                        pgprintk("hfn old %llx new %llx\n",
                                 spte_to_pfn(*sptep), pfn);
-                       drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, sptep);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else
                        was_rmapped = 1;
        }
 
        if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-                     dirty, level, gfn, pfn, speculative, true,
+                     level, gfn, pfn, speculative, true,
                      host_writable)) {
                if (write_fault)
-                       *ptwrite = 1;
+                       *emulate = 1;
                kvm_mmu_flush_tlb(vcpu);
        }
 
+       if (unlikely(is_mmio_spte(*sptep) && emulate))
+               *emulate = 1;
+
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
        pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
                 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
-       page_header_update_slot(vcpu->kvm, sptep, gfn);
-       if (!was_rmapped) {
-               rmap_count = rmap_add(vcpu, sptep, gfn);
-               if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-                       rmap_recycle(vcpu, sptep, gfn);
+       if (is_shadow_present_pte(*sptep)) {
+               page_header_update_slot(vcpu->kvm, sptep, gfn);
+               if (!was_rmapped) {
+                       rmap_count = rmap_add(vcpu, sptep, gfn);
+                       if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+                               rmap_recycle(vcpu, sptep, gfn);
+               }
        }
        kvm_release_pfn_clean(pfn);
        if (speculative) {
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
        slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
        if (!slot) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
+               get_page(fault_page);
+               return page_to_pfn(fault_page);
        }
 
        hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 
        for (i = 0; i < ret; i++, gfn++, start++)
                mmu_set_spte(vcpu, start, ACC_ALL,
-                            access, 0, 0, 1, NULL,
+                            access, 0, 0, NULL,
                             sp->role.level, gfn,
                             page_to_pfn(pages[i]), true, true);
 
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
        spte = sp->spt + i;
 
        for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-               if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+               if (is_shadow_present_pte(*spte) || spte == sptep) {
                        if (!start)
                                continue;
                        if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
-       int pt_write = 0;
+       int emulate = 0;
        gfn_t pseudo_gfn;
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                        unsigned pte_access = ACC_ALL;
 
                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-                                    0, write, 1, &pt_write,
+                                    0, write, &emulate,
                                     level, gfn, pfn, prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
                }
 
-               if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+               if (!is_shadow_present_pte(*iterator.sptep)) {
                        u64 base_addr = iterator.addr;
 
                        base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                return -ENOMEM;
                        }
 
-                       __set_spte(iterator.sptep,
-                                  __pa(sp->spt)
-                                  | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                  | shadow_user_mask | shadow_x_mask
-                                  | shadow_accessed_mask);
+                       mmu_spte_set(iterator.sptep,
+                                    __pa(sp->spt)
+                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
+                                    | shadow_user_mask | shadow_x_mask
+                                    | shadow_accessed_mask);
                }
        }
-       return pt_write;
+       return emulate;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
        send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
        kvm_release_pfn_clean(pfn);
        if (is_hwpoison_pfn(pfn)) {
-               kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+               kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
                return 0;
-       } else if (is_fault_pfn(pfn))
-               return -EFAULT;
+       }
 
-       return 1;
+       return -EFAULT;
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
        }
 }
 
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+       return unlikely(is_invalid_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+                               pfn_t pfn, unsigned access, int *ret_val)
+{
+       bool ret = true;
+
+       /* The pfn is invalid, report the error! */
+       if (unlikely(is_invalid_pfn(pfn))) {
+               *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+               goto exit;
+       }
+
+       if (unlikely(is_noslot_pfn(pfn)))
+               vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+       ret = false;
+exit:
+       return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
        if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
                return 0;
 
-       /* mmio */
-       if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+               return r;
 
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
 
+       vcpu_clear_mmio_info(vcpu, ~0ul);
        trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
        if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
        return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+       if (direct)
+               return vcpu_match_mmio_gpa(vcpu, addr);
+
+       return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ *   - It is the mmio spte
+ *   - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+       return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte = 0ull;
+
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+               if (!is_shadow_present_pte(spte))
+                       break;
+       walk_shadow_page_lockless_end(vcpu);
+
+       return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+       u64 spte;
+
+       if (quickly_check_mmio_pf(vcpu, addr, direct))
+               return 1;
+
+       spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+       if (is_mmio_spte(spte)) {
+               gfn_t gfn = get_mmio_spte_gfn(spte);
+               unsigned access = get_mmio_spte_access(spte);
+
+               if (direct)
+                       addr = 0;
+
+               trace_handle_mmio_page_fault(addr, gfn, access);
+               vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+               return 1;
+       }
+
+       /*
+        * It's ok if the gva is remapped by other cpus on shadow guest,
+        * it's a BUG if the gfn is not a mmio page.
+        */
+       if (direct && !check_direct_spte_mmio_pf(spte))
+               return -1;
+
+       /*
+        * If the page table is zapped by other cpus, let CPU fault again on
+        * the address.
+        */
+       return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+                                 u32 error_code, bool direct)
+{
+       int ret;
+
+       ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+       WARN_ON(ret < 0);
+       return ret;
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code, bool prefault)
 {
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        int r;
 
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
                return 0;
 
-       /* mmio */
-       if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+       if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+               return r;
+
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->page_fault = nonpaging_page_fault;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+                          int *nr_present)
+{
+       if (unlikely(is_mmio_spte(*sptep))) {
+               if (gfn != get_mmio_spte_gfn(*sptep)) {
+                       mmu_spte_clear_no_track(sptep);
+                       return true;
+               }
+
+               (*nr_present)++;
+               mark_mmio_spte(sptep, gfn, access);
+               return true;
+       }
+
+       return false;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
-       context->prefetch_page = paging64_prefetch_page;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->free = paging_free;
-       context->prefetch_page = paging32_prefetch_page;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
        int r;
+       bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+       vcpu->arch.mmu.base_role.smep_andnot_wp
+               = smep && !is_write_protection(vcpu);
 
        return r;
 }
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp,
-                                 u64 *spte)
-{
-       u64 pte;
-       struct kvm_mmu_page *child;
-
-       pte = *spte;
-       if (is_shadow_present_pte(pte)) {
-               if (is_last_spte(pte, sp->role.level))
-                       drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
-               else {
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, spte);
-               }
-       }
-       __set_spte(spte, shadow_trap_nonpresent_pte);
-       if (is_large_pte(pte))
-               --vcpu->kvm->stat.lpages;
-}
-
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp, u64 *spte,
                                  const void *new)
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int level, npte, invlpg_counter, r, flooded = 0;
        bool remote_flush, local_flush, zap_page;
 
+       /*
+        * If we don't have indirect shadow pages, it means no page is
+        * write-protected, so we can exit simply.
+        */
+       if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+               return;
+
        zap_page = remote_flush = local_flush = false;
        offset = offset_in_page(gpa);
 
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                spte = &sp->spt[page_offset / sizeof(*spte)];
                while (npte--) {
                        entry = *spte;
-                       mmu_pte_write_zap_pte(vcpu, sp, spte);
+                       mmu_page_zap_pte(vcpu->kvm, sp, spte);
                        if (gentry &&
                              !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
                              & mask.word))
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
                                  struct kvm_mmu_page, link);
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                ++vcpu->kvm->stat.mmu_recycled;
        }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                                continue;
 
                        if (is_large_pte(pt[i])) {
-                               drop_spte(kvm, &pt[i],
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(kvm, &pt[i]);
                                --kvm->stat.lpages;
                                continue;
                        }
 
                        /* avoid RMW */
                        if (is_writable_pte(pt[i]))
-                               update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+                               mmu_spte_update(&pt[i],
+                                               pt[i] & ~PT_WRITABLE_MASK);
                }
        }
        kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = {
 
 static void mmu_destroy_caches(void)
 {
-       if (pte_chain_cache)
-               kmem_cache_destroy(pte_chain_cache);
-       if (rmap_desc_cache)
-               kmem_cache_destroy(rmap_desc_cache);
+       if (pte_list_desc_cache)
+               kmem_cache_destroy(pte_list_desc_cache);
        if (mmu_page_header_cache)
                kmem_cache_destroy(mmu_page_header_cache);
 }
 
 int kvm_mmu_module_init(void)
 {
-       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-                                           sizeof(struct kvm_pte_chain),
-                                           0, 0, NULL);
-       if (!pte_chain_cache)
-               goto nomem;
-       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-                                           sizeof(struct kvm_rmap_desc),
+       pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+                                           sizeof(struct pte_list_desc),
                                            0, 0, NULL);
-       if (!rmap_desc_cache)
+       if (!pte_list_desc_cache)
                goto nomem;
 
        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4082,17 @@ out:
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 {
        struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
        int nr_sptes = 0;
 
-       spin_lock(&vcpu->kvm->mmu_lock);
-       for_each_shadow_entry(vcpu, addr, iterator) {
-               sptes[iterator.level-1] = *iterator.sptep;
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               sptes[iterator.level-1] = spte;
                nr_sptes++;
-               if (!is_shadow_present_pte(*iterator.sptep))
+               if (!is_shadow_present_pte(spte))
                        break;
        }
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       walk_shadow_page_lockless_end(vcpu);
 
        return nr_sptes;
 }
index 7086ca8..e374db9 100644 (file)
@@ -49,6 +49,8 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
        return pte & PT_PRESENT_MASK;
 }
 
+static inline int is_writable_pte(unsigned long pte)
+{
+       return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+       return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+                                          bool write_fault, bool user_fault,
+                                          unsigned long pte)
+{
+       if (unlikely(write_fault && !is_writable_pte(pte)
+             && (user_fault || is_write_protection(vcpu))))
+               return false;
+
+       if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+               return false;
+
+       return true;
+}
 #endif
index 5f6223b..2460a26 100644 (file)
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
                                     "level = %d\n", sp, level);
                        return;
                }
-
-               if (*sptep == shadow_notrap_nonpresent_pte) {
-                       audit_printk(vcpu->kvm, "notrap spte in unsync "
-                                    "sp: %p\n", sp);
-                       return;
-               }
-       }
-
-       if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-               audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
-                            sp);
-               return;
        }
 
        if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
index b60b4fd..eed67f3 100644 (file)
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
        TP_ARGS(sp)
 );
 
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+       TP_PROTO(struct kvm_mmu_page *sp),
+
+       TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+       mark_mmio_spte,
+       TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+       TP_ARGS(sptep, gfn, access),
+
+       TP_STRUCT__entry(
+               __field(void *, sptep)
+               __field(gfn_t, gfn)
+               __field(unsigned, access)
+       ),
+
+       TP_fast_assign(
+               __entry->sptep = sptep;
+               __entry->gfn = gfn;
+               __entry->access = access;
+       ),
+
+       TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+                 __entry->access)
+);
+
+TRACE_EVENT(
+       handle_mmio_page_fault,
+       TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+       TP_ARGS(addr, gfn, access),
+
+       TP_STRUCT__entry(
+               __field(u64, addr)
+               __field(gfn_t, gfn)
+               __field(unsigned, access)
+       ),
+
+       TP_fast_assign(
+               __entry->addr = addr;
+               __entry->gfn = gfn;
+               __entry->access = access;
+       ),
+
+       TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+                 __entry->access)
+);
+
 TRACE_EVENT(
        kvm_mmu_audit,
        TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
index 9d03ad4..507e2b8 100644 (file)
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+                                  bool last)
 {
        unsigned access;
 
        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+       if (last && !is_dirty_gpte(gpte))
+               access &= ~ACC_WRITE_MASK;
+
 #if PTTYPE == 64
        if (vcpu->arch.mmu.nx)
                access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
        return access;
 }
 
+static bool FNAME(is_last_gpte)(struct guest_walker *walker,
+                               struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                               pt_element_t gpte)
+{
+       if (walker->level == PT_PAGE_TABLE_LEVEL)
+               return true;
+
+       if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
+           (PTTYPE == 64 || is_pse(vcpu)))
+               return true;
+
+       if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
+           (mmu->root_level == PT64_ROOT_LEVEL))
+               return true;
+
+       return false;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        gfn_t table_gfn;
        unsigned index, pt_access, uninitialized_var(pte_access);
        gpa_t pte_gpa;
-       bool eperm, present, rsvd_fault;
-       int offset, write_fault, user_fault, fetch_fault;
-
-       write_fault = access & PFERR_WRITE_MASK;
-       user_fault = access & PFERR_USER_MASK;
-       fetch_fault = access & PFERR_FETCH_MASK;
+       bool eperm;
+       int offset;
+       const int write_fault = access & PFERR_WRITE_MASK;
+       const int user_fault  = access & PFERR_USER_MASK;
+       const int fetch_fault = access & PFERR_FETCH_MASK;
+       u16 errcode = 0;
 
        trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
                                     fetch_fault);
-walk:
-       present = true;
-       eperm = rsvd_fault = false;
+retry_walk:
+       eperm = false;
        walker->level = mmu->root_level;
        pte           = mmu->get_cr3(vcpu);
 
@@ -144,10 +165,8 @@ walk:
        if (walker->level == PT32E_ROOT_LEVEL) {
                pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
-               if (!is_present_gpte(pte)) {
-                       present = false;
+               if (!is_present_gpte(pte))
                        goto error;
-               }
                --walker->level;
        }
 #endif
@@ -170,42 +189,31 @@ walk:
 
                real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
                                              PFERR_USER_MASK|PFERR_WRITE_MASK);
-               if (unlikely(real_gfn == UNMAPPED_GVA)) {
-                       present = false;
-                       break;
-               }
+               if (unlikely(real_gfn == UNMAPPED_GVA))
+                       goto error;
                real_gfn = gpa_to_gfn(real_gfn);
 
                host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
-               if (unlikely(kvm_is_error_hva(host_addr))) {
-                       present = false;
-                       break;
-               }
+               if (unlikely(kvm_is_error_hva(host_addr)))
+                       goto error;
 
                ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-               if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
-                       present = false;
-                       break;
-               }
+               if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+                       goto error;
 
                trace_kvm_mmu_paging_element(pte, walker->level);
 
-               if (unlikely(!is_present_gpte(pte))) {
-                       present = false;
-                       break;
-               }
+               if (unlikely(!is_present_gpte(pte)))
+                       goto error;
 
                if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
                                              walker->level))) {
-                       rsvd_fault = true;
-                       break;
+                       errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+                       goto error;
                }
 
-               if (unlikely(write_fault && !is_writable_pte(pte)
-                            && (user_fault || is_write_protection(vcpu))))
-                       eperm = true;
-
-               if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+               if (!check_write_user_access(vcpu, write_fault, user_fault,
+                                         pte))
                        eperm = true;
 
 #if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
                        eperm = true;
 #endif
 
-               if (!eperm && !rsvd_fault
-                   && unlikely(!(pte & PT_ACCESSED_MASK))) {
+               if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
                        int ret;
                        trace_kvm_mmu_set_accessed_bit(table_gfn, index,
                                                       sizeof(pte));
                        ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
                                                  pte, pte|PT_ACCESSED_MASK);
-                       if (unlikely(ret < 0)) {
-                               present = false;
-                               break;
-                       } else if (ret)
-                               goto walk;
+                       if (unlikely(ret < 0))
+                               goto error;
+                       else if (ret)
+                               goto retry_walk;
 
                        mark_page_dirty(vcpu->kvm, table_gfn);
                        pte |= PT_ACCESSED_MASK;
                }
 
-               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
                walker->ptes[walker->level - 1] = pte;
 
-               if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
-                   ((walker->level == PT_DIRECTORY_LEVEL) &&
-                               is_large_pte(pte) &&
-                               (PTTYPE == 64 || is_pse(vcpu))) ||
-                   ((walker->level == PT_PDPE_LEVEL) &&
-                               is_large_pte(pte) &&
-                               mmu->root_level == PT64_ROOT_LEVEL)) {
+               if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
                        int lvl = walker->level;
                        gpa_t real_gpa;
                        gfn_t gfn;
                        u32 ac;
 
+                       /* check if the kernel is fetching from user page */
+                       if (unlikely(pte_access & PT_USER_MASK) &&
+                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+                               if (fetch_fault && !user_fault)
+                                       eperm = true;
+
                        gfn = gpte_to_gfn_lvl(pte, lvl);
                        gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
@@ -266,12 +270,14 @@ walk:
                        break;
                }
 
-               pt_access = pte_access;
+               pt_access &= FNAME(gpte_access)(vcpu, pte, false);
                --walker->level;
        }
 
-       if (unlikely(!present || eperm || rsvd_fault))
+       if (unlikely(eperm)) {
+               errcode |= PFERR_PRESENT_MASK;
                goto error;
+       }
 
        if (write_fault && unlikely(!is_dirty_gpte(pte))) {
                int ret;
@@ -279,17 +285,17 @@ walk:
                trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
                ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
                                          pte, pte|PT_DIRTY_MASK);
-               if (unlikely(ret < 0)) {
-                       present = false;
+               if (unlikely(ret < 0))
                        goto error;
-               else if (ret)
-                       goto walk;
+               else if (ret)
+                       goto retry_walk;
 
                mark_page_dirty(vcpu->kvm, table_gfn);
                pte |= PT_DIRTY_MASK;
                walker->ptes[walker->level - 1] = pte;
        }
 
+       pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
        walker->pt_access = pt_access;
        walker->pte_access = pte_access;
        pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
        return 1;
 
 error:
+       errcode |= write_fault | user_fault;
+       if (fetch_fault && (mmu->nx ||
+                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+               errcode |= PFERR_FETCH_MASK;
+
        walker->fault.vector = PF_VECTOR;
        walker->fault.error_code_valid = true;
-       walker->fault.error_code = 0;
-       if (present)
-               walker->fault.error_code |= PFERR_PRESENT_MASK;
-
-       walker->fault.error_code |= write_fault | user_fault;
-
-       if (fetch_fault && mmu->nx)
-               walker->fault.error_code |= PFERR_FETCH_MASK;
-       if (rsvd_fault)
-               walker->fault.error_code |= PFERR_RSVD_MASK;
-
+       walker->fault.error_code = errcode;
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp, u64 *spte,
                                    pt_element_t gpte)
 {
-       u64 nonpresent = shadow_trap_nonpresent_pte;
-
        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
                goto no_present;
 
-       if (!is_present_gpte(gpte)) {
-               if (!sp->unsync)
-                       nonpresent = shadow_notrap_nonpresent_pte;
+       if (!is_present_gpte(gpte))
                goto no_present;
-       }
 
        if (!(gpte & PT_ACCESSED_MASK))
                goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
        return false;
 
 no_present:
-       drop_spte(vcpu->kvm, spte, nonpresent);
+       drop_spte(vcpu->kvm, spte);
        return true;
 }
 
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                return;
 
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-       pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+       pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
        pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-       if (is_error_pfn(pfn)) {
+       if (mmu_invalid_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return;
        }
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
         * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
         */
        mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-                    is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+                    NULL, PT_PAGE_TABLE_LEVEL,
                     gpte_to_gfn(gpte), pfn, true, true);
 }
 
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
                unsigned pte_access;
                gfn_t gfn;
                pfn_t pfn;
-               bool dirty;
 
                if (spte == sptep)
                        continue;
 
-               if (*spte != shadow_trap_nonpresent_pte)
+               if (is_shadow_present_pte(*spte))
                        continue;
 
                gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                        continue;
 
-               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+                                                                 true);
                gfn = gpte_to_gfn(gpte);
-               dirty = is_dirty_gpte(gpte);
                pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-                                     (pte_access & ACC_WRITE_MASK) && dirty);
-               if (is_error_pfn(pfn)) {
+                                     pte_access & ACC_WRITE_MASK);
+               if (mmu_invalid_pfn(pfn)) {
                        kvm_release_pfn_clean(pfn);
                        break;
                }
 
                mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-                            dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+                            NULL, PT_PAGE_TABLE_LEVEL, gfn,
                             pfn, true, true);
        }
 }
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
                         int user_fault, int write_fault, int hlevel,
-                        int *ptwrite, pfn_t pfn, bool map_writable,
+                        int *emulate, pfn_t pfn, bool map_writable,
                         bool prefault)
 {
        unsigned access = gw->pt_access;
        struct kvm_mmu_page *sp = NULL;
-       bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
        int top_level;
        unsigned direct_access;
        struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        if (!is_present_gpte(gw->ptes[gw->level - 1]))
                return NULL;
 
-       direct_access = gw->pt_access & gw->pte_access;
-       if (!dirty)
-               direct_access &= ~ACC_WRITE_MASK;
+       direct_access = gw->pte_access;
 
        top_level = vcpu->arch.mmu.root_level;
        if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                link_shadow_page(it.sptep, sp);
        }
 
-       mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
-                    user_fault, write_fault, dirty, ptwrite, it.level,
+       mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
+                    user_fault, write_fault, emulate, it.level,
                     gw->gfn, pfn, prefault, map_writable);
        FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int user_fault = error_code & PFERR_USER_MASK;
        struct guest_walker walker;
        u64 *sptep;
-       int write_pt = 0;
+       int emulate = 0;
        int r;
        pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return handle_mmio_page_fault(vcpu, addr, error_code,
+                                             mmu_is_nested(vcpu));
+
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                         &map_writable))
                return 0;
 
-       /* mmio */
-       if (is_error_pfn(pfn))
-               return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+       if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+                               walker.gfn, pfn, walker.pte_access, &r))
+               return r;
 
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        if (!force_pt_level)
                transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                            level, &write_pt, pfn, map_writable, prefault);
+                            level, &emulate, pfn, map_writable, prefault);
        (void)sptep;
-       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
-                sptep, *sptep, write_pt);
+       pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+                sptep, *sptep, emulate);
 
-       if (!write_pt)
+       if (!emulate)
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
        ++vcpu->stat.pf_fixed;
        trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
-       return write_pt;
+       return emulate;
 
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
        u64 *sptep;
        int need_flush = 0;
 
+       vcpu_clear_mmio_info(vcpu, gva);
+
        spin_lock(&vcpu->kvm->mmu_lock);
 
        for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                        if (is_shadow_present_pte(*sptep)) {
                                if (is_large_pte(*sptep))
                                        --vcpu->kvm->stat.lpages;
-                               drop_spte(vcpu->kvm, sptep,
-                                         shadow_trap_nonpresent_pte);
+                               drop_spte(vcpu->kvm, sptep);
                                need_flush = 1;
-                       } else
-                               __set_spte(sptep, shadow_trap_nonpresent_pte);
+                       } else if (is_mmio_spte(*sptep))
+                               mmu_spte_clear_no_track(sptep);
+
                        break;
                }
 
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
        return gpa;
 }
 
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
-                                struct kvm_mmu_page *sp)
-{
-       int i, j, offset, r;
-       pt_element_t pt[256 / sizeof(pt_element_t)];
-       gpa_t pte_gpa;
-
-       if (sp->role.direct
-           || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
-               nonpaging_prefetch_page(vcpu, sp);
-               return;
-       }
-
-       pte_gpa = gfn_to_gpa(sp->gfn);
-       if (PTTYPE == 32) {
-               offset = sp->role.quadrant << PT64_LEVEL_BITS;
-               pte_gpa += offset * sizeof(pt_element_t);
-       }
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
-               r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
-               pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
-               for (j = 0; j < ARRAY_SIZE(pt); ++j)
-                       if (r || is_present_gpte(pt[j]))
-                               sp->spt[i+j] = shadow_trap_nonpresent_pte;
-                       else
-                               sp->spt[i+j] = shadow_notrap_nonpresent_pte;
-       }
-}
-
 /*
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                gpa_t pte_gpa;
                gfn_t gfn;
 
-               if (!is_shadow_present_pte(sp->spt[i]))
+               if (!sp->spt[i])
                        continue;
 
                pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
 
-               gfn = gpte_to_gfn(gpte);
-
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
 
+               gfn = gpte_to_gfn(gpte);
+               pte_access = sp->role.access;
+               pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+               if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+                       continue;
+
                if (gfn != sp->gfns[i]) {
-                       drop_spte(vcpu->kvm, &sp->spt[i],
-                                     shadow_trap_nonpresent_pte);
+                       drop_spte(vcpu->kvm, &sp->spt[i]);
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
 
                nr_present++;
-               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+
                host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-                        is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+                        PT_PAGE_TABLE_LEVEL, gfn,
                         spte_to_pfn(sp->spt[i]), true, false,
                         host_writable);
        }
index 506e4fe..475d1c9 100644 (file)
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        update_cr0_intercept(svm);
 }
 
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
+       if (cr4 & X86_CR4_VMXE)
+               return 1;
+
        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
                svm_flush_tlb(vcpu);
 
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        cr4 |= host_cr4_mce;
        to_svm(vcpu)->vmcb->save.cr4 = cr4;
        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+       return 0;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
index db93276..3ff898c 100644 (file)
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
                ),
 
        TP_fast_assign(
-               __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+               __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
                __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
-               __entry->len = vcpu->arch.emulate_ctxt.decode.eip
-                              - vcpu->arch.emulate_ctxt.decode.fetch.start;
+               __entry->len = vcpu->arch.emulate_ctxt._eip
+                              - vcpu->arch.emulate_ctxt.fetch.start;
                memcpy(__entry->insn,
-                      vcpu->arch.emulate_ctxt.decode.fetch.data,
+                      vcpu->arch.emulate_ctxt.fetch.data,
                       15);
                __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
                __entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
 #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
 #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
 
+TRACE_EVENT(
+       vcpu_match_mmio,
+       TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+       TP_ARGS(gva, gpa, write, gpa_match),
+
+       TP_STRUCT__entry(
+               __field(gva_t, gva)
+               __field(gpa_t, gpa)
+               __field(bool, write)
+               __field(bool, gpa_match)
+               ),
+
+       TP_fast_assign(
+               __entry->gva = gva;
+               __entry->gpa = gpa;
+               __entry->write = write;
+               __entry->gpa_match = gpa_match
+               ),
+
+       TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+                 __entry->write ? "Write" : "Read",
+                 __entry->gpa_match ? "GPA" : "GVA")
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index d48ec60..e65a158 100644 (file)
 #include "trace.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+       ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
 static int __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK                                             \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
 #define NR_AUTOLOAD_MSRS 1
+#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
        u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
        char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+       struct vmcs *vmcs;
+       int cpu;
+       int launched;
+       struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
        unsigned index;
        u64 data;
        u64 mask;
 };
 
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+       /* According to the Intel spec, a VMCS region must start with the
+        * following two fields. Then follow implementation-specific data.
+        */
+       u32 revision_id;
+       u32 abort;
+
+       u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+       u32 padding[7]; /* room for future expansion */
+
+       u64 io_bitmap_a;
+       u64 io_bitmap_b;
+       u64 msr_bitmap;
+       u64 vm_exit_msr_store_addr;
+       u64 vm_exit_msr_load_addr;
+       u64 vm_entry_msr_load_addr;
+       u64 tsc_offset;
+       u64 virtual_apic_page_addr;
+       u64 apic_access_addr;
+       u64 ept_pointer;
+       u64 guest_physical_address;
+       u64 vmcs_link_pointer;
+       u64 guest_ia32_debugctl;
+       u64 guest_ia32_pat;
+       u64 guest_ia32_efer;
+       u64 guest_ia32_perf_global_ctrl;
+       u64 guest_pdptr0;
+       u64 guest_pdptr1;
+       u64 guest_pdptr2;
+       u64 guest_pdptr3;
+       u64 host_ia32_pat;
+       u64 host_ia32_efer;
+       u64 host_ia32_perf_global_ctrl;
+       u64 padding64[8]; /* room for future expansion */
+       /*
+        * To allow migration of L1 (complete with its L2 guests) between
+        * machines of different natural widths (32 or 64 bit), we cannot have
+        * unsigned long fields with no explict size. We use u64 (aliased
+        * natural_width) instead. Luckily, x86 is little-endian.
+        */
+       natural_width cr0_guest_host_mask;
+       natural_width cr4_guest_host_mask;
+       natural_width cr0_read_shadow;
+       natural_width cr4_read_shadow;
+       natural_width cr3_target_value0;
+       natural_width cr3_target_value1;
+       natural_width cr3_target_value2;
+       natural_width cr3_target_value3;
+       natural_width exit_qualification;
+       natural_width guest_linear_address;
+       natural_width guest_cr0;
+       natural_width guest_cr3;
+       natural_width guest_cr4;
+       natural_width guest_es_base;
+       natural_width guest_cs_base;
+       natural_width guest_ss_base;
+       natural_width guest_ds_base;
+       natural_width guest_fs_base;
+       natural_width guest_gs_base;
+       natural_width guest_ldtr_base;
+       natural_width guest_tr_base;
+       natural_width guest_gdtr_base;
+       natural_width guest_idtr_base;
+       natural_width guest_dr7;
+       natural_width guest_rsp;
+       natural_width guest_rip;
+       natural_width guest_rflags;
+       natural_width guest_pending_dbg_exceptions;
+       natural_width guest_sysenter_esp;
+       natural_width guest_sysenter_eip;
+       natural_width host_cr0;
+       natural_width host_cr3;
+       natural_width host_cr4;
+       natural_width host_fs_base;
+       natural_width host_gs_base;
+       natural_width host_tr_base;
+       natural_width host_gdtr_base;
+       natural_width host_idtr_base;
+       natural_width host_ia32_sysenter_esp;
+       natural_width host_ia32_sysenter_eip;
+       natural_width host_rsp;
+       natural_width host_rip;
+       natural_width paddingl[8]; /* room for future expansion */
+       u32 pin_based_vm_exec_control;
+       u32 cpu_based_vm_exec_control;
+       u32 exception_bitmap;
+       u32 page_fault_error_code_mask;
+       u32 page_fault_error_code_match;
+       u32 cr3_target_count;
+       u32 vm_exit_controls;
+       u32 vm_exit_msr_store_count;
+       u32 vm_exit_msr_load_count;
+       u32 vm_entry_controls;
+       u32 vm_entry_msr_load_count;
+       u32 vm_entry_intr_info_field;
+       u32 vm_entry_exception_error_code;
+       u32 vm_entry_instruction_len;
+       u32 tpr_threshold;
+       u32 secondary_vm_exec_control;
+       u32 vm_instruction_error;
+       u32 vm_exit_reason;
+       u32 vm_exit_intr_info;
+       u32 vm_exit_intr_error_code;
+       u32 idt_vectoring_info_field;
+       u32 idt_vectoring_error_code;
+       u32 vm_exit_instruction_len;
+       u32 vmx_instruction_info;
+       u32 guest_es_limit;
+       u32 guest_cs_limit;
+       u32 guest_ss_limit;
+       u32 guest_ds_limit;
+       u32 guest_fs_limit;
+       u32 guest_gs_limit;
+       u32 guest_ldtr_limit;
+       u32 guest_tr_limit;
+       u32 guest_gdtr_limit;
+       u32 guest_idtr_limit;
+       u32 guest_es_ar_bytes;
+       u32 guest_cs_ar_bytes;
+       u32 guest_ss_ar_bytes;
+       u32 guest_ds_ar_bytes;
+       u32 guest_fs_ar_bytes;
+       u32 guest_gs_ar_bytes;
+       u32 guest_ldtr_ar_bytes;
+       u32 guest_tr_ar_bytes;
+       u32 guest_interruptibility_info;
+       u32 guest_activity_state;
+       u32 guest_sysenter_cs;
+       u32 host_ia32_sysenter_cs;
+       u32 padding32[8]; /* room for future expansion */
+       u16 virtual_processor_id;
+       u16 guest_es_selector;
+       u16 guest_cs_selector;
+       u16 guest_ss_selector;
+       u16 guest_ds_selector;
+       u16 guest_fs_selector;
+       u16 guest_gs_selector;
+       u16 guest_ldtr_selector;
+       u16 guest_tr_selector;
+       u16 host_es_selector;
+       u16 host_cs_selector;
+       u16 host_ss_selector;
+       u16 host_ds_selector;
+       u16 host_fs_selector;
+       u16 host_gs_selector;
+       u16 host_tr_selector;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+struct vmcs02_list {
+       struct list_head list;
+       gpa_t vmptr;
+       struct loaded_vmcs vmcs02;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+       /* Has the level1 guest done vmxon? */
+       bool vmxon;
+
+       /* The guest-physical address of the current VMCS L1 keeps for L2 */
+       gpa_t current_vmptr;
+       /* The host-usable pointer to the above */
+       struct page *current_vmcs12_page;
+       struct vmcs12 *current_vmcs12;
+
+       /* vmcs02_list cache of VMCSs recently used to run L2 guests */
+       struct list_head vmcs02_pool;
+       int vmcs02_num;
+       u64 vmcs01_tsc_offset;
+       /* L2 must run next, and mustn't decide to exit to L1. */
+       bool nested_run_pending;
+       /*
+        * Guest pages referred to in vmcs02 with host-physical pointers, so
+        * we must keep them pinned while L2 runs.
+        */
+       struct page *apic_access_page;
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
-       struct list_head      local_vcpus_link;
        unsigned long         host_rsp;
-       int                   launched;
        u8                    fail;
        u8                    cpl;
        bool                  nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
        u64                   msr_host_kernel_gs_base;
        u64                   msr_guest_kernel_gs_base;
 #endif
-       struct vmcs          *vmcs;
+       /*
+        * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+        * non-nested (L1) guest, it always points to vmcs01. For a nested
+        * guest (L2), it points to a different VMCS.
+        */
+       struct loaded_vmcs    vmcs01;
+       struct loaded_vmcs   *loaded_vmcs;
+       bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
                unsigned nr;
                struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
        u32 exit_reason;
 
        bool rdtscp_enabled;
+
+       /* Support for a guest hypervisor (nested VMX) */
+       struct nested_vmx nested;
 };
 
 enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name)    [number] = VMCS12_OFFSET(name)
+#define FIELD64(number, name)  [number] = VMCS12_OFFSET(name), \
+                               [number##_HIGH] = VMCS12_OFFSET(name)+4
+
+static unsigned short vmcs_field_to_offset_table[] = {
+       FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+       FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+       FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+       FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+       FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+       FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+       FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+       FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+       FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+       FIELD(HOST_ES_SELECTOR, host_es_selector),
+       FIELD(HOST_CS_SELECTOR, host_cs_selector),
+       FIELD(HOST_SS_SELECTOR, host_ss_selector),
+       FIELD(HOST_DS_SELECTOR, host_ds_selector),
+       FIELD(HOST_FS_SELECTOR, host_fs_selector),
+       FIELD(HOST_GS_SELECTOR, host_gs_selector),
+       FIELD(HOST_TR_SELECTOR, host_tr_selector),
+       FIELD64(IO_BITMAP_A, io_bitmap_a),
+       FIELD64(IO_BITMAP_B, io_bitmap_b),
+       FIELD64(MSR_BITMAP, msr_bitmap),
+       FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+       FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+       FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+       FIELD64(TSC_OFFSET, tsc_offset),
+       FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+       FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+       FIELD64(EPT_POINTER, ept_pointer),
+       FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+       FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+       FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+       FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+       FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+       FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+       FIELD64(GUEST_PDPTR0, guest_pdptr0),
+       FIELD64(GUEST_PDPTR1, guest_pdptr1),
+       FIELD64(GUEST_PDPTR2, guest_pdptr2),
+       FIELD64(GUEST_PDPTR3, guest_pdptr3),
+       FIELD64(HOST_IA32_PAT, host_ia32_pat),
+       FIELD64(HOST_IA32_EFER, host_ia32_efer),
+       FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+       FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+       FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+       FIELD(EXCEPTION_BITMAP, exception_bitmap),
+       FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+       FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+       FIELD(CR3_TARGET_COUNT, cr3_target_count),
+       FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+       FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+       FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+       FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+       FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+       FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+       FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+       FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+       FIELD(TPR_THRESHOLD, tpr_threshold),
+       FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+       FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+       FIELD(VM_EXIT_REASON, vm_exit_reason),
+       FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+       FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+       FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+       FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+       FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+       FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+       FIELD(GUEST_ES_LIMIT, guest_es_limit),
+       FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+       FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+       FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+       FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+       FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+       FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+       FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+       FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+       FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+       FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+       FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+       FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+       FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+       FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+       FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+       FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+       FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+       FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+       FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+       FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+       FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+       FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+       FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+       FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+       FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+       FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+       FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+       FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+       FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+       FIELD(EXIT_QUALIFICATION, exit_qualification),
+       FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+       FIELD(GUEST_CR0, guest_cr0),
+       FIELD(GUEST_CR3, guest_cr3),
+       FIELD(GUEST_CR4, guest_cr4),
+       FIELD(GUEST_ES_BASE, guest_es_base),
+       FIELD(GUEST_CS_BASE, guest_cs_base),
+       FIELD(GUEST_SS_BASE, guest_ss_base),
+       FIELD(GUEST_DS_BASE, guest_ds_base),
+       FIELD(GUEST_FS_BASE, guest_fs_base),
+       FIELD(GUEST_GS_BASE, guest_gs_base),
+       FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+       FIELD(GUEST_TR_BASE, guest_tr_base),
+       FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+       FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+       FIELD(GUEST_DR7, guest_dr7),
+       FIELD(GUEST_RSP, guest_rsp),
+       FIELD(GUEST_RIP, guest_rip),
+       FIELD(GUEST_RFLAGS, guest_rflags),
+       FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+       FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+       FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+       FIELD(HOST_CR0, host_cr0),
+       FIELD(HOST_CR3, host_cr3),
+       FIELD(HOST_CR4, host_cr4),
+       FIELD(HOST_FS_BASE, host_fs_base),
+       FIELD(HOST_GS_BASE, host_gs_base),
+       FIELD(HOST_TR_BASE, host_tr_base),
+       FIELD(HOST_GDTR_BASE, host_gdtr_base),
+       FIELD(HOST_IDTR_BASE, host_idtr_base),
+       FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+       FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+       FIELD(HOST_RSP, host_rsp),
+       FIELD(HOST_RIP, host_rip),
+};
+static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+       if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
+               return -1;
+       return vmcs_field_to_offset_table[field];
+}
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+       struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               return NULL;
+       }
+       return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+       kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+       kvm_release_page_clean(page);
+}
+
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
        return flexpriority_enabled;
 }
 
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+       return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+       return (vmcs12->cpu_based_vm_exec_control &
+                       CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+               (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+       struct kvm_vcpu *vcpu)
+{
+       return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool is_exception(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+               == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+                       struct vmcs12 *vmcs12,
+                       u32 reason, unsigned long qualification);
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+       vmcs_clear(loaded_vmcs->vmcs);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
        u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
                        : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                        : "cc", "memory");
        if (error)
-               printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+               printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-       struct vcpu_vmx *vmx = arg;
+       struct loaded_vmcs *loaded_vmcs = arg;
        int cpu = raw_smp_processor_id();
 
-       if (vmx->vcpu.cpu == cpu)
-               vmcs_clear(vmx->vmcs);
-       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+       if (loaded_vmcs->cpu != cpu)
+               return; /* vcpu migration can race with cpu offline */
+       if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       list_del(&vmx->local_vcpus_link);
-       vmx->vcpu.cpu = -1;
-       vmx->launched = 0;
+       list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+       loaded_vmcs_init(loaded_vmcs);
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-       if (vmx->vcpu.cpu == -1)
-               return;
-       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+       if (loaded_vmcs->cpu != -1)
+               smp_call_function_single(
+                       loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
        }
 }
 
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
-       unsigned long value = 0;
+       unsigned long value;
 
-       asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-                     : "+a"(value) : "d"(field) : "cc");
+       asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+                     : "=a"(value) : "d"(field) : "cc");
        return value;
 }
 
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
 {
        return vmcs_readl(field);
 }
 
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
 {
        return vmcs_readl(field);
 }
 
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
 {
 #ifdef CONFIG_X86_64
        return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
        if (vcpu->fpu_active)
                eb &= ~(1u << NM_VECTOR);
+
+       /* When we are running a nested L2 guest and L1 specified for it a
+        * certain exception bitmap, we must trap the same exceptions and pass
+        * them to L1. When running L2, we will only handle the exceptions
+        * specified above if L1 did not want them.
+        */
+       if (is_guest_mode(vcpu))
+               eb |= get_vmcs12(vcpu)->exception_bitmap;
+
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (!vmm_exclusive)
                kvm_cpu_vmxon(phys_addr);
-       else if (vcpu->cpu != cpu)
-               vcpu_clear(vmx);
+       else if (vmx->loaded_vmcs->cpu != cpu)
+               loaded_vmcs_clear(vmx->loaded_vmcs);
 
-       if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->vmcs;
-               vmcs_load(vmx->vmcs);
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
        }
 
-       if (vcpu->cpu != cpu) {
+       if (vmx->loaded_vmcs->cpu != cpu) {
                struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
                unsigned long sysenter_esp;
 
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                local_irq_disable();
-               list_add(&vmx->local_vcpus_link,
-                        &per_cpu(vcpus_on_cpu, cpu));
+               list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+                        &per_cpu(loaded_vmcss_on_cpu, cpu));
                local_irq_enable();
 
                /*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+               vmx->loaded_vmcs->cpu = cpu;
        }
 }
 
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
        __vmx_load_host_state(to_vmx(vcpu));
        if (!vmm_exclusive) {
-               __vcpu_clear(to_vmx(vcpu));
+               __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+               vcpu->cpu = -1;
                kvm_cpu_vmxoff();
        }
 }
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_CR0, cr0);
        update_exception_bitmap(vcpu);
        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       if (is_guest_mode(vcpu))
+               vcpu->arch.cr0_guest_owned_bits &=
+                       ~get_vmcs12(vcpu)->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+       return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+               (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+       return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+               (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+       /* Note that there is no vcpu->fpu_active = 0 here. The caller must
+        * set this *before* calling this function.
+        */
        vmx_decache_cr0_guest_bits(vcpu);
        vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
        update_exception_bitmap(vcpu);
        vcpu->arch.cr0_guest_owned_bits = 0;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-       vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+       if (is_guest_mode(vcpu)) {
+               /*
+                * L1's specified read shadow might not contain the TS bit,
+                * so now that we turned on shadowing of this bit, we need to
+                * set this bit of the shadow. Like in nested_vmx_run we need
+                * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
+                * up-to-date here because we just decached cr0.TS (and we'll
+                * only update vmcs12->guest_cr0 on nested exit).
+                */
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+                       (vcpu->arch.cr0 & X86_CR0_TS);
+               vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+       } else
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
                vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 }
 
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ * This function assumes it is called with the exit reason in vmcs02 being
+ * a #PF exception (this is the only case in which KVM injects a #PF when L2
+ * is running).
+ */
+static int nested_pf_handled(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+       if (!(vmcs12->exception_bitmap & PF_VECTOR))
+               return 0;
+
+       nested_vmx_vmexit(vcpu);
+       return 1;
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                bool has_error_code, u32 error_code,
                                bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+       if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+               nested_pf_handled(vcpu))
+               return;
+
        if (has_error_code) {
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
        vmcs_write64(TSC_OFFSET, offset);
+       if (is_guest_mode(vcpu))
+               /*
+                * We're here if L1 chose not to trap the TSC MSR. Since
+                * prepare_vmcs12() does not copy tsc_offset, we need to also
+                * set the vmcs12 field here.
+                */
+               get_vmcs12(vcpu)->tsc_offset = offset -
+                       to_vmx(vcpu)->nested.vmcs01_tsc_offset;
 }
 
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 {
        u64 offset = vmcs_read64(TSC_OFFSET);
        vmcs_write64(TSC_OFFSET, offset + adjustment);
+       if (is_guest_mode(vcpu)) {
+               /* Even when running L2, the adjustment needs to apply to L1 */
+               to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
+       }
 }
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
        return target_tsc - native_read_tsc();
 }
 
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+       return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+       return nested && guest_cpuid_has_vmx(vcpu);
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ * TODO: allow these variables to be modified (downgraded) by module options
+ * or other means.
+ */
+static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
+static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
+static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static __init void nested_vmx_setup_ctls_msrs(void)
+{
+       /*
+        * Note that as a general rule, the high half of the MSRs (bits in
+        * the control fields which may be 1) should be initialized by the
+        * intersection of the underlying hardware's MSR (i.e., features which
+        * can be supported) and the list of features we want to expose -
+        * because they are known to be properly supported in our code.
+        * Also, usually, the low half of the MSRs (bits which must be 1) can
+        * be set to 0, meaning that L1 may turn off any of these bits. The
+        * reason is that if one of these bits is necessary, it will appear
+        * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+        * fields of vmcs01 and vmcs02, will turn these bits off - and
+        * nested_vmx_exit_handled() will not pass related exits to L1.
+        * These rules have exceptions below.
+        */
+
+       /* pin-based controls */
+       /*
+        * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
+        * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
+        */
+       nested_vmx_pinbased_ctls_low = 0x16 ;
+       nested_vmx_pinbased_ctls_high = 0x16 |
+               PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+               PIN_BASED_VIRTUAL_NMIS;
+
+       /* exit controls */
+       nested_vmx_exit_ctls_low = 0;
+       /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+#ifdef CONFIG_X86_64
+       nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#else
+       nested_vmx_exit_ctls_high = 0;
+#endif
+
+       /* entry controls */
+       rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+               nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
+       nested_vmx_entry_ctls_low = 0;
+       nested_vmx_entry_ctls_high &=
+               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+
+       /* cpu-based controls */
+       rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+               nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
+       nested_vmx_procbased_ctls_low = 0;
+       nested_vmx_procbased_ctls_high &=
+               CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+               CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+               CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+               CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+               CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+               CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+               CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+       /*
+        * We can allow some features even when not supported by the
+        * hardware. For example, L1 can specify an MSR bitmap - and we
+        * can use it to avoid exits to L1 - even when L0 runs L2
+        * without MSR bitmaps.
+        */
+       nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+
+       /* secondary cpu-based controls */
+       rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+               nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
+       nested_vmx_secondary_ctls_low = 0;
+       nested_vmx_secondary_ctls_high &=
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+       /*
+        * Bits 0 in high must be 0, and bits 1 in low must be 1.
+        */
+       return ((control & high) | low) == control;
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+       return low | ((u64)high << 32);
+}
+
+/*
+ * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
+ * also let it use VMX-specific MSRs.
+ * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
+ * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
+ * like all other MSRs).
+ */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+       if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
+                    msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
+               /*
+                * According to the spec, processors which do not support VMX
+                * should throw a #GP(0) when VMX capability MSRs are read.
+                */
+               kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+               return 1;
+       }
+
+       switch (msr_index) {
+       case MSR_IA32_FEATURE_CONTROL:
+               *pdata = 0;
+               break;
+       case MSR_IA32_VMX_BASIC:
+               /*
+                * This MSR reports some information about VMX support. We
+                * should return information about the VMX we emulate for the
+                * guest, and the VMCS structure we give it - not about the
+                * VMX support of the underlying hardware.
+                */
+               *pdata = VMCS12_REVISION |
+                          ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+                          (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+               break;
+       case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+       case MSR_IA32_VMX_PINBASED_CTLS:
+               *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
+                                       nested_vmx_pinbased_ctls_high);
+               break;
+       case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+       case MSR_IA32_VMX_PROCBASED_CTLS:
+               *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
+                                       nested_vmx_procbased_ctls_high);
+               break;
+       case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+       case MSR_IA32_VMX_EXIT_CTLS:
+               *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
+                                       nested_vmx_exit_ctls_high);
+               break;
+       case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+       case MSR_IA32_VMX_ENTRY_CTLS:
+               *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
+                                       nested_vmx_entry_ctls_high);
+               break;
+       case MSR_IA32_VMX_MISC:
+               *pdata = 0;
+               break;
+       /*
+        * These MSRs specify bits which the guest must keep fixed (on or off)
+        * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+        * We picked the standard core2 setting.
+        */
+#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
+       case MSR_IA32_VMX_CR0_FIXED0:
+               *pdata = VMXON_CR0_ALWAYSON;
+               break;
+       case MSR_IA32_VMX_CR0_FIXED1:
+               *pdata = -1ULL;
+               break;
+       case MSR_IA32_VMX_CR4_FIXED0:
+               *pdata = VMXON_CR4_ALWAYSON;
+               break;
+       case MSR_IA32_VMX_CR4_FIXED1:
+               *pdata = -1ULL;
+               break;
+       case MSR_IA32_VMX_VMCS_ENUM:
+               *pdata = 0x1f;
+               break;
+       case MSR_IA32_VMX_PROCBASED_CTLS2:
+               *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
+                                       nested_vmx_secondary_ctls_high);
+               break;
+       case MSR_IA32_VMX_EPT_VPID_CAP:
+               /* Currently, no nested ept or nested vpid */
+               *pdata = 0;
+               break;
+       default:
+               return 0;
+       }
+
+       return 1;
+}
+
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       if (!nested_vmx_allowed(vcpu))
+               return 0;
+
+       if (msr_index == MSR_IA32_FEATURE_CONTROL)
+               /* TODO: the right thing. */
+               return 1;
+       /*
+        * No need to treat VMX capability MSRs specially: If we don't handle
+        * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
+        */
+       return 0;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                /* Otherwise falls through */
        default:
                vmx_load_host_state(to_vmx(vcpu));
+               if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
+                       return 0;
                msr = find_msr_entry(to_vmx(vcpu), msr_index);
                if (msr) {
                        vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        return 1;
                /* Otherwise falls through */
        default:
+               if (vmx_set_vmx_msr(vcpu, msr_index, data))
+                       break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
        if (read_cr4() & X86_CR4_VMXE)
                return -EBUSY;
 
-       INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+       INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
        return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
        int cpu = raw_smp_processor_id();
-       struct vcpu_vmx *vmx, *n;
+       struct loaded_vmcs *v, *n;
 
-       list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-                                local_vcpus_link)
-               __vcpu_clear(vmx);
+       list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                                loaded_vmcss_on_cpu_link)
+               __loaded_vmcs_clear(v);
 }
 
 
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
        if (vmm_exclusive) {
-               vmclear_local_vcpus();
+               vmclear_local_loaded_vmcss();
                kvm_cpu_vmxoff();
        }
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
        free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+       if (!loaded_vmcs->vmcs)
+               return;
+       loaded_vmcs_clear(loaded_vmcs);
+       free_vmcs(loaded_vmcs->vmcs);
+       loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
        int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_ple())
                ple_gap = 0;
 
+       if (nested)
+               nested_vmx_setup_ctls_msrs();
+
        return alloc_kvm_area();
 }
 
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
                  (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
                                        unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+       if (cr4 & X86_CR4_VMXE) {
+               /*
+                * To use VMXON (and later other VMX instructions), a guest
+                * must first be able to turn on cr4.VMXE (see handle_vmon()).
+                * So basically the check on whether to allow nested VMX
+                * is here.
+                */
+               if (!nested_vmx_allowed(vcpu))
+                       return 1;
+       } else if (to_vmx(vcpu)->nested.vmxon)
+               return 1;
+
        vcpu->arch.cr4 = cr4;
        if (enable_ept) {
                if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
        vmcs_writel(CR4_READ_SHADOW, cr4);
        vmcs_writel(GUEST_CR4, hw_cr4);
+       return 0;
 }
 
 static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,33 +3510,58 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 }
 
 /*
- * Sets up the vmcs for emulated real mode.
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
  */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void vmx_set_constant_host_state(void)
 {
-       u32 host_sysenter_cs, msr_low, msr_high;
-       u32 junk;
-       u64 host_pat;
-       unsigned long a;
+       u32 low32, high32;
+       unsigned long tmpl;
        struct desc_ptr dt;
-       int i;
-       unsigned long kvm_vmx_return;
-       u32 exec_control;
 
-       /* I/O */
-       vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
-       vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
+       vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
+       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
-       if (cpu_has_vmx_msr_bitmap())
-               vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+       vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+       native_store_idt(&dt);
+       vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
-       /* Control */
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               vmcs_config.pin_based_exec_ctrl);
+       asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
+       vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
 
-       exec_control = vmcs_config.cpu_based_exec_ctrl;
+       rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+       vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+       rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+       vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
+
+       if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, low32, high32);
+               vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+       }
+}
+
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+       vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+       if (enable_ept)
+               vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+       if (is_guest_mode(&vmx->vcpu))
+               vmx->vcpu.arch.cr4_guest_owned_bits &=
+                       ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
+       vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+       u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
                exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
@@ -2759,45 +3573,80 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                exec_control |= CPU_BASED_CR3_STORE_EXITING |
                                CPU_BASED_CR3_LOAD_EXITING  |
                                CPU_BASED_INVLPG_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
-       if (cpu_has_secondary_exec_ctrls()) {
-               exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-               if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-                       exec_control &=
-                               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               if (vmx->vpid == 0)
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-               if (!enable_ept) {
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
-                       enable_unrestricted_guest = 0;
-               }
-               if (!enable_unrestricted_guest)
-                       exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
-               if (!ple_gap)
-                       exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-       }
+       return exec_control;
+}
 
-       if (ple_gap) {
-               vmcs_write32(PLE_GAP, ple_gap);
-               vmcs_write32(PLE_WINDOW, ple_window);
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+       u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+       if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+       if (vmx->vpid == 0)
+               exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+       if (!enable_ept) {
+               exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+               enable_unrestricted_guest = 0;
        }
+       if (!enable_unrestricted_guest)
+               exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+       if (!ple_gap)
+               exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+       return exec_control;
+}
 
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+static void ept_set_mmio_spte_mask(void)
+{
+       /*
+        * EPT Misconfigurations can be generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+        * spte.
+        */
+       kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
 
-       vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
-       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+#ifdef CONFIG_X86_64
+       unsigned long a;
+#endif
+       int i;
+
+       /* I/O */
+       vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+       vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+
+       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+       /* Control */
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+               vmcs_config.pin_based_exec_ctrl);
+
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
+
+       if (cpu_has_secondary_exec_ctrls()) {
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+                               vmx_secondary_exec_control(vmx));
+       }
+
+       if (ple_gap) {
+               vmcs_write32(PLE_GAP, ple_gap);
+               vmcs_write32(PLE_WINDOW, ple_window);
+       }
+
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
-       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-       vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-       vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-       vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmx_set_constant_host_state();
 #ifdef CONFIG_X86_64
        rdmsrl(MSR_FS_BASE, a);
        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 #endif
 
-       vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-       native_store_idt(&dt);
-       vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
-
-       asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-       vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-       rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-       vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-       rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-       vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-       rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-       vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
-
-       if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
-               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-               host_pat = msr_low | ((u64) msr_high << 32);
-               vmcs_write64(HOST_IA32_PAT, host_pat);
-       }
        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+               u32 msr_low, msr_high;
+               u64 host_pat;
                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
                host_pat = msr_low | ((u64) msr_high << 32);
                /* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-       vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
-       if (enable_ept)
-               vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
-       vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+       set_cr4_guest_host_mask(vmx);
 
        kvm_write_tsc(&vmx->vcpu, 0);
 
@@ -2990,9 +3819,25 @@ out:
        return ret;
 }
 
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+       return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+               PIN_BASED_EXT_INTR_MASK;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               /* We can get here when nested_run_pending caused
+                * vmx_interrupt_allowed() to return false. In this case, do
+                * nothing - the interrupt will be injected later.
+                */
+               return;
 
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       if (is_guest_mode(vcpu))
+               return;
+
        if (!cpu_has_virtual_nmis()) {
                /*
                 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+               struct vmcs12 *vmcs12;
+               if (to_vmx(vcpu)->nested.nested_run_pending)
+                       return 0;
+               nested_vmx_vmexit(vcpu);
+               vmcs12 = get_vmcs12(vcpu);
+               vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+               vmcs12->vm_exit_intr_info = 0;
+               /* fall through to normal code, but now in L1, not L2 */
+       }
+
        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       if (to_vmx(vcpu)->nested.vmxon &&
+           ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+               return 1;
+
+       if (is_guest_mode(vcpu)) {
+               /*
+                * We get here when L2 changed cr0 in a way that did not change
+                * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+                * but did change L0 shadowed bits. This can currently happen
+                * with the TS bit: L0 may want to leave TS on (for lazy fpu
+                * loading) while pretending to allow the guest to change it.
+                */
+               if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
+                        (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+                       return 1;
+               vmcs_writel(CR0_READ_SHADOW, val);
+               return 0;
+       } else
+               return kvm_set_cr0(vcpu, val);
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       if (is_guest_mode(vcpu)) {
+               if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
+                        (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+                       return 1;
+               vmcs_writel(CR4_READ_SHADOW, val);
+               return 0;
+       } else
+               return kvm_set_cr4(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu)) {
+               /*
+                * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
+                * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
+                * just pretend it's off (also in arch.cr0 for fpu_activate).
+                */
+               vmcs_writel(CR0_READ_SHADOW,
+                       vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+       } else
+               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       err = kvm_set_cr0(vcpu, val);
+                       err = handle_set_cr0(vcpu, val);
                        kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                        kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 4:
-                       err = kvm_set_cr4(vcpu, val);
+                       err = handle_set_cr4(vcpu, val);
                        kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                };
                break;
        case 2: /* clts */
-               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+               handle_clts(vcpu);
                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
                skip_emulated_instruction(vcpu);
                vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        u64 sptes[4];
-       int nr_sptes, i;
+       int nr_sptes, i, ret;
        gpa_t gpa;
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
+       ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+       if (likely(ret == 1))
+               return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+                                             EMULATE_DONE;
+       if (unlikely(!ret))
+               return 1;
+
+       /* It is the real ept misconfig */
        printk(KERN_ERR "EPT: Misconfiguration.\n");
        printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
 
@@ -3866,151 +4779,1028 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 }
 
 /*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
+ * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
  */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
-       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
-       [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
-       [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
-       [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
-       [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
-       [EXIT_REASON_CR_ACCESS]               = handle_cr,
-       [EXIT_REASON_DR_ACCESS]               = handle_dr,
-       [EXIT_REASON_CPUID]                   = handle_cpuid,
-       [EXIT_REASON_MSR_READ]                = handle_rdmsr,
-       [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
-       [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
-       [EXIT_REASON_HLT]                     = handle_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
-       [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
-       [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
-       [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-       [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-       [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
-       [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
-       [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-       [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-       [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-       [EXIT_REASON_VMON]                    = handle_vmx_insn,
-       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
-       [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
-       [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
-       [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
-       [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
-       [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
-};
-
-static const int kvm_vmx_max_exit_handlers =
-       ARRAY_SIZE(kvm_vmx_exit_handlers);
 
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 {
-       *info1 = vmcs_readl(EXIT_QUALIFICATION);
-       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+       struct vmcs02_list *item;
+       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+               if (item->vmptr == vmx->nested.current_vmptr) {
+                       list_move(&item->list, &vmx->nested.vmcs02_pool);
+                       return &item->vmcs02;
+               }
+
+       if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+               /* Recycle the least recently used VMCS. */
+               item = list_entry(vmx->nested.vmcs02_pool.prev,
+                       struct vmcs02_list, list);
+               item->vmptr = vmx->nested.current_vmptr;
+               list_move(&item->list, &vmx->nested.vmcs02_pool);
+               return &item->vmcs02;
+       }
+
+       /* Create a new VMCS */
+       item = (struct vmcs02_list *)
+               kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+       if (!item)
+               return NULL;
+       item->vmcs02.vmcs = alloc_vmcs();
+       if (!item->vmcs02.vmcs) {
+               kfree(item);
+               return NULL;
+       }
+       loaded_vmcs_init(&item->vmcs02);
+       item->vmptr = vmx->nested.current_vmptr;
+       list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+       vmx->nested.vmcs02_num++;
+       return &item->vmcs02;
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+       struct vmcs02_list *item;
+       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+               if (item->vmptr == vmptr) {
+                       free_loaded_vmcs(&item->vmcs02);
+                       list_del(&item->list);
+                       kfree(item);
+                       vmx->nested.vmcs02_num--;
+                       return;
+               }
 }
 
 /*
- * The guest has exited.  See if we can fix it or if we need userspace
- * assistance.
+ * Free all VMCSs saved for this vcpu, except the one pointed by
+ * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
+ * currently used, if running L2), and vmcs01 when running L2.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+       struct vmcs02_list *item, *n;
+       list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+               if (vmx->loaded_vmcs != &item->vmcs02)
+                       free_loaded_vmcs(&item->vmcs02);
+               list_del(&item->list);
+               kfree(item);
+       }
+       vmx->nested.vmcs02_num = 0;
+
+       if (vmx->loaded_vmcs != &vmx->vmcs01)
+               free_loaded_vmcs(&vmx->vmcs01);
+}
+
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
 {
+       struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exit_reason = vmx->exit_reason;
-       u32 vectoring_info = vmx->idt_vectoring_info;
 
-       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+       /* The Intel VMX Instruction Reference lists a bunch of bits that
+        * are prerequisite to running VMXON, most notably cr4.VMXE must be
+        * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+        * Otherwise, we should fail with #UD. We test these now:
+        */
+       if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+           !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+           (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
 
-       /* If guest state is invalid, start emulating */
-       if (vmx->emulation_required && emulate_invalid_guest_state)
-               return handle_invalid_guest_state(vcpu);
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       if (is_long_mode(vcpu) && !cs.l) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
 
-       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
-               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               vcpu->run->fail_entry.hardware_entry_failure_reason
-                       = exit_reason;
-               return 0;
+       if (vmx_get_cpl(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
        }
 
-       if (unlikely(vmx->fail)) {
-               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               vcpu->run->fail_entry.hardware_entry_failure_reason
-                       = vmcs_read32(VM_INSTRUCTION_ERROR);
+       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+       vmx->nested.vmcs02_num = 0;
+
+       vmx->nested.vmxon = true;
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->nested.vmxon) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 0;
        }
 
-       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                       (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-                       exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                       exit_reason != EXIT_REASON_TASK_SWITCH))
-               printk(KERN_WARNING "%s: unexpected, valid vectoring info "
-                      "(0x%x) and exit reason is 0x%x\n",
-                      __func__, vectoring_info, exit_reason);
-
-       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
-               if (vmx_interrupt_allowed(vcpu)) {
-                       vmx->soft_vnmi_blocked = 0;
-               } else if (vmx->vnmi_blocked_time > 1000000000LL &&
-                          vcpu->arch.nmi_pending) {
-                       /*
-                        * This CPU don't support us in finding the end of an
-                        * NMI-blocked window if the guest runs with IRQs
-                        * disabled. So we pull the trigger after 1 s of
-                        * futile waiting, but inform the user about this.
-                        */
-                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-                              "state on VCPU %d after 1 s timeout\n",
-                              __func__, vcpu->vcpu_id);
-                       vmx->soft_vnmi_blocked = 0;
-               }
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+           (is_long_mode(vcpu) && !cs.l)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
        }
 
-       if (exit_reason < kvm_vmx_max_exit_handlers
-           && kvm_vmx_exit_handlers[exit_reason])
-               return kvm_vmx_exit_handlers[exit_reason](vcpu);
-       else {
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = exit_reason;
+       if (vmx_get_cpl(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+               return 0;
        }
-       return 0;
+
+       return 1;
 }
 
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
 {
-       if (irr == -1 || tpr < irr) {
-               vmcs_write32(TPR_THRESHOLD, 0);
+       if (!vmx->nested.vmxon)
                return;
+       vmx->nested.vmxon = false;
+       if (vmx->nested.current_vmptr != -1ull) {
+               kunmap(vmx->nested.current_vmcs12_page);
+               nested_release_page(vmx->nested.current_vmcs12_page);
+               vmx->nested.current_vmptr = -1ull;
+               vmx->nested.current_vmcs12 = NULL;
+       }
+       /* Unpin physical memory we referred to in current vmcs02 */
+       if (vmx->nested.apic_access_page) {
+               nested_release_page(vmx->nested.apic_access_page);
+               vmx->nested.apic_access_page = 0;
        }
 
-       vmcs_write32(TPR_THRESHOLD, irr);
+       nested_free_all_saved_vmcss(vmx);
 }
 
-static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
-       u32 exit_intr_info;
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+       free_nested(to_vmx(vcpu));
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
 
-       if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
-             || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
-               return;
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+                                unsigned long exit_qualification,
+                                u32 vmx_instruction_info, gva_t *ret)
+{
+       /*
+        * According to Vol. 3B, "Information for VM Exits Due to Instruction
+        * Execution", on an exit, vmx_instruction_info holds most of the
+        * addressing components of the operand. Only the displacement part
+        * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+        * For how an actual address is calculated from all these components,
+        * refer to Vol. 1, "Operand Addressing".
+        */
+       int  scaling = vmx_instruction_info & 3;
+       int  addr_size = (vmx_instruction_info >> 7) & 7;
+       bool is_reg = vmx_instruction_info & (1u << 10);
+       int  seg_reg = (vmx_instruction_info >> 15) & 7;
+       int  index_reg = (vmx_instruction_info >> 18) & 0xf;
+       bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+       int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
+       bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
+
+       if (is_reg) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
 
-       vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       exit_intr_info = vmx->exit_intr_info;
+       /* Addr = segment_base + offset */
+       /* offset = base + [index * scale] + displacement */
+       *ret = vmx_get_segment_base(vcpu, seg_reg);
+       if (base_is_valid)
+               *ret += kvm_register_read(vcpu, base_reg);
+       if (index_is_valid)
+               *ret += kvm_register_read(vcpu, index_reg)<<scaling;
+       *ret += exit_qualification; /* holds the displacement */
 
-       /* Handle machine checks before interrupts are enabled */
-       if (is_machine_check(exit_intr_info))
-               kvm_machine_check();
+       if (addr_size == 1) /* 32 bit */
+               *ret &= 0xffffffff;
 
-       /* We need to handle NMIs before interrupts are enabled */
-       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+       /*
+        * TODO: throw #GP (and return 1) in various cases that the VM*
+        * instructions require it - e.g., offset beyond segment limit,
+        * unusable or unreadable/unwritable segment, non-canonical 64-bit
+        * address, and so on. Currently these are not checked.
+        */
+       return 0;
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                                       u32 vm_instruction_error)
+{
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+               /*
+                * failValid writes the error number to the current VMCS, which
+                * can't be done there isn't a current VMCS.
+                */
+               nested_vmx_failInvalid(vcpu);
+               return;
+       }
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_ZF);
+       get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gva_t gva;
+       gpa_t vmptr;
+       struct vmcs12 *vmcs12;
+       struct page *page;
+       struct x86_exception e;
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+               return 1;
+
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+                               sizeof(vmptr), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+
+       if (vmptr == vmx->nested.current_vmptr) {
+               kunmap(vmx->nested.current_vmcs12_page);
+               nested_release_page(vmx->nested.current_vmcs12_page);
+               vmx->nested.current_vmptr = -1ull;
+               vmx->nested.current_vmcs12 = NULL;
+       }
+
+       page = nested_get_page(vcpu, vmptr);
+       if (page == NULL) {
+               /*
+                * For accurate processor emulation, VMCLEAR beyond available
+                * physical memory should do nothing at all. However, it is
+                * possible that a nested vmx bug, not a guest hypervisor bug,
+                * resulted in this case, so let's shut down before doing any
+                * more damage:
+                */
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               return 1;
+       }
+       vmcs12 = kmap(page);
+       vmcs12->launch_state = 0;
+       kunmap(page);
+       nested_release_page(page);
+
+       nested_free_vmcs02(vmx, vmptr);
+
+       skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
+       return 1;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+       return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+       return nested_vmx_run(vcpu, false);
+}
+
+enum vmcs_field_type {
+       VMCS_FIELD_TYPE_U16 = 0,
+       VMCS_FIELD_TYPE_U64 = 1,
+       VMCS_FIELD_TYPE_U32 = 2,
+       VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_type(unsigned long field)
+{
+       if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
+               return VMCS_FIELD_TYPE_U32;
+       return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+       return (((field >> 10) & 0x3) == 1);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
+                                       unsigned long field, u64 *ret)
+{
+       short offset = vmcs_field_to_offset(field);
+       char *p;
+
+       if (offset < 0)
+               return 0;
+
+       p = ((char *)(get_vmcs12(vcpu))) + offset;
+
+       switch (vmcs_field_type(field)) {
+       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+               *ret = *((natural_width *)p);
+               return 1;
+       case VMCS_FIELD_TYPE_U16:
+               *ret = *((u16 *)p);
+               return 1;
+       case VMCS_FIELD_TYPE_U32:
+               *ret = *((u32 *)p);
+               return 1;
+       case VMCS_FIELD_TYPE_U64:
+               *ret = *((u64 *)p);
+               return 1;
+       default:
+               return 0; /* can never happen. */
+       }
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       if (vmx->nested.current_vmptr == -1ull) {
+               nested_vmx_failInvalid(vcpu);
+               skip_emulated_instruction(vcpu);
+               return 0;
+       }
+       return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+       unsigned long field;
+       u64 field_value;
+       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       gva_t gva = 0;
+
+       if (!nested_vmx_check_permission(vcpu) ||
+           !nested_vmx_check_vmcs12(vcpu))
+               return 1;
+
+       /* Decode instruction info and find the field to read */
+       field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+       /* Read the field, zero-extended to a u64 field_value */
+       if (!vmcs12_read_any(vcpu, field, &field_value)) {
+               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+       /*
+        * Now copy part of this value to register or memory, as requested.
+        * Note that the number of bits actually copied is 32 or 64 depending
+        * on the guest's mode (32 or 64 bit), not on the given field's length.
+        */
+       if (vmx_instruction_info & (1u << 10)) {
+               kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+                       field_value);
+       } else {
+               if (get_vmx_mem_address(vcpu, exit_qualification,
+                               vmx_instruction_info, &gva))
+                       return 1;
+               /* _system ok, as nested_vmx_check_permission verified cpl=0 */
+               kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
+                            &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+       }
+
+       nested_vmx_succeed(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+       unsigned long field;
+       gva_t gva;
+       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       char *p;
+       short offset;
+       /* The value to write might be 32 or 64 bits, depending on L1's long
+        * mode, and eventually we need to write that into a field of several
+        * possible lengths. The code below first zero-extends the value to 64
+        * bit (field_value), and then copies only the approriate number of
+        * bits into the vmcs12 field.
+        */
+       u64 field_value = 0;
+       struct x86_exception e;
+
+       if (!nested_vmx_check_permission(vcpu) ||
+           !nested_vmx_check_vmcs12(vcpu))
+               return 1;
+
+       if (vmx_instruction_info & (1u << 10))
+               field_value = kvm_register_read(vcpu,
+                       (((vmx_instruction_info) >> 3) & 0xf));
+       else {
+               if (get_vmx_mem_address(vcpu, exit_qualification,
+                               vmx_instruction_info, &gva))
+                       return 1;
+               if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
+                          &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+                       kvm_inject_page_fault(vcpu, &e);
+                       return 1;
+               }
+       }
+
+
+       field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+       if (vmcs_field_readonly(field)) {
+               nested_vmx_failValid(vcpu,
+                       VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+
+       offset = vmcs_field_to_offset(field);
+       if (offset < 0) {
+               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+       p = ((char *) get_vmcs12(vcpu)) + offset;
+
+       switch (vmcs_field_type(field)) {
+       case VMCS_FIELD_TYPE_U16:
+               *(u16 *)p = field_value;
+               break;
+       case VMCS_FIELD_TYPE_U32:
+               *(u32 *)p = field_value;
+               break;
+       case VMCS_FIELD_TYPE_U64:
+               *(u64 *)p = field_value;
+               break;
+       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+               *(natural_width *)p = field_value;
+               break;
+       default:
+               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+
+       nested_vmx_succeed(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gva_t gva;
+       gpa_t vmptr;
+       struct x86_exception e;
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+               return 1;
+
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+                               sizeof(vmptr), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+
+       if (vmx->nested.current_vmptr != vmptr) {
+               struct vmcs12 *new_vmcs12;
+               struct page *page;
+               page = nested_get_page(vcpu, vmptr);
+               if (page == NULL) {
+                       nested_vmx_failInvalid(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               new_vmcs12 = kmap(page);
+               if (new_vmcs12->revision_id != VMCS12_REVISION) {
+                       kunmap(page);
+                       nested_release_page_clean(page);
+                       nested_vmx_failValid(vcpu,
+                               VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               if (vmx->nested.current_vmptr != -1ull) {
+                       kunmap(vmx->nested.current_vmcs12_page);
+                       nested_release_page(vmx->nested.current_vmcs12_page);
+               }
+
+               vmx->nested.current_vmptr = vmptr;
+               vmx->nested.current_vmcs12 = new_vmcs12;
+               vmx->nested.current_vmcs12_page = page;
+       }
+
+       nested_vmx_succeed(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       gva_t vmcs_gva;
+       struct x86_exception e;
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (get_vmx_mem_address(vcpu, exit_qualification,
+                       vmx_instruction_info, &vmcs_gva))
+               return 1;
+       /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+       if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
+                                (void *)&to_vmx(vcpu)->nested.current_vmptr,
+                                sizeof(u64), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+       nested_vmx_succeed(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+       [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
+       [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
+       [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
+       [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
+       [EXIT_REASON_CR_ACCESS]               = handle_cr,
+       [EXIT_REASON_DR_ACCESS]               = handle_dr,
+       [EXIT_REASON_CPUID]                   = handle_cpuid,
+       [EXIT_REASON_MSR_READ]                = handle_rdmsr,
+       [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
+       [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
+       [EXIT_REASON_HLT]                     = handle_halt,
+       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVLPG]                  = handle_invlpg,
+       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
+       [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
+       [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+       [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
+       [EXIT_REASON_VMREAD]                  = handle_vmread,
+       [EXIT_REASON_VMRESUME]                = handle_vmresume,
+       [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
+       [EXIT_REASON_VMOFF]                   = handle_vmoff,
+       [EXIT_REASON_VMON]                    = handle_vmon,
+       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
+       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+       [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
+       [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
+       [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
+       [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+       ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+       struct vmcs12 *vmcs12, u32 exit_reason)
+{
+       u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+       gpa_t bitmap;
+
+       if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+               return 1;
+
+       /*
+        * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+        * for the four combinations of read/write and low/high MSR numbers.
+        * First we need to figure out which of the four to use:
+        */
+       bitmap = vmcs12->msr_bitmap;
+       if (exit_reason == EXIT_REASON_MSR_WRITE)
+               bitmap += 2048;
+       if (msr_index >= 0xc0000000) {
+               msr_index -= 0xc0000000;
+               bitmap += 1024;
+       }
+
+       /* Then read the msr_index'th bit from this bitmap: */
+       if (msr_index < 1024*8) {
+               unsigned char b;
+               kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+               return 1 & (b >> (msr_index & 7));
+       } else
+               return 1; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+       struct vmcs12 *vmcs12)
+{
+       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       int cr = exit_qualification & 15;
+       int reg = (exit_qualification >> 8) & 15;
+       unsigned long val = kvm_register_read(vcpu, reg);
+
+       switch ((exit_qualification >> 4) & 3) {
+       case 0: /* mov to cr */
+               switch (cr) {
+               case 0:
+                       if (vmcs12->cr0_guest_host_mask &
+                           (val ^ vmcs12->cr0_read_shadow))
+                               return 1;
+                       break;
+               case 3:
+                       if ((vmcs12->cr3_target_count >= 1 &&
+                                       vmcs12->cr3_target_value0 == val) ||
+                               (vmcs12->cr3_target_count >= 2 &&
+                                       vmcs12->cr3_target_value1 == val) ||
+                               (vmcs12->cr3_target_count >= 3 &&
+                                       vmcs12->cr3_target_value2 == val) ||
+                               (vmcs12->cr3_target_count >= 4 &&
+                                       vmcs12->cr3_target_value3 == val))
+                               return 0;
+                       if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+                               return 1;
+                       break;
+               case 4:
+                       if (vmcs12->cr4_guest_host_mask &
+                           (vmcs12->cr4_read_shadow ^ val))
+                               return 1;
+                       break;
+               case 8:
+                       if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+                               return 1;
+                       break;
+               }
+               break;
+       case 2: /* clts */
+               if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+                   (vmcs12->cr0_read_shadow & X86_CR0_TS))
+                       return 1;
+               break;
+       case 1: /* mov from cr */
+               switch (cr) {
+               case 3:
+                       if (vmcs12->cpu_based_vm_exec_control &
+                           CPU_BASED_CR3_STORE_EXITING)
+                               return 1;
+                       break;
+               case 8:
+                       if (vmcs12->cpu_based_vm_exec_control &
+                           CPU_BASED_CR8_STORE_EXITING)
+                               return 1;
+                       break;
+               }
+               break;
+       case 3: /* lmsw */
+               /*
+                * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+                * cr0. Other attempted changes are ignored, with no exit.
+                */
+               if (vmcs12->cr0_guest_host_mask & 0xe &
+                   (val ^ vmcs12->cr0_read_shadow))
+                       return 1;
+               if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+                   !(vmcs12->cr0_read_shadow & 0x1) &&
+                   (val & 0x1))
+                       return 1;
+               break;
+       }
+       return 0;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+{
+       u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+       u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       if (vmx->nested.nested_run_pending)
+               return 0;
+
+       if (unlikely(vmx->fail)) {
+               printk(KERN_INFO "%s failed vm entry %x\n",
+                      __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+               return 1;
+       }
+
+       switch (exit_reason) {
+       case EXIT_REASON_EXCEPTION_NMI:
+               if (!is_exception(intr_info))
+                       return 0;
+               else if (is_page_fault(intr_info))
+                       return enable_ept;
+               return vmcs12->exception_bitmap &
+                               (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+               return 0;
+       case EXIT_REASON_TRIPLE_FAULT:
+               return 1;
+       case EXIT_REASON_PENDING_INTERRUPT:
+       case EXIT_REASON_NMI_WINDOW:
+               /*
+                * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
+                * (aka Interrupt Window Exiting) only when L1 turned it on,
+                * so if we got a PENDING_INTERRUPT exit, this must be for L1.
+                * Same for NMI Window Exiting.
+                */
+               return 1;
+       case EXIT_REASON_TASK_SWITCH:
+               return 1;
+       case EXIT_REASON_CPUID:
+               return 1;
+       case EXIT_REASON_HLT:
+               return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+       case EXIT_REASON_INVD:
+               return 1;
+       case EXIT_REASON_INVLPG:
+               return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+       case EXIT_REASON_RDPMC:
+               return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+       case EXIT_REASON_RDTSC:
+               return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+       case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+       case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+       case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
+       case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+       case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+               /*
+                * VMX instructions trap unconditionally. This allows L1 to
+                * emulate them for its L2 guest, i.e., allows 3-level nesting!
+                */
+               return 1;
+       case EXIT_REASON_CR_ACCESS:
+               return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+       case EXIT_REASON_DR_ACCESS:
+               return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+       case EXIT_REASON_IO_INSTRUCTION:
+               /* TODO: support IO bitmaps */
+               return 1;
+       case EXIT_REASON_MSR_READ:
+       case EXIT_REASON_MSR_WRITE:
+               return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+       case EXIT_REASON_INVALID_STATE:
+               return 1;
+       case EXIT_REASON_MWAIT_INSTRUCTION:
+               return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+       case EXIT_REASON_MONITOR_INSTRUCTION:
+               return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+       case EXIT_REASON_PAUSE_INSTRUCTION:
+               return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+                       nested_cpu_has2(vmcs12,
+                               SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+       case EXIT_REASON_MCE_DURING_VMENTRY:
+               return 0;
+       case EXIT_REASON_TPR_BELOW_THRESHOLD:
+               return 1;
+       case EXIT_REASON_APIC_ACCESS:
+               return nested_cpu_has2(vmcs12,
+                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+       case EXIT_REASON_EPT_VIOLATION:
+       case EXIT_REASON_EPT_MISCONFIG:
+               return 0;
+       case EXIT_REASON_WBINVD:
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+       case EXIT_REASON_XSETBV:
+               return 1;
+       default:
+               return 1;
+       }
+}
+
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+       *info1 = vmcs_readl(EXIT_QUALIFICATION);
+       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 exit_reason = vmx->exit_reason;
+       u32 vectoring_info = vmx->idt_vectoring_info;
+
+       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
+       /* If guest state is invalid, start emulating */
+       if (vmx->emulation_required && emulate_invalid_guest_state)
+               return handle_invalid_guest_state(vcpu);
+
+       /*
+        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+        * we did not inject a still-pending event to L1 now because of
+        * nested_run_pending, we need to re-enable this bit.
+        */
+       if (vmx->nested.nested_run_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
+           exit_reason == EXIT_REASON_VMRESUME))
+               vmx->nested.nested_run_pending = 1;
+       else
+               vmx->nested.nested_run_pending = 0;
+
+       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
+               nested_vmx_vmexit(vcpu);
+               return 1;
+       }
+
+       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               vcpu->run->fail_entry.hardware_entry_failure_reason
+                       = exit_reason;
+               return 0;
+       }
+
+       if (unlikely(vmx->fail)) {
+               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               vcpu->run->fail_entry.hardware_entry_failure_reason
+                       = vmcs_read32(VM_INSTRUCTION_ERROR);
+               return 0;
+       }
+
+       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                       (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+                       exit_reason != EXIT_REASON_EPT_VIOLATION &&
+                       exit_reason != EXIT_REASON_TASK_SWITCH))
+               printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                      "(0x%x) and exit reason is 0x%x\n",
+                      __func__, vectoring_info, exit_reason);
+
+       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
+           !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
+                                       get_vmcs12(vcpu), vcpu)))) {
+               if (vmx_interrupt_allowed(vcpu)) {
+                       vmx->soft_vnmi_blocked = 0;
+               } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+                          vcpu->arch.nmi_pending) {
+                       /*
+                        * This CPU don't support us in finding the end of an
+                        * NMI-blocked window if the guest runs with IRQs
+                        * disabled. So we pull the trigger after 1 s of
+                        * futile waiting, but inform the user about this.
+                        */
+                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                              "state on VCPU %d after 1 s timeout\n",
+                              __func__, vcpu->vcpu_id);
+                       vmx->soft_vnmi_blocked = 0;
+               }
+       }
+
+       if (exit_reason < kvm_vmx_max_exit_handlers
+           && kvm_vmx_exit_handlers[exit_reason])
+               return kvm_vmx_exit_handlers[exit_reason](vcpu);
+       else {
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = exit_reason;
+       }
+       return 0;
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+       if (irr == -1 || tpr < irr) {
+               vmcs_write32(TPR_THRESHOLD, 0);
+               return;
+       }
+
+       vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+{
+       u32 exit_intr_info;
+
+       if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
+             || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
+               return;
+
+       vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       exit_intr_info = vmx->exit_intr_info;
+
+       /* Handle machine checks before interrupts are enabled */
+       if (is_machine_check(exit_intr_info))
+               kvm_machine_check();
+
+       /* We need to handle NMIs before interrupts are enabled */
+       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+           (exit_intr_info & INTR_INFO_VALID_MASK)) {
                kvm_before_handle_nmi(&vmx->vcpu);
                asm("int $2");
                kvm_after_handle_nmi(&vmx->vcpu);
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
+       if (is_guest_mode(&vmx->vcpu))
+               return;
        __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
                                  VM_EXIT_INSTRUCTION_LEN,
                                  IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
+       if (is_guest_mode(vcpu))
+               return;
        __vmx_complete_interrupts(to_vmx(vcpu),
                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
                                  VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               if (vmcs12->idt_vectoring_info_field &
+                               VECTORING_INFO_VALID_MASK) {
+                       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                               vmcs12->idt_vectoring_info_field);
+                       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                               vmcs12->vm_exit_instruction_len);
+                       if (vmcs12->idt_vectoring_info_field &
+                                       VECTORING_INFO_DELIVER_CODE_MASK)
+                               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                                       vmcs12->idt_vectoring_error_code);
+               }
+       }
+
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
                vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vmx_set_interrupt_shadow(vcpu, 0);
 
+       vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
                "push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "pop  %%"R"bp; pop  %%"R"dx \n\t"
                "setbe %c[fail](%0) \n\t"
              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
+       if (is_guest_mode(vcpu)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
+               if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+                       vmcs12->idt_vectoring_error_code =
+                               vmcs_read32(IDT_VECTORING_ERROR_CODE);
+                       vmcs12->vm_exit_instruction_len =
+                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               }
+       }
+
        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-       vmx->launched = 1;
+       vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (vmx->vmcs) {
-               vcpu_clear(vmx);
-               free_vmcs(vmx->vmcs);
-               vmx->vmcs = NULL;
-       }
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        free_vpid(vmx);
-       vmx_free_vmcs(vcpu);
+       free_nested(vmx);
+       free_loaded_vmcs(vmx->loaded_vmcs);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
-static inline void vmcs_init(struct vmcs *vmcs)
-{
-       u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
-       if (!vmm_exclusive)
-               kvm_cpu_vmxon(phys_addr);
-
-       vmcs_clear(vmcs);
-
-       if (!vmm_exclusive)
-               kvm_cpu_vmxoff();
-}
-
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
        int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                goto uninit_vcpu;
        }
 
-       vmx->vmcs = alloc_vmcs();
-       if (!vmx->vmcs)
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx->loaded_vmcs->vmcs = alloc_vmcs();
+       if (!vmx->loaded_vmcs->vmcs)
                goto free_msrs;
-
-       vmcs_init(vmx->vmcs);
+       if (!vmm_exclusive)
+               kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+       loaded_vmcs_init(vmx->loaded_vmcs);
+       if (!vmm_exclusive)
+               kvm_cpu_vmxoff();
 
        cpu = get_cpu();
        vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                        goto free_vmcs;
        }
 
+       vmx->nested.current_vmptr = -1ull;
+       vmx->nested.current_vmcs12 = NULL;
+
        return &vmx->vcpu;
 
 free_vmcs:
-       free_vmcs(vmx->vmcs);
+       free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
        kfree(vmx->guest_msrs);
 uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
+       if (func == 1 && nested)
+               entry->ecx |= bit(X86_FEATURE_VMX);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ */
+static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 exec_control;
+
+       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+       vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+       vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+       vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+       vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+       vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+       vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+       vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+       vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+       vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+       vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+       vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+       vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+       vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+       vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+       vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+       vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+       vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+       vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+       vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+       vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+       vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+       vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+       vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+       vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+       vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+       vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+       vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+       vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+       vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+       vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+               vmcs12->vm_entry_intr_info_field);
+       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+               vmcs12->vm_entry_exception_error_code);
+       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+               vmcs12->vm_entry_instruction_len);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+               vmcs12->guest_interruptibility_info);
+       vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
+       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+       vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+       vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+       vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+               vmcs12->guest_pending_dbg_exceptions);
+       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+               (vmcs_config.pin_based_exec_ctrl |
+                vmcs12->pin_based_vm_exec_control));
+
+       /*
+        * Whether page-faults are trapped is determined by a combination of
+        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+        * If enable_ept, L0 doesn't care about page faults and we should
+        * set all of these to L1's desires. However, if !enable_ept, L0 does
+        * care about (at least some) page faults, and because it is not easy
+        * (if at all possible?) to merge L0 and L1's desires, we simply ask
+        * to exit on each and every L2 page fault. This is done by setting
+        * MASK=MATCH=0 and (see below) EB.PF=1.
+        * Note that below we don't need special code to set EB.PF beyond the
+        * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+        * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+        * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+        *
+        * A problem with this approach (when !enable_ept) is that L1 may be
+        * injected with more page faults than it asked for. This could have
+        * caused problems, but in practice existing hypervisors don't care.
+        * To fix this, we will need to emulate the PFEC checking (on the L1
+        * page tables), using walk_addr(), when injecting PFs to L1.
+        */
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+       if (cpu_has_secondary_exec_ctrls()) {
+               u32 exec_control = vmx_secondary_exec_control(vmx);
+               if (!vmx->rdtscp_enabled)
+                       exec_control &= ~SECONDARY_EXEC_RDTSCP;
+               /* Take the following fields only from vmcs12 */
+               exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               if (nested_cpu_has(vmcs12,
+                               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+                       exec_control |= vmcs12->secondary_vm_exec_control;
+
+               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
+                       /*
+                        * Translate L1 physical address to host physical
+                        * address for vmcs02. Keep the page pinned, so this
+                        * physical address remains valid. We keep a reference
+                        * to it so we can release it later.
+                        */
+                       if (vmx->nested.apic_access_page) /* shouldn't happen */
+                               nested_release_page(vmx->nested.apic_access_page);
+                       vmx->nested.apic_access_page =
+                               nested_get_page(vcpu, vmcs12->apic_access_addr);
+                       /*
+                        * If translation failed, no matter: This feature asks
+                        * to exit when accessing the given address, and if it
+                        * can never be accessed, this feature won't do
+                        * anything anyway.
+                        */
+                       if (!vmx->nested.apic_access_page)
+                               exec_control &=
+                                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+                       else
+                               vmcs_write64(APIC_ACCESS_ADDR,
+                                 page_to_phys(vmx->nested.apic_access_page));
+               }
+
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+       }
+
+
+       /*
+        * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+        * Some constant fields are set here by vmx_set_constant_host_state().
+        * Other fields are different per CPU, and will be set later when
+        * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+        */
+       vmx_set_constant_host_state();
+
+       /*
+        * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+        * entry, but only if the current (host) sp changed from the value
+        * we wrote last (vmx->host_rsp). This cache is no longer relevant
+        * if we switch vmcs, and rather than hold a separate cache per vmcs,
+        * here we just force the write to happen on entry.
+        */
+       vmx->host_rsp = 0;
+
+       exec_control = vmx_exec_control(vmx); /* L0's desires */
+       exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+       exec_control &= ~CPU_BASED_TPR_SHADOW;
+       exec_control |= vmcs12->cpu_based_vm_exec_control;
+       /*
+        * Merging of IO and MSR bitmaps not currently supported.
+        * Rather, exit every time.
+        */
+       exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+       exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+       exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+       /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+        * bitwise-or of what L1 wants to trap for L2, and what we want to
+        * trap. Note that CR0.TS also needs updating - we do this later.
+        */
+       update_exception_bitmap(vcpu);
+       vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+       vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+       /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+       vmcs_write32(VM_EXIT_CONTROLS,
+               vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+       vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+               (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+               vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+       else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+               vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+
+       set_cr4_guest_host_mask(vmx);
+
+       vmcs_write64(TSC_OFFSET,
+               vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+
+       if (enable_vpid) {
+               /*
+                * Trivially support vpid by letting L2s share their parent
+                * L1's vpid. TODO: move to a more elaborate solution, giving
+                * each L2 its own vpid and exposing the vpid feature to L1.
+                */
+               vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+               vmx_flush_tlb(vcpu);
+       }
+
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+               vcpu->arch.efer = vmcs12->guest_ia32_efer;
+       if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+               vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+       else
+               vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+       /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+       vmx_set_efer(vcpu, vcpu->arch.efer);
+
+       /*
+        * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+        * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+        * The CR0_READ_SHADOW is what L2 should have expected to read given
+        * the specifications by L1; It's not enough to take
+        * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+        * have more bits than L1 expected.
+        */
+       vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+       vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+       vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+       vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+       /* shadow page tables on either EPT or shadow page tables */
+       kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+       kvm_mmu_reset_context(vcpu);
+
+       kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+       kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+       struct vmcs12 *vmcs12;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+       struct loaded_vmcs *vmcs02;
+
+       if (!nested_vmx_check_permission(vcpu) ||
+           !nested_vmx_check_vmcs12(vcpu))
+               return 1;
+
+       skip_emulated_instruction(vcpu);
+       vmcs12 = get_vmcs12(vcpu);
+
+       /*
+        * The nested entry process starts with enforcing various prerequisites
+        * on vmcs12 as required by the Intel SDM, and act appropriately when
+        * they fail: As the SDM explains, some conditions should cause the
+        * instruction to fail, while others will cause the instruction to seem
+        * to succeed, but return an EXIT_REASON_INVALID_STATE.
+        * To speed up the normal (success) code path, we should avoid checking
+        * for misconfigurations which will anyway be caught by the processor
+        * when using the merged vmcs02.
+        */
+       if (vmcs12->launch_state == launch) {
+               nested_vmx_failValid(vcpu,
+                       launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+                              : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+               return 1;
+       }
+
+       if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
+                       !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+               /*TODO: Also verify bits beyond physical address width are 0*/
+               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return 1;
+       }
+
+       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+                       !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+               /*TODO: Also verify bits beyond physical address width are 0*/
+               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return 1;
+       }
+
+       if (vmcs12->vm_entry_msr_load_count > 0 ||
+           vmcs12->vm_exit_msr_load_count > 0 ||
+           vmcs12->vm_exit_msr_store_count > 0) {
+               if (printk_ratelimit())
+                       printk(KERN_WARNING
+                         "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return 1;
+       }
+
+       if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+             nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+           !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+             nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+           !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+             nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+           !vmx_control_verify(vmcs12->vm_exit_controls,
+             nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+           !vmx_control_verify(vmcs12->vm_entry_controls,
+             nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+       {
+               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return 1;
+       }
+
+       if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+           ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+               nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+               return 1;
+       }
+
+       if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+           ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+               nested_vmx_entry_failure(vcpu, vmcs12,
+                       EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+               return 1;
+       }
+       if (vmcs12->vmcs_link_pointer != -1ull) {
+               nested_vmx_entry_failure(vcpu, vmcs12,
+                       EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+               return 1;
+       }
+
+       /*
+        * We're finally done with prerequisite checking, and can start with
+        * the nested entry.
+        */
+
+       vmcs02 = nested_get_current_vmcs02(vmx);
+       if (!vmcs02)
+               return -ENOMEM;
+
+       enter_guest_mode(vcpu);
+
+       vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+
+       cpu = get_cpu();
+       vmx->loaded_vmcs = vmcs02;
+       vmx_vcpu_put(vcpu);
+       vmx_vcpu_load(vcpu, cpu);
+       vcpu->cpu = cpu;
+       put_cpu();
+
+       vmcs12->launch_state = 1;
+
+       prepare_vmcs02(vcpu, vmcs12);
+
+       /*
+        * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+        * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+        * returned as far as L1 is concerned. It will only return (and set
+        * the success flag) when L2 exits (see nested_vmx_vmexit()).
+        */
+       return 1;
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ *     didn't trap the bit, because if L1 did, so would L0).
+ *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ *     been modified by L2, and L1 knows it. So just leave the old value of
+ *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ *     isn't relevant, because if L0 traps this bit it can set it to anything.
+ *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ *     changed these bits, and therefore they need to be updated, but L0
+ *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       return
+       /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+       /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+       /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+                       vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       return
+       /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+       /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+       /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+                       vcpu->arch.cr4_guest_owned_bits));
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       /* update guest state fields: */
+       vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+       vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+       kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+       vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+       vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+       vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+       vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+       vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+       vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+       vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+       vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+       vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+       vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+       vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+       vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+       vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+       vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+       vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+       vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+       vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+       vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+       vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+       vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+       vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+       vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+       vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+       vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+       vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+       vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+       vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+       vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+       vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+       vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+       vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+       vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+       vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+       vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+       vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+       vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+       vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+       vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+       vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+       vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+       vmcs12->guest_interruptibility_info =
+               vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+       vmcs12->guest_pending_dbg_exceptions =
+               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+       /* TODO: These cannot have changed unless we have MSR bitmaps and
+        * the relevant bit asks not to trap the change */
+       vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+       if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+               vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+       vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+       vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+       vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+       /* update exit information fields: */
+
+       vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+       vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+       vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       vmcs12->idt_vectoring_info_field =
+               vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       vmcs12->idt_vectoring_error_code =
+               vmcs_read32(IDT_VECTORING_ERROR_CODE);
+       vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+       vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+       /* clear vm-entry fields which are to be cleared on exit */
+       if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+               vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+               vcpu->arch.efer = vmcs12->host_ia32_efer;
+       if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+               vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+       else
+               vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+       vmx_set_efer(vcpu, vcpu->arch.efer);
+
+       kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+       kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+       /*
+        * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+        * actually changed, because it depends on the current state of
+        * fpu_active (which may have changed).
+        * Note that vmx_set_cr0 refers to efer set above.
+        */
+       kvm_set_cr0(vcpu, vmcs12->host_cr0);
+       /*
+        * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
+        * to apply the same changes to L1's vmcs. We just set cr0 correctly,
+        * but we also need to update cr0_guest_host_mask and exception_bitmap.
+        */
+       update_exception_bitmap(vcpu);
+       vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+       vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+       /*
+        * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
+        * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
+        */
+       vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+       kvm_set_cr4(vcpu, vmcs12->host_cr4);
+
+       /* shadow page tables on either EPT or shadow page tables */
+       kvm_set_cr3(vcpu, vmcs12->host_cr3);
+       kvm_mmu_reset_context(vcpu);
+
+       if (enable_vpid) {
+               /*
+                * Trivially support vpid by letting L2s share their parent
+                * L1's vpid. TODO: move to a more elaborate solution, giving
+                * each L2 its own vpid and exposing the vpid feature to L1.
+                */
+               vmx_flush_tlb(vcpu);
+       }
+
+
+       vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+       vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+       vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+       vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+       vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+       vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
+       vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+       vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
+       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
+       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
+       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
+       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
+       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
+       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
+       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
+
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+               vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+               vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+                       vmcs12->host_ia32_perf_global_ctrl);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       leave_guest_mode(vcpu);
+       prepare_vmcs12(vcpu, vmcs12);
+
+       cpu = get_cpu();
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx_vcpu_put(vcpu);
+       vmx_vcpu_load(vcpu, cpu);
+       vcpu->cpu = cpu;
+       put_cpu();
+
+       /* if no vmcs02 cache requested, remove the one we used */
+       if (VMCS02_POOL_SIZE == 0)
+               nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+
+       load_vmcs12_host_state(vcpu, vmcs12);
+
+       /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+       vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+
+       /* This is needed for same reason as it was needed in prepare_vmcs02 */
+       vmx->host_rsp = 0;
+
+       /* Unpin physical memory we referred to in vmcs02 */
+       if (vmx->nested.apic_access_page) {
+               nested_release_page(vmx->nested.apic_access_page);
+               vmx->nested.apic_access_page = 0;
+       }
+
+       /*
+        * Exiting from L2 to L1, we're now back to L1 which thinks it just
+        * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
+        * success or failure flag accordingly.
+        */
+       if (unlikely(vmx->fail)) {
+               vmx->fail = 0;
+               nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+       } else
+               nested_vmx_succeed(vcpu);
+}
+
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+                       struct vmcs12 *vmcs12,
+                       u32 reason, unsigned long qualification)
+{
+       load_vmcs12_host_state(vcpu, vmcs12);
+       vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+       vmcs12->exit_qualification = qualification;
+       nested_vmx_succeed(vcpu);
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
        if (enable_ept) {
-               bypass_guest_pf = 0;
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                VMX_EPT_EXECUTABLE_MASK);
+               ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();
 
-       if (bypass_guest_pf)
-               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
        return 0;
 
 out3:
index 77c9d86..84a28ea 100644 (file)
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
        vcpu->arch.cr2 = fault->address;
        kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
        return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
 
+static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
+                                  X86_CR4_PAE | X86_CR4_SMEP;
        if (cr4 & CR4_RESERVED_BITS)
                return 1;
 
        if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
                return 1;
 
+       if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
+               return 1;
+
+       if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
+               return 1;
+
        if (is_long_mode(vcpu)) {
                if (!(cr4 & X86_CR4_PAE))
                        return 1;
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                                   kvm_read_cr3(vcpu)))
                return 1;
 
-       if (cr4 & X86_CR4_VMXE)
+       if (kvm_x86_ops->set_cr4(vcpu, cr4))
                return 1;
 
-       kvm_x86_ops->set_cr4(vcpu, cr4);
-
        if ((cr4 ^ old_cr4) & pdptr_bits)
                kvm_mmu_reset_context(vcpu);
 
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN    8
+#define KVM_SAVE_MSRS_BEGIN    9
 static u32 msrs_to_save[] = {
        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-       HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+       HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
        MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                        return 1;
                kvm_x86_ops->patch_hypercall(vcpu, instructions);
                ((unsigned char *)instructions)[3] = 0xc3; /* ret */
-               if (copy_to_user((void __user *)addr, instructions, 4))
+               if (__copy_to_user((void __user *)addr, instructions, 4))
                        return 1;
                kvm->arch.hv_hypercall = data;
                break;
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                                  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
                if (kvm_is_error_hva(addr))
                        return 1;
-               if (clear_user((void __user *)addr, PAGE_SIZE))
+               if (__clear_user((void __user *)addr, PAGE_SIZE))
                        return 1;
                vcpu->arch.hv_vapic = data;
                break;
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
        }
 }
 
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+       u64 delta;
+
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+
+       delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+       vcpu->arch.st.last_steal = current->sched_info.run_delay;
+       vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+
+       if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+               return;
+
+       vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+       vcpu->arch.st.steal.version += 2;
+       vcpu->arch.st.accum_steal = 0;
+
+       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+               &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                if (kvm_pv_enable_async_pf(vcpu, data))
                        return 1;
                break;
+       case MSR_KVM_STEAL_TIME:
+
+               if (unlikely(!sched_info_on()))
+                       return 1;
+
+               if (data & KVM_STEAL_RESERVED_MASK)
+                       return 1;
+
+               if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+                                                       data & KVM_STEAL_VALID_BITS))
+                       return 1;
+
+               vcpu->arch.st.msr_val = data;
+
+               if (!(data & KVM_MSR_ENABLED))
+                       break;
+
+               vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+               preempt_disable();
+               accumulate_steal_time(vcpu);
+               preempt_enable();
+
+               kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+               break;
+
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_KVM_ASYNC_PF_EN:
                data = vcpu->arch.apf.msr_val;
                break;
+       case MSR_KVM_STEAL_TIME:
+               data = vcpu->arch.st.msr_val;
+               break;
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
        case MSR_IA32_MCG_CAP:
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        kvm_migrate_timers(vcpu);
                vcpu->cpu = cpu;
        }
+
+       accumulate_steal_time(vcpu);
+       kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        entry->flags = 0;
 }
 
+static bool supported_xcr0_bit(unsigned bit)
+{
+       u64 mask = ((u64)1 << bit);
+
+       return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
 #define F(x) bit(X86_FEATURE_##x)
 
 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                0 /* Reserved, DCA */ | F(XMM4_1) |
                F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
                0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-               F(F16C);
+               F(F16C) | F(RDRAND);
        /* cpuid 0x80000001.ecx */
        const u32 kvm_supported_word6_x86_features =
                F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
                F(PMM) | F(PMM_EN);
 
+       /* cpuid 7.0.ebx */
+       const u32 kvm_supported_word9_x86_features =
+               F(SMEP) | F(FSGSBASE) | F(ERMS);
+
        /* all calls to cpuid_count() should be made on the same cpu */
        get_cpu();
        do_cpuid_1_ent(entry, function, index);
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                }
                break;
        }
-       /* function 4 and 0xb have additional index. */
+       /* function 4 has additional index. */
        case 4: {
                int i, cache_type;
 
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                }
                break;
        }
+       case 7: {
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* Mask ebx against host capbability word 9 */
+               if (index == 0) {
+                       entry->ebx &= kvm_supported_word9_x86_features;
+                       cpuid_mask(&entry->ebx, 9);
+               } else
+                       entry->ebx = 0;
+               entry->eax = 0;
+               entry->ecx = 0;
+               entry->edx = 0;
+               break;
+       }
+       case 9:
+               break;
+       /* function 0xb has additional index. */
        case 0xb: {
                int i, level_type;
 
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        }
        case 0xd: {
-               int i;
+               int idx, i;
 
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               for (i = 1; *nent < maxnent && i < 64; ++i) {
-                       if (entry[i].eax == 0)
+               for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+                       do_cpuid_1_ent(&entry[i], function, idx);
+                       if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
                                continue;
-                       do_cpuid_1_ent(&entry[i], function, i);
                        entry[i].flags |=
                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                        ++*nent;
+                       ++i;
                }
                break;
        }
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
                             (1 << KVM_FEATURE_ASYNC_PF) |
                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+               if (sched_info_on())
+                       entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
                entry->ebx = 0;
                entry->ecx = 0;
                entry->edx = 0;
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->ecx &= kvm_supported_word6_x86_features;
                cpuid_mask(&entry->ecx, 6);
                break;
+       case 0x80000008: {
+               unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+               unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+               unsigned phys_as = entry->eax & 0xff;
+
+               if (!g_phys_as)
+                       g_phys_as = phys_as;
+               entry->eax = g_phys_as | (virt_as << 8);
+               entry->ebx = entry->edx = 0;
+               break;
+       }
+       case 0x80000019:
+               entry->ecx = entry->edx = 0;
+               break;
+       case 0x8000001a:
+               break;
+       case 0x8000001d:
+               break;
        /*Add support for Centaur's CPUID instruction*/
        case 0xC0000000:
                /*Just support up to 0xC0000004 now*/
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->edx &= kvm_supported_word5_x86_features;
                cpuid_mask(&entry->edx, 5);
                break;
+       case 3: /* Processor serial number */
+       case 5: /* MONITOR/MWAIT */
+       case 6: /* Thermal management */
+       case 0xA: /* Architectural Performance Monitoring */
+       case 0x80000007: /* Advanced power management */
        case 0xC0000002:
        case 0xC0000003:
        case 0xC0000004:
-               /*Now nothing to do, reserved for the future*/
+       default:
+               entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
                break;
        }
 
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
                                          exception);
 }
 
-static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
                               gva_t addr, void *val, unsigned int bytes,
                               struct x86_exception *exception)
 {
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
                                          exception);
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
                                      gva_t addr, void *val, unsigned int bytes,
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
-static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
                                       gva_t addr, void *val,
                                       unsigned int bytes,
                                       struct x86_exception *exception)
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 out:
        return r;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+                               gpa_t *gpa, struct x86_exception *exception,
+                               bool write)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+       if (vcpu_match_mmio_gva(vcpu, gva) &&
+                 check_write_user_access(vcpu, write, access,
+                 vcpu->arch.access)) {
+               *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+                                       (gva & (PAGE_SIZE - 1));
+               trace_vcpu_match_mmio(gva, *gpa, write, false);
+               return 1;
+       }
+
+       if (write)
+               access |= PFERR_WRITE_MASK;
+
+       *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+       if (*gpa == UNMAPPED_GVA)
+               return -1;
+
+       /* For APIC access vmexit */
+       if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               return 1;
+
+       if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+               trace_vcpu_match_mmio(gva, *gpa, write, true);
+               return 1;
+       }
+
+       return 0;
+}
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
                                  unsigned long addr,
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
                                  struct x86_exception *exception)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       gpa_t                 gpa;
-       int handled;
+       gpa_t gpa;
+       int handled, ret;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
                return X86EMUL_CONTINUE;
        }
 
-       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
 
-       if (gpa == UNMAPPED_GVA)
+       if (ret < 0)
                return X86EMUL_PROPAGATE_FAULT;
 
-       /* For APIC access vmexit */
-       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+       if (ret)
                goto mmio;
 
        if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                                           struct x86_exception *exception,
                                           struct kvm_vcpu *vcpu)
 {
-       gpa_t                 gpa;
-       int handled;
+       gpa_t gpa;
+       int handled, ret;
 
-       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+       ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
 
-       if (gpa == UNMAPPED_GVA)
+       if (ret < 0)
                return X86EMUL_PROPAGATE_FAULT;
 
        /* For APIC access vmexit */
-       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+       if (ret)
                goto mmio;
 
        if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
                kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
+                             const unsigned long *regs)
+{
+       memset(&ctxt->twobyte, 0,
+              (void *)&ctxt->regs - (void *)&ctxt->twobyte);
+       memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+
+       ctxt->fetch.start = 0;
+       ctxt->fetch.end = 0;
+       ctxt->io_read.pos = 0;
+       ctxt->io_read.end = 0;
+       ctxt->mem_read.pos = 0;
+       ctxt->mem_read.end = 0;
+}
+
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
-       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
        /*
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-       vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
-       vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-       vcpu->arch.emulate_ctxt.mode =
-               (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-               (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-               ? X86EMUL_MODE_VM86 : cs_l
-               ? X86EMUL_MODE_PROT64 : cs_db
-               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-       vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
-       memset(c, 0, sizeof(struct decode_cache));
-       memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+       ctxt->eflags = kvm_get_rflags(vcpu);
+       ctxt->eip = kvm_rip_read(vcpu);
+       ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
+                    (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
+                    cs_l                               ? X86EMUL_MODE_PROT64 :
+                    cs_db                              ? X86EMUL_MODE_PROT32 :
+                                                         X86EMUL_MODE_PROT16;
+       ctxt->guest_mode = is_guest_mode(vcpu);
+
+       init_decode_cache(ctxt, vcpu->arch.regs);
        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
-       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
 
-       vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
-       vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
-       vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
-                                                                inc_eip;
-       ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+       ctxt->op_bytes = 2;
+       ctxt->ad_bytes = 2;
+       ctxt->_eip = ctxt->eip + inc_eip;
+       ret = emulate_int_real(ctxt, irq);
 
        if (ret != X86EMUL_CONTINUE)
                return EMULATE_FAIL;
 
-       vcpu->arch.emulate_ctxt.eip = c->eip;
-       memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-       kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-       kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+       ctxt->eip = ctxt->_eip;
+       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+       kvm_rip_write(vcpu, ctxt->eip);
+       kvm_set_rflags(vcpu, ctxt->eflags);
 
        if (irq == NMI_VECTOR)
                vcpu->arch.nmi_pending = false;
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            int insn_len)
 {
        int r;
-       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        bool writeback = true;
 
        kvm_clear_exception_queue(vcpu);
 
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                init_emulate_ctxt(vcpu);
-               vcpu->arch.emulate_ctxt.interruptibility = 0;
-               vcpu->arch.emulate_ctxt.have_exception = false;
-               vcpu->arch.emulate_ctxt.perm_ok = false;
+               ctxt->interruptibility = 0;
+               ctxt->have_exception = false;
+               ctxt->perm_ok = false;
 
-               vcpu->arch.emulate_ctxt.only_vendor_specific_insn
+               ctxt->only_vendor_specific_insn
                        = emulation_type & EMULTYPE_TRAP_UD;
 
-               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
+               r = x86_decode_insn(ctxt, insn, insn_len);
 
                trace_kvm_emulate_insn_start(vcpu);
                ++vcpu->stat.insn_emulation;
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
        }
 
        if (emulation_type & EMULTYPE_SKIP) {
-               kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+               kvm_rip_write(vcpu, ctxt->_eip);
                return EMULATE_DONE;
        }
 
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
           changes registers values  during IO operation */
        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-               memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+               memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
        }
 
 restart:
-       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+       r = x86_emulate_insn(ctxt);
 
        if (r == EMULATION_INTERCEPTED)
                return EMULATE_DONE;
@@ -4636,7 +4824,7 @@ restart:
                return handle_emulation_failure(vcpu);
        }
 
-       if (vcpu->arch.emulate_ctxt.have_exception) {
+       if (ctxt->have_exception) {
                inject_emulated_exception(vcpu);
                r = EMULATE_DONE;
        } else if (vcpu->arch.pio.count) {
@@ -4655,13 +4843,12 @@ restart:
                r = EMULATE_DONE;
 
        if (writeback) {
-               toggle_interruptibility(vcpu,
-                               vcpu->arch.emulate_ctxt.interruptibility);
-               kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+               toggle_interruptibility(vcpu, ctxt->interruptibility);
+               kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-               kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+               kvm_rip_write(vcpu, ctxt->eip);
        } else
                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
+static void kvm_set_mmio_spte_mask(void)
+{
+       u64 mask;
+       int maxphyaddr = boot_cpu_data.x86_phys_bits;
+
+       /*
+        * Set the reserved bits and the present bit of an paging-structure
+        * entry to generate page fault with PFER.RSV = 1.
+        */
+       mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+       mask |= 1ull;
+
+#ifdef CONFIG_X86_64
+       /*
+        * If reserved bit is not supported, clear the present bit to disable
+        * mmio page fault.
+        */
+       if (maxphyaddr == 52)
+               mask &= ~1ull;
+#endif
+
+       kvm_mmu_set_mmio_spte_mask(mask);
+}
+
 int kvm_arch_init(void *opaque)
 {
        int r;
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out;
 
+       kvm_set_mmio_spte_mask();
        kvm_init_msr_list();
 
        kvm_x86_ops = ops;
-       kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 
        kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-       return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
-                                      rip, instruction, 3, NULL);
+       return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        r = 1;
                        goto out;
                }
+               if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+                       record_steal_time(vcpu);
+
        }
 
        r = kvm_mmu_reload(vcpu);
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
-               struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-               memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+               struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
                    bool has_error_code, u32 error_code)
 {
-       struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
 
-       ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
-                                  tss_selector, reason, has_error_code,
-                                  error_code);
+       ret = emulator_task_switch(ctxt, tss_selector, reason,
+                                  has_error_code, error_code);
 
        if (ret)
                return EMULATE_FAIL;
 
-       memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-       kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-       kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+       kvm_rip_write(vcpu, ctxt->eip);
+       kvm_set_rflags(vcpu, ctxt->eflags);
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        return EMULATE_DONE;
 }
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
-       if (r < 0)
-               goto free_vcpu;
 
-       return 0;
-free_vcpu:
-       kvm_x86_ops->vcpu_free(vcpu);
        return r;
 }
 
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        vcpu->arch.apf.msr_val = 0;
+       vcpu->arch.st.msr_val = 0;
 
        kvmclock_reset(vcpu);
 
index e407ed3..d36fe23 100644 (file)
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
        return 1 << (bitno & 31);
 }
 
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+                                       gva_t gva, gfn_t gfn, unsigned access)
+{
+       vcpu->arch.mmio_gva = gva & PAGE_MASK;
+       vcpu->arch.access = access;
+       vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+               return;
+
+       vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+       if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+               return true;
+
+       return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+       if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+               return true;
+
+       return false;
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+       gva_t addr, void *val, unsigned int bytes,
+       struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+       gva_t addr, void *val, unsigned int bytes,
+       struct x86_exception *exception);
+
 #endif
index a6575b9..ccf73b2 100644 (file)
@@ -13,7 +13,7 @@ CFLAGS_mmu.o                  := $(nostackp)
 obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm.o xen-asm_$(BITS).o \
                        grant-table.o suspend.o platform-pci-unplug.o \
-                       p2m.o
+                       p2m.o trace.o
 
 obj-$(CONFIG_SMP)              += smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
index 5325742..974a528 100644 (file)
@@ -341,6 +341,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
        struct mmuext_op *op;
        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
+       trace_xen_cpu_set_ldt(addr, entries);
+
        op = mcs.args;
        op->cmd = MMUEXT_SET_LDT;
        op->arg1.linear_addr = (unsigned long)addr;
@@ -496,6 +498,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
        xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
        u64 entry = *(u64 *)ptr;
 
+       trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
        preempt_disable();
 
        xen_mc_flush();
@@ -565,6 +569,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
        unsigned long p = (unsigned long)&dt[entrynum];
        unsigned long start, end;
 
+       trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
        preempt_disable();
 
        start = __this_cpu_read(idt_desc.address);
@@ -619,6 +625,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
        static DEFINE_SPINLOCK(lock);
        static struct trap_info traps[257];
 
+       trace_xen_cpu_load_idt(desc);
+
        spin_lock(&lock);
 
        __get_cpu_var(idt_desc) = *desc;
@@ -637,6 +645,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                                const void *desc, int type)
 {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
        preempt_disable();
 
        switch (type) {
@@ -665,6 +675,8 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
                                            const void *desc, int type)
 {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
        switch (type) {
        case DESC_LDT:
        case DESC_TSS:
@@ -684,7 +696,9 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 static void xen_load_sp0(struct tss_struct *tss,
                         struct thread_struct *thread)
 {
-       struct multicall_space mcs = xen_mc_entry(0);
+       struct multicall_space mcs;
+
+       mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
index 0ccccb6..f987bde 100644 (file)
@@ -48,6 +48,8 @@
 #include <linux/memblock.h>
 #include <linux/seq_file.h>
 
+#include <trace/events/xen.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -194,6 +196,8 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
        struct multicall_space mcs;
        struct mmu_update *u;
 
+       trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
+
        mcs = xen_mc_entry(sizeof(*u));
        u = mcs.args;
 
@@ -225,6 +229,24 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
        *u = *update;
 }
 
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+       struct multicall_space mcs;
+       struct mmuext_op *u;
+
+       mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+       if (mcs.mc != NULL) {
+               mcs.mc->args[1]++;
+       } else {
+               mcs = __xen_mc_entry(sizeof(*u));
+               MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+       }
+
+       u = mcs.args;
+       *u = *op;
+}
+
 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 {
        struct mmu_update u;
@@ -245,6 +267,8 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
+       trace_xen_mmu_set_pmd(ptr, val);
+
        /* If page is not pinned, we can just update the entry
           directly */
        if (!xen_page_pinned(ptr)) {
@@ -282,22 +306,30 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
        return true;
 }
 
-static void xen_set_pte(pte_t *ptep, pte_t pteval)
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
        if (!xen_batched_set_pte(ptep, pteval))
                native_set_pte(ptep, pteval);
 }
 
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+       trace_xen_mmu_set_pte(ptep, pteval);
+       __xen_set_pte(ptep, pteval);
+}
+
 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval)
 {
-       xen_set_pte(ptep, pteval);
+       trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+       __xen_set_pte(ptep, pteval);
 }
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
                                 unsigned long addr, pte_t *ptep)
 {
        /* Just return the pte as-is.  We preserve the bits on commit */
+       trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
        return *ptep;
 }
 
@@ -306,6 +338,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 {
        struct mmu_update u;
 
+       trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
        xen_mc_batch();
 
        u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@ -530,6 +563,8 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 static void xen_set_pud(pud_t *ptr, pud_t val)
 {
+       trace_xen_mmu_set_pud(ptr, val);
+
        /* If page is not pinned, we can just update the entry
           directly */
        if (!xen_page_pinned(ptr)) {
@@ -543,17 +578,20 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
 #ifdef CONFIG_X86_PAE
 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
+       trace_xen_mmu_set_pte_atomic(ptep, pte);
        set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 
 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
+       trace_xen_mmu_pte_clear(mm, addr, ptep);
        if (!xen_batched_set_pte(ptep, native_make_pte(0)))
                native_pte_clear(mm, addr, ptep);
 }
 
 static void xen_pmd_clear(pmd_t *pmdp)
 {
+       trace_xen_mmu_pmd_clear(pmdp);
        set_pmd(pmdp, __pmd(0));
 }
 #endif /* CONFIG_X86_PAE */
@@ -629,6 +667,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 {
        pgd_t *user_ptr = xen_get_user_pgd(ptr);
 
+       trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+
        /* If page is not pinned, we can just update the entry
           directly */
        if (!xen_page_pinned(ptr)) {
@@ -788,14 +828,12 @@ static void xen_pte_unlock(void *v)
 
 static void xen_do_pin(unsigned level, unsigned long pfn)
 {
-       struct mmuext_op *op;
-       struct multicall_space mcs;
+       struct mmuext_op op;
 
-       mcs = __xen_mc_entry(sizeof(*op));
-       op = mcs.args;
-       op->cmd = level;
-       op->arg1.mfn = pfn_to_mfn(pfn);
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+       op.cmd = level;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+
+       xen_extend_mmuext_op(&op);
 }
 
 static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@ -863,6 +901,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
    read-only, and can be pinned. */
 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
+       trace_xen_mmu_pgd_pin(mm, pgd);
+
        xen_mc_batch();
 
        if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -988,6 +1028,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 /* Release a pagetables pages back as normal RW */
 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 {
+       trace_xen_mmu_pgd_unpin(mm, pgd);
+
        xen_mc_batch();
 
        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1196,6 +1238,8 @@ static void xen_flush_tlb(void)
        struct mmuext_op *op;
        struct multicall_space mcs;
 
+       trace_xen_mmu_flush_tlb(0);
+
        preempt_disable();
 
        mcs = xen_mc_entry(sizeof(*op));
@@ -1214,6 +1258,8 @@ static void xen_flush_tlb_single(unsigned long addr)
        struct mmuext_op *op;
        struct multicall_space mcs;
 
+       trace_xen_mmu_flush_tlb_single(addr);
+
        preempt_disable();
 
        mcs = xen_mc_entry(sizeof(*op));
@@ -1240,6 +1286,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
        } *args;
        struct multicall_space mcs;
 
+       trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+
        if (cpumask_empty(cpus))
                return;         /* nothing to do */
 
@@ -1275,10 +1323,11 @@ static void set_current_cr3(void *v)
 
 static void __xen_write_cr3(bool kernel, unsigned long cr3)
 {
-       struct mmuext_op *op;
-       struct multicall_space mcs;
+       struct mmuext_op op;
        unsigned long mfn;
 
+       trace_xen_mmu_write_cr3(kernel, cr3);
+
        if (cr3)
                mfn = pfn_to_mfn(PFN_DOWN(cr3));
        else
@@ -1286,13 +1335,10 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
 
        WARN_ON(mfn == 0 && kernel);
 
-       mcs = __xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-       op->arg1.mfn = mfn;
+       op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+       op.arg1.mfn = mfn;
 
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+       xen_extend_mmuext_op(&op);
 
        if (kernel) {
                percpu_write(xen_cr3, cr3);
@@ -1451,19 +1497,52 @@ static void __init xen_release_pmd_init(unsigned long pfn)
        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+       struct multicall_space mcs;
+       struct mmuext_op *op;
+
+       mcs = __xen_mc_entry(sizeof(*op));
+       op = mcs.args;
+       op->cmd = cmd;
+       op->arg1.mfn = pfn_to_mfn(pfn);
+
+       MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+       struct multicall_space mcs;
+       unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+       mcs = __xen_mc_entry(0);
+       MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+                               pfn_pte(pfn, prot), 0);
+}
+
 /* This needs to make sure the new pte page is pinned iff its being
    attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+                                   unsigned level)
 {
-       struct page *page = pfn_to_page(pfn);
+       bool pinned = PagePinned(virt_to_page(mm->pgd));
+
+       trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+       if (pinned) {
+               struct page *page = pfn_to_page(pfn);
 
-       if (PagePinned(virt_to_page(mm->pgd))) {
                SetPagePinned(page);
 
                if (!PageHighMem(page)) {
-                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+                       xen_mc_batch();
+
+                       __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+                               __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
                } else {
                        /* make sure there are no stray mappings of
                           this page */
@@ -1483,15 +1562,23 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
 }
 
 /* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
 {
        struct page *page = pfn_to_page(pfn);
+       bool pinned = PagePinned(page);
 
-       if (PagePinned(page)) {
+       trace_xen_mmu_release_ptpage(pfn, level, pinned);
+
+       if (pinned) {
                if (!PageHighMem(page)) {
+                       xen_mc_batch();
+
                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+                               __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+                       __set_pfn_prot(pfn, PAGE_KERNEL);
+
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
                }
                ClearPagePinned(page);
        }
index 1b2b73f..0d82003 100644 (file)
 
 #define MC_BATCH       32
 
-#define MC_DEBUG       1
+#define MC_DEBUG       0
 
 #define MC_ARGS                (MC_BATCH * 16)
 
 
 struct mc_buffer {
+       unsigned mcidx, argidx, cbidx;
        struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
        struct multicall_entry debug[MC_BATCH];
@@ -46,85 +47,15 @@ struct mc_buffer {
                void (*fn)(void *);
                void *data;
        } callbacks[MC_BATCH];
-       unsigned mcidx, argidx, cbidx;
 };
 
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
 DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
-/* flush reasons 0- slots, 1- args, 2- callbacks */
-enum flush_reasons
-{
-       FL_SLOTS,
-       FL_ARGS,
-       FL_CALLBACKS,
-
-       FL_N_REASONS
-};
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define NHYPERCALLS    40              /* not really */
-
-static struct {
-       unsigned histo[MC_BATCH+1];
-
-       unsigned issued;
-       unsigned arg_total;
-       unsigned hypercalls;
-       unsigned histo_hypercalls[NHYPERCALLS];
-
-       unsigned flush[FL_N_REASONS];
-} mc_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
-       if (unlikely(zero_stats)) {
-               memset(&mc_stats, 0, sizeof(mc_stats));
-               zero_stats = 0;
-       }
-}
-
-static void mc_add_stats(const struct mc_buffer *mc)
-{
-       int i;
-
-       check_zero();
-
-       mc_stats.issued++;
-       mc_stats.hypercalls += mc->mcidx;
-       mc_stats.arg_total += mc->argidx;
-
-       mc_stats.histo[mc->mcidx]++;
-       for(i = 0; i < mc->mcidx; i++) {
-               unsigned op = mc->entries[i].op;
-               if (op < NHYPERCALLS)
-                       mc_stats.histo_hypercalls[op]++;
-       }
-}
-
-static void mc_stats_flush(enum flush_reasons idx)
-{
-       check_zero();
-
-       mc_stats.flush[idx]++;
-}
-
-#else  /* !CONFIG_XEN_DEBUG_FS */
-
-static inline void mc_add_stats(const struct mc_buffer *mc)
-{
-}
-
-static inline void mc_stats_flush(enum flush_reasons idx)
-{
-}
-#endif /* CONFIG_XEN_DEBUG_FS */
-
 void xen_mc_flush(void)
 {
        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+       struct multicall_entry *mc;
        int ret = 0;
        unsigned long flags;
        int i;
@@ -135,9 +66,26 @@ void xen_mc_flush(void)
           something in the middle */
        local_irq_save(flags);
 
-       mc_add_stats(b);
+       trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+       switch (b->mcidx) {
+       case 0:
+               /* no-op */
+               BUG_ON(b->argidx != 0);
+               break;
+
+       case 1:
+               /* Singleton multicall - bypass multicall machinery
+                  and just do the call directly. */
+               mc = &b->entries[0];
+
+               mc->result = privcmd_call(mc->op,
+                                         mc->args[0], mc->args[1], mc->args[2], 
+                                         mc->args[3], mc->args[4]);
+               ret = mc->result < 0;
+               break;
 
-       if (b->mcidx) {
+       default:
 #if MC_DEBUG
                memcpy(b->debug, b->entries,
                       b->mcidx * sizeof(struct multicall_entry));
@@ -164,11 +112,10 @@ void xen_mc_flush(void)
                        }
                }
 #endif
+       }
 
-               b->mcidx = 0;
-               b->argidx = 0;
-       } else
-               BUG_ON(b->argidx != 0);
+       b->mcidx = 0;
+       b->argidx = 0;
 
        for (i = 0; i < b->cbidx; i++) {
                struct callback *cb = &b->callbacks[i];
@@ -188,18 +135,21 @@ struct multicall_space __xen_mc_entry(size_t args)
        struct multicall_space ret;
        unsigned argidx = roundup(b->argidx, sizeof(u64));
 
+       trace_xen_mc_entry_alloc(args);
+
        BUG_ON(preemptible());
        BUG_ON(b->argidx >= MC_ARGS);
 
-       if (b->mcidx == MC_BATCH ||
-           (argidx + args) >= MC_ARGS) {
-               mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
+       if (unlikely(b->mcidx == MC_BATCH ||
+                    (argidx + args) >= MC_ARGS)) {
+               trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+                                         XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
                xen_mc_flush();
                argidx = roundup(b->argidx, sizeof(u64));
        }
 
        ret.mc = &b->entries[b->mcidx];
-#ifdef MC_DEBUG
+#if MC_DEBUG
        b->caller[b->mcidx] = __builtin_return_address(0);
 #endif
        b->mcidx++;
@@ -218,20 +168,25 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
        BUG_ON(preemptible());
        BUG_ON(b->argidx >= MC_ARGS);
 
-       if (b->mcidx == 0)
-               return ret;
-
-       if (b->entries[b->mcidx - 1].op != op)
-               return ret;
+       if (unlikely(b->mcidx == 0 ||
+                    b->entries[b->mcidx - 1].op != op)) {
+               trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+               goto out;
+       }
 
-       if ((b->argidx + size) >= MC_ARGS)
-               return ret;
+       if (unlikely((b->argidx + size) >= MC_ARGS)) {
+               trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+               goto out;
+       }
 
        ret.mc = &b->entries[b->mcidx - 1];
        ret.args = &b->args[b->argidx];
        b->argidx += size;
 
        BUG_ON(b->argidx >= MC_ARGS);
+
+       trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
        return ret;
 }
 
@@ -241,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)
        struct callback *cb;
 
        if (b->cbidx == MC_BATCH) {
-               mc_stats_flush(FL_CALLBACKS);
+               trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
                xen_mc_flush();
        }
 
+       trace_xen_mc_callback(fn, data);
+
        cb = &b->callbacks[b->cbidx++];
        cb->fn = fn;
        cb->data = data;
 }
-
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct dentry *d_mc_debug;
-
-static int __init xen_mc_debugfs(void)
-{
-       struct dentry *d_xen = xen_init_debugfs();
-
-       if (d_xen == NULL)
-               return -ENOMEM;
-
-       d_mc_debug = debugfs_create_dir("multicalls", d_xen);
-
-       debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
-
-       debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
-       debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
-       debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
-
-       xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
-                                    mc_stats.histo, MC_BATCH);
-       xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
-                                    mc_stats.histo_hypercalls, NHYPERCALLS);
-       xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
-                                    mc_stats.flush, FL_N_REASONS);
-
-       return 0;
-}
-fs_initcall(xen_mc_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
index 4ec8035..dee79b7 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _XEN_MULTICALLS_H
 #define _XEN_MULTICALLS_H
 
+#include <trace/events/xen.h>
+
 #include "xen-ops.h"
 
 /* Multicalls */
@@ -20,8 +22,10 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
 static inline void xen_mc_batch(void)
 {
        unsigned long flags;
+
        /* need to disable interrupts until this entry is complete */
        local_irq_save(flags);
+       trace_xen_mc_batch(paravirt_get_lazy_mode());
        __this_cpu_write(xen_mc_irq_flags, flags);
 }
 
@@ -37,6 +41,8 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
+       trace_xen_mc_issue(mode);
+
        if ((paravirt_get_lazy_mode() & mode) == 0)
                xen_mc_flush();
 
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644 (file)
index 0000000..734beba
--- /dev/null
@@ -0,0 +1,61 @@
+#include <linux/ftrace.h>
+
+#define N(x)   [__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+       N(set_trap_table),
+       N(mmu_update),
+       N(set_gdt),
+       N(stack_switch),
+       N(set_callbacks),
+       N(fpu_taskswitch),
+       N(sched_op_compat),
+       N(dom0_op),
+       N(set_debugreg),
+       N(get_debugreg),
+       N(update_descriptor),
+       N(memory_op),
+       N(multicall),
+       N(update_va_mapping),
+       N(set_timer_op),
+       N(event_channel_op_compat),
+       N(xen_version),
+       N(console_io),
+       N(physdev_op_compat),
+       N(grant_table_op),
+       N(vm_assist),
+       N(update_va_mapping_otherdomain),
+       N(iret),
+       N(vcpu_op),
+       N(set_segment_base),
+       N(mmuext_op),
+       N(acm_op),
+       N(nmi_op),
+       N(sched_op),
+       N(callback_op),
+       N(xenoprof_op),
+       N(event_channel_op),
+       N(physdev_op),
+       N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+       N(arch_0),
+       N(arch_1),
+       N(arch_2),
+       N(arch_3),
+       N(arch_4),
+       N(arch_5),
+       N(arch_6),
+       N(arch_7),
+};
+#undef N
+
+static const char *xen_hypercall_name(unsigned op)
+{
+       if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+               return xen_hypercall_names[op];
+
+       return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
index 87b22ca..2af8155 100644 (file)
@@ -458,7 +458,7 @@ config CRYPTO_WP512
 
 config CRYPTO_GHASH_CLMUL_NI_INTEL
        tristate "GHASH digest algorithm (CLMUL-NI accelerated)"
-       depends on (X86 || UML_X86) && 64BIT
+       depends on X86 && 64BIT
        select CRYPTO_SHASH
        select CRYPTO_CRYPTD
        help
@@ -533,7 +533,7 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_NI_INTEL
        tristate "AES cipher algorithms (AES-NI)"
-       depends on (X86 || UML_X86)
+       depends on X86
        select CRYPTO_AES_X86_64 if 64BIT
        select CRYPTO_AES_586 if !64BIT
        select CRYPTO_CRYPTD
index 62122a1..ef5356c 100644 (file)
@@ -68,8 +68,10 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
                        int newlen;
 
                        newlen = af_alg_make_sg(&ctx->sgl, from, len, 0);
-                       if (newlen < 0)
+                       if (newlen < 0) {
+                               err = copied ? 0 : newlen;
                                goto unlock;
+                       }
 
                        ahash_request_set_crypt(&ctx->req, ctx->sgl.sg, NULL,
                                                newlen);
index 8be47e1..0d12a96 100644 (file)
@@ -1,4 +1,4 @@
-/* 
+/*
  * Cryptographic API
  *
  * ARC4 Cipher Algorithm
@@ -33,16 +33,15 @@ static int arc4_set_key(struct crypto_tfm *tfm, const u8 *in_key,
        ctx->x = 1;
        ctx->y = 0;
 
-       for(i = 0; i < 256; i++)
+       for (i = 0; i < 256; i++)
                ctx->S[i] = i;
 
-       for(i = 0; i < 256; i++)
-       {
+       for (i = 0; i < 256; i++) {
                u8 a = ctx->S[i];
                j = (j + in_key[k] + a) & 0xff;
                ctx->S[i] = ctx->S[j];
                ctx->S[j] = a;
-               if(++k >= key_len)
+               if (++k >= key_len)
                        k = 0;
        }
 
@@ -80,9 +79,9 @@ static struct crypto_alg arc4_alg = {
        .cra_u                  =       { .cipher = {
        .cia_min_keysize        =       ARC4_MIN_KEY_SIZE,
        .cia_max_keysize        =       ARC4_MAX_KEY_SIZE,
-       .cia_setkey             =       arc4_set_key,
-       .cia_encrypt            =       arc4_crypt,
-       .cia_decrypt            =       arc4_crypt } }
+       .cia_setkey             =       arc4_set_key,
+       .cia_encrypt            =       arc4_crypt,
+       .cia_decrypt            =       arc4_crypt } }
 };
 
 static int __init arc4_init(void)
index de9e55c..3f9ad28 100644 (file)
@@ -224,11 +224,11 @@ static int crc32c_cra_init(struct crypto_tfm *tfm)
 static struct shash_alg alg = {
        .digestsize             =       CHKSUM_DIGEST_SIZE,
        .setkey                 =       chksum_setkey,
-       .init                   =       chksum_init,
-       .update                 =       chksum_update,
-       .final                  =       chksum_final,
-       .finup                  =       chksum_finup,
-       .digest                 =       chksum_digest,
+       .init           =       chksum_init,
+       .update         =       chksum_update,
+       .final          =       chksum_final,
+       .finup          =       chksum_finup,
+       .digest         =       chksum_digest,
        .descsize               =       sizeof(struct chksum_desc_ctx),
        .base                   =       {
                .cra_name               =       "crc32c",
index df35e4c..5276607 100644 (file)
@@ -182,7 +182,7 @@ void gf128mul_lle(be128 *r, const be128 *b)
        for (i = 0; i < 7; ++i)
                gf128mul_x_lle(&p[i + 1], &p[i]);
 
-       memset(r, 0, sizeof(r));
+       memset(r, 0, sizeof(*r));
        for (i = 0;;) {
                u8 ch = ((u8 *)b)[15 - i];
 
@@ -220,7 +220,7 @@ void gf128mul_bbe(be128 *r, const be128 *b)
        for (i = 0; i < 7; ++i)
                gf128mul_x_bbe(&p[i + 1], &p[i]);
 
-       memset(r, 0, sizeof(r));
+       memset(r, 0, sizeof(*r));
        for (i = 0;;) {
                u8 ch = ((u8 *)b)[i];
 
index 0416091..00ae60e 100644 (file)
@@ -43,25 +43,26 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
        unsigned int partial, done;
        const u8 *src;
 
-       partial = sctx->count & 0x3f;
+       partial = sctx->count % SHA1_BLOCK_SIZE;
        sctx->count += len;
        done = 0;
        src = data;
 
-       if ((partial + len) > 63) {
+       if ((partial + len) >= SHA1_BLOCK_SIZE) {
                u32 temp[SHA_WORKSPACE_WORDS];
 
                if (partial) {
                        done = -partial;
-                       memcpy(sctx->buffer + partial, data, done + 64);
+                       memcpy(sctx->buffer + partial, data,
+                              done + SHA1_BLOCK_SIZE);
                        src = sctx->buffer;
                }
 
                do {
                        sha_transform(sctx->state, src, temp);
-                       done += 64;
+                       done += SHA1_BLOCK_SIZE;
                        src = data + done;
-               } while (done + 63 < len);
+               } while (done + SHA1_BLOCK_SIZE <= len);
 
                memset(temp, 0, sizeof(temp));
                partial = 0;
index 27e6061..27adc92 100644 (file)
@@ -2976,8 +2976,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
 #define AES_CBC_DEC_TEST_VECTORS 4
 #define AES_LRW_ENC_TEST_VECTORS 8
 #define AES_LRW_DEC_TEST_VECTORS 8
-#define AES_XTS_ENC_TEST_VECTORS 4
-#define AES_XTS_DEC_TEST_VECTORS 4
+#define AES_XTS_ENC_TEST_VECTORS 5
+#define AES_XTS_DEC_TEST_VECTORS 5
 #define AES_CTR_ENC_TEST_VECTORS 3
 #define AES_CTR_DEC_TEST_VECTORS 3
 #define AES_OFB_ENC_TEST_VECTORS 1
@@ -3926,6 +3926,150 @@ static struct cipher_testvec aes_xts_enc_tv_template[] = {
                          "\x0a\x28\x2d\xf9\x20\x14\x7b\xea"
                          "\xbe\x42\x1e\xe5\x31\x9d\x05\x68",
                .rlen   = 512,
+       }, { /* XTS-AES 10, XTS-AES-256, data unit 512 bytes */
+               .key    = "\x27\x18\x28\x18\x28\x45\x90\x45"
+                         "\x23\x53\x60\x28\x74\x71\x35\x26"
+                         "\x62\x49\x77\x57\x24\x70\x93\x69"
+                         "\x99\x59\x57\x49\x66\x96\x76\x27"
+                         "\x31\x41\x59\x26\x53\x58\x97\x93"
+                         "\x23\x84\x62\x64\x33\x83\x27\x95"
+                         "\x02\x88\x41\x97\x16\x93\x99\x37"
+                         "\x51\x05\x82\x09\x74\x94\x45\x92",
+               .klen   = 64,
+               .iv     = "\xff\x00\x00\x00\x00\x00\x00\x00"
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+               .input  = "\x00\x01\x02\x03\x04\x05\x06\x07"
+                         "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                         "\x10\x11\x12\x13\x14\x15\x16\x17"
+                         "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                         "\x20\x21\x22\x23\x24\x25\x26\x27"
+                         "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+                         "\x30\x31\x32\x33\x34\x35\x36\x37"
+                         "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+                         "\x40\x41\x42\x43\x44\x45\x46\x47"
+                         "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+                         "\x50\x51\x52\x53\x54\x55\x56\x57"
+                         "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+                         "\x60\x61\x62\x63\x64\x65\x66\x67"
+                         "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+                         "\x70\x71\x72\x73\x74\x75\x76\x77"
+                         "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+                         "\x80\x81\x82\x83\x84\x85\x86\x87"
+                         "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+                         "\x90\x91\x92\x93\x94\x95\x96\x97"
+                         "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+                         "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+                         "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+                         "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+                         "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+                         "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+                         "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+                         "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+                         "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+                         "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+                         "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+                         "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+                         "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+                         "\x00\x01\x02\x03\x04\x05\x06\x07"
+                         "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                         "\x10\x11\x12\x13\x14\x15\x16\x17"
+                         "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                         "\x20\x21\x22\x23\x24\x25\x26\x27"
+                         "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+                         "\x30\x31\x32\x33\x34\x35\x36\x37"
+                         "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+                         "\x40\x41\x42\x43\x44\x45\x46\x47"
+                         "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+                         "\x50\x51\x52\x53\x54\x55\x56\x57"
+                         "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+                         "\x60\x61\x62\x63\x64\x65\x66\x67"
+                         "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+                         "\x70\x71\x72\x73\x74\x75\x76\x77"
+                         "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+                         "\x80\x81\x82\x83\x84\x85\x86\x87"
+                         "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+                         "\x90\x91\x92\x93\x94\x95\x96\x97"
+                         "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+                         "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+                         "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+                         "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+                         "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+                         "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+                         "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+                         "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+                         "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+                         "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+                         "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+                         "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+                         "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+               .ilen   = 512,
+               .result = "\x1c\x3b\x3a\x10\x2f\x77\x03\x86"
+                         "\xe4\x83\x6c\x99\xe3\x70\xcf\x9b"
+                         "\xea\x00\x80\x3f\x5e\x48\x23\x57"
+                         "\xa4\xae\x12\xd4\x14\xa3\xe6\x3b"
+                         "\x5d\x31\xe2\x76\xf8\xfe\x4a\x8d"
+                         "\x66\xb3\x17\xf9\xac\x68\x3f\x44"
+                         "\x68\x0a\x86\xac\x35\xad\xfc\x33"
+                         "\x45\xbe\xfe\xcb\x4b\xb1\x88\xfd"
+                         "\x57\x76\x92\x6c\x49\xa3\x09\x5e"
+                         "\xb1\x08\xfd\x10\x98\xba\xec\x70"
+                         "\xaa\xa6\x69\x99\xa7\x2a\x82\xf2"
+                         "\x7d\x84\x8b\x21\xd4\xa7\x41\xb0"
+                         "\xc5\xcd\x4d\x5f\xff\x9d\xac\x89"
+                         "\xae\xba\x12\x29\x61\xd0\x3a\x75"
+                         "\x71\x23\xe9\x87\x0f\x8a\xcf\x10"
+                         "\x00\x02\x08\x87\x89\x14\x29\xca"
+                         "\x2a\x3e\x7a\x7d\x7d\xf7\xb1\x03"
+                         "\x55\x16\x5c\x8b\x9a\x6d\x0a\x7d"
+                         "\xe8\xb0\x62\xc4\x50\x0d\xc4\xcd"
+                         "\x12\x0c\x0f\x74\x18\xda\xe3\xd0"
+                         "\xb5\x78\x1c\x34\x80\x3f\xa7\x54"
+                         "\x21\xc7\x90\xdf\xe1\xde\x18\x34"
+                         "\xf2\x80\xd7\x66\x7b\x32\x7f\x6c"
+                         "\x8c\xd7\x55\x7e\x12\xac\x3a\x0f"
+                         "\x93\xec\x05\xc5\x2e\x04\x93\xef"
+                         "\x31\xa1\x2d\x3d\x92\x60\xf7\x9a"
+                         "\x28\x9d\x6a\x37\x9b\xc7\x0c\x50"
+                         "\x84\x14\x73\xd1\xa8\xcc\x81\xec"
+                         "\x58\x3e\x96\x45\xe0\x7b\x8d\x96"
+                         "\x70\x65\x5b\xa5\xbb\xcf\xec\xc6"
+                         "\xdc\x39\x66\x38\x0a\xd8\xfe\xcb"
+                         "\x17\xb6\xba\x02\x46\x9a\x02\x0a"
+                         "\x84\xe1\x8e\x8f\x84\x25\x20\x70"
+                         "\xc1\x3e\x9f\x1f\x28\x9b\xe5\x4f"
+                         "\xbc\x48\x14\x57\x77\x8f\x61\x60"
+                         "\x15\xe1\x32\x7a\x02\xb1\x40\xf1"
+                         "\x50\x5e\xb3\x09\x32\x6d\x68\x37"
+                         "\x8f\x83\x74\x59\x5c\x84\x9d\x84"
+                         "\xf4\xc3\x33\xec\x44\x23\x88\x51"
+                         "\x43\xcb\x47\xbd\x71\xc5\xed\xae"
+                         "\x9b\xe6\x9a\x2f\xfe\xce\xb1\xbe"
+                         "\xc9\xde\x24\x4f\xbe\x15\x99\x2b"
+                         "\x11\xb7\x7c\x04\x0f\x12\xbd\x8f"
+                         "\x6a\x97\x5a\x44\xa0\xf9\x0c\x29"
+                         "\xa9\xab\xc3\xd4\xd8\x93\x92\x72"
+                         "\x84\xc5\x87\x54\xcc\xe2\x94\x52"
+                         "\x9f\x86\x14\xdc\xd2\xab\xa9\x91"
+                         "\x92\x5f\xed\xc4\xae\x74\xff\xac"
+                         "\x6e\x33\x3b\x93\xeb\x4a\xff\x04"
+                         "\x79\xda\x9a\x41\x0e\x44\x50\xe0"
+                         "\xdd\x7a\xe4\xc6\xe2\x91\x09\x00"
+                         "\x57\x5d\xa4\x01\xfc\x07\x05\x9f"
+                         "\x64\x5e\x8b\x7e\x9b\xfd\xef\x33"
+                         "\x94\x30\x54\xff\x84\x01\x14\x93"
+                         "\xc2\x7b\x34\x29\xea\xed\xb4\xed"
+                         "\x53\x76\x44\x1a\x77\xed\x43\x85"
+                         "\x1a\xd7\x7f\x16\xf5\x41\xdf\xd2"
+                         "\x69\xd5\x0d\x6a\x5f\x14\xfb\x0a"
+                         "\xab\x1c\xbb\x4c\x15\x50\xbe\x97"
+                         "\xf7\xab\x40\x66\x19\x3c\x4c\xaa"
+                         "\x77\x3d\xad\x38\x01\x4b\xd2\x09"
+                         "\x2f\xa7\x55\xc8\x24\xbb\x5e\x54"
+                         "\xc4\xf3\x6f\xfd\xa9\xfc\xea\x70"
+                         "\xb9\xc6\xe6\x93\xe1\x48\xc1\x51",
+               .rlen   = 512,
        }
 };
 
@@ -4123,6 +4267,151 @@ static struct cipher_testvec aes_xts_dec_tv_template[] = {
                          "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
                          "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
                .rlen   = 512,
+       }, { /* XTS-AES 10, XTS-AES-256, data unit 512 bytes */
+               .key    = "\x27\x18\x28\x18\x28\x45\x90\x45"
+                         "\x23\x53\x60\x28\x74\x71\x35\x26"
+                         "\x62\x49\x77\x57\x24\x70\x93\x69"
+                         "\x99\x59\x57\x49\x66\x96\x76\x27"
+                         "\x31\x41\x59\x26\x53\x58\x97\x93"
+                         "\x23\x84\x62\x64\x33\x83\x27\x95"
+                         "\x02\x88\x41\x97\x16\x93\x99\x37"
+                         "\x51\x05\x82\x09\x74\x94\x45\x92",
+               .klen   = 64,
+               .iv     = "\xff\x00\x00\x00\x00\x00\x00\x00"
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+                         "\x00\x00\x00\x00\x00\x00\x00\x00",
+               .input  = "\x1c\x3b\x3a\x10\x2f\x77\x03\x86"
+                         "\xe4\x83\x6c\x99\xe3\x70\xcf\x9b"
+                         "\xea\x00\x80\x3f\x5e\x48\x23\x57"
+                         "\xa4\xae\x12\xd4\x14\xa3\xe6\x3b"
+                         "\x5d\x31\xe2\x76\xf8\xfe\x4a\x8d"
+                         "\x66\xb3\x17\xf9\xac\x68\x3f\x44"
+                         "\x68\x0a\x86\xac\x35\xad\xfc\x33"
+                         "\x45\xbe\xfe\xcb\x4b\xb1\x88\xfd"
+                         "\x57\x76\x92\x6c\x49\xa3\x09\x5e"
+                         "\xb1\x08\xfd\x10\x98\xba\xec\x70"
+                         "\xaa\xa6\x69\x99\xa7\x2a\x82\xf2"
+                         "\x7d\x84\x8b\x21\xd4\xa7\x41\xb0"
+                         "\xc5\xcd\x4d\x5f\xff\x9d\xac\x89"
+                         "\xae\xba\x12\x29\x61\xd0\x3a\x75"
+                         "\x71\x23\xe9\x87\x0f\x8a\xcf\x10"
+                         "\x00\x02\x08\x87\x89\x14\x29\xca"
+                         "\x2a\x3e\x7a\x7d\x7d\xf7\xb1\x03"
+                         "\x55\x16\x5c\x8b\x9a\x6d\x0a\x7d"
+                         "\xe8\xb0\x62\xc4\x50\x0d\xc4\xcd"
+                         "\x12\x0c\x0f\x74\x18\xda\xe3\xd0"
+                         "\xb5\x78\x1c\x34\x80\x3f\xa7\x54"
+                         "\x21\xc7\x90\xdf\xe1\xde\x18\x34"
+                         "\xf2\x80\xd7\x66\x7b\x32\x7f\x6c"
+                         "\x8c\xd7\x55\x7e\x12\xac\x3a\x0f"
+                         "\x93\xec\x05\xc5\x2e\x04\x93\xef"
+                         "\x31\xa1\x2d\x3d\x92\x60\xf7\x9a"
+                         "\x28\x9d\x6a\x37\x9b\xc7\x0c\x50"
+                         "\x84\x14\x73\xd1\xa8\xcc\x81\xec"
+                         "\x58\x3e\x96\x45\xe0\x7b\x8d\x96"
+                         "\x70\x65\x5b\xa5\xbb\xcf\xec\xc6"
+                         "\xdc\x39\x66\x38\x0a\xd8\xfe\xcb"
+                         "\x17\xb6\xba\x02\x46\x9a\x02\x0a"
+                         "\x84\xe1\x8e\x8f\x84\x25\x20\x70"
+                         "\xc1\x3e\x9f\x1f\x28\x9b\xe5\x4f"
+                         "\xbc\x48\x14\x57\x77\x8f\x61\x60"
+                         "\x15\xe1\x32\x7a\x02\xb1\x40\xf1"
+                         "\x50\x5e\xb3\x09\x32\x6d\x68\x37"
+                         "\x8f\x83\x74\x59\x5c\x84\x9d\x84"
+                         "\xf4\xc3\x33\xec\x44\x23\x88\x51"
+                         "\x43\xcb\x47\xbd\x71\xc5\xed\xae"
+                         "\x9b\xe6\x9a\x2f\xfe\xce\xb1\xbe"
+                         "\xc9\xde\x24\x4f\xbe\x15\x99\x2b"
+                         "\x11\xb7\x7c\x04\x0f\x12\xbd\x8f"
+                         "\x6a\x97\x5a\x44\xa0\xf9\x0c\x29"
+                         "\xa9\xab\xc3\xd4\xd8\x93\x92\x72"
+                         "\x84\xc5\x87\x54\xcc\xe2\x94\x52"
+                         "\x9f\x86\x14\xdc\xd2\xab\xa9\x91"
+                         "\x92\x5f\xed\xc4\xae\x74\xff\xac"
+                         "\x6e\x33\x3b\x93\xeb\x4a\xff\x04"
+                         "\x79\xda\x9a\x41\x0e\x44\x50\xe0"
+                         "\xdd\x7a\xe4\xc6\xe2\x91\x09\x00"
+                         "\x57\x5d\xa4\x01\xfc\x07\x05\x9f"
+                         "\x64\x5e\x8b\x7e\x9b\xfd\xef\x33"
+                         "\x94\x30\x54\xff\x84\x01\x14\x93"
+                         "\xc2\x7b\x34\x29\xea\xed\xb4\xed"
+                         "\x53\x76\x44\x1a\x77\xed\x43\x85"
+                         "\x1a\xd7\x7f\x16\xf5\x41\xdf\xd2"
+                         "\x69\xd5\x0d\x6a\x5f\x14\xfb\x0a"
+                         "\xab\x1c\xbb\x4c\x15\x50\xbe\x97"
+                         "\xf7\xab\x40\x66\x19\x3c\x4c\xaa"
+                         "\x77\x3d\xad\x38\x01\x4b\xd2\x09"
+                         "\x2f\xa7\x55\xc8\x24\xbb\x5e\x54"
+                         "\xc4\xf3\x6f\xfd\xa9\xfc\xea\x70"
+                         "\xb9\xc6\xe6\x93\xe1\x48\xc1\x51",
+               .ilen   = 512,
+               .result = "\x00\x01\x02\x03\x04\x05\x06\x07"
+                         "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                         "\x10\x11\x12\x13\x14\x15\x16\x17"
+                         "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                         "\x20\x21\x22\x23\x24\x25\x26\x27"
+                         "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+                         "\x30\x31\x32\x33\x34\x35\x36\x37"
+                         "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+                         "\x40\x41\x42\x43\x44\x45\x46\x47"
+                         "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+                         "\x50\x51\x52\x53\x54\x55\x56\x57"
+                         "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+                         "\x60\x61\x62\x63\x64\x65\x66\x67"
+                         "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+                         "\x70\x71\x72\x73\x74\x75\x76\x77"
+                         "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+                         "\x80\x81\x82\x83\x84\x85\x86\x87"
+                         "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+                         "\x90\x91\x92\x93\x94\x95\x96\x97"
+                         "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+                         "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+                         "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+                         "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+                         "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+                         "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+                         "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+                         "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+                         "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+                         "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+                         "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+                         "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+                         "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+                         "\x00\x01\x02\x03\x04\x05\x06\x07"
+                         "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                         "\x10\x11\x12\x13\x14\x15\x16\x17"
+                         "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                         "\x20\x21\x22\x23\x24\x25\x26\x27"
+                         "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+                         "\x30\x31\x32\x33\x34\x35\x36\x37"
+                         "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+                         "\x40\x41\x42\x43\x44\x45\x46\x47"
+                         "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+                         "\x50\x51\x52\x53\x54\x55\x56\x57"
+                         "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+                         "\x60\x61\x62\x63\x64\x65\x66\x67"
+                         "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+                         "\x70\x71\x72\x73\x74\x75\x76\x77"
+                         "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+                         "\x80\x81\x82\x83\x84\x85\x86\x87"
+                         "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+                         "\x90\x91\x92\x93\x94\x95\x96\x97"
+                         "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+                         "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+                         "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+                         "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+                         "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+                         "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+                         "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+                         "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+                         "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+                         "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+                         "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+                         "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+                         "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+               .rlen   = 512,
+
        }
 };
 
index a60043b..1d2ebc7 100644 (file)
@@ -210,3 +210,15 @@ config HW_RANDOM_PICOXCELL
          module will be called picoxcell-rng.
 
          If unsure, say Y.
+
+config HW_RANDOM_PPC4XX
+       tristate "PowerPC 4xx generic true random number generator support"
+       depends on HW_RANDOM && PPC && 4xx
+       ---help---
+        This driver provides the kernel-side support for the TRNG hardware
+        found in the security function of some PowerPC 4xx SoCs.
+
+        To compile this driver as a module, choose M here: the
+        module will be called ppc4xx-rng.
+
+        If unsure, say N.
index 3db4eb8..c88f244 100644 (file)
@@ -20,3 +20,4 @@ obj-$(CONFIG_HW_RANDOM_MXC_RNGA) += mxc-rnga.o
 obj-$(CONFIG_HW_RANDOM_OCTEON) += octeon-rng.o
 obj-$(CONFIG_HW_RANDOM_NOMADIK) += nomadik-rng.o
 obj-$(CONFIG_HW_RANDOM_PICOXCELL) += picoxcell-rng.o
+obj-$(CONFIG_HW_RANDOM_PPC4XX) += ppc4xx-rng.o
index dd1d143..52e08ca 100644 (file)
@@ -55,7 +55,7 @@ static int nmk_rng_probe(struct amba_device *dev, const struct amba_id *id)
 
        ret = amba_request_regions(dev, dev->dev.init_name);
        if (ret)
-               return ret;
+               goto out_clk;
        ret = -ENOMEM;
        base = ioremap(dev->res.start, resource_size(&dev->res));
        if (!base)
@@ -70,6 +70,7 @@ out_unmap:
        iounmap(base);
 out_release:
        amba_release_regions(dev);
+out_clk:
        clk_disable(rng_clk);
        clk_put(rng_clk);
        return ret;
index 2cc755a..b757fac 100644 (file)
@@ -113,8 +113,10 @@ static int __devinit omap_rng_probe(struct platform_device *pdev)
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
-       if (!res)
-               return -ENOENT;
+       if (!res) {
+               ret = -ENOENT;
+               goto err_region;
+       }
 
        if (!request_mem_region(res->start, resource_size(res), pdev->name)) {
                ret = -EBUSY;
diff --git a/drivers/char/hw_random/ppc4xx-rng.c b/drivers/char/hw_random/ppc4xx-rng.c
new file mode 100644 (file)
index 0000000..b8afa6a
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ * Generic PowerPC 44x RNG driver
+ *
+ * Copyright 2011 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/hw_random.h>
+#include <linux/delay.h>
+#include <linux/of_platform.h>
+#include <asm/io.h>
+
+#define PPC4XX_TRNG_DEV_CTRL 0x60080
+
+#define PPC4XX_TRNGE 0x00020000
+#define PPC4XX_TRNG_CTRL 0x0008
+#define PPC4XX_TRNG_CTRL_DALM 0x20
+#define PPC4XX_TRNG_STAT 0x0004
+#define PPC4XX_TRNG_STAT_B 0x1
+#define PPC4XX_TRNG_DATA 0x0000
+
+#define MODULE_NAME "ppc4xx_rng"
+
+static int ppc4xx_rng_data_present(struct hwrng *rng, int wait)
+{
+       void __iomem *rng_regs = (void __iomem *) rng->priv;
+       int busy, i, present = 0;
+
+       for (i = 0; i < 20; i++) {
+               busy = (in_le32(rng_regs + PPC4XX_TRNG_STAT) & PPC4XX_TRNG_STAT_B);
+               if (!busy || !wait) {
+                       present = 1;
+                       break;
+               }
+               udelay(10);
+       }
+       return present;
+}
+
+static int ppc4xx_rng_data_read(struct hwrng *rng, u32 *data)
+{
+       void __iomem *rng_regs = (void __iomem *) rng->priv;
+       *data = in_le32(rng_regs + PPC4XX_TRNG_DATA);
+       return 4;
+}
+
+static int ppc4xx_rng_enable(int enable)
+{
+       struct device_node *ctrl;
+       void __iomem *ctrl_reg;
+       int err = 0;
+       u32 val;
+
+       /* Find the main crypto device node and map it to turn the TRNG on */
+       ctrl = of_find_compatible_node(NULL, NULL, "amcc,ppc4xx-crypto");
+       if (!ctrl)
+               return -ENODEV;
+
+       ctrl_reg = of_iomap(ctrl, 0);
+       if (!ctrl_reg) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       val = in_le32(ctrl_reg + PPC4XX_TRNG_DEV_CTRL);
+
+       if (enable)
+               val |= PPC4XX_TRNGE;
+       else
+               val = val & ~PPC4XX_TRNGE;
+
+       out_le32(ctrl_reg + PPC4XX_TRNG_DEV_CTRL, val);
+       iounmap(ctrl_reg);
+
+out:
+       of_node_put(ctrl);
+
+       return err;
+}
+
+static struct hwrng ppc4xx_rng = {
+       .name = MODULE_NAME,
+       .data_present = ppc4xx_rng_data_present,
+       .data_read = ppc4xx_rng_data_read,
+};
+
+static int __devinit ppc4xx_rng_probe(struct platform_device *dev)
+{
+       void __iomem *rng_regs;
+       int err = 0;
+
+       rng_regs = of_iomap(dev->dev.of_node, 0);
+       if (!rng_regs)
+               return -ENODEV;
+
+       err = ppc4xx_rng_enable(1);
+       if (err)
+               return err;
+
+       out_le32(rng_regs + PPC4XX_TRNG_CTRL, PPC4XX_TRNG_CTRL_DALM);
+       ppc4xx_rng.priv = (unsigned long) rng_regs;
+
+       err = hwrng_register(&ppc4xx_rng);
+
+       return err;
+}
+
+static int __devexit ppc4xx_rng_remove(struct platform_device *dev)
+{
+       void __iomem *rng_regs = (void __iomem *) ppc4xx_rng.priv;
+
+       hwrng_unregister(&ppc4xx_rng);
+       ppc4xx_rng_enable(0);
+       iounmap(rng_regs);
+
+       return 0;
+}
+
+static struct of_device_id ppc4xx_rng_match[] = {
+       { .compatible = "ppc4xx-rng", },
+       { .compatible = "amcc,ppc460ex-rng", },
+       { .compatible = "amcc,ppc440epx-rng", },
+       {},
+};
+
+static struct platform_driver ppc4xx_rng_driver = {
+       .driver = {
+               .name = MODULE_NAME,
+               .owner = THIS_MODULE,
+               .of_match_table = ppc4xx_rng_match,
+       },
+       .probe = ppc4xx_rng_probe,
+       .remove = ppc4xx_rng_remove,
+};
+
+static int __init ppc4xx_rng_init(void)
+{
+       return platform_driver_register(&ppc4xx_rng_driver);
+}
+module_init(ppc4xx_rng_init);
+
+static void __exit ppc4xx_rng_exit(void)
+{
+       platform_driver_unregister(&ppc4xx_rng_driver);
+}
+module_exit(ppc4xx_rng_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Josh Boyer <jwboyer@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("HW RNG driver for PPC 4xx processors");
index a94e930..a8428e6 100644 (file)
@@ -100,8 +100,7 @@ static int __devinit timeriomem_rng_probe(struct platform_device *pdev)
 
        timeriomem_rng_data = pdev->dev.platform_data;
 
-       timeriomem_rng_data->address = ioremap(res->start,
-                                               res->end - res->start + 1);
+       timeriomem_rng_data->address = ioremap(res->start, resource_size(res));
        if (!timeriomem_rng_data->address)
                return -EIO;
 
index 1891252..1d103f9 100644 (file)
@@ -51,6 +51,7 @@ static void crypto4xx_hw_init(struct crypto4xx_device *dev)
        union ce_io_threshold io_threshold;
        u32 rand_num;
        union ce_pe_dma_cfg pe_dma_cfg;
+       u32 device_ctrl;
 
        writel(PPC4XX_BYTE_ORDER, dev->ce_base + CRYPTO4XX_BYTE_ORDER_CFG);
        /* setup pe dma, include reset sg, pdr and pe, then release reset */
@@ -84,7 +85,9 @@ static void crypto4xx_hw_init(struct crypto4xx_device *dev)
        writel(ring_size.w, dev->ce_base + CRYPTO4XX_RING_SIZE);
        ring_ctrl.w = 0;
        writel(ring_ctrl.w, dev->ce_base + CRYPTO4XX_RING_CTRL);
-       writel(PPC4XX_DC_3DES_EN, dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
+       device_ctrl = readl(dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
+       device_ctrl |= PPC4XX_DC_3DES_EN;
+       writel(device_ctrl, dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
        writel(dev->gdr_pa, dev->ce_base + CRYPTO4XX_GATH_RING_BASE);
        writel(dev->sdr_pa, dev->ce_base + CRYPTO4XX_SCAT_RING_BASE);
        part_ring_size.w = 0;
index 676d957..4159265 100644 (file)
 #define CAAM_MAX_IV_LENGTH             16
 
 /* length of descriptors text */
-#define DESC_AEAD_SHARED_TEXT_LEN      4
-#define DESC_AEAD_ENCRYPT_TEXT_LEN     21
-#define DESC_AEAD_DECRYPT_TEXT_LEN     24
-#define DESC_AEAD_GIVENCRYPT_TEXT_LEN  27
+#define DESC_JOB_IO_LEN                        (CAAM_CMD_SZ * 3 + CAAM_PTR_SZ * 3)
+
+#define DESC_AEAD_BASE                 (4 * CAAM_CMD_SZ)
+#define DESC_AEAD_ENC_LEN              (DESC_AEAD_BASE + 16 * CAAM_CMD_SZ)
+#define DESC_AEAD_DEC_LEN              (DESC_AEAD_BASE + 21 * CAAM_CMD_SZ)
+#define DESC_AEAD_GIVENC_LEN           (DESC_AEAD_ENC_LEN + 7 * CAAM_CMD_SZ)
+
+#define DESC_ABLKCIPHER_BASE           (3 * CAAM_CMD_SZ)
+#define DESC_ABLKCIPHER_ENC_LEN                (DESC_ABLKCIPHER_BASE + \
+                                        20 * CAAM_CMD_SZ)
+#define DESC_ABLKCIPHER_DEC_LEN                (DESC_ABLKCIPHER_BASE + \
+                                        15 * CAAM_CMD_SZ)
+
+#define DESC_MAX_USED_BYTES            (DESC_AEAD_GIVENC_LEN + \
+                                        CAAM_MAX_KEY_SIZE)
+#define DESC_MAX_USED_LEN              (DESC_MAX_USED_BYTES / CAAM_CMD_SZ)
 
 #ifdef DEBUG
 /* for print_hex_dumps with line references */
 #define debug(format, arg...)
 #endif
 
+/* Set DK bit in class 1 operation if shared */
+static inline void append_dec_op1(u32 *desc, u32 type)
+{
+       u32 *jump_cmd, *uncond_jump_cmd;
+
+       jump_cmd = append_jump(desc, JUMP_TEST_ALL | JUMP_COND_SHRD);
+       append_operation(desc, type | OP_ALG_AS_INITFINAL |
+                        OP_ALG_DECRYPT);
+       uncond_jump_cmd = append_jump(desc, JUMP_TEST_ALL);
+       set_jump_tgt_here(desc, jump_cmd);
+       append_operation(desc, type | OP_ALG_AS_INITFINAL |
+                        OP_ALG_DECRYPT | OP_ALG_AAI_DK);
+       set_jump_tgt_here(desc, uncond_jump_cmd);
+}
+
+/*
+ * Wait for completion of class 1 key loading before allowing
+ * error propagation
+ */
+static inline void append_dec_shr_done(u32 *desc)
+{
+       u32 *jump_cmd;
+
+       jump_cmd = append_jump(desc, JUMP_CLASS_CLASS1 | JUMP_TEST_ALL);
+       set_jump_tgt_here(desc, jump_cmd);
+       append_cmd(desc, SET_OK_PROP_ERRORS | CMD_LOAD);
+}
+
+/*
+ * For aead functions, read payload and write payload,
+ * both of which are specified in req->src and req->dst
+ */
+static inline void aead_append_src_dst(u32 *desc, u32 msg_type)
+{
+       append_seq_fifo_load(desc, 0, FIFOLD_CLASS_BOTH |
+                            KEY_VLF | msg_type | FIFOLD_TYPE_LASTBOTH);
+       append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | KEY_VLF);
+}
+
+/*
+ * For aead encrypt and decrypt, read iv for both classes
+ */
+static inline void aead_append_ld_iv(u32 *desc, int ivsize)
+{
+       append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+                  LDST_CLASS_1_CCB | ivsize);
+       append_move(desc, MOVE_SRC_CLASS1CTX | MOVE_DEST_CLASS2INFIFO | ivsize);
+}
+
+/*
+ * For ablkcipher encrypt and decrypt, read from req->src and
+ * write to req->dst
+ */
+static inline void ablkcipher_append_src_dst(u32 *desc)
+{
+       append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0, CAAM_CMD_SZ); \
+       append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ); \
+       append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS1 | \
+                            KEY_VLF | FIFOLD_TYPE_MSG | FIFOLD_TYPE_LAST1); \
+       append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | KEY_VLF); \
+}
+
+/*
+ * If all data, including src (with assoc and iv) or dst (with iv only) are
+ * contiguous
+ */
+#define GIV_SRC_CONTIG         1
+#define GIV_DST_CONTIG         (1 << 1)
+
 /*
  * per-session context
  */
 struct caam_ctx {
        struct device *jrdev;
-       u32 *sh_desc;
-       dma_addr_t shared_desc_phys;
+       u32 sh_desc_enc[DESC_MAX_USED_LEN];
+       u32 sh_desc_dec[DESC_MAX_USED_LEN];
+       u32 sh_desc_givenc[DESC_MAX_USED_LEN];
+       dma_addr_t sh_desc_enc_dma;
+       dma_addr_t sh_desc_dec_dma;
+       dma_addr_t sh_desc_givenc_dma;
        u32 class1_alg_type;
        u32 class2_alg_type;
        u32 alg_op;
-       u8 *key;
-       dma_addr_t key_phys;
+       u8 key[CAAM_MAX_KEY_SIZE];
+       dma_addr_t key_dma;
        unsigned int enckeylen;
        unsigned int split_key_len;
        unsigned int split_key_pad_len;
        unsigned int authsize;
 };
 
-static int aead_authenc_setauthsize(struct crypto_aead *authenc,
+static void append_key_aead(u32 *desc, struct caam_ctx *ctx,
+                           int keys_fit_inline)
+{
+       if (keys_fit_inline) {
+               append_key_as_imm(desc, ctx->key, ctx->split_key_pad_len,
+                                 ctx->split_key_len, CLASS_2 |
+                                 KEY_DEST_MDHA_SPLIT | KEY_ENC);
+               append_key_as_imm(desc, (void *)ctx->key +
+                                 ctx->split_key_pad_len, ctx->enckeylen,
+                                 ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+       } else {
+               append_key(desc, ctx->key_dma, ctx->split_key_len, CLASS_2 |
+                          KEY_DEST_MDHA_SPLIT | KEY_ENC);
+               append_key(desc, ctx->key_dma + ctx->split_key_pad_len,
+                          ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+       }
+}
+
+static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
+                                 int keys_fit_inline)
+{
+       u32 *key_jump_cmd;
+
+       init_sh_desc(desc, HDR_SHARE_WAIT);
+
+       /* Skip if already shared */
+       key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+                                  JUMP_COND_SHRD);
+
+       append_key_aead(desc, ctx, keys_fit_inline);
+
+       set_jump_tgt_here(desc, key_jump_cmd);
+
+       /* Propagate errors from shared to job descriptor */
+       append_cmd(desc, SET_OK_PROP_ERRORS | CMD_LOAD);
+}
+
+static int aead_set_sh_desc(struct crypto_aead *aead)
+{
+       struct aead_tfm *tfm = &aead->base.crt_aead;
+       struct caam_ctx *ctx = crypto_aead_ctx(aead);
+       struct device *jrdev = ctx->jrdev;
+       bool keys_fit_inline = 0;
+       u32 *key_jump_cmd, *jump_cmd;
+       u32 geniv, moveiv;
+       u32 *desc;
+
+       if (!ctx->enckeylen || !ctx->authsize)
+               return 0;
+
+       /*
+        * Job Descriptor and Shared Descriptors
+        * must all fit into the 64-word Descriptor h/w Buffer
+        */
+       if (DESC_AEAD_ENC_LEN + DESC_JOB_IO_LEN +
+           ctx->split_key_pad_len + ctx->enckeylen <=
+           CAAM_DESC_BYTES_MAX)
+               keys_fit_inline = 1;
+
+       /* aead_encrypt shared descriptor */
+       desc = ctx->sh_desc_enc;
+
+       init_sh_desc_key_aead(desc, ctx, keys_fit_inline);
+
+       /* Class 2 operation */
+       append_operation(desc, ctx->class2_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+
+       /* cryptlen = seqoutlen - authsize */
+       append_math_sub_imm_u32(desc, REG3, SEQOUTLEN, IMM, ctx->authsize);
+
+       /* assoclen + cryptlen = seqinlen - ivsize */
+       append_math_sub_imm_u32(desc, REG2, SEQINLEN, IMM, tfm->ivsize);
+
+       /* assoclen + cryptlen = (assoclen + cryptlen) - cryptlen */
+       append_math_sub(desc, VARSEQINLEN, REG2, REG3, CAAM_CMD_SZ);
+
+       /* read assoc before reading payload */
+       append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_MSG |
+                            KEY_VLF);
+       aead_append_ld_iv(desc, tfm->ivsize);
+
+       /* Class 1 operation */
+       append_operation(desc, ctx->class1_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+
+       /* Read and write cryptlen bytes */
+       append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
+       append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ);
+       aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2);
+
+       /* Write ICV */
+       append_seq_store(desc, ctx->authsize, LDST_CLASS_2_CCB |
+                        LDST_SRCDST_BYTE_CONTEXT);
+
+       ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
+                                             desc_bytes(desc),
+                                             DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
+               dev_err(jrdev, "unable to map shared descriptor\n");
+               return -ENOMEM;
+       }
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead enc shdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, desc,
+                      desc_bytes(desc), 1);
+#endif
+
+       /*
+        * Job Descriptor and Shared Descriptors
+        * must all fit into the 64-word Descriptor h/w Buffer
+        */
+       if (DESC_AEAD_DEC_LEN + DESC_JOB_IO_LEN +
+           ctx->split_key_pad_len + ctx->enckeylen <=
+           CAAM_DESC_BYTES_MAX)
+               keys_fit_inline = 1;
+
+       desc = ctx->sh_desc_dec;
+
+       /* aead_decrypt shared descriptor */
+       init_sh_desc(desc, HDR_SHARE_WAIT);
+
+       /* Skip if already shared */
+       key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+                                  JUMP_COND_SHRD);
+
+       append_key_aead(desc, ctx, keys_fit_inline);
+
+       /* Only propagate error immediately if shared */
+       jump_cmd = append_jump(desc, JUMP_TEST_ALL);
+       set_jump_tgt_here(desc, key_jump_cmd);
+       append_cmd(desc, SET_OK_PROP_ERRORS | CMD_LOAD);
+       set_jump_tgt_here(desc, jump_cmd);
+
+       /* Class 2 operation */
+       append_operation(desc, ctx->class2_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+
+       /* assoclen + cryptlen = seqinlen - ivsize */
+       append_math_sub_imm_u32(desc, REG3, SEQINLEN, IMM,
+                               ctx->authsize + tfm->ivsize)
+       /* assoclen = (assoclen + cryptlen) - cryptlen */
+       append_math_sub(desc, REG2, SEQOUTLEN, REG0, CAAM_CMD_SZ);
+       append_math_sub(desc, VARSEQINLEN, REG3, REG2, CAAM_CMD_SZ);
+
+       /* read assoc before reading payload */
+       append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_MSG |
+                            KEY_VLF);
+
+       aead_append_ld_iv(desc, tfm->ivsize);
+
+       append_dec_op1(desc, ctx->class1_alg_type);
+
+       /* Read and write cryptlen bytes */
+       append_math_add(desc, VARSEQINLEN, ZERO, REG2, CAAM_CMD_SZ);
+       append_math_add(desc, VARSEQOUTLEN, ZERO, REG2, CAAM_CMD_SZ);
+       aead_append_src_dst(desc, FIFOLD_TYPE_MSG);
+
+       /* Load ICV */
+       append_seq_fifo_load(desc, ctx->authsize, FIFOLD_CLASS_CLASS2 |
+                            FIFOLD_TYPE_LAST2 | FIFOLD_TYPE_ICV);
+       append_dec_shr_done(desc);
+
+       ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
+                                             desc_bytes(desc),
+                                             DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
+               dev_err(jrdev, "unable to map shared descriptor\n");
+               return -ENOMEM;
+       }
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead dec shdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, desc,
+                      desc_bytes(desc), 1);
+#endif
+
+       /*
+        * Job Descriptor and Shared Descriptors
+        * must all fit into the 64-word Descriptor h/w Buffer
+        */
+       if (DESC_AEAD_GIVENC_LEN + DESC_JOB_IO_LEN +
+           ctx->split_key_pad_len + ctx->enckeylen <=
+           CAAM_DESC_BYTES_MAX)
+               keys_fit_inline = 1;
+
+       /* aead_givencrypt shared descriptor */
+       desc = ctx->sh_desc_givenc;
+
+       init_sh_desc_key_aead(desc, ctx, keys_fit_inline);
+
+       /* Generate IV */
+       geniv = NFIFOENTRY_STYPE_PAD | NFIFOENTRY_DEST_DECO |
+               NFIFOENTRY_DTYPE_MSG | NFIFOENTRY_LC1 |
+               NFIFOENTRY_PTYPE_RND | (tfm->ivsize << NFIFOENTRY_DLEN_SHIFT);
+       append_load_imm_u32(desc, geniv, LDST_CLASS_IND_CCB |
+                           LDST_SRCDST_WORD_INFO_FIFO | LDST_IMM);
+       append_cmd(desc, CMD_LOAD | DISABLE_AUTO_INFO_FIFO);
+       append_move(desc, MOVE_SRC_INFIFO |
+                   MOVE_DEST_CLASS1CTX | (tfm->ivsize << MOVE_LEN_SHIFT));
+       append_cmd(desc, CMD_LOAD | ENABLE_AUTO_INFO_FIFO);
+
+       /* Copy IV to class 1 context */
+       append_move(desc, MOVE_SRC_CLASS1CTX |
+                   MOVE_DEST_OUTFIFO | (tfm->ivsize << MOVE_LEN_SHIFT));
+
+       /* Return to encryption */
+       append_operation(desc, ctx->class2_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+
+       /* ivsize + cryptlen = seqoutlen - authsize */
+       append_math_sub_imm_u32(desc, REG3, SEQOUTLEN, IMM, ctx->authsize);
+
+       /* assoclen = seqinlen - (ivsize + cryptlen) */
+       append_math_sub(desc, VARSEQINLEN, SEQINLEN, REG3, CAAM_CMD_SZ);
+
+       /* read assoc before reading payload */
+       append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_MSG |
+                            KEY_VLF);
+
+       /* Copy iv from class 1 ctx to class 2 fifo*/
+       moveiv = NFIFOENTRY_STYPE_OFIFO | NFIFOENTRY_DEST_CLASS2 |
+                NFIFOENTRY_DTYPE_MSG | (tfm->ivsize << NFIFOENTRY_DLEN_SHIFT);
+       append_load_imm_u32(desc, moveiv, LDST_CLASS_IND_CCB |
+                           LDST_SRCDST_WORD_INFO_FIFO | LDST_IMM);
+       append_load_imm_u32(desc, tfm->ivsize, LDST_CLASS_2_CCB |
+                           LDST_SRCDST_WORD_DATASZ_REG | LDST_IMM);
+
+       /* Class 1 operation */
+       append_operation(desc, ctx->class1_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+
+       /* Will write ivsize + cryptlen */
+       append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+
+       /* Not need to reload iv */
+       append_seq_fifo_load(desc, tfm->ivsize,
+                            FIFOLD_CLASS_SKIP);
+
+       /* Will read cryptlen */
+       append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+       aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2);
+
+       /* Write ICV */
+       append_seq_store(desc, ctx->authsize, LDST_CLASS_2_CCB |
+                        LDST_SRCDST_BYTE_CONTEXT);
+
+       ctx->sh_desc_givenc_dma = dma_map_single(jrdev, desc,
+                                                desc_bytes(desc),
+                                                DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->sh_desc_givenc_dma)) {
+               dev_err(jrdev, "unable to map shared descriptor\n");
+               return -ENOMEM;
+       }
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead givenc shdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, desc,
+                      desc_bytes(desc), 1);
+#endif
+
+       return 0;
+}
+
+static int aead_setauthsize(struct crypto_aead *authenc,
                                    unsigned int authsize)
 {
        struct caam_ctx *ctx = crypto_aead_ctx(authenc);
 
        ctx->authsize = authsize;
+       aead_set_sh_desc(authenc);
 
        return 0;
 }
@@ -117,6 +465,7 @@ static void split_key_done(struct device *dev, u32 *desc, u32 err,
 #ifdef DEBUG
        dev_err(dev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
 #endif
+
        if (err) {
                char tmp[CAAM_ERROR_STR_MAX];
 
@@ -220,73 +569,7 @@ static u32 gen_split_key(struct caam_ctx *ctx, const u8 *key_in, u32 authkeylen)
        return ret;
 }
 
-static int build_sh_desc_ipsec(struct caam_ctx *ctx)
-{
-       struct device *jrdev = ctx->jrdev;
-       u32 *sh_desc;
-       u32 *jump_cmd;
-       bool keys_fit_inline = 0;
-
-       /*
-        * largest Job Descriptor and its Shared Descriptor
-        * must both fit into the 64-word Descriptor h/w Buffer
-        */
-       if ((DESC_AEAD_GIVENCRYPT_TEXT_LEN +
-            DESC_AEAD_SHARED_TEXT_LEN) * CAAM_CMD_SZ +
-           ctx->split_key_pad_len + ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-               keys_fit_inline = 1;
-
-       /* build shared descriptor for this session */
-       sh_desc = kmalloc(CAAM_CMD_SZ * DESC_AEAD_SHARED_TEXT_LEN +
-                         (keys_fit_inline ?
-                          ctx->split_key_pad_len + ctx->enckeylen :
-                          CAAM_PTR_SZ * 2), GFP_DMA | GFP_KERNEL);
-       if (!sh_desc) {
-               dev_err(jrdev, "could not allocate shared descriptor\n");
-               return -ENOMEM;
-       }
-
-       init_sh_desc(sh_desc, HDR_SAVECTX | HDR_SHARE_SERIAL);
-
-       jump_cmd = append_jump(sh_desc, CLASS_BOTH | JUMP_TEST_ALL |
-                              JUMP_COND_SHRD | JUMP_COND_SELF);
-
-       /*
-        * process keys, starting with class 2/authentication.
-        */
-       if (keys_fit_inline) {
-               append_key_as_imm(sh_desc, ctx->key, ctx->split_key_pad_len,
-                                 ctx->split_key_len,
-                                 CLASS_2 | KEY_DEST_MDHA_SPLIT | KEY_ENC);
-
-               append_key_as_imm(sh_desc, (void *)ctx->key +
-                                 ctx->split_key_pad_len, ctx->enckeylen,
-                                 ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
-       } else {
-               append_key(sh_desc, ctx->key_phys, ctx->split_key_len, CLASS_2 |
-                          KEY_DEST_MDHA_SPLIT | KEY_ENC);
-               append_key(sh_desc, ctx->key_phys + ctx->split_key_pad_len,
-                          ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
-       }
-
-       /* update jump cmd now that we are at the jump target */
-       set_jump_tgt_here(sh_desc, jump_cmd);
-
-       ctx->shared_desc_phys = dma_map_single(jrdev, sh_desc,
-                                              desc_bytes(sh_desc),
-                                              DMA_TO_DEVICE);
-       if (dma_mapping_error(jrdev, ctx->shared_desc_phys)) {
-               dev_err(jrdev, "unable to map shared descriptor\n");
-               kfree(sh_desc);
-               return -ENOMEM;
-       }
-
-       ctx->sh_desc = sh_desc;
-
-       return 0;
-}
-
-static int aead_authenc_setkey(struct crypto_aead *aead,
+static int aead_setkey(struct crypto_aead *aead,
                               const u8 *key, unsigned int keylen)
 {
        /* Sizes for MDHA pads (*not* keys): MD5, SHA1, 224, 256, 384, 512 */
@@ -326,27 +609,19 @@ static int aead_authenc_setkey(struct crypto_aead *aead,
        print_hex_dump(KERN_ERR, "key in @"xstr(__LINE__)": ",
                       DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
 #endif
-       ctx->key = kmalloc(ctx->split_key_pad_len + enckeylen,
-                          GFP_KERNEL | GFP_DMA);
-       if (!ctx->key) {
-               dev_err(jrdev, "could not allocate key output memory\n");
-               return -ENOMEM;
-       }
 
        ret = gen_split_key(ctx, key, authkeylen);
        if (ret) {
-               kfree(ctx->key);
                goto badkey;
        }
 
        /* postpend encryption key to auth split key */
        memcpy(ctx->key + ctx->split_key_pad_len, key + authkeylen, enckeylen);
 
-       ctx->key_phys = dma_map_single(jrdev, ctx->key, ctx->split_key_pad_len +
+       ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->split_key_pad_len +
                                       enckeylen, DMA_TO_DEVICE);
-       if (dma_mapping_error(jrdev, ctx->key_phys)) {
+       if (dma_mapping_error(jrdev, ctx->key_dma)) {
                dev_err(jrdev, "unable to map key i/o memory\n");
-               kfree(ctx->key);
                return -ENOMEM;
        }
 #ifdef DEBUG
@@ -357,11 +632,10 @@ static int aead_authenc_setkey(struct crypto_aead *aead,
 
        ctx->enckeylen = enckeylen;
 
-       ret = build_sh_desc_ipsec(ctx);
+       ret = aead_set_sh_desc(aead);
        if (ret) {
-               dma_unmap_single(jrdev, ctx->key_phys, ctx->split_key_pad_len +
+               dma_unmap_single(jrdev, ctx->key_dma, ctx->split_key_pad_len +
                                 enckeylen, DMA_TO_DEVICE);
-               kfree(ctx->key);
        }
 
        return ret;
@@ -370,6 +644,119 @@ badkey:
        return -EINVAL;
 }
 
+static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
+                            const u8 *key, unsigned int keylen)
+{
+       struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
+       struct ablkcipher_tfm *tfm = &ablkcipher->base.crt_ablkcipher;
+       struct device *jrdev = ctx->jrdev;
+       int ret = 0;
+       u32 *key_jump_cmd, *jump_cmd;
+       u32 *desc;
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "key in @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
+#endif
+
+       memcpy(ctx->key, key, keylen);
+       ctx->key_dma = dma_map_single(jrdev, ctx->key, keylen,
+                                     DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->key_dma)) {
+               dev_err(jrdev, "unable to map key i/o memory\n");
+               return -ENOMEM;
+       }
+       ctx->enckeylen = keylen;
+
+       /* ablkcipher_encrypt shared descriptor */
+       desc = ctx->sh_desc_enc;
+       init_sh_desc(desc, HDR_SHARE_WAIT);
+       /* Skip if already shared */
+       key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+                                  JUMP_COND_SHRD);
+
+       /* Load class1 key only */
+       append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
+                         ctx->enckeylen, CLASS_1 |
+                         KEY_DEST_CLASS_REG);
+
+       set_jump_tgt_here(desc, key_jump_cmd);
+
+       /* Propagate errors from shared to job descriptor */
+       append_cmd(desc, SET_OK_PROP_ERRORS | CMD_LOAD);
+
+       /* Load iv */
+       append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+                  LDST_CLASS_1_CCB | tfm->ivsize);
+
+       /* Load operation */
+       append_operation(desc, ctx->class1_alg_type |
+                        OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+
+       /* Perform operation */
+       ablkcipher_append_src_dst(desc);
+
+       ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
+                                             desc_bytes(desc),
+                                             DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
+               dev_err(jrdev, "unable to map shared descriptor\n");
+               return -ENOMEM;
+       }
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "ablkcipher enc shdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, desc,
+                      desc_bytes(desc), 1);
+#endif
+       /* ablkcipher_decrypt shared descriptor */
+       desc = ctx->sh_desc_dec;
+
+       init_sh_desc(desc, HDR_SHARE_WAIT);
+       /* Skip if already shared */
+       key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+                                  JUMP_COND_SHRD);
+
+       /* Load class1 key only */
+       append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
+                         ctx->enckeylen, CLASS_1 |
+                         KEY_DEST_CLASS_REG);
+
+       /* For aead, only propagate error immediately if shared */
+       jump_cmd = append_jump(desc, JUMP_TEST_ALL);
+       set_jump_tgt_here(desc, key_jump_cmd);
+       append_cmd(desc, SET_OK_PROP_ERRORS | CMD_LOAD);
+       set_jump_tgt_here(desc, jump_cmd);
+
+       /* load IV */
+       append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+                  LDST_CLASS_1_CCB | tfm->ivsize);
+
+       /* Choose operation */
+       append_dec_op1(desc, ctx->class1_alg_type);
+
+       /* Perform operation */
+       ablkcipher_append_src_dst(desc);
+
+       /* Wait for key to load before allowing propagating error */
+       append_dec_shr_done(desc);
+
+       ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
+                                             desc_bytes(desc),
+                                             DMA_TO_DEVICE);
+       if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
+               dev_err(jrdev, "unable to map shared descriptor\n");
+               return -ENOMEM;
+       }
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "ablkcipher dec shdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, desc,
+                      desc_bytes(desc), 1);
+#endif
+
+       return ret;
+}
+
 struct link_tbl_entry {
        u64 ptr;
        u32 len;
@@ -379,64 +766,109 @@ struct link_tbl_entry {
 };
 
 /*
- * ipsec_esp_edesc - s/w-extended ipsec_esp descriptor
+ * aead_edesc - s/w-extended aead descriptor
+ * @assoc_nents: number of segments in associated data (SPI+Seq) scatterlist
  * @src_nents: number of segments in input scatterlist
  * @dst_nents: number of segments in output scatterlist
- * @assoc_nents: number of segments in associated data (SPI+Seq) scatterlist
+ * @iv_dma: dma address of iv for checking continuity and link table
  * @desc: h/w descriptor (variable length; must not exceed MAX_CAAM_DESCSIZE)
  * @link_tbl_bytes: length of dma mapped link_tbl space
  * @link_tbl_dma: bus physical mapped address of h/w link table
  * @hw_desc: the h/w job descriptor followed by any referenced link tables
  */
-struct ipsec_esp_edesc {
+struct aead_edesc {
        int assoc_nents;
        int src_nents;
        int dst_nents;
+       dma_addr_t iv_dma;
        int link_tbl_bytes;
        dma_addr_t link_tbl_dma;
        struct link_tbl_entry *link_tbl;
        u32 hw_desc[0];
 };
 
-static void ipsec_esp_unmap(struct device *dev,
-                           struct ipsec_esp_edesc *edesc,
-                           struct aead_request *areq)
-{
-       dma_unmap_sg(dev, areq->assoc, edesc->assoc_nents, DMA_TO_DEVICE);
+/*
+ * ablkcipher_edesc - s/w-extended ablkcipher descriptor
+ * @src_nents: number of segments in input scatterlist
+ * @dst_nents: number of segments in output scatterlist
+ * @iv_dma: dma address of iv for checking continuity and link table
+ * @desc: h/w descriptor (variable length; must not exceed MAX_CAAM_DESCSIZE)
+ * @link_tbl_bytes: length of dma mapped link_tbl space
+ * @link_tbl_dma: bus physical mapped address of h/w link table
+ * @hw_desc: the h/w job descriptor followed by any referenced link tables
+ */
+struct ablkcipher_edesc {
+       int src_nents;
+       int dst_nents;
+       dma_addr_t iv_dma;
+       int link_tbl_bytes;
+       dma_addr_t link_tbl_dma;
+       struct link_tbl_entry *link_tbl;
+       u32 hw_desc[0];
+};
 
-       if (unlikely(areq->dst != areq->src)) {
-               dma_unmap_sg(dev, areq->src, edesc->src_nents,
-                            DMA_TO_DEVICE);
-               dma_unmap_sg(dev, areq->dst, edesc->dst_nents,
-                            DMA_FROM_DEVICE);
+static void caam_unmap(struct device *dev, struct scatterlist *src,
+                      struct scatterlist *dst, int src_nents, int dst_nents,
+                      dma_addr_t iv_dma, int ivsize, dma_addr_t link_tbl_dma,
+                      int link_tbl_bytes)
+{
+       if (unlikely(dst != src)) {
+               dma_unmap_sg(dev, src, src_nents, DMA_TO_DEVICE);
+               dma_unmap_sg(dev, dst, dst_nents, DMA_FROM_DEVICE);
        } else {
-               dma_unmap_sg(dev, areq->src, edesc->src_nents,
-                            DMA_BIDIRECTIONAL);
+               dma_unmap_sg(dev, src, src_nents, DMA_BIDIRECTIONAL);
        }
 
-       if (edesc->link_tbl_bytes)
-               dma_unmap_single(dev, edesc->link_tbl_dma,
-                                edesc->link_tbl_bytes,
+       if (iv_dma)
+               dma_unmap_single(dev, iv_dma, ivsize, DMA_TO_DEVICE);
+       if (link_tbl_bytes)
+               dma_unmap_single(dev, link_tbl_dma, link_tbl_bytes,
                                 DMA_TO_DEVICE);
 }
 
-/*
- * ipsec_esp descriptor callbacks
- */
-static void ipsec_esp_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
+static void aead_unmap(struct device *dev,
+                      struct aead_edesc *edesc,
+                      struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       int ivsize = crypto_aead_ivsize(aead);
+
+       dma_unmap_sg(dev, req->assoc, edesc->assoc_nents, DMA_TO_DEVICE);
+
+       caam_unmap(dev, req->src, req->dst,
+                  edesc->src_nents, edesc->dst_nents,
+                  edesc->iv_dma, ivsize, edesc->link_tbl_dma,
+                  edesc->link_tbl_bytes);
+}
+
+static void ablkcipher_unmap(struct device *dev,
+                            struct ablkcipher_edesc *edesc,
+                            struct ablkcipher_request *req)
+{
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
+
+       caam_unmap(dev, req->src, req->dst,
+                  edesc->src_nents, edesc->dst_nents,
+                  edesc->iv_dma, ivsize, edesc->link_tbl_dma,
+                  edesc->link_tbl_bytes);
+}
+
+static void aead_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
                                   void *context)
 {
-       struct aead_request *areq = context;
-       struct ipsec_esp_edesc *edesc;
+       struct aead_request *req = context;
+       struct aead_edesc *edesc;
 #ifdef DEBUG
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
-       int ivsize = crypto_aead_ivsize(aead);
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
+       int ivsize = crypto_aead_ivsize(aead);
 
        dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
 #endif
-       edesc = (struct ipsec_esp_edesc *)((char *)desc -
-                offsetof(struct ipsec_esp_edesc, hw_desc));
+
+       edesc = (struct aead_edesc *)((char *)desc -
+                offsetof(struct aead_edesc, hw_desc));
 
        if (err) {
                char tmp[CAAM_ERROR_STR_MAX];
@@ -444,39 +876,50 @@ static void ipsec_esp_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
                dev_err(jrdev, "%08x: %s\n", err, caam_jr_strstatus(tmp, err));
        }
 
-       ipsec_esp_unmap(jrdev, edesc, areq);
+       aead_unmap(jrdev, edesc, req);
 
 #ifdef DEBUG
        print_hex_dump(KERN_ERR, "assoc  @"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->assoc),
-                      areq->assoclen , 1);
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->assoc),
+                      req->assoclen , 1);
        print_hex_dump(KERN_ERR, "dstiv  @"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->src) - ivsize,
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src) - ivsize,
                       edesc->src_nents ? 100 : ivsize, 1);
        print_hex_dump(KERN_ERR, "dst    @"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->src),
-                      edesc->src_nents ? 100 : areq->cryptlen +
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      edesc->src_nents ? 100 : req->cryptlen +
                       ctx->authsize + 4, 1);
 #endif
 
        kfree(edesc);
 
-       aead_request_complete(areq, err);
+       aead_request_complete(req, err);
 }
 
-static void ipsec_esp_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
+static void aead_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
                                   void *context)
 {
-       struct aead_request *areq = context;
-       struct ipsec_esp_edesc *edesc;
+       struct aead_request *req = context;
+       struct aead_edesc *edesc;
 #ifdef DEBUG
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
+       int ivsize = crypto_aead_ivsize(aead);
 
        dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
 #endif
-       edesc = (struct ipsec_esp_edesc *)((char *)desc -
-                offsetof(struct ipsec_esp_edesc, hw_desc));
+
+       edesc = (struct aead_edesc *)((char *)desc -
+                offsetof(struct aead_edesc, hw_desc));
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "dstiv  @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->iv,
+                      ivsize, 1);
+       print_hex_dump(KERN_ERR, "dst    @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->dst),
+                      req->cryptlen, 1);
+#endif
 
        if (err) {
                char tmp[CAAM_ERROR_STR_MAX];
@@ -484,7 +927,7 @@ static void ipsec_esp_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
                dev_err(jrdev, "%08x: %s\n", err, caam_jr_strstatus(tmp, err));
        }
 
-       ipsec_esp_unmap(jrdev, edesc, areq);
+       aead_unmap(jrdev, edesc, req);
 
        /*
         * verify hw auth check passed else return -EBADMSG
@@ -495,255 +938,413 @@ static void ipsec_esp_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
 #ifdef DEBUG
        print_hex_dump(KERN_ERR, "iphdrout@"xstr(__LINE__)": ",
                       DUMP_PREFIX_ADDRESS, 16, 4,
-                      ((char *)sg_virt(areq->assoc) - sizeof(struct iphdr)),
-                      sizeof(struct iphdr) + areq->assoclen +
-                      ((areq->cryptlen > 1500) ? 1500 : areq->cryptlen) +
+                      ((char *)sg_virt(req->assoc) - sizeof(struct iphdr)),
+                      sizeof(struct iphdr) + req->assoclen +
+                      ((req->cryptlen > 1500) ? 1500 : req->cryptlen) +
                       ctx->authsize + 36, 1);
        if (!err && edesc->link_tbl_bytes) {
-               struct scatterlist *sg = sg_last(areq->src, edesc->src_nents);
+               struct scatterlist *sg = sg_last(req->src, edesc->src_nents);
                print_hex_dump(KERN_ERR, "sglastout@"xstr(__LINE__)": ",
                               DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(sg),
                        sg->length + ctx->authsize + 16, 1);
        }
 #endif
+
        kfree(edesc);
 
-       aead_request_complete(areq, err);
+       aead_request_complete(req, err);
+}
+
+static void ablkcipher_encrypt_done(struct device *jrdev, u32 *desc, u32 err,
+                                  void *context)
+{
+       struct ablkcipher_request *req = context;
+       struct ablkcipher_edesc *edesc;
+#ifdef DEBUG
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
+
+       dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
+#endif
+
+       edesc = (struct ablkcipher_edesc *)((char *)desc -
+                offsetof(struct ablkcipher_edesc, hw_desc));
+
+       if (err) {
+               char tmp[CAAM_ERROR_STR_MAX];
+
+               dev_err(jrdev, "%08x: %s\n", err, caam_jr_strstatus(tmp, err));
+       }
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "dstiv  @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->info,
+                      edesc->src_nents > 1 ? 100 : ivsize, 1);
+       print_hex_dump(KERN_ERR, "dst    @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      edesc->dst_nents > 1 ? 100 : req->nbytes, 1);
+#endif
+
+       ablkcipher_unmap(jrdev, edesc, req);
+       kfree(edesc);
+
+       ablkcipher_request_complete(req, err);
+}
+
+static void ablkcipher_decrypt_done(struct device *jrdev, u32 *desc, u32 err,
+                                   void *context)
+{
+       struct ablkcipher_request *req = context;
+       struct ablkcipher_edesc *edesc;
+#ifdef DEBUG
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
+
+       dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
+#endif
+
+       edesc = (struct ablkcipher_edesc *)((char *)desc -
+                offsetof(struct ablkcipher_edesc, hw_desc));
+       if (err) {
+               char tmp[CAAM_ERROR_STR_MAX];
+
+               dev_err(jrdev, "%08x: %s\n", err, caam_jr_strstatus(tmp, err));
+       }
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "dstiv  @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->info,
+                      ivsize, 1);
+       print_hex_dump(KERN_ERR, "dst    @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      edesc->dst_nents > 1 ? 100 : req->nbytes, 1);
+#endif
+
+       ablkcipher_unmap(jrdev, edesc, req);
+       kfree(edesc);
+
+       ablkcipher_request_complete(req, err);
+}
+
+static void sg_to_link_tbl_one(struct link_tbl_entry *link_tbl_ptr,
+                              dma_addr_t dma, u32 len, u32 offset)
+{
+       link_tbl_ptr->ptr = dma;
+       link_tbl_ptr->len = len;
+       link_tbl_ptr->reserved = 0;
+       link_tbl_ptr->buf_pool_id = 0;
+       link_tbl_ptr->offset = offset;
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "link_tbl_ptr@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, link_tbl_ptr,
+                      sizeof(struct link_tbl_entry), 1);
+#endif
 }
 
 /*
  * convert scatterlist to h/w link table format
- * scatterlist must have been previously dma mapped
+ * but does not have final bit; instead, returns last entry
  */
-static void sg_to_link_tbl(struct scatterlist *sg, int sg_count,
-                          struct link_tbl_entry *link_tbl_ptr, u32 offset)
+static struct link_tbl_entry *sg_to_link_tbl(struct scatterlist *sg,
+                                            int sg_count, struct link_tbl_entry
+                                            *link_tbl_ptr, u32 offset)
 {
        while (sg_count) {
-               link_tbl_ptr->ptr = sg_dma_address(sg);
-               link_tbl_ptr->len = sg_dma_len(sg);
-               link_tbl_ptr->reserved = 0;
-               link_tbl_ptr->buf_pool_id = 0;
-               link_tbl_ptr->offset = offset;
+               sg_to_link_tbl_one(link_tbl_ptr, sg_dma_address(sg),
+                                  sg_dma_len(sg), offset);
                link_tbl_ptr++;
                sg = sg_next(sg);
                sg_count--;
        }
+       return link_tbl_ptr - 1;
+}
+
+/*
+ * convert scatterlist to h/w link table format
+ * scatterlist must have been previously dma mapped
+ */
+static void sg_to_link_tbl_last(struct scatterlist *sg, int sg_count,
+                               struct link_tbl_entry *link_tbl_ptr, u32 offset)
+{
+       link_tbl_ptr = sg_to_link_tbl(sg, sg_count, link_tbl_ptr, offset);
+       link_tbl_ptr->len |= 0x40000000;
+}
+
+/*
+ * Fill in aead job descriptor
+ */
+static void init_aead_job(u32 *sh_desc, dma_addr_t ptr,
+                         struct aead_edesc *edesc,
+                         struct aead_request *req,
+                         bool all_contig, bool encrypt)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct caam_ctx *ctx = crypto_aead_ctx(aead);
+       int ivsize = crypto_aead_ivsize(aead);
+       int authsize = ctx->authsize;
+       u32 *desc = edesc->hw_desc;
+       u32 out_options = 0, in_options;
+       dma_addr_t dst_dma, src_dma;
+       int len, link_tbl_index = 0;
+
+#ifdef DEBUG
+       debug("assoclen %d cryptlen %d authsize %d\n",
+             req->assoclen, req->cryptlen, authsize);
+       print_hex_dump(KERN_ERR, "assoc  @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->assoc),
+                      req->assoclen , 1);
+       print_hex_dump(KERN_ERR, "presciv@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->iv,
+                      edesc->src_nents ? 100 : ivsize, 1);
+       print_hex_dump(KERN_ERR, "src    @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                       edesc->src_nents ? 100 : req->cryptlen, 1);
+       print_hex_dump(KERN_ERR, "shrdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sh_desc,
+                      desc_bytes(sh_desc), 1);
+#endif
+
+       len = desc_len(sh_desc);
+       init_job_desc_shared(desc, ptr, len, HDR_SHARE_DEFER | HDR_REVERSE);
+
+       if (all_contig) {
+               src_dma = sg_dma_address(req->assoc);
+               in_options = 0;
+       } else {
+               src_dma = edesc->link_tbl_dma;
+               link_tbl_index += (edesc->assoc_nents ? : 1) + 1 +
+                                 (edesc->src_nents ? : 1);
+               in_options = LDST_SGF;
+       }
+       if (encrypt)
+               append_seq_in_ptr(desc, src_dma, req->assoclen + ivsize +
+                                 req->cryptlen - authsize, in_options);
+       else
+               append_seq_in_ptr(desc, src_dma, req->assoclen + ivsize +
+                                 req->cryptlen, in_options);
 
-       /* set Final bit (marks end of link table) */
-       link_tbl_ptr--;
-       link_tbl_ptr->len |= 0x40000000;
+       if (likely(req->src == req->dst)) {
+               if (all_contig) {
+                       dst_dma = sg_dma_address(req->src);
+               } else {
+                       dst_dma = src_dma + sizeof(struct link_tbl_entry) *
+                                 ((edesc->assoc_nents ? : 1) + 1);
+                       out_options = LDST_SGF;
+               }
+       } else {
+               if (!edesc->dst_nents) {
+                       dst_dma = sg_dma_address(req->dst);
+               } else {
+                       dst_dma = edesc->link_tbl_dma +
+                                 link_tbl_index *
+                                 sizeof(struct link_tbl_entry);
+                       out_options = LDST_SGF;
+               }
+       }
+       if (encrypt)
+               append_seq_out_ptr(desc, dst_dma, req->cryptlen, out_options);
+       else
+               append_seq_out_ptr(desc, dst_dma, req->cryptlen - authsize,
+                                  out_options);
 }
 
 /*
- * fill in and submit ipsec_esp job descriptor
+ * Fill in aead givencrypt job descriptor
  */
-static int ipsec_esp(struct ipsec_esp_edesc *edesc, struct aead_request *areq,
-                    u32 encrypt,
-                    void (*callback) (struct device *dev, u32 *desc,
-                                      u32 err, void *context))
+static void init_aead_giv_job(u32 *sh_desc, dma_addr_t ptr,
+                             struct aead_edesc *edesc,
+                             struct aead_request *req,
+                             int contig)
 {
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
-       struct device *jrdev = ctx->jrdev;
-       u32 *desc = edesc->hw_desc, options;
-       int ret, sg_count, assoc_sg_count;
        int ivsize = crypto_aead_ivsize(aead);
        int authsize = ctx->authsize;
-       dma_addr_t ptr, dst_dma, src_dma;
-#ifdef DEBUG
-       u32 *sh_desc = ctx->sh_desc;
+       u32 *desc = edesc->hw_desc;
+       u32 out_options = 0, in_options;
+       dma_addr_t dst_dma, src_dma;
+       int len, link_tbl_index = 0;
 
+#ifdef DEBUG
        debug("assoclen %d cryptlen %d authsize %d\n",
-             areq->assoclen, areq->cryptlen, authsize);
+             req->assoclen, req->cryptlen, authsize);
        print_hex_dump(KERN_ERR, "assoc  @"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->assoc),
-                      areq->assoclen , 1);
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->assoc),
+                      req->assoclen , 1);
        print_hex_dump(KERN_ERR, "presciv@"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->src) - ivsize,
-                      edesc->src_nents ? 100 : ivsize, 1);
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->iv, ivsize, 1);
        print_hex_dump(KERN_ERR, "src    @"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(areq->src),
-                       edesc->src_nents ? 100 : areq->cryptlen + authsize, 1);
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                       edesc->src_nents > 1 ? 100 : req->cryptlen, 1);
        print_hex_dump(KERN_ERR, "shrdesc@"xstr(__LINE__)": ",
                       DUMP_PREFIX_ADDRESS, 16, 4, sh_desc,
                       desc_bytes(sh_desc), 1);
 #endif
-       assoc_sg_count = dma_map_sg(jrdev, areq->assoc, edesc->assoc_nents ?: 1,
-                                   DMA_TO_DEVICE);
-       if (areq->src == areq->dst)
-               sg_count = dma_map_sg(jrdev, areq->src, edesc->src_nents ? : 1,
-                                     DMA_BIDIRECTIONAL);
-       else
-               sg_count = dma_map_sg(jrdev, areq->src, edesc->src_nents ? : 1,
-                                     DMA_TO_DEVICE);
 
-       /* start auth operation */
-       append_operation(desc, ctx->class2_alg_type | OP_ALG_AS_INITFINAL |
-                        (encrypt ? : OP_ALG_ICV_ON));
+       len = desc_len(sh_desc);
+       init_job_desc_shared(desc, ptr, len, HDR_SHARE_DEFER | HDR_REVERSE);
 
-       /* Load FIFO with data for Class 2 CHA */
-       options = FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_MSG;
-       if (!edesc->assoc_nents) {
-               ptr = sg_dma_address(areq->assoc);
+       if (contig & GIV_SRC_CONTIG) {
+               src_dma = sg_dma_address(req->assoc);
+               in_options = 0;
        } else {
-               sg_to_link_tbl(areq->assoc, edesc->assoc_nents,
-                              edesc->link_tbl, 0);
-               ptr = edesc->link_tbl_dma;
-               options |= LDST_SGF;
+               src_dma = edesc->link_tbl_dma;
+               link_tbl_index += edesc->assoc_nents + 1 + edesc->src_nents;
+               in_options = LDST_SGF;
        }
-       append_fifo_load(desc, ptr, areq->assoclen, options);
-
-       /* copy iv from cipher/class1 input context to class2 infifo */
-       append_move(desc, MOVE_SRC_CLASS1CTX | MOVE_DEST_CLASS2INFIFO | ivsize);
-
-       if (!encrypt) {
-               u32 *jump_cmd, *uncond_jump_cmd;
-
-               /* JUMP if shared */
-               jump_cmd = append_jump(desc, JUMP_TEST_ALL | JUMP_COND_SHRD);
+       append_seq_in_ptr(desc, src_dma, req->assoclen + ivsize +
+                         req->cryptlen - authsize, in_options);
 
-               /* start class 1 (cipher) operation, non-shared version */
-               append_operation(desc, ctx->class1_alg_type |
-                                OP_ALG_AS_INITFINAL);
-
-               uncond_jump_cmd = append_jump(desc, 0);
-
-               set_jump_tgt_here(desc, jump_cmd);
-
-               /* start class 1 (cipher) operation, shared version */
-               append_operation(desc, ctx->class1_alg_type |
-                                OP_ALG_AS_INITFINAL | OP_ALG_AAI_DK);
-               set_jump_tgt_here(desc, uncond_jump_cmd);
-       } else
-               append_operation(desc, ctx->class1_alg_type |
-                                OP_ALG_AS_INITFINAL | encrypt);
-
-       /* load payload & instruct to class2 to snoop class 1 if encrypting */
-       options = 0;
-       if (!edesc->src_nents) {
-               src_dma = sg_dma_address(areq->src);
+       if (contig & GIV_DST_CONTIG) {
+               dst_dma = edesc->iv_dma;
        } else {
-               sg_to_link_tbl(areq->src, edesc->src_nents, edesc->link_tbl +
-                              edesc->assoc_nents, 0);
-               src_dma = edesc->link_tbl_dma + edesc->assoc_nents *
-                         sizeof(struct link_tbl_entry);
-               options |= LDST_SGF;
-       }
-       append_seq_in_ptr(desc, src_dma, areq->cryptlen + authsize, options);
-       append_seq_fifo_load(desc, areq->cryptlen, FIFOLD_CLASS_BOTH |
-                            FIFOLD_TYPE_LASTBOTH |
-                            (encrypt ? FIFOLD_TYPE_MSG1OUT2
-                                     : FIFOLD_TYPE_MSG));
-
-       /* specify destination */
-       if (areq->src == areq->dst) {
-               dst_dma = src_dma;
-       } else {
-               sg_count = dma_map_sg(jrdev, areq->dst, edesc->dst_nents ? : 1,
-                                     DMA_FROM_DEVICE);
-               if (!edesc->dst_nents) {
-                       dst_dma = sg_dma_address(areq->dst);
-                       options = 0;
+               if (likely(req->src == req->dst)) {
+                       dst_dma = src_dma + sizeof(struct link_tbl_entry) *
+                                 edesc->assoc_nents;
+                       out_options = LDST_SGF;
                } else {
-                       sg_to_link_tbl(areq->dst, edesc->dst_nents,
-                                      edesc->link_tbl + edesc->assoc_nents +
-                                      edesc->src_nents, 0);
-                       dst_dma = edesc->link_tbl_dma + (edesc->assoc_nents +
-                                 edesc->src_nents) *
+                       dst_dma = edesc->link_tbl_dma +
+                                 link_tbl_index *
                                  sizeof(struct link_tbl_entry);
-                       options = LDST_SGF;
+                       out_options = LDST_SGF;
                }
        }
-       append_seq_out_ptr(desc, dst_dma, areq->cryptlen + authsize, options);
-       append_seq_fifo_store(desc, areq->cryptlen, FIFOST_TYPE_MESSAGE_DATA);
 
-       /* ICV */
-       if (encrypt)
-               append_seq_store(desc, authsize, LDST_CLASS_2_CCB |
-                                LDST_SRCDST_BYTE_CONTEXT);
-       else
-               append_seq_fifo_load(desc, authsize, FIFOLD_CLASS_CLASS2 |
-                                    FIFOLD_TYPE_LAST2 | FIFOLD_TYPE_ICV);
+       append_seq_out_ptr(desc, dst_dma, ivsize + req->cryptlen, out_options);
+}
+
+/*
+ * Fill in ablkcipher job descriptor
+ */
+static void init_ablkcipher_job(u32 *sh_desc, dma_addr_t ptr,
+                               struct ablkcipher_edesc *edesc,
+                               struct ablkcipher_request *req,
+                               bool iv_contig)
+{
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
+       u32 *desc = edesc->hw_desc;
+       u32 out_options = 0, in_options;
+       dma_addr_t dst_dma, src_dma;
+       int len, link_tbl_index = 0;
 
 #ifdef DEBUG
-       debug("job_desc_len %d\n", desc_len(desc));
-       print_hex_dump(KERN_ERR, "jobdesc@"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc) , 1);
-       print_hex_dump(KERN_ERR, "jdlinkt@"xstr(__LINE__)": ",
-                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->link_tbl,
-                       edesc->link_tbl_bytes, 1);
+       print_hex_dump(KERN_ERR, "presciv@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, req->info,
+                      ivsize, 1);
+       print_hex_dump(KERN_ERR, "src    @"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      edesc->src_nents ? 100 : req->nbytes, 1);
 #endif
 
-       ret = caam_jr_enqueue(jrdev, desc, callback, areq);
-       if (!ret)
-               ret = -EINPROGRESS;
-       else {
-               ipsec_esp_unmap(jrdev, edesc, areq);
-               kfree(edesc);
+       len = desc_len(sh_desc);
+       init_job_desc_shared(desc, ptr, len, HDR_SHARE_DEFER | HDR_REVERSE);
+
+       if (iv_contig) {
+               src_dma = edesc->iv_dma;
+               in_options = 0;
+       } else {
+               src_dma = edesc->link_tbl_dma;
+               link_tbl_index += (iv_contig ? 0 : 1) + edesc->src_nents;
+               in_options = LDST_SGF;
        }
+       append_seq_in_ptr(desc, src_dma, req->nbytes + ivsize, in_options);
 
-       return ret;
+       if (likely(req->src == req->dst)) {
+               if (!edesc->src_nents && iv_contig) {
+                       dst_dma = sg_dma_address(req->src);
+               } else {
+                       dst_dma = edesc->link_tbl_dma +
+                               sizeof(struct link_tbl_entry);
+                       out_options = LDST_SGF;
+               }
+       } else {
+               if (!edesc->dst_nents) {
+                       dst_dma = sg_dma_address(req->dst);
+               } else {
+                       dst_dma = edesc->link_tbl_dma +
+                               link_tbl_index * sizeof(struct link_tbl_entry);
+                       out_options = LDST_SGF;
+               }
+       }
+       append_seq_out_ptr(desc, dst_dma, req->nbytes, out_options);
 }
 
 /*
  * derive number of elements in scatterlist
  */
-static int sg_count(struct scatterlist *sg_list, int nbytes, int *chained)
+static int sg_count(struct scatterlist *sg_list, int nbytes)
 {
        struct scatterlist *sg = sg_list;
        int sg_nents = 0;
 
-       *chained = 0;
        while (nbytes > 0) {
                sg_nents++;
                nbytes -= sg->length;
                if (!sg_is_last(sg) && (sg + 1)->length == 0)
-                       *chained = 1;
+                       BUG(); /* Not support chaining */
                sg = scatterwalk_sg_next(sg);
        }
 
+       if (likely(sg_nents == 1))
+               return 0;
+
        return sg_nents;
 }
 
 /*
- * allocate and map the ipsec_esp extended descriptor
+ * allocate and map the aead extended descriptor
  */
-static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
-                                                    int desc_bytes)
+static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
+                                          int desc_bytes, bool *all_contig_ptr)
 {
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
        struct device *jrdev = ctx->jrdev;
-       gfp_t flags = areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
-                     GFP_ATOMIC;
-       int assoc_nents, src_nents, dst_nents = 0, chained, link_tbl_bytes;
-       struct ipsec_esp_edesc *edesc;
-
-       assoc_nents = sg_count(areq->assoc, areq->assoclen, &chained);
-       BUG_ON(chained);
-       if (likely(assoc_nents == 1))
-               assoc_nents = 0;
-
-       src_nents = sg_count(areq->src, areq->cryptlen + ctx->authsize,
-                            &chained);
-       BUG_ON(chained);
-       if (src_nents == 1)
-               src_nents = 0;
-
-       if (unlikely(areq->dst != areq->src)) {
-               dst_nents = sg_count(areq->dst, areq->cryptlen + ctx->authsize,
-                                    &chained);
-               BUG_ON(chained);
-               if (dst_nents == 1)
-                       dst_nents = 0;
+       gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+                      CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+       int assoc_nents, src_nents, dst_nents = 0;
+       struct aead_edesc *edesc;
+       dma_addr_t iv_dma = 0;
+       int sgc;
+       bool all_contig = true;
+       int ivsize = crypto_aead_ivsize(aead);
+       int link_tbl_index, link_tbl_len = 0, link_tbl_bytes;
+
+       assoc_nents = sg_count(req->assoc, req->assoclen);
+       src_nents = sg_count(req->src, req->cryptlen);
+
+       if (unlikely(req->dst != req->src))
+               dst_nents = sg_count(req->dst, req->cryptlen);
+
+       sgc = dma_map_sg(jrdev, req->assoc, assoc_nents ? : 1,
+                        DMA_BIDIRECTIONAL);
+       if (likely(req->src == req->dst)) {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_BIDIRECTIONAL);
+       } else {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_TO_DEVICE);
+               sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
+                                DMA_FROM_DEVICE);
        }
 
-       link_tbl_bytes = (assoc_nents + src_nents + dst_nents) *
-                        sizeof(struct link_tbl_entry);
-       debug("link_tbl_bytes %d\n", link_tbl_bytes);
+       /* Check if data are contiguous */
+       iv_dma = dma_map_single(jrdev, req->iv, ivsize, DMA_TO_DEVICE);
+       if (assoc_nents || sg_dma_address(req->assoc) + req->assoclen !=
+           iv_dma || src_nents || iv_dma + ivsize !=
+           sg_dma_address(req->src)) {
+               all_contig = false;
+               assoc_nents = assoc_nents ? : 1;
+               src_nents = src_nents ? : 1;
+               link_tbl_len = assoc_nents + 1 + src_nents;
+       }
+       link_tbl_len += dst_nents;
+
+       link_tbl_bytes = link_tbl_len * sizeof(struct link_tbl_entry);
 
        /* allocate space for base edesc and hw desc commands, link tables */
-       edesc = kmalloc(sizeof(struct ipsec_esp_edesc) + desc_bytes +
+       edesc = kmalloc(sizeof(struct aead_edesc) + desc_bytes +
                        link_tbl_bytes, GFP_DMA | flags);
        if (!edesc) {
                dev_err(jrdev, "could not allocate extended descriptor\n");
@@ -753,142 +1354,450 @@ static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
        edesc->assoc_nents = assoc_nents;
        edesc->src_nents = src_nents;
        edesc->dst_nents = dst_nents;
-       edesc->link_tbl = (void *)edesc + sizeof(struct ipsec_esp_edesc) +
+       edesc->iv_dma = iv_dma;
+       edesc->link_tbl_bytes = link_tbl_bytes;
+       edesc->link_tbl = (void *)edesc + sizeof(struct aead_edesc) +
                          desc_bytes;
        edesc->link_tbl_dma = dma_map_single(jrdev, edesc->link_tbl,
                                             link_tbl_bytes, DMA_TO_DEVICE);
-       edesc->link_tbl_bytes = link_tbl_bytes;
+       *all_contig_ptr = all_contig;
+
+       link_tbl_index = 0;
+       if (!all_contig) {
+               sg_to_link_tbl(req->assoc,
+                              (assoc_nents ? : 1),
+                              edesc->link_tbl +
+                              link_tbl_index, 0);
+               link_tbl_index += assoc_nents ? : 1;
+               sg_to_link_tbl_one(edesc->link_tbl + link_tbl_index,
+                                  iv_dma, ivsize, 0);
+               link_tbl_index += 1;
+               sg_to_link_tbl_last(req->src,
+                                   (src_nents ? : 1),
+                                   edesc->link_tbl +
+                                   link_tbl_index, 0);
+               link_tbl_index += src_nents ? : 1;
+       }
+       if (dst_nents) {
+               sg_to_link_tbl_last(req->dst, dst_nents,
+                                   edesc->link_tbl + link_tbl_index, 0);
+       }
 
        return edesc;
 }
 
-static int aead_authenc_encrypt(struct aead_request *areq)
+static int aead_encrypt(struct aead_request *req)
 {
-       struct ipsec_esp_edesc *edesc;
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+       struct aead_edesc *edesc;
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
        struct device *jrdev = ctx->jrdev;
-       int ivsize = crypto_aead_ivsize(aead);
+       bool all_contig;
        u32 *desc;
-       dma_addr_t iv_dma;
+       int ret = 0;
+
+       req->cryptlen += ctx->authsize;
 
        /* allocate extended descriptor */
-       edesc = ipsec_esp_edesc_alloc(areq, DESC_AEAD_ENCRYPT_TEXT_LEN *
-                                     CAAM_CMD_SZ);
+       edesc = aead_edesc_alloc(req, DESC_JOB_IO_LEN *
+                                CAAM_CMD_SZ, &all_contig);
        if (IS_ERR(edesc))
                return PTR_ERR(edesc);
 
-       desc = edesc->hw_desc;
-
-       /* insert shared descriptor pointer */
-       init_job_desc_shared(desc, ctx->shared_desc_phys,
-                            desc_len(ctx->sh_desc), HDR_SHARE_DEFER);
-
-       iv_dma = dma_map_single(jrdev, areq->iv, ivsize, DMA_TO_DEVICE);
-       /* check dma error */
+       /* Create and submit job descriptor */
+       init_aead_job(ctx->sh_desc_enc, ctx->sh_desc_enc_dma, edesc, req,
+                     all_contig, true);
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead jobdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->hw_desc,
+                      desc_bytes(edesc->hw_desc), 1);
+#endif
 
-       append_load(desc, iv_dma, ivsize,
-                   LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT);
+       desc = edesc->hw_desc;
+       ret = caam_jr_enqueue(jrdev, desc, aead_encrypt_done, req);
+       if (!ret) {
+               ret = -EINPROGRESS;
+       } else {
+               aead_unmap(jrdev, edesc, req);
+               kfree(edesc);
+       }
 
-       return ipsec_esp(edesc, areq, OP_ALG_ENCRYPT, ipsec_esp_encrypt_done);
+       return ret;
 }
 
-static int aead_authenc_decrypt(struct aead_request *req)
+static int aead_decrypt(struct aead_request *req)
 {
+       struct aead_edesc *edesc;
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
-       int ivsize = crypto_aead_ivsize(aead);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
        struct device *jrdev = ctx->jrdev;
-       struct ipsec_esp_edesc *edesc;
+       bool all_contig;
        u32 *desc;
-       dma_addr_t iv_dma;
-
-       req->cryptlen -= ctx->authsize;
+       int ret = 0;
 
        /* allocate extended descriptor */
-       edesc = ipsec_esp_edesc_alloc(req, DESC_AEAD_DECRYPT_TEXT_LEN *
-                                     CAAM_CMD_SZ);
+       edesc = aead_edesc_alloc(req, DESC_JOB_IO_LEN *
+                                CAAM_CMD_SZ, &all_contig);
        if (IS_ERR(edesc))
                return PTR_ERR(edesc);
 
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "dec src@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      req->cryptlen, 1);
+#endif
+
+       /* Create and submit job descriptor*/
+       init_aead_job(ctx->sh_desc_dec,
+                     ctx->sh_desc_dec_dma, edesc, req, all_contig, false);
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead jobdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->hw_desc,
+                      desc_bytes(edesc->hw_desc), 1);
+#endif
+
        desc = edesc->hw_desc;
+       ret = caam_jr_enqueue(jrdev, desc, aead_decrypt_done, req);
+       if (!ret) {
+               ret = -EINPROGRESS;
+       } else {
+               aead_unmap(jrdev, edesc, req);
+               kfree(edesc);
+       }
 
-       /* insert shared descriptor pointer */
-       init_job_desc_shared(desc, ctx->shared_desc_phys,
-                            desc_len(ctx->sh_desc), HDR_SHARE_DEFER);
+       return ret;
+}
 
-       iv_dma = dma_map_single(jrdev, req->iv, ivsize, DMA_TO_DEVICE);
-       /* check dma error */
+/*
+ * allocate and map the aead extended descriptor for aead givencrypt
+ */
+static struct aead_edesc *aead_giv_edesc_alloc(struct aead_givcrypt_request
+                                              *greq, int desc_bytes,
+                                              u32 *contig_ptr)
+{
+       struct aead_request *req = &greq->areq;
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct caam_ctx *ctx = crypto_aead_ctx(aead);
+       struct device *jrdev = ctx->jrdev;
+       gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+                      CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+       int assoc_nents, src_nents, dst_nents = 0;
+       struct aead_edesc *edesc;
+       dma_addr_t iv_dma = 0;
+       int sgc;
+       u32 contig = GIV_SRC_CONTIG | GIV_DST_CONTIG;
+       int ivsize = crypto_aead_ivsize(aead);
+       int link_tbl_index, link_tbl_len = 0, link_tbl_bytes;
+
+       assoc_nents = sg_count(req->assoc, req->assoclen);
+       src_nents = sg_count(req->src, req->cryptlen);
+
+       if (unlikely(req->dst != req->src))
+               dst_nents = sg_count(req->dst, req->cryptlen);
+
+       sgc = dma_map_sg(jrdev, req->assoc, assoc_nents ? : 1,
+                        DMA_BIDIRECTIONAL);
+       if (likely(req->src == req->dst)) {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_BIDIRECTIONAL);
+       } else {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_TO_DEVICE);
+               sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
+                                DMA_FROM_DEVICE);
+       }
+
+       /* Check if data are contiguous */
+       iv_dma = dma_map_single(jrdev, greq->giv, ivsize, DMA_TO_DEVICE);
+       if (assoc_nents || sg_dma_address(req->assoc) + req->assoclen !=
+           iv_dma || src_nents || iv_dma + ivsize != sg_dma_address(req->src))
+               contig &= ~GIV_SRC_CONTIG;
+       if (dst_nents || iv_dma + ivsize != sg_dma_address(req->dst))
+               contig &= ~GIV_DST_CONTIG;
+               if (unlikely(req->src != req->dst)) {
+                       dst_nents = dst_nents ? : 1;
+                       link_tbl_len += 1;
+               }
+       if (!(contig & GIV_SRC_CONTIG)) {
+               assoc_nents = assoc_nents ? : 1;
+               src_nents = src_nents ? : 1;
+               link_tbl_len += assoc_nents + 1 + src_nents;
+               if (likely(req->src == req->dst))
+                       contig &= ~GIV_DST_CONTIG;
+       }
+       link_tbl_len += dst_nents;
+
+       link_tbl_bytes = link_tbl_len * sizeof(struct link_tbl_entry);
+
+       /* allocate space for base edesc and hw desc commands, link tables */
+       edesc = kmalloc(sizeof(struct aead_edesc) + desc_bytes +
+                       link_tbl_bytes, GFP_DMA | flags);
+       if (!edesc) {
+               dev_err(jrdev, "could not allocate extended descriptor\n");
+               return ERR_PTR(-ENOMEM);
+       }
 
-       append_load(desc, iv_dma, ivsize,
-                   LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT);
+       edesc->assoc_nents = assoc_nents;
+       edesc->src_nents = src_nents;
+       edesc->dst_nents = dst_nents;
+       edesc->iv_dma = iv_dma;
+       edesc->link_tbl_bytes = link_tbl_bytes;
+       edesc->link_tbl = (void *)edesc + sizeof(struct aead_edesc) +
+                         desc_bytes;
+       edesc->link_tbl_dma = dma_map_single(jrdev, edesc->link_tbl,
+                                            link_tbl_bytes, DMA_TO_DEVICE);
+       *contig_ptr = contig;
+
+       link_tbl_index = 0;
+       if (!(contig & GIV_SRC_CONTIG)) {
+               sg_to_link_tbl(req->assoc, assoc_nents,
+                              edesc->link_tbl +
+                              link_tbl_index, 0);
+               link_tbl_index += assoc_nents;
+               sg_to_link_tbl_one(edesc->link_tbl + link_tbl_index,
+                                  iv_dma, ivsize, 0);
+               link_tbl_index += 1;
+               sg_to_link_tbl_last(req->src, src_nents,
+                                   edesc->link_tbl +
+                                   link_tbl_index, 0);
+               link_tbl_index += src_nents;
+       }
+       if (unlikely(req->src != req->dst && !(contig & GIV_DST_CONTIG))) {
+               sg_to_link_tbl_one(edesc->link_tbl + link_tbl_index,
+                                  iv_dma, ivsize, 0);
+               link_tbl_index += 1;
+               sg_to_link_tbl_last(req->dst, dst_nents,
+                                   edesc->link_tbl + link_tbl_index, 0);
+       }
 
-       return ipsec_esp(edesc, req, !OP_ALG_ENCRYPT, ipsec_esp_decrypt_done);
+       return edesc;
 }
 
-static int aead_authenc_givencrypt(struct aead_givcrypt_request *req)
+static int aead_givencrypt(struct aead_givcrypt_request *areq)
 {
-       struct aead_request *areq = &req->areq;
-       struct ipsec_esp_edesc *edesc;
-       struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+       struct aead_request *req = &areq->areq;
+       struct aead_edesc *edesc;
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct caam_ctx *ctx = crypto_aead_ctx(aead);
        struct device *jrdev = ctx->jrdev;
-       int ivsize = crypto_aead_ivsize(aead);
-       dma_addr_t iv_dma;
+       u32 contig;
        u32 *desc;
+       int ret = 0;
 
-       iv_dma = dma_map_single(jrdev, req->giv, ivsize, DMA_FROM_DEVICE);
-
-       debug("%s: giv %p\n", __func__, req->giv);
+       req->cryptlen += ctx->authsize;
 
        /* allocate extended descriptor */
-       edesc = ipsec_esp_edesc_alloc(areq, DESC_AEAD_GIVENCRYPT_TEXT_LEN *
-                                     CAAM_CMD_SZ);
+       edesc = aead_giv_edesc_alloc(areq, DESC_JOB_IO_LEN *
+                                    CAAM_CMD_SZ, &contig);
+
        if (IS_ERR(edesc))
                return PTR_ERR(edesc);
 
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "giv src@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
+                      req->cryptlen, 1);
+#endif
+
+       /* Create and submit job descriptor*/
+       init_aead_giv_job(ctx->sh_desc_givenc,
+                         ctx->sh_desc_givenc_dma, edesc, req, contig);
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "aead jobdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->hw_desc,
+                      desc_bytes(edesc->hw_desc), 1);
+#endif
+
        desc = edesc->hw_desc;
+       ret = caam_jr_enqueue(jrdev, desc, aead_encrypt_done, req);
+       if (!ret) {
+               ret = -EINPROGRESS;
+       } else {
+               aead_unmap(jrdev, edesc, req);
+               kfree(edesc);
+       }
 
-       /* insert shared descriptor pointer */
-       init_job_desc_shared(desc, ctx->shared_desc_phys,
-                            desc_len(ctx->sh_desc), HDR_SHARE_DEFER);
+       return ret;
+}
 
-       /*
-        * LOAD IMM Info FIFO
-        * to DECO, Last, Padding, Random, Message, 16 bytes
-        */
-       append_load_imm_u32(desc, NFIFOENTRY_DEST_DECO | NFIFOENTRY_LC1 |
-                           NFIFOENTRY_STYPE_PAD | NFIFOENTRY_DTYPE_MSG |
-                           NFIFOENTRY_PTYPE_RND | ivsize,
-                           LDST_SRCDST_WORD_INFO_FIFO);
+/*
+ * allocate and map the ablkcipher extended descriptor for ablkcipher
+ */
+static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request
+                                                      *req, int desc_bytes,
+                                                      bool *iv_contig_out)
+{
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
+       struct device *jrdev = ctx->jrdev;
+       gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+                                         CRYPTO_TFM_REQ_MAY_SLEEP)) ?
+                      GFP_KERNEL : GFP_ATOMIC;
+       int src_nents, dst_nents = 0, link_tbl_bytes;
+       struct ablkcipher_edesc *edesc;
+       dma_addr_t iv_dma = 0;
+       bool iv_contig = false;
+       int sgc;
+       int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
+       int link_tbl_index;
+
+       src_nents = sg_count(req->src, req->nbytes);
+
+       if (unlikely(req->dst != req->src))
+               dst_nents = sg_count(req->dst, req->nbytes);
+
+       if (likely(req->src == req->dst)) {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_BIDIRECTIONAL);
+       } else {
+               sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
+                                DMA_TO_DEVICE);
+               sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
+                                DMA_FROM_DEVICE);
+       }
 
        /*
-        * disable info fifo entries since the above serves as the entry
-        * this way, the MOVE command won't generate an entry.
-        * Note that this isn't required in more recent versions of
-        * SEC as a MOVE that doesn't do info FIFO entries is available.
+        * Check if iv can be contiguous with source and destination.
+        * If so, include it. If not, create scatterlist.
         */
-       append_cmd(desc, CMD_LOAD | DISABLE_AUTO_INFO_FIFO);
+       iv_dma = dma_map_single(jrdev, req->info, ivsize, DMA_TO_DEVICE);
+       if (!src_nents && iv_dma + ivsize == sg_dma_address(req->src))
+               iv_contig = true;
+       else
+               src_nents = src_nents ? : 1;
+       link_tbl_bytes = ((iv_contig ? 0 : 1) + src_nents + dst_nents) *
+                        sizeof(struct link_tbl_entry);
+
+       /* allocate space for base edesc and hw desc commands, link tables */
+       edesc = kmalloc(sizeof(struct ablkcipher_edesc) + desc_bytes +
+                       link_tbl_bytes, GFP_DMA | flags);
+       if (!edesc) {
+               dev_err(jrdev, "could not allocate extended descriptor\n");
+               return ERR_PTR(-ENOMEM);
+       }
 
-       /* MOVE DECO Alignment -> C1 Context 16 bytes */
-       append_move(desc, MOVE_SRC_INFIFO | MOVE_DEST_CLASS1CTX | ivsize);
+       edesc->src_nents = src_nents;
+       edesc->dst_nents = dst_nents;
+       edesc->link_tbl_bytes = link_tbl_bytes;
+       edesc->link_tbl = (void *)edesc + sizeof(struct ablkcipher_edesc) +
+                         desc_bytes;
 
-       /* re-enable info fifo entries */
-       append_cmd(desc, CMD_LOAD | ENABLE_AUTO_INFO_FIFO);
+       link_tbl_index = 0;
+       if (!iv_contig) {
+               sg_to_link_tbl_one(edesc->link_tbl, iv_dma, ivsize, 0);
+               sg_to_link_tbl_last(req->src, src_nents,
+                                   edesc->link_tbl + 1, 0);
+               link_tbl_index += 1 + src_nents;
+       }
+
+       if (unlikely(dst_nents)) {
+               sg_to_link_tbl_last(req->dst, dst_nents,
+                       edesc->link_tbl + link_tbl_index, 0);
+       }
+
+       edesc->link_tbl_dma = dma_map_single(jrdev, edesc->link_tbl,
+                                            link_tbl_bytes, DMA_TO_DEVICE);
+       edesc->iv_dma = iv_dma;
+
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "ablkcipher link_tbl@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->link_tbl,
+                      link_tbl_bytes, 1);
+#endif
+
+       *iv_contig_out = iv_contig;
+       return edesc;
+}
+
+static int ablkcipher_encrypt(struct ablkcipher_request *req)
+{
+       struct ablkcipher_edesc *edesc;
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
+       struct device *jrdev = ctx->jrdev;
+       bool iv_contig;
+       u32 *desc;
+       int ret = 0;
+
+       /* allocate extended descriptor */
+       edesc = ablkcipher_edesc_alloc(req, DESC_JOB_IO_LEN *
+                                      CAAM_CMD_SZ, &iv_contig);
+       if (IS_ERR(edesc))
+               return PTR_ERR(edesc);
+
+       /* Create and submit job descriptor*/
+       init_ablkcipher_job(ctx->sh_desc_enc,
+               ctx->sh_desc_enc_dma, edesc, req, iv_contig);
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "ablkcipher jobdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->hw_desc,
+                      desc_bytes(edesc->hw_desc), 1);
+#endif
+       desc = edesc->hw_desc;
+       ret = caam_jr_enqueue(jrdev, desc, ablkcipher_encrypt_done, req);
+
+       if (!ret) {
+               ret = -EINPROGRESS;
+       } else {
+               ablkcipher_unmap(jrdev, edesc, req);
+               kfree(edesc);
+       }
+
+       return ret;
+}
+
+static int ablkcipher_decrypt(struct ablkcipher_request *req)
+{
+       struct ablkcipher_edesc *edesc;
+       struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
+       struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
+       struct device *jrdev = ctx->jrdev;
+       bool iv_contig;
+       u32 *desc;
+       int ret = 0;
+
+       /* allocate extended descriptor */
+       edesc = ablkcipher_edesc_alloc(req, DESC_JOB_IO_LEN *
+                                      CAAM_CMD_SZ, &iv_contig);
+       if (IS_ERR(edesc))
+               return PTR_ERR(edesc);
 
-       /* MOVE C1 Context -> OFIFO 16 bytes */
-       append_move(desc, MOVE_SRC_CLASS1CTX | MOVE_DEST_OUTFIFO | ivsize);
+       /* Create and submit job descriptor*/
+       init_ablkcipher_job(ctx->sh_desc_dec,
+               ctx->sh_desc_dec_dma, edesc, req, iv_contig);
+       desc = edesc->hw_desc;
+#ifdef DEBUG
+       print_hex_dump(KERN_ERR, "ablkcipher jobdesc@"xstr(__LINE__)": ",
+                      DUMP_PREFIX_ADDRESS, 16, 4, edesc->hw_desc,
+                      desc_bytes(edesc->hw_desc), 1);
+#endif
 
-       append_fifo_store(desc, iv_dma, ivsize, FIFOST_TYPE_MESSAGE_DATA);
+       ret = caam_jr_enqueue(jrdev, desc, ablkcipher_decrypt_done, req);
+       if (!ret) {
+               ret = -EINPROGRESS;
+       } else {
+               ablkcipher_unmap(jrdev, edesc, req);
+               kfree(edesc);
+       }
 
-       return ipsec_esp(edesc, areq, OP_ALG_ENCRYPT, ipsec_esp_encrypt_done);
+       return ret;
 }
 
+#define template_aead          template_u.aead
+#define template_ablkcipher    template_u.ablkcipher
 struct caam_alg_template {
        char name[CRYPTO_MAX_ALG_NAME];
        char driver_name[CRYPTO_MAX_ALG_NAME];
        unsigned int blocksize;
-       struct aead_alg aead;
+       u32 type;
+       union {
+               struct ablkcipher_alg ablkcipher;
+               struct aead_alg aead;
+               struct blkcipher_alg blkcipher;
+               struct cipher_alg cipher;
+               struct compress_alg compress;
+               struct rng_alg rng;
+       } template_u;
        u32 class1_alg_type;
        u32 class2_alg_type;
        u32 alg_op;
@@ -900,12 +1809,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha1),cbc(aes))",
                .driver_name = "authenc-hmac-sha1-cbc-aes-caam",
                .blocksize = AES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = AES_BLOCK_SIZE,
                        .maxauthsize = SHA1_DIGEST_SIZE,
@@ -918,12 +1828,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha256),cbc(aes))",
                .driver_name = "authenc-hmac-sha256-cbc-aes-caam",
                .blocksize = AES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = AES_BLOCK_SIZE,
                        .maxauthsize = SHA256_DIGEST_SIZE,
@@ -937,12 +1848,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha512),cbc(aes))",
                .driver_name = "authenc-hmac-sha512-cbc-aes-caam",
                .blocksize = AES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = AES_BLOCK_SIZE,
                        .maxauthsize = SHA512_DIGEST_SIZE,
@@ -956,12 +1868,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha1),cbc(des3_ede))",
                .driver_name = "authenc-hmac-sha1-cbc-des3_ede-caam",
                .blocksize = DES3_EDE_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES3_EDE_BLOCK_SIZE,
                        .maxauthsize = SHA1_DIGEST_SIZE,
@@ -974,12 +1887,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha256),cbc(des3_ede))",
                .driver_name = "authenc-hmac-sha256-cbc-des3_ede-caam",
                .blocksize = DES3_EDE_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES3_EDE_BLOCK_SIZE,
                        .maxauthsize = SHA256_DIGEST_SIZE,
@@ -993,12 +1907,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha512),cbc(des3_ede))",
                .driver_name = "authenc-hmac-sha512-cbc-des3_ede-caam",
                .blocksize = DES3_EDE_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES3_EDE_BLOCK_SIZE,
                        .maxauthsize = SHA512_DIGEST_SIZE,
@@ -1012,12 +1927,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha1),cbc(des))",
                .driver_name = "authenc-hmac-sha1-cbc-des-caam",
                .blocksize = DES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES_BLOCK_SIZE,
                        .maxauthsize = SHA1_DIGEST_SIZE,
@@ -1030,12 +1946,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha256),cbc(des))",
                .driver_name = "authenc-hmac-sha256-cbc-des-caam",
                .blocksize = DES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES_BLOCK_SIZE,
                        .maxauthsize = SHA256_DIGEST_SIZE,
@@ -1049,12 +1966,13 @@ static struct caam_alg_template driver_algs[] = {
                .name = "authenc(hmac(sha512),cbc(des))",
                .driver_name = "authenc-hmac-sha512-cbc-des-caam",
                .blocksize = DES_BLOCK_SIZE,
-               .aead = {
-                       .setkey = aead_authenc_setkey,
-                       .setauthsize = aead_authenc_setauthsize,
-                       .encrypt = aead_authenc_encrypt,
-                       .decrypt = aead_authenc_decrypt,
-                       .givencrypt = aead_authenc_givencrypt,
+               .type = CRYPTO_ALG_TYPE_AEAD,
+               .template_aead = {
+                       .setkey = aead_setkey,
+                       .setauthsize = aead_setauthsize,
+                       .encrypt = aead_encrypt,
+                       .decrypt = aead_decrypt,
+                       .givencrypt = aead_givencrypt,
                        .geniv = "<built-in>",
                        .ivsize = DES_BLOCK_SIZE,
                        .maxauthsize = SHA512_DIGEST_SIZE,
@@ -1064,6 +1982,55 @@ static struct caam_alg_template driver_algs[] = {
                                   OP_ALG_AAI_HMAC_PRECOMP,
                .alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
        },
+       /* ablkcipher descriptor */
+       {
+               .name = "cbc(aes)",
+               .driver_name = "cbc-aes-caam",
+               .blocksize = AES_BLOCK_SIZE,
+               .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+               .template_ablkcipher = {
+                       .setkey = ablkcipher_setkey,
+                       .encrypt = ablkcipher_encrypt,
+                       .decrypt = ablkcipher_decrypt,
+                       .geniv = "eseqiv",
+                       .min_keysize = AES_MIN_KEY_SIZE,
+                       .max_keysize = AES_MAX_KEY_SIZE,
+                       .ivsize = AES_BLOCK_SIZE,
+                       },
+               .class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
+       },
+       {
+               .name = "cbc(des3_ede)",
+               .driver_name = "cbc-3des-caam",
+               .blocksize = DES3_EDE_BLOCK_SIZE,
+               .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+               .template_ablkcipher = {
+                       .setkey = ablkcipher_setkey,
+                       .encrypt = ablkcipher_encrypt,
+                       .decrypt = ablkcipher_decrypt,
+                       .geniv = "eseqiv",
+                       .min_keysize = DES3_EDE_KEY_SIZE,
+                       .max_keysize = DES3_EDE_KEY_SIZE,
+                       .ivsize = DES3_EDE_BLOCK_SIZE,
+                       },
+               .class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
+       },
+       {
+               .name = "cbc(des)",
+               .driver_name = "cbc-des-caam",
+               .blocksize = DES_BLOCK_SIZE,
+               .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+               .template_ablkcipher = {
+                       .setkey = ablkcipher_setkey,
+                       .encrypt = ablkcipher_encrypt,
+                       .decrypt = ablkcipher_decrypt,
+                       .geniv = "eseqiv",
+                       .min_keysize = DES_KEY_SIZE,
+                       .max_keysize = DES_KEY_SIZE,
+                       .ivsize = DES_BLOCK_SIZE,
+                       },
+               .class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
+       }
 };
 
 struct caam_crypto_alg {
@@ -1102,16 +2069,19 @@ static void caam_cra_exit(struct crypto_tfm *tfm)
 {
        struct caam_ctx *ctx = crypto_tfm_ctx(tfm);
 
-       if (!dma_mapping_error(ctx->jrdev, ctx->shared_desc_phys))
-               dma_unmap_single(ctx->jrdev, ctx->shared_desc_phys,
-                                desc_bytes(ctx->sh_desc), DMA_TO_DEVICE);
-       kfree(ctx->sh_desc);
-
-       if (!dma_mapping_error(ctx->jrdev, ctx->key_phys))
-               dma_unmap_single(ctx->jrdev, ctx->key_phys,
-                                ctx->split_key_pad_len + ctx->enckeylen,
+       if (ctx->sh_desc_enc_dma &&
+           !dma_mapping_error(ctx->jrdev, ctx->sh_desc_enc_dma))
+               dma_unmap_single(ctx->jrdev, ctx->sh_desc_enc_dma,
+                                desc_bytes(ctx->sh_desc_enc), DMA_TO_DEVICE);
+       if (ctx->sh_desc_dec_dma &&
+           !dma_mapping_error(ctx->jrdev, ctx->sh_desc_dec_dma))
+               dma_unmap_single(ctx->jrdev, ctx->sh_desc_dec_dma,
+                                desc_bytes(ctx->sh_desc_dec), DMA_TO_DEVICE);
+       if (ctx->sh_desc_givenc_dma &&
+           !dma_mapping_error(ctx->jrdev, ctx->sh_desc_givenc_dma))
+               dma_unmap_single(ctx->jrdev, ctx->sh_desc_givenc_dma,
+                                desc_bytes(ctx->sh_desc_givenc),
                                 DMA_TO_DEVICE);
-       kfree(ctx->key);
 }
 
 static void __exit caam_algapi_exit(void)
@@ -1175,12 +2145,20 @@ static struct caam_crypto_alg *caam_alg_alloc(struct device *ctrldev,
        alg->cra_init = caam_cra_init;
        alg->cra_exit = caam_cra_exit;
        alg->cra_priority = CAAM_CRA_PRIORITY;
-       alg->cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC;
        alg->cra_blocksize = template->blocksize;
        alg->cra_alignmask = 0;
-       alg->cra_type = &crypto_aead_type;
        alg->cra_ctxsize = sizeof(struct caam_ctx);
-       alg->cra_u.aead = template->aead;
+       alg->cra_flags = CRYPTO_ALG_ASYNC | template->type;
+       switch (template->type) {
+       case CRYPTO_ALG_TYPE_ABLKCIPHER:
+               alg->cra_type = &crypto_ablkcipher_type;
+               alg->cra_ablkcipher = template->template_ablkcipher;
+               break;
+       case CRYPTO_ALG_TYPE_AEAD:
+               alg->cra_type = &crypto_aead_type;
+               alg->cra_aead = template->template_aead;
+               break;
+       }
 
        t_alg->class1_alg_type = template->class1_alg_type;
        t_alg->class2_alg_type = template->class2_alg_type;
index 9504503..d38f2af 100644 (file)
@@ -31,5 +31,6 @@
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/scatterwalk.h>
+#include <crypto/internal/skcipher.h>
 
 #endif /* !defined(CAAM_COMPAT_H) */
index 9009713..fc2d9ed 100644 (file)
@@ -52,9 +52,11 @@ static int caam_probe(struct platform_device *pdev)
        struct caam_ctrl __iomem *ctrl;
        struct caam_full __iomem *topregs;
        struct caam_drv_private *ctrlpriv;
-       struct caam_perfmon *perfmon;
        struct caam_deco **deco;
        u32 deconum;
+#ifdef CONFIG_DEBUG_FS
+       struct caam_perfmon *perfmon;
+#endif
 
        ctrlpriv = kzalloc(sizeof(struct caam_drv_private), GFP_KERNEL);
        if (!ctrlpriv)
index 4691580..0991323 100644 (file)
@@ -9,7 +9,7 @@
 #define IMMEDIATE (1 << 23)
 #define CAAM_CMD_SZ sizeof(u32)
 #define CAAM_PTR_SZ sizeof(dma_addr_t)
-#define CAAM_DESC_BYTES_MAX (CAAM_CMD_SZ * 64)
+#define CAAM_DESC_BYTES_MAX (CAAM_CMD_SZ * MAX_CAAM_DESCSIZE)
 
 #ifdef DEBUG
 #define PRINT_POS do { printk(KERN_DEBUG "%02d: %s\n", desc_len(desc),\
@@ -18,6 +18,9 @@
 #define PRINT_POS
 #endif
 
+#define SET_OK_PROP_ERRORS (IMMEDIATE | LDST_CLASS_DECO | \
+                           LDST_SRCDST_WORD_DECOCTRL | \
+                           (LDOFF_CHG_SHARE_OK_PROP << LDST_OFFSET_SHIFT))
 #define DISABLE_AUTO_INFO_FIFO (IMMEDIATE | LDST_CLASS_DECO | \
                                LDST_SRCDST_WORD_DECOCTRL | \
                                (LDOFF_DISABLE_AUTO_NFIFO << LDST_OFFSET_SHIFT))
@@ -203,3 +206,56 @@ static inline void append_##cmd##_imm_##type(u32 *desc, type immediate, \
        append_cmd(desc, immediate); \
 }
 APPEND_CMD_RAW_IMM(load, LOAD, u32);
+
+/*
+ * Append math command. Only the last part of destination and source need to
+ * be specified
+ */
+#define APPEND_MATH(op, desc, dest, src_0, src_1, len) \
+append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \
+          MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32) (len & MATH_LEN_MASK));
+
+#define append_math_add(desc, dest, src0, src1, len) \
+       APPEND_MATH(ADD, desc, dest, src0, src1, len)
+#define append_math_sub(desc, dest, src0, src1, len) \
+       APPEND_MATH(SUB, desc, dest, src0, src1, len)
+#define append_math_add_c(desc, dest, src0, src1, len) \
+       APPEND_MATH(ADDC, desc, dest, src0, src1, len)
+#define append_math_sub_b(desc, dest, src0, src1, len) \
+       APPEND_MATH(SUBB, desc, dest, src0, src1, len)
+#define append_math_and(desc, dest, src0, src1, len) \
+       APPEND_MATH(AND, desc, dest, src0, src1, len)
+#define append_math_or(desc, dest, src0, src1, len) \
+       APPEND_MATH(OR, desc, dest, src0, src1, len)
+#define append_math_xor(desc, dest, src0, src1, len) \
+       APPEND_MATH(XOR, desc, dest, src0, src1, len)
+#define append_math_lshift(desc, dest, src0, src1, len) \
+       APPEND_MATH(LSHIFT, desc, dest, src0, src1, len)
+#define append_math_rshift(desc, dest, src0, src1, len) \
+       APPEND_MATH(RSHIFT, desc, dest, src0, src1, len)
+
+/* Exactly one source is IMM. Data is passed in as u32 value */
+#define APPEND_MATH_IMM_u32(op, desc, dest, src_0, src_1, data) \
+do { \
+       APPEND_MATH(op, desc, dest, src_0, src_1, CAAM_CMD_SZ); \
+       append_cmd(desc, data); \
+} while (0);
+
+#define append_math_add_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(ADD, desc, dest, src0, src1, data)
+#define append_math_sub_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(SUB, desc, dest, src0, src1, data)
+#define append_math_add_c_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(ADDC, desc, dest, src0, src1, data)
+#define append_math_sub_b_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(SUBB, desc, dest, src0, src1, data)
+#define append_math_and_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(AND, desc, dest, src0, src1, data)
+#define append_math_or_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(OR, desc, dest, src0, src1, data)
+#define append_math_xor_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(XOR, desc, dest, src0, src1, data)
+#define append_math_lshift_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(LSHIFT, desc, dest, src0, src1, data)
+#define append_math_rshift_imm_u32(desc, dest, src0, src1, data) \
+       APPEND_MATH_IMM_u32(RSHIFT, desc, dest, src0, src1, data)
index ba8f1ea..6399a8f 100644 (file)
 
 #define DEFAULT_TIMEOUT_INTERVAL       HZ
 
-#define FLAGS_FINUP            0x0002
-#define FLAGS_FINAL            0x0004
-#define FLAGS_SG               0x0008
-#define FLAGS_SHA1             0x0010
-#define FLAGS_DMA_ACTIVE       0x0020
-#define FLAGS_OUTPUT_READY     0x0040
-#define FLAGS_INIT             0x0100
-#define FLAGS_CPU              0x0200
-#define FLAGS_HMAC             0x0400
-#define FLAGS_ERROR            0x0800
-#define FLAGS_BUSY             0x1000
+/* mostly device flags */
+#define FLAGS_BUSY             0
+#define FLAGS_FINAL            1
+#define FLAGS_DMA_ACTIVE       2
+#define FLAGS_OUTPUT_READY     3
+#define FLAGS_INIT             4
+#define FLAGS_CPU              5
+#define FLAGS_DMA_READY                6
+/* context flags */
+#define FLAGS_FINUP            16
+#define FLAGS_SG               17
+#define FLAGS_SHA1             18
+#define FLAGS_HMAC             19
+#define FLAGS_ERROR            20
 
 #define OP_UPDATE      1
 #define OP_FINAL       2
@@ -144,7 +147,6 @@ struct omap_sham_dev {
        int                     dma;
        int                     dma_lch;
        struct tasklet_struct   done_task;
-       struct tasklet_struct   queue_task;
 
        unsigned long           flags;
        struct crypto_queue     queue;
@@ -223,7 +225,7 @@ static void omap_sham_copy_ready_hash(struct ahash_request *req)
        if (!hash)
                return;
 
-       if (likely(ctx->flags & FLAGS_SHA1)) {
+       if (likely(ctx->flags & BIT(FLAGS_SHA1))) {
                /* SHA1 results are in big endian */
                for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
                        hash[i] = be32_to_cpu(in[i]);
@@ -238,7 +240,7 @@ static int omap_sham_hw_init(struct omap_sham_dev *dd)
 {
        clk_enable(dd->iclk);
 
-       if (!(dd->flags & FLAGS_INIT)) {
+       if (!test_bit(FLAGS_INIT, &dd->flags)) {
                omap_sham_write_mask(dd, SHA_REG_MASK,
                        SHA_REG_MASK_SOFTRESET, SHA_REG_MASK_SOFTRESET);
 
@@ -246,7 +248,7 @@ static int omap_sham_hw_init(struct omap_sham_dev *dd)
                                        SHA_REG_SYSSTATUS_RESETDONE))
                        return -ETIMEDOUT;
 
-               dd->flags |= FLAGS_INIT;
+               set_bit(FLAGS_INIT, &dd->flags);
                dd->err = 0;
        }
 
@@ -269,7 +271,7 @@ static void omap_sham_write_ctrl(struct omap_sham_dev *dd, size_t length,
         * Setting ALGO_CONST only for the first iteration
         * and CLOSE_HASH only for the last one.
         */
-       if (ctx->flags & FLAGS_SHA1)
+       if (ctx->flags & BIT(FLAGS_SHA1))
                val |= SHA_REG_CTRL_ALGO;
        if (!ctx->digcnt)
                val |= SHA_REG_CTRL_ALGO_CONST;
@@ -301,7 +303,9 @@ static int omap_sham_xmit_cpu(struct omap_sham_dev *dd, const u8 *buf,
                return -ETIMEDOUT;
 
        if (final)
-               ctx->flags |= FLAGS_FINAL; /* catch last interrupt */
+               set_bit(FLAGS_FINAL, &dd->flags); /* catch last interrupt */
+
+       set_bit(FLAGS_CPU, &dd->flags);
 
        len32 = DIV_ROUND_UP(length, sizeof(u32));
 
@@ -334,9 +338,9 @@ static int omap_sham_xmit_dma(struct omap_sham_dev *dd, dma_addr_t dma_addr,
        ctx->digcnt += length;
 
        if (final)
-               ctx->flags |= FLAGS_FINAL; /* catch last interrupt */
+               set_bit(FLAGS_FINAL, &dd->flags); /* catch last interrupt */
 
-       dd->flags |= FLAGS_DMA_ACTIVE;
+       set_bit(FLAGS_DMA_ACTIVE, &dd->flags);
 
        omap_start_dma(dd->dma_lch);
 
@@ -392,7 +396,7 @@ static int omap_sham_xmit_dma_map(struct omap_sham_dev *dd,
                return -EINVAL;
        }
 
-       ctx->flags &= ~FLAGS_SG;
+       ctx->flags &= ~BIT(FLAGS_SG);
 
        /* next call does not fail... so no unmap in the case of error */
        return omap_sham_xmit_dma(dd, ctx->dma_addr, length, final);
@@ -406,7 +410,7 @@ static int omap_sham_update_dma_slow(struct omap_sham_dev *dd)
 
        omap_sham_append_sg(ctx);
 
-       final = (ctx->flags & FLAGS_FINUP) && !ctx->total;
+       final = (ctx->flags & BIT(FLAGS_FINUP)) && !ctx->total;
 
        dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n",
                                         ctx->bufcnt, ctx->digcnt, final);
@@ -452,7 +456,7 @@ static int omap_sham_update_dma_start(struct omap_sham_dev *dd)
        length = min(ctx->total, sg->length);
 
        if (sg_is_last(sg)) {
-               if (!(ctx->flags & FLAGS_FINUP)) {
+               if (!(ctx->flags & BIT(FLAGS_FINUP))) {
                        /* not last sg must be SHA1_MD5_BLOCK_SIZE aligned */
                        tail = length & (SHA1_MD5_BLOCK_SIZE - 1);
                        /* without finup() we need one block to close hash */
@@ -467,12 +471,12 @@ static int omap_sham_update_dma_start(struct omap_sham_dev *dd)
                return -EINVAL;
        }
 
-       ctx->flags |= FLAGS_SG;
+       ctx->flags |= BIT(FLAGS_SG);
 
        ctx->total -= length;
        ctx->offset = length; /* offset where to start slow */
 
-       final = (ctx->flags & FLAGS_FINUP) && !ctx->total;
+       final = (ctx->flags & BIT(FLAGS_FINUP)) && !ctx->total;
 
        /* next call does not fail... so no unmap in the case of error */
        return omap_sham_xmit_dma(dd, sg_dma_address(ctx->sg), length, final);
@@ -495,7 +499,7 @@ static int omap_sham_update_dma_stop(struct omap_sham_dev *dd)
        struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 
        omap_stop_dma(dd->dma_lch);
-       if (ctx->flags & FLAGS_SG) {
+       if (ctx->flags & BIT(FLAGS_SG)) {
                dma_unmap_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE);
                if (ctx->sg->length == ctx->offset) {
                        ctx->sg = sg_next(ctx->sg);
@@ -537,18 +541,18 @@ static int omap_sham_init(struct ahash_request *req)
                crypto_ahash_digestsize(tfm));
 
        if (crypto_ahash_digestsize(tfm) == SHA1_DIGEST_SIZE)
-               ctx->flags |= FLAGS_SHA1;
+               ctx->flags |= BIT(FLAGS_SHA1);
 
        ctx->bufcnt = 0;
        ctx->digcnt = 0;
        ctx->buflen = BUFLEN;
 
-       if (tctx->flags & FLAGS_HMAC) {
+       if (tctx->flags & BIT(FLAGS_HMAC)) {
                struct omap_sham_hmac_ctx *bctx = tctx->base;
 
                memcpy(ctx->buffer, bctx->ipad, SHA1_MD5_BLOCK_SIZE);
                ctx->bufcnt = SHA1_MD5_BLOCK_SIZE;
-               ctx->flags |= FLAGS_HMAC;
+               ctx->flags |= BIT(FLAGS_HMAC);
        }
 
        return 0;
@@ -562,9 +566,9 @@ static int omap_sham_update_req(struct omap_sham_dev *dd)
        int err;
 
        dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n",
-                ctx->total, ctx->digcnt, (ctx->flags & FLAGS_FINUP) != 0);
+                ctx->total, ctx->digcnt, (ctx->flags & BIT(FLAGS_FINUP)) != 0);
 
-       if (ctx->flags & FLAGS_CPU)
+       if (ctx->flags & BIT(FLAGS_CPU))
                err = omap_sham_update_cpu(dd);
        else
                err = omap_sham_update_dma_start(dd);
@@ -624,7 +628,7 @@ static int omap_sham_finish(struct ahash_request *req)
 
        if (ctx->digcnt) {
                omap_sham_copy_ready_hash(req);
-               if (ctx->flags & FLAGS_HMAC)
+               if (ctx->flags & BIT(FLAGS_HMAC))
                        err = omap_sham_finish_hmac(req);
        }
 
@@ -639,18 +643,23 @@ static void omap_sham_finish_req(struct ahash_request *req, int err)
        struct omap_sham_dev *dd = ctx->dd;
 
        if (!err) {
-               omap_sham_copy_hash(ctx->dd->req, 1);
-               if (ctx->flags & FLAGS_FINAL)
+               omap_sham_copy_hash(req, 1);
+               if (test_bit(FLAGS_FINAL, &dd->flags))
                        err = omap_sham_finish(req);
        } else {
-               ctx->flags |= FLAGS_ERROR;
+               ctx->flags |= BIT(FLAGS_ERROR);
        }
 
+       /* atomic operation is not needed here */
+       dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) |
+                       BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY));
        clk_disable(dd->iclk);
-       dd->flags &= ~FLAGS_BUSY;
 
        if (req->base.complete)
                req->base.complete(&req->base, err);
+
+       /* handle new request */
+       tasklet_schedule(&dd->done_task);
 }
 
 static int omap_sham_handle_queue(struct omap_sham_dev *dd,
@@ -658,21 +667,20 @@ static int omap_sham_handle_queue(struct omap_sham_dev *dd,
 {
        struct crypto_async_request *async_req, *backlog;
        struct omap_sham_reqctx *ctx;
-       struct ahash_request *prev_req;
        unsigned long flags;
        int err = 0, ret = 0;
 
        spin_lock_irqsave(&dd->lock, flags);
        if (req)
                ret = ahash_enqueue_request(&dd->queue, req);
-       if (dd->flags & FLAGS_BUSY) {
+       if (test_bit(FLAGS_BUSY, &dd->flags)) {
                spin_unlock_irqrestore(&dd->lock, flags);
                return ret;
        }
        backlog = crypto_get_backlog(&dd->queue);
        async_req = crypto_dequeue_request(&dd->queue);
        if (async_req)
-               dd->flags |= FLAGS_BUSY;
+               set_bit(FLAGS_BUSY, &dd->flags);
        spin_unlock_irqrestore(&dd->lock, flags);
 
        if (!async_req)
@@ -682,16 +690,12 @@ static int omap_sham_handle_queue(struct omap_sham_dev *dd,
                backlog->complete(backlog, -EINPROGRESS);
 
        req = ahash_request_cast(async_req);
-
-       prev_req = dd->req;
        dd->req = req;
-
        ctx = ahash_request_ctx(req);
 
        dev_dbg(dd->dev, "handling new req, op: %lu, nbytes: %d\n",
                                                ctx->op, req->nbytes);
 
-
        err = omap_sham_hw_init(dd);
        if (err)
                goto err1;
@@ -712,18 +716,16 @@ static int omap_sham_handle_queue(struct omap_sham_dev *dd,
 
        if (ctx->op == OP_UPDATE) {
                err = omap_sham_update_req(dd);
-               if (err != -EINPROGRESS && (ctx->flags & FLAGS_FINUP))
+               if (err != -EINPROGRESS && (ctx->flags & BIT(FLAGS_FINUP)))
                        /* no final() after finup() */
                        err = omap_sham_final_req(dd);
        } else if (ctx->op == OP_FINAL) {
                err = omap_sham_final_req(dd);
        }
 err1:
-       if (err != -EINPROGRESS) {
+       if (err != -EINPROGRESS)
                /* done_task will not finish it, so do it here */
                omap_sham_finish_req(req, err);
-               tasklet_schedule(&dd->queue_task);
-       }
 
        dev_dbg(dd->dev, "exit, err: %d\n", err);
 
@@ -752,7 +754,7 @@ static int omap_sham_update(struct ahash_request *req)
        ctx->sg = req->src;
        ctx->offset = 0;
 
-       if (ctx->flags & FLAGS_FINUP) {
+       if (ctx->flags & BIT(FLAGS_FINUP)) {
                if ((ctx->digcnt + ctx->bufcnt + ctx->total) < 9) {
                        /*
                        * OMAP HW accel works only with buffers >= 9
@@ -765,7 +767,7 @@ static int omap_sham_update(struct ahash_request *req)
                        /*
                        * faster to use CPU for short transfers
                        */
-                       ctx->flags |= FLAGS_CPU;
+                       ctx->flags |= BIT(FLAGS_CPU);
                }
        } else if (ctx->bufcnt + ctx->total < ctx->buflen) {
                omap_sham_append_sg(ctx);
@@ -802,9 +804,9 @@ static int omap_sham_final(struct ahash_request *req)
 {
        struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 
-       ctx->flags |= FLAGS_FINUP;
+       ctx->flags |= BIT(FLAGS_FINUP);
 
-       if (ctx->flags & FLAGS_ERROR)
+       if (ctx->flags & BIT(FLAGS_ERROR))
                return 0; /* uncompleted hash is not needed */
 
        /* OMAP HW accel works only with buffers >= 9 */
@@ -823,7 +825,7 @@ static int omap_sham_finup(struct ahash_request *req)
        struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
        int err1, err2;
 
-       ctx->flags |= FLAGS_FINUP;
+       ctx->flags |= BIT(FLAGS_FINUP);
 
        err1 = omap_sham_update(req);
        if (err1 == -EINPROGRESS || err1 == -EBUSY)
@@ -895,7 +897,7 @@ static int omap_sham_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base)
 
        if (alg_base) {
                struct omap_sham_hmac_ctx *bctx = tctx->base;
-               tctx->flags |= FLAGS_HMAC;
+               tctx->flags |= BIT(FLAGS_HMAC);
                bctx->shash = crypto_alloc_shash(alg_base, 0,
                                                CRYPTO_ALG_NEED_FALLBACK);
                if (IS_ERR(bctx->shash)) {
@@ -932,7 +934,7 @@ static void omap_sham_cra_exit(struct crypto_tfm *tfm)
        crypto_free_shash(tctx->fallback);
        tctx->fallback = NULL;
 
-       if (tctx->flags & FLAGS_HMAC) {
+       if (tctx->flags & BIT(FLAGS_HMAC)) {
                struct omap_sham_hmac_ctx *bctx = tctx->base;
                crypto_free_shash(bctx->shash);
        }
@@ -1036,51 +1038,46 @@ static struct ahash_alg algs[] = {
 static void omap_sham_done_task(unsigned long data)
 {
        struct omap_sham_dev *dd = (struct omap_sham_dev *)data;
-       struct ahash_request *req = dd->req;
-       struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
-       int ready = 0, err = 0;
+       int err = 0;
 
-       if (ctx->flags & FLAGS_OUTPUT_READY) {
-               ctx->flags &= ~FLAGS_OUTPUT_READY;
-               ready = 1;
+       if (!test_bit(FLAGS_BUSY, &dd->flags)) {
+               omap_sham_handle_queue(dd, NULL);
+               return;
        }
 
-       if (dd->flags & FLAGS_DMA_ACTIVE) {
-               dd->flags &= ~FLAGS_DMA_ACTIVE;
-               omap_sham_update_dma_stop(dd);
-               if (!dd->err)
+       if (test_bit(FLAGS_CPU, &dd->flags)) {
+               if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags))
+                       goto finish;
+       } else if (test_bit(FLAGS_DMA_READY, &dd->flags)) {
+               if (test_and_clear_bit(FLAGS_DMA_ACTIVE, &dd->flags)) {
+                       omap_sham_update_dma_stop(dd);
+                       if (dd->err) {
+                               err = dd->err;
+                               goto finish;
+                       }
+               }
+               if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags)) {
+                       /* hash or semi-hash ready */
+                       clear_bit(FLAGS_DMA_READY, &dd->flags);
                        err = omap_sham_update_dma_start(dd);
+                       if (err != -EINPROGRESS)
+                               goto finish;
+               }
        }
 
-       err = dd->err ? : err;
-
-       if (err != -EINPROGRESS && (ready || err)) {
-               dev_dbg(dd->dev, "update done: err: %d\n", err);
-               /* finish curent request */
-               omap_sham_finish_req(req, err);
-               /* start new request */
-               omap_sham_handle_queue(dd, NULL);
-       }
-}
-
-static void omap_sham_queue_task(unsigned long data)
-{
-       struct omap_sham_dev *dd = (struct omap_sham_dev *)data;
+       return;
 
-       omap_sham_handle_queue(dd, NULL);
+finish:
+       dev_dbg(dd->dev, "update done: err: %d\n", err);
+       /* finish curent request */
+       omap_sham_finish_req(dd->req, err);
 }
 
 static irqreturn_t omap_sham_irq(int irq, void *dev_id)
 {
        struct omap_sham_dev *dd = dev_id;
-       struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-
-       if (!ctx) {
-               dev_err(dd->dev, "unknown interrupt.\n");
-               return IRQ_HANDLED;
-       }
 
-       if (unlikely(ctx->flags & FLAGS_FINAL))
+       if (unlikely(test_bit(FLAGS_FINAL, &dd->flags)))
                /* final -> allow device to go to power-saving mode */
                omap_sham_write_mask(dd, SHA_REG_CTRL, 0, SHA_REG_CTRL_LENGTH);
 
@@ -1088,8 +1085,12 @@ static irqreturn_t omap_sham_irq(int irq, void *dev_id)
                                 SHA_REG_CTRL_OUTPUT_READY);
        omap_sham_read(dd, SHA_REG_CTRL);
 
-       ctx->flags |= FLAGS_OUTPUT_READY;
-       dd->err = 0;
+       if (!test_bit(FLAGS_BUSY, &dd->flags)) {
+               dev_warn(dd->dev, "Interrupt when no active requests.\n");
+               return IRQ_HANDLED;
+       }
+
+       set_bit(FLAGS_OUTPUT_READY, &dd->flags);
        tasklet_schedule(&dd->done_task);
 
        return IRQ_HANDLED;
@@ -1102,9 +1103,10 @@ static void omap_sham_dma_callback(int lch, u16 ch_status, void *data)
        if (ch_status != OMAP_DMA_BLOCK_IRQ) {
                pr_err("omap-sham DMA error status: 0x%hx\n", ch_status);
                dd->err = -EIO;
-               dd->flags &= ~FLAGS_INIT; /* request to re-initialize */
+               clear_bit(FLAGS_INIT, &dd->flags);/* request to re-initialize */
        }
 
+       set_bit(FLAGS_DMA_READY, &dd->flags);
        tasklet_schedule(&dd->done_task);
 }
 
@@ -1151,7 +1153,6 @@ static int __devinit omap_sham_probe(struct platform_device *pdev)
        INIT_LIST_HEAD(&dd->list);
        spin_lock_init(&dd->lock);
        tasklet_init(&dd->done_task, omap_sham_done_task, (unsigned long)dd);
-       tasklet_init(&dd->queue_task, omap_sham_queue_task, (unsigned long)dd);
        crypto_init_queue(&dd->queue, OMAP_SHAM_QUEUE_LENGTH);
 
        dd->irq = -1;
@@ -1260,7 +1261,6 @@ static int __devexit omap_sham_remove(struct platform_device *pdev)
        for (i = 0; i < ARRAY_SIZE(algs); i++)
                crypto_unregister_ahash(&algs[i]);
        tasklet_kill(&dd->done_task);
-       tasklet_kill(&dd->queue_task);
        iounmap(dd->io_base);
        clk_put(dd->iclk);
        omap_sham_dma_cleanup(dd);
index 854e263..8a0bb41 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * talitos - Freescale Integrated Security Engine (SEC) device driver
  *
- * Copyright (c) 2008-2010 Freescale Semiconductor, Inc.
+ * Copyright (c) 2008-2011 Freescale Semiconductor, Inc.
  *
  * Scatterlist Crypto API glue code copied from files with the following:
  * Copyright (c) 2006-2007 Herbert Xu <herbert@gondor.apana.org.au>
@@ -282,6 +282,7 @@ static int init_device(struct device *dev)
 /**
  * talitos_submit - submits a descriptor to the device for processing
  * @dev:       the SEC device to be used
+ * @ch:                the SEC device channel to be used
  * @desc:      the descriptor to be processed by the device
  * @callback:  whom to call when processing is complete
  * @context:   a handle for use by caller (optional)
@@ -290,7 +291,7 @@ static int init_device(struct device *dev)
  * callback must check err and feedback in descriptor header
  * for device processing status.
  */
-static int talitos_submit(struct device *dev, struct talitos_desc *desc,
+static int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
                          void (*callback)(struct device *dev,
                                           struct talitos_desc *desc,
                                           void *context, int error),
@@ -298,15 +299,9 @@ static int talitos_submit(struct device *dev, struct talitos_desc *desc,
 {
        struct talitos_private *priv = dev_get_drvdata(dev);
        struct talitos_request *request;
-       unsigned long flags, ch;
+       unsigned long flags;
        int head;
 
-       /* select done notification */
-       desc->hdr |= DESC_HDR_DONE_NOTIFY;
-
-       /* emulate SEC's round-robin channel fifo polling scheme */
-       ch = atomic_inc_return(&priv->last_chan) & (priv->num_channels - 1);
-
        spin_lock_irqsave(&priv->chan[ch].head_lock, flags);
 
        if (!atomic_inc_not_zero(&priv->chan[ch].submit_count)) {
@@ -706,6 +701,7 @@ static void talitos_unregister_rng(struct device *dev)
 
 struct talitos_ctx {
        struct device *dev;
+       int ch;
        __be32 desc_hdr_template;
        u8 key[TALITOS_MAX_KEY_SIZE];
        u8 iv[TALITOS_MAX_IV_LENGTH];
@@ -1117,7 +1113,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
        map_single_talitos_ptr(dev, &desc->ptr[6], ivsize, ctx->iv, 0,
                               DMA_FROM_DEVICE);
 
-       ret = talitos_submit(dev, desc, callback, areq);
+       ret = talitos_submit(dev, ctx->ch, desc, callback, areq);
        if (ret != -EINPROGRESS) {
                ipsec_esp_unmap(dev, edesc, areq);
                kfree(edesc);
@@ -1382,22 +1378,11 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *cipher,
                             const u8 *key, unsigned int keylen)
 {
        struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
-       struct ablkcipher_alg *alg = crypto_ablkcipher_alg(cipher);
-
-       if (keylen > TALITOS_MAX_KEY_SIZE)
-               goto badkey;
-
-       if (keylen < alg->min_keysize || keylen > alg->max_keysize)
-               goto badkey;
 
        memcpy(&ctx->key, key, keylen);
        ctx->keylen = keylen;
 
        return 0;
-
-badkey:
-       crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-       return -EINVAL;
 }
 
 static void common_nonsnoop_unmap(struct device *dev,
@@ -1433,7 +1418,6 @@ static void ablkcipher_done(struct device *dev,
 
 static int common_nonsnoop(struct talitos_edesc *edesc,
                           struct ablkcipher_request *areq,
-                          u8 *giv,
                           void (*callback) (struct device *dev,
                                             struct talitos_desc *desc,
                                             void *context, int error))
@@ -1453,7 +1437,7 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
 
        /* cipher iv */
        ivsize = crypto_ablkcipher_ivsize(cipher);
-       map_single_talitos_ptr(dev, &desc->ptr[1], ivsize, giv ?: areq->info, 0,
+       map_single_talitos_ptr(dev, &desc->ptr[1], ivsize, areq->info, 0,
                               DMA_TO_DEVICE);
 
        /* cipher key */
@@ -1524,7 +1508,7 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
        to_talitos_ptr(&desc->ptr[6], 0);
        desc->ptr[6].j_extent = 0;
 
-       ret = talitos_submit(dev, desc, callback, areq);
+       ret = talitos_submit(dev, ctx->ch, desc, callback, areq);
        if (ret != -EINPROGRESS) {
                common_nonsnoop_unmap(dev, edesc, areq);
                kfree(edesc);
@@ -1556,7 +1540,7 @@ static int ablkcipher_encrypt(struct ablkcipher_request *areq)
        /* set encrypt */
        edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_MODE0_ENCRYPT;
 
-       return common_nonsnoop(edesc, areq, NULL, ablkcipher_done);
+       return common_nonsnoop(edesc, areq, ablkcipher_done);
 }
 
 static int ablkcipher_decrypt(struct ablkcipher_request *areq)
@@ -1572,7 +1556,7 @@ static int ablkcipher_decrypt(struct ablkcipher_request *areq)
 
        edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND;
 
-       return common_nonsnoop(edesc, areq, NULL, ablkcipher_done);
+       return common_nonsnoop(edesc, areq, ablkcipher_done);
 }
 
 static void common_nonsnoop_hash_unmap(struct device *dev,
@@ -1703,7 +1687,7 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc,
        /* last DWORD empty */
        desc->ptr[6] = zero_entry;
 
-       ret = talitos_submit(dev, desc, callback, areq);
+       ret = talitos_submit(dev, ctx->ch, desc, callback, areq);
        if (ret != -EINPROGRESS) {
                common_nonsnoop_hash_unmap(dev, edesc, areq);
                kfree(edesc);
@@ -2244,6 +2228,7 @@ static int talitos_cra_init(struct crypto_tfm *tfm)
        struct crypto_alg *alg = tfm->__crt_alg;
        struct talitos_crypto_alg *talitos_alg;
        struct talitos_ctx *ctx = crypto_tfm_ctx(tfm);
+       struct talitos_private *priv;
 
        if ((alg->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_AHASH)
                talitos_alg = container_of(__crypto_ahash_alg(alg),
@@ -2256,9 +2241,17 @@ static int talitos_cra_init(struct crypto_tfm *tfm)
        /* update context with ptr to dev */
        ctx->dev = talitos_alg->dev;
 
+       /* assign SEC channel to tfm in round-robin fashion */
+       priv = dev_get_drvdata(ctx->dev);
+       ctx->ch = atomic_inc_return(&priv->last_chan) &
+                 (priv->num_channels - 1);
+
        /* copy descriptor header template value */
        ctx->desc_hdr_template = talitos_alg->algt.desc_hdr_template;
 
+       /* select done notification */
+       ctx->desc_hdr_template |= DESC_HDR_DONE_NOTIFY;
+
        return 0;
 }
 
index ec7b060..3701b62 100644 (file)
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/of.h>
 #include <linux/of_pci.h>
 #include <asm/prom.h>
 
index 55ef181..2c366b5 100644 (file)
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL      19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
                struct {
                        __u64 gprs[32];
                } osi;
+               struct {
+                       __u64 nr;
+                       __u64 ret;
+                       __u64 args[9];
+               } papr_hcall;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -544,6 +550,9 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_TSC_CONTROL 60
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA        65
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -746,6 +755,9 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS             _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS             _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+#define KVM_CREATE_SPAPR_TCE     _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA         _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 
@@ -773,20 +785,14 @@ struct kvm_assigned_pci_dev {
 
 struct kvm_assigned_irq {
        __u32 assigned_dev_id;
-       __u32 host_irq;
+       __u32 host_irq; /* ignored (legacy field) */
        __u32 guest_irq;
        __u32 flags;
        union {
-               struct {
-                       __u32 addr_lo;
-                       __u32 addr_hi;
-                       __u32 data;
-               } guest_msi;
                __u32 reserved[12];
        };
 };
 
-
 struct kvm_assigned_msix_nr {
        __u32 assigned_dev_id;
        __u16 entry_nr;
index 31ebb59..eabb21a 100644 (file)
@@ -47,6 +47,7 @@
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
+#define KVM_REQ_STEAL_UPDATE      13
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID    0
 
@@ -326,12 +327,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 
 extern struct page *bad_page;
+extern struct page *fault_page;
+
 extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
 int is_fault_pfn(pfn_t pfn);
+int is_noslot_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
                          struct kvm_userspace_memory_region *mem,
@@ -381,6 +387,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
                          unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                          void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
new file mode 100644 (file)
index 0000000..44d8dec
--- /dev/null
@@ -0,0 +1,504 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xen
+
+#if !defined(_TRACE_XEN_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XEN_H
+
+#include <linux/tracepoint.h>
+#include <asm/paravirt_types.h>
+#include <asm/xen/trace_types.h>
+
+/* Multicalls */
+DECLARE_EVENT_CLASS(xen_mc__batch,
+           TP_PROTO(enum paravirt_lazy_mode mode),
+           TP_ARGS(mode),
+           TP_STRUCT__entry(
+                   __field(enum paravirt_lazy_mode, mode)
+                   ),
+           TP_fast_assign(__entry->mode = mode),
+           TP_printk("start batch LAZY_%s",
+                     (__entry->mode == PARAVIRT_LAZY_MMU) ? "MMU" :
+                     (__entry->mode == PARAVIRT_LAZY_CPU) ? "CPU" : "NONE")
+       );
+#define DEFINE_XEN_MC_BATCH(name)                      \
+       DEFINE_EVENT(xen_mc__batch, name,               \
+               TP_PROTO(enum paravirt_lazy_mode mode), \
+                    TP_ARGS(mode))
+
+DEFINE_XEN_MC_BATCH(xen_mc_batch);
+DEFINE_XEN_MC_BATCH(xen_mc_issue);
+
+TRACE_EVENT(xen_mc_entry,
+           TP_PROTO(struct multicall_entry *mc, unsigned nargs),
+           TP_ARGS(mc, nargs),
+           TP_STRUCT__entry(
+                   __field(unsigned int, op)
+                   __field(unsigned int, nargs)
+                   __array(unsigned long, args, 6)
+                   ),
+           TP_fast_assign(__entry->op = mc->op;
+                          __entry->nargs = nargs;
+                          memcpy(__entry->args, mc->args, sizeof(unsigned long) * nargs);
+                          memset(__entry->args + nargs, 0, sizeof(unsigned long) * (6 - nargs));
+                   ),
+           TP_printk("op %u%s args [%lx, %lx, %lx, %lx, %lx, %lx]",
+                     __entry->op, xen_hypercall_name(__entry->op),
+                     __entry->args[0], __entry->args[1], __entry->args[2],
+                     __entry->args[3], __entry->args[4], __entry->args[5])
+       );
+
+TRACE_EVENT(xen_mc_entry_alloc,
+           TP_PROTO(size_t args),
+           TP_ARGS(args),
+           TP_STRUCT__entry(
+                   __field(size_t, args)
+                   ),
+           TP_fast_assign(__entry->args = args),
+           TP_printk("alloc entry %zu arg bytes", __entry->args)
+       );
+
+TRACE_EVENT(xen_mc_callback,
+           TP_PROTO(xen_mc_callback_fn_t fn, void *data),
+           TP_ARGS(fn, data),
+           TP_STRUCT__entry(
+                   __field(xen_mc_callback_fn_t, fn)
+                   __field(void *, data)
+                   ),
+           TP_fast_assign(
+                   __entry->fn = fn;
+                   __entry->data = data;
+                   ),
+           TP_printk("callback %pf, data %p",
+                     __entry->fn, __entry->data)
+       );
+
+TRACE_EVENT(xen_mc_flush_reason,
+           TP_PROTO(enum xen_mc_flush_reason reason),
+           TP_ARGS(reason),
+           TP_STRUCT__entry(
+                   __field(enum xen_mc_flush_reason, reason)
+                   ),
+           TP_fast_assign(__entry->reason = reason),
+           TP_printk("flush reason %s",
+                     (__entry->reason == XEN_MC_FL_NONE) ? "NONE" :
+                     (__entry->reason == XEN_MC_FL_BATCH) ? "BATCH" :
+                     (__entry->reason == XEN_MC_FL_ARGS) ? "ARGS" :
+                     (__entry->reason == XEN_MC_FL_CALLBACK) ? "CALLBACK" : "??")
+       );
+
+TRACE_EVENT(xen_mc_flush,
+           TP_PROTO(unsigned mcidx, unsigned argidx, unsigned cbidx),
+           TP_ARGS(mcidx, argidx, cbidx),
+           TP_STRUCT__entry(
+                   __field(unsigned, mcidx)
+                   __field(unsigned, argidx)
+                   __field(unsigned, cbidx)
+                   ),
+           TP_fast_assign(__entry->mcidx = mcidx;
+                          __entry->argidx = argidx;
+                          __entry->cbidx = cbidx),
+           TP_printk("flushing %u hypercalls, %u arg bytes, %u callbacks",
+                     __entry->mcidx, __entry->argidx, __entry->cbidx)
+       );
+
+TRACE_EVENT(xen_mc_extend_args,
+           TP_PROTO(unsigned long op, size_t args, enum xen_mc_extend_args res),
+           TP_ARGS(op, args, res),
+           TP_STRUCT__entry(
+                   __field(unsigned int, op)
+                   __field(size_t, args)
+                   __field(enum xen_mc_extend_args, res)
+                   ),
+           TP_fast_assign(__entry->op = op;
+                          __entry->args = args;
+                          __entry->res = res),
+           TP_printk("extending op %u%s by %zu bytes res %s",
+                     __entry->op, xen_hypercall_name(__entry->op),
+                     __entry->args,
+                     __entry->res == XEN_MC_XE_OK ? "OK" :
+                     __entry->res == XEN_MC_XE_BAD_OP ? "BAD_OP" :
+                     __entry->res == XEN_MC_XE_NO_SPACE ? "NO_SPACE" : "???")
+       );
+
+/* mmu */
+DECLARE_EVENT_CLASS(xen_mmu__set_pte,
+           TP_PROTO(pte_t *ptep, pte_t pteval),
+           TP_ARGS(ptep, pteval),
+           TP_STRUCT__entry(
+                   __field(pte_t *, ptep)
+                   __field(pteval_t, pteval)
+                   ),
+           TP_fast_assign(__entry->ptep = ptep;
+                          __entry->pteval = pteval.pte),
+           TP_printk("ptep %p pteval %0*llx (raw %0*llx)",
+                     __entry->ptep,
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)__entry->pteval)
+       );
+
+#define DEFINE_XEN_MMU_SET_PTE(name)                           \
+       DEFINE_EVENT(xen_mmu__set_pte, name,                    \
+                    TP_PROTO(pte_t *ptep, pte_t pteval),       \
+                    TP_ARGS(ptep, pteval))
+
+DEFINE_XEN_MMU_SET_PTE(xen_mmu_set_pte);
+DEFINE_XEN_MMU_SET_PTE(xen_mmu_set_pte_atomic);
+
+TRACE_EVENT(xen_mmu_set_domain_pte,
+           TP_PROTO(pte_t *ptep, pte_t pteval, unsigned domid),
+           TP_ARGS(ptep, pteval, domid),
+           TP_STRUCT__entry(
+                   __field(pte_t *, ptep)
+                   __field(pteval_t, pteval)
+                   __field(unsigned, domid)
+                   ),
+           TP_fast_assign(__entry->ptep = ptep;
+                          __entry->pteval = pteval.pte;
+                          __entry->domid = domid),
+           TP_printk("ptep %p pteval %0*llx (raw %0*llx) domid %u",
+                     __entry->ptep,
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)__entry->pteval,
+                     __entry->domid)
+       );
+
+TRACE_EVENT(xen_mmu_set_pte_at,
+           TP_PROTO(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval),
+           TP_ARGS(mm, addr, ptep, pteval),
+           TP_STRUCT__entry(
+                   __field(struct mm_struct *, mm)
+                   __field(unsigned long, addr)
+                   __field(pte_t *, ptep)
+                   __field(pteval_t, pteval)
+                   ),
+           TP_fast_assign(__entry->mm = mm;
+                          __entry->addr = addr;
+                          __entry->ptep = ptep;
+                          __entry->pteval = pteval.pte),
+           TP_printk("mm %p addr %lx ptep %p pteval %0*llx (raw %0*llx)",
+                     __entry->mm, __entry->addr, __entry->ptep,
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)__entry->pteval)
+       );
+
+TRACE_EVENT(xen_mmu_pte_clear,
+           TP_PROTO(struct mm_struct *mm, unsigned long addr, pte_t *ptep),
+           TP_ARGS(mm, addr, ptep),
+           TP_STRUCT__entry(
+                   __field(struct mm_struct *, mm)
+                   __field(unsigned long, addr)
+                   __field(pte_t *, ptep)
+                   ),
+           TP_fast_assign(__entry->mm = mm;
+                          __entry->addr = addr;
+                          __entry->ptep = ptep),
+           TP_printk("mm %p addr %lx ptep %p",
+                     __entry->mm, __entry->addr, __entry->ptep)
+       );
+
+TRACE_EVENT(xen_mmu_set_pmd,
+           TP_PROTO(pmd_t *pmdp, pmd_t pmdval),
+           TP_ARGS(pmdp, pmdval),
+           TP_STRUCT__entry(
+                   __field(pmd_t *, pmdp)
+                   __field(pmdval_t, pmdval)
+                   ),
+           TP_fast_assign(__entry->pmdp = pmdp;
+                          __entry->pmdval = pmdval.pmd),
+           TP_printk("pmdp %p pmdval %0*llx (raw %0*llx)",
+                     __entry->pmdp,
+                     (int)sizeof(pmdval_t) * 2, (unsigned long long)pmd_val(native_make_pmd(__entry->pmdval)),
+                     (int)sizeof(pmdval_t) * 2, (unsigned long long)__entry->pmdval)
+       );
+
+TRACE_EVENT(xen_mmu_pmd_clear,
+           TP_PROTO(pmd_t *pmdp),
+           TP_ARGS(pmdp),
+           TP_STRUCT__entry(
+                   __field(pmd_t *, pmdp)
+                   ),
+           TP_fast_assign(__entry->pmdp = pmdp),
+           TP_printk("pmdp %p", __entry->pmdp)
+       );
+
+#if PAGETABLE_LEVELS >= 4
+
+TRACE_EVENT(xen_mmu_set_pud,
+           TP_PROTO(pud_t *pudp, pud_t pudval),
+           TP_ARGS(pudp, pudval),
+           TP_STRUCT__entry(
+                   __field(pud_t *, pudp)
+                   __field(pudval_t, pudval)
+                   ),
+           TP_fast_assign(__entry->pudp = pudp;
+                          __entry->pudval = native_pud_val(pudval)),
+           TP_printk("pudp %p pudval %0*llx (raw %0*llx)",
+                     __entry->pudp,
+                     (int)sizeof(pudval_t) * 2, (unsigned long long)pud_val(native_make_pud(__entry->pudval)),
+                     (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval)
+       );
+
+TRACE_EVENT(xen_mmu_set_pgd,
+           TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval),
+           TP_ARGS(pgdp, user_pgdp, pgdval),
+           TP_STRUCT__entry(
+                   __field(pgd_t *, pgdp)
+                   __field(pgd_t *, user_pgdp)
+                   __field(pgdval_t, pgdval)
+                   ),
+           TP_fast_assign(__entry->pgdp = pgdp;
+                          __entry->user_pgdp = user_pgdp;
+                          __entry->pgdval = pgdval.pgd),
+           TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)",
+                     __entry->pgdp, __entry->user_pgdp,
+                     (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)),
+                     (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval)
+       );
+
+TRACE_EVENT(xen_mmu_pud_clear,
+           TP_PROTO(pud_t *pudp),
+           TP_ARGS(pudp),
+           TP_STRUCT__entry(
+                   __field(pud_t *, pudp)
+                   ),
+           TP_fast_assign(__entry->pudp = pudp),
+           TP_printk("pudp %p", __entry->pudp)
+       );
+#else
+
+TRACE_EVENT(xen_mmu_set_pud,
+           TP_PROTO(pud_t *pudp, pud_t pudval),
+           TP_ARGS(pudp, pudval),
+           TP_STRUCT__entry(
+                   __field(pud_t *, pudp)
+                   __field(pudval_t, pudval)
+                   ),
+           TP_fast_assign(__entry->pudp = pudp;
+                          __entry->pudval = native_pud_val(pudval)),
+           TP_printk("pudp %p pudval %0*llx (raw %0*llx)",
+                     __entry->pudp,
+                     (int)sizeof(pudval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pudval)),
+                     (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval)
+       );
+
+#endif
+
+TRACE_EVENT(xen_mmu_pgd_clear,
+           TP_PROTO(pgd_t *pgdp),
+           TP_ARGS(pgdp),
+           TP_STRUCT__entry(
+                   __field(pgd_t *, pgdp)
+                   ),
+           TP_fast_assign(__entry->pgdp = pgdp),
+           TP_printk("pgdp %p", __entry->pgdp)
+       );
+
+DECLARE_EVENT_CLASS(xen_mmu_ptep_modify_prot,
+           TP_PROTO(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval),
+           TP_ARGS(mm, addr, ptep, pteval),
+           TP_STRUCT__entry(
+                   __field(struct mm_struct *, mm)
+                   __field(unsigned long, addr)
+                   __field(pte_t *, ptep)
+                   __field(pteval_t, pteval)
+                   ),
+           TP_fast_assign(__entry->mm = mm;
+                          __entry->addr = addr;
+                          __entry->ptep = ptep;
+                          __entry->pteval = pteval.pte),
+           TP_printk("mm %p addr %lx ptep %p pteval %0*llx (raw %0*llx)",
+                     __entry->mm, __entry->addr, __entry->ptep,
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
+                     (int)sizeof(pteval_t) * 2, (unsigned long long)__entry->pteval)
+       );
+#define DEFINE_XEN_MMU_PTEP_MODIFY_PROT(name)                          \
+       DEFINE_EVENT(xen_mmu_ptep_modify_prot, name,                    \
+                    TP_PROTO(struct mm_struct *mm, unsigned long addr, \
+                             pte_t *ptep, pte_t pteval),               \
+                    TP_ARGS(mm, addr, ptep, pteval))
+
+DEFINE_XEN_MMU_PTEP_MODIFY_PROT(xen_mmu_ptep_modify_prot_start);
+DEFINE_XEN_MMU_PTEP_MODIFY_PROT(xen_mmu_ptep_modify_prot_commit);
+
+TRACE_EVENT(xen_mmu_alloc_ptpage,
+           TP_PROTO(struct mm_struct *mm, unsigned long pfn, unsigned level, bool pinned),
+           TP_ARGS(mm, pfn, level, pinned),
+           TP_STRUCT__entry(
+                   __field(struct mm_struct *, mm)
+                   __field(unsigned long, pfn)
+                   __field(unsigned, level)
+                   __field(bool, pinned)
+                   ),
+           TP_fast_assign(__entry->mm = mm;
+                          __entry->pfn = pfn;
+                          __entry->level = level;
+                          __entry->pinned = pinned),
+           TP_printk("mm %p  pfn %lx  level %d  %spinned",
+                     __entry->mm, __entry->pfn, __entry->level,
+                     __entry->pinned ? "" : "un")
+       );
+
+TRACE_EVENT(xen_mmu_release_ptpage,
+           TP_PROTO(unsigned long pfn, unsigned level, bool pinned),
+           TP_ARGS(pfn, level, pinned),
+           TP_STRUCT__entry(
+                   __field(unsigned long, pfn)
+                   __field(unsigned, level)
+                   __field(bool, pinned)
+                   ),
+           TP_fast_assign(__entry->pfn = pfn;
+                          __entry->level = level;
+                          __entry->pinned = pinned),
+           TP_printk("pfn %lx  level %d  %spinned",
+                     __entry->pfn, __entry->level,
+                     __entry->pinned ? "" : "un")
+       );
+
+DECLARE_EVENT_CLASS(xen_mmu_pgd,
+           TP_PROTO(struct mm_struct *mm, pgd_t *pgd),
+           TP_ARGS(mm, pgd),
+           TP_STRUCT__entry(
+                   __field(struct mm_struct *, mm)
+                   __field(pgd_t *, pgd)
+                   ),
+           TP_fast_assign(__entry->mm = mm;
+                          __entry->pgd = pgd),
+           TP_printk("mm %p pgd %p", __entry->mm, __entry->pgd)
+       );
+#define DEFINE_XEN_MMU_PGD_EVENT(name)                         \
+       DEFINE_EVENT(xen_mmu_pgd, name,                         \
+               TP_PROTO(struct mm_struct *mm, pgd_t *pgd),     \
+                    TP_ARGS(mm, pgd))
+
+DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_pin);
+DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_unpin);
+
+TRACE_EVENT(xen_mmu_flush_tlb,
+           TP_PROTO(int x),
+           TP_ARGS(x),
+           TP_STRUCT__entry(__array(char, x, 0)),
+           TP_fast_assign((void)x),
+           TP_printk("%s", "")
+       );
+
+TRACE_EVENT(xen_mmu_flush_tlb_single,
+           TP_PROTO(unsigned long addr),
+           TP_ARGS(addr),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   ),
+           TP_fast_assign(__entry->addr = addr),
+           TP_printk("addr %lx", __entry->addr)
+       );
+
+TRACE_EVENT(xen_mmu_flush_tlb_others,
+           TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
+                    unsigned long addr),
+           TP_ARGS(cpus, mm, addr),
+           TP_STRUCT__entry(
+                   __field(unsigned, ncpus)
+                   __field(struct mm_struct *, mm)
+                   __field(unsigned long, addr)
+                   ),
+           TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+                          __entry->mm = mm;
+                          __entry->addr = addr),
+           TP_printk("ncpus %d mm %p addr %lx",
+                     __entry->ncpus, __entry->mm, __entry->addr)
+       );
+
+TRACE_EVENT(xen_mmu_write_cr3,
+           TP_PROTO(bool kernel, unsigned long cr3),
+           TP_ARGS(kernel, cr3),
+           TP_STRUCT__entry(
+                   __field(bool, kernel)
+                   __field(unsigned long, cr3)
+                   ),
+           TP_fast_assign(__entry->kernel = kernel;
+                          __entry->cr3 = cr3),
+           TP_printk("%s cr3 %lx",
+                     __entry->kernel ? "kernel" : "user", __entry->cr3)
+       );
+
+
+/* CPU */
+TRACE_EVENT(xen_cpu_write_ldt_entry,
+           TP_PROTO(struct desc_struct *dt, int entrynum, u64 desc),
+           TP_ARGS(dt, entrynum, desc),
+           TP_STRUCT__entry(
+                   __field(struct desc_struct *, dt)
+                   __field(int, entrynum)
+                   __field(u64, desc)
+                   ),
+           TP_fast_assign(__entry->dt = dt;
+                          __entry->entrynum = entrynum;
+                          __entry->desc = desc;
+                   ),
+           TP_printk("dt %p  entrynum %d  entry %016llx",
+                     __entry->dt, __entry->entrynum,
+                     (unsigned long long)__entry->desc)
+       );
+
+TRACE_EVENT(xen_cpu_write_idt_entry,
+           TP_PROTO(gate_desc *dt, int entrynum, const gate_desc *ent),
+           TP_ARGS(dt, entrynum, ent),
+           TP_STRUCT__entry(
+                   __field(gate_desc *, dt)
+                   __field(int, entrynum)
+                   ),
+           TP_fast_assign(__entry->dt = dt;
+                          __entry->entrynum = entrynum;
+                   ),
+           TP_printk("dt %p  entrynum %d",
+                     __entry->dt, __entry->entrynum)
+       );
+
+TRACE_EVENT(xen_cpu_load_idt,
+           TP_PROTO(const struct desc_ptr *desc),
+           TP_ARGS(desc),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   ),
+           TP_fast_assign(__entry->addr = desc->address),
+           TP_printk("addr %lx", __entry->addr)
+       );
+
+TRACE_EVENT(xen_cpu_write_gdt_entry,
+           TP_PROTO(struct desc_struct *dt, int entrynum, const void *desc, int type),
+           TP_ARGS(dt, entrynum, desc, type),
+           TP_STRUCT__entry(
+                   __field(u64, desc)
+                   __field(struct desc_struct *, dt)
+                   __field(int, entrynum)
+                   __field(int, type)
+                   ),
+           TP_fast_assign(__entry->dt = dt;
+                          __entry->entrynum = entrynum;
+                          __entry->desc = *(u64 *)desc;
+                          __entry->type = type;
+                   ),
+           TP_printk("dt %p  entrynum %d  type %d  desc %016llx",
+                     __entry->dt, __entry->entrynum, __entry->type,
+                     (unsigned long long)__entry->desc)
+       );
+
+TRACE_EVENT(xen_cpu_set_ldt,
+           TP_PROTO(const void *addr, unsigned entries),
+           TP_ARGS(addr, entries),
+           TP_STRUCT__entry(
+                   __field(const void *, addr)
+                   __field(unsigned, entries)
+                   ),
+           TP_fast_assign(__entry->addr = addr;
+                          __entry->entries = entries),
+           TP_printk("addr %p  entries %u",
+                     __entry->addr, __entry->entries)
+       );
+
+
+#endif /*  _TRACE_XEN_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index fc9eb09..18197ae 100644 (file)
@@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
        case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
        }
 }
+EXPORT_SYMBOL_GPL(sigset_from_compat);
 
 asmlinkage long
 compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
index ead9b61..418b3f7 100644 (file)
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
 
 int delayacct_on __read_mostly = 1;    /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)
index 9aaf567..751a7cc 100644 (file)
@@ -75,6 +75,9 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -528,6 +531,12 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+       u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       u64 prev_steal_time_rq;
+#endif
 
        /* calc_load related fields */
        unsigned long calc_load_update;
@@ -1921,10 +1930,28 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
 {
-       s64 irq_delta;
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
 
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
        /*
@@ -1947,12 +1974,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
        rq->prev_irq_time += irq_delta;
        delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       if (static_branch((&paravirt_steal_rq_enabled))) {
+               u64 st;
+
+               steal = paravirt_steal_clock(cpu_of(rq));
+               steal -= rq->prev_steal_time_rq;
+
+               if (unlikely(steal > delta))
+                       steal = delta;
+
+               st = steal_ticks(steal);
+               steal = st * TICK_NSEC;
+
+               rq->prev_steal_time_rq += steal;
+
+               delta -= steal;
+       }
+#endif
+
        rq->clock_task += delta;
 
-       if (irq_delta && sched_feat(NONIRQ_POWER))
-               sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+               sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -1987,12 +2037,7 @@ static int irqtime_account_si_update(void)
 
 #define sched_clock_irqtime    (0)
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-       rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -3845,6 +3890,25 @@ void account_idle_time(cputime_t cputime)
                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
 
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+       if (static_branch(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+
+               account_steal_time(st);
+               return st;
+       }
+#endif
+       return false;
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3876,6 +3940,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
+       if (steal_account_process_tick())
+               return;
+
        if (irqtime_account_hi_update()) {
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        } else if (irqtime_account_si_update()) {
@@ -3929,6 +3996,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
 
+       if (steal_account_process_tick())
+               return;
+
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
index 1e7066d..2e74677 100644 (file)
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
  */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
 
 /*
  * Queue remote wakeups on the target CPU and process them
index 6cc4b97..4e9eaeb 100644 (file)
@@ -617,7 +617,7 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
        if (adev->entries_nr == 0) {
                adev->entries_nr = entry_nr->entry_nr;
                if (adev->entries_nr == 0 ||
-                   adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+                   adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
                        r = -EINVAL;
                        goto msix_nr_out;
                }
index 62a9caf..78c80f6 100644 (file)
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
+static int allow_unsafe_assigned_interrupts;
+module_param_named(allow_unsafe_assigned_interrupts,
+                  allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
+ "Enable device assignment on platforms without interrupt remapping support.");
+
 static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
                                gfn_t base_gfn, unsigned long npages);
@@ -231,6 +237,18 @@ int kvm_iommu_map_guest(struct kvm *kvm)
        if (!kvm->arch.iommu_domain)
                return -ENOMEM;
 
+       if (!allow_unsafe_assigned_interrupts &&
+           !iommu_domain_has_cap(kvm->arch.iommu_domain,
+                                 IOMMU_CAP_INTR_REMAP)) {
+               printk(KERN_WARNING "%s: No interrupt remapping support,"
+                      " disallowing device assignment."
+                      " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
+                      " module option.\n", __func__);
+               iommu_domain_free(kvm->arch.iommu_domain);
+               kvm->arch.iommu_domain = NULL;
+               return -EPERM;
+       }
+
        r = kvm_iommu_map_memslots(kvm);
        if (r)
                goto out_unmap;
index 96ebc06..aefdda3 100644 (file)
@@ -84,6 +84,10 @@ struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
+                                 unsigned long arg);
+#endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 
@@ -97,8 +101,8 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
@@ -827,6 +831,13 @@ skip_lpage:
 
        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
+       /*
+        * If the new memory slot is created, we need to clear all
+        * mmio sptes.
+        */
+       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
+               kvm_arch_flush_shadow(kvm);
+
        kvm_free_physmem_slot(&old, &new);
        kfree(old_memslots);
 
@@ -927,6 +938,18 @@ int is_fault_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_fault_pfn);
 
+int is_noslot_pfn(pfn_t pfn)
+{
+       return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_noslot_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+       return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
 static inline unsigned long bad_hva(void)
 {
        return PAGE_OFFSET;
@@ -1345,7 +1368,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = copy_to_user((void __user *)addr + offset, data, len);
+       r = __copy_to_user((void __user *)addr + offset, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty(kvm, gfn);
@@ -1405,7 +1428,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
 
-       r = copy_to_user((void __user *)ghc->hva, data, len);
+       r = __copy_to_user((void __user *)ghc->hva, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
@@ -1414,6 +1437,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                          void *data, unsigned long len)
+{
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       int r;
+
+       if (slots->generation != ghc->generation)
+               kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+       if (kvm_is_error_hva(ghc->hva))
+               return -EFAULT;
+
+       r = __copy_from_user(data, (void __user *)ghc->hva, len);
+       if (r)
+               return -EFAULT;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
        return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
@@ -1586,7 +1629,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
-       .compat_ioctl   = kvm_vcpu_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = kvm_vcpu_compat_ioctl,
+#endif
        .mmap           = kvm_vcpu_mmap,
        .llseek         = noop_llseek,
 };
@@ -1615,18 +1660,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
        r = kvm_arch_vcpu_setup(vcpu);
        if (r)
-               return r;
+               goto vcpu_destroy;
 
        mutex_lock(&kvm->lock);
        if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
                r = -EINVAL;
-               goto vcpu_destroy;
+               goto unlock_vcpu_destroy;
        }
 
        kvm_for_each_vcpu(r, v, kvm)
                if (v->vcpu_id == id) {
                        r = -EEXIST;
-                       goto vcpu_destroy;
+                       goto unlock_vcpu_destroy;
                }
 
        BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
@@ -1636,7 +1681,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        r = create_vcpu_fd(vcpu);
        if (r < 0) {
                kvm_put_kvm(kvm);
-               goto vcpu_destroy;
+               goto unlock_vcpu_destroy;
        }
 
        kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
@@ -1650,8 +1695,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        mutex_unlock(&kvm->lock);
        return r;
 
-vcpu_destroy:
+unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
+vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
        return r;
 }
@@ -1874,6 +1920,50 @@ out:
        return r;
 }
 
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *filp,
+                                 unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = compat_ptr(arg);
+       int r;
+
+       if (vcpu->kvm->mm != current->mm)
+               return -EIO;
+
+       switch (ioctl) {
+       case KVM_SET_SIGNAL_MASK: {
+               struct kvm_signal_mask __user *sigmask_arg = argp;
+               struct kvm_signal_mask kvm_sigmask;
+               compat_sigset_t csigset;
+               sigset_t sigset;
+
+               if (argp) {
+                       r = -EFAULT;
+                       if (copy_from_user(&kvm_sigmask, argp,
+                                          sizeof kvm_sigmask))
+                               goto out;
+                       r = -EINVAL;
+                       if (kvm_sigmask.len != sizeof csigset)
+                               goto out;
+                       r = -EFAULT;
+                       if (copy_from_user(&csigset, sigmask_arg->sigset,
+                                          sizeof csigset))
+                               goto out;
+               }
+               sigset_from_compat(&sigset, &csigset);
+               r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+               break;
+       }
+       default:
+               r = kvm_vcpu_ioctl(filp, ioctl, arg);
+       }
+
+out:
+       return r;
+}
+#endif
+
 static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {