Merge branch 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 13 Jan 2011 18:14:24 +0000 (10:14 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 13 Jan 2011 18:14:24 +0000 (10:14 -0800)
* 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits)
  KVM: Initialize fpu state in preemptible context
  KVM: VMX: when entering real mode align segment base to 16 bytes
  KVM: MMU: handle 'map_writable' in set_spte() function
  KVM: MMU: audit: allow audit more guests at the same time
  KVM: Fetch guest cr3 from hardware on demand
  KVM: Replace reads of vcpu->arch.cr3 by an accessor
  KVM: MMU: only write protect mappings at pagetable level
  KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear()
  KVM: MMU: Initialize base_role for tdp mmus
  KVM: VMX: Optimize atomic EFER load
  KVM: VMX: Add definitions for more vm entry/exit control bits
  KVM: SVM: copy instruction bytes from VMCB
  KVM: SVM: implement enhanced INVLPG intercept
  KVM: SVM: enhance mov DR intercept handler
  KVM: SVM: enhance MOV CR intercept handler
  KVM: SVM: add new SVM feature bit names
  KVM: cleanup emulate_instruction
  KVM: move complete_insn_gp() into x86.c
  KVM: x86: fix CR8 handling
  KVM guest: Fix kvm clock initialization when it's configured out
  ...

43 files changed:
Documentation/kernel-parameters.txt
Documentation/kvm/api.txt
Documentation/kvm/cpuid.txt
Documentation/kvm/msr.txt
arch/ia64/include/asm/kvm_host.h
arch/ia64/kvm/kvm-ia64.c
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/powerpc.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/traps.h
arch/x86/include/asm/vmx.h
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/i387.c
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/emulate.c
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm.h
include/linux/kvm_host.h
include/linux/kvm_types.h
include/trace/events/kvm.h
virt/kvm/Kconfig
virt/kvm/assigned-dev.c
virt/kvm/async_pf.c [new file with mode: 0644]
virt/kvm/async_pf.h [new file with mode: 0644]
virt/kvm/eventfd.c
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c

index 338c96e..55fe759 100644 (file)
@@ -1705,6 +1705,9 @@ and is between 256 and 4096 characters. It is defined in the file
 
        no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
 
+       no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
+                       fault handling.
+
        nolapic         [X86-32,APIC] Do not enable or use the local APIC.
 
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
index 50713e3..ad85797 100644 (file)
@@ -1085,6 +1085,184 @@ of 4 instructions that make up a hypercall.
 If any additional field gets added to this structure later on, a bit for that
 additional piece of information will be set in the flags bitmap.
 
+4.47 KVM_ASSIGN_PCI_DEVICE
+
+Capability: KVM_CAP_DEVICE_ASSIGNMENT
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Assigns a host PCI device to the VM.
+
+struct kvm_assigned_pci_dev {
+       __u32 assigned_dev_id;
+       __u32 busnr;
+       __u32 devfn;
+       __u32 flags;
+       __u32 segnr;
+       union {
+               __u32 reserved[11];
+       };
+};
+
+The PCI device is specified by the triple segnr, busnr, and devfn.
+Identification in succeeding service requests is done via assigned_dev_id. The
+following flags are specified:
+
+/* Depends on KVM_CAP_IOMMU */
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
+
+4.48 KVM_DEASSIGN_PCI_DEVICE
+
+Capability: KVM_CAP_DEVICE_DEASSIGNMENT
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Ends PCI device assignment, releasing all associated resources.
+
+See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is
+used in kvm_assigned_pci_dev to identify the device.
+
+4.49 KVM_ASSIGN_DEV_IRQ
+
+Capability: KVM_CAP_ASSIGN_DEV_IRQ
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_irq (in)
+Returns: 0 on success, -1 on error
+
+Assigns an IRQ to a passed-through device.
+
+struct kvm_assigned_irq {
+       __u32 assigned_dev_id;
+       __u32 host_irq;
+       __u32 guest_irq;
+       __u32 flags;
+       union {
+               struct {
+                       __u32 addr_lo;
+                       __u32 addr_hi;
+                       __u32 data;
+               } guest_msi;
+               __u32 reserved[12];
+       };
+};
+
+The following flags are defined:
+
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+It is not valid to specify multiple types per host or guest IRQ. However, the
+IRQ type of host and guest can differ or can even be null.
+
+4.50 KVM_DEASSIGN_DEV_IRQ
+
+Capability: KVM_CAP_ASSIGN_DEV_IRQ
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_irq (in)
+Returns: 0 on success, -1 on error
+
+Ends an IRQ assignment to a passed-through device.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id, flags must correspond to the IRQ type specified on
+KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
+
+4.51 KVM_SET_GSI_ROUTING
+
+Capability: KVM_CAP_IRQ_ROUTING
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_irq_routing (in)
+Returns: 0 on success, -1 on error
+
+Sets the GSI routing table entries, overwriting any previously set entries.
+
+struct kvm_irq_routing {
+       __u32 nr;
+       __u32 flags;
+       struct kvm_irq_routing_entry entries[0];
+};
+
+No flags are specified so far, the corresponding field must be set to zero.
+
+struct kvm_irq_routing_entry {
+       __u32 gsi;
+       __u32 type;
+       __u32 flags;
+       __u32 pad;
+       union {
+               struct kvm_irq_routing_irqchip irqchip;
+               struct kvm_irq_routing_msi msi;
+               __u32 pad[8];
+       } u;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+
+No flags are specified so far, the corresponding field must be set to zero.
+
+struct kvm_irq_routing_irqchip {
+       __u32 irqchip;
+       __u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+       __u32 address_lo;
+       __u32 address_hi;
+       __u32 data;
+       __u32 pad;
+};
+
+4.52 KVM_ASSIGN_SET_MSIX_NR
+
+Capability: KVM_CAP_DEVICE_MSIX
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_msix_nr (in)
+Returns: 0 on success, -1 on error
+
+Set the number of MSI-X interrupts for an assigned device. This service can
+only be called once in the lifetime of an assigned device.
+
+struct kvm_assigned_msix_nr {
+       __u32 assigned_dev_id;
+       __u16 entry_nr;
+       __u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV           256
+
+4.53 KVM_ASSIGN_SET_MSIX_ENTRY
+
+Capability: KVM_CAP_DEVICE_MSIX
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_msix_entry (in)
+Returns: 0 on success, -1 on error
+
+Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
+the GSI vector to zero means disabling the interrupt.
+
+struct kvm_assigned_msix_entry {
+       __u32 assigned_dev_id;
+       __u32 gsi;
+       __u16 entry; /* The index of entry in the MSI-X table */
+       __u16 padding[3];
+};
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
index 14a12ea..8820685 100644 (file)
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP                 ||     2 || deprecated.
 KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
                                    ||       || 0x4b564d00 and 0x4b564d01
 ------------------------------------------------------------------------------
+KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
+                                   ||       || writing to msr 0x4b564d02
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
index 8ddcfe8..d079aed 100644 (file)
@@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
 =====================================================
 
 KVM makes use of some custom MSRs to service some requests.
-At present, this facility is only used by kvmclock.
 
 Custom MSRs have a range reserved for them, that goes from
 0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
@@ -151,3 +150,38 @@ MSR_KVM_SYSTEM_TIME: 0x12
                        return PRESENT;
                } else
                        return NON_PRESENT;
+
+MSR_KVM_ASYNC_PF_EN: 0x4b564d02
+       data: Bits 63-6 hold 64-byte aligned physical address of a
+       64 byte memory area which must be in guest RAM and must be
+       zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
+       when asynchronous page faults are enabled on the vcpu 0 when
+       disabled. Bit 2 is 1 if asynchronous page faults can be injected
+       when vcpu is in cpl == 0.
+
+       First 4 byte of 64 byte memory location will be written to by
+       the hypervisor at the time of asynchronous page fault (APF)
+       injection to indicate type of asynchronous page fault. Value
+       of 1 means that the page referred to by the page fault is not
+       present. Value 2 means that the page is now available. Disabling
+       interrupt inhibits APFs. Guest must not enable interrupt
+       before the reason is read, or it may be overwritten by another
+       APF. Since APF uses the same exception vector as regular page
+       fault guest must reset the reason to 0 before it does
+       something that can generate normal page fault.  If during page
+       fault APF reason is 0 it means that this is regular page
+       fault.
+
+       During delivery of type 1 APF cr2 contains a token that will
+       be used to notify a guest when missing page becomes
+       available. When page becomes available type 2 APF is sent with
+       cr2 set to the token associated with the page. There is special
+       kind of token 0xffffffff which tells vcpu that it should wake
+       up all processes waiting for APFs and no individual type 2 APFs
+       will be sent.
+
+       If APF is disabled while there are outstanding APFs, they will
+       not be delivered.
+
+       Currently type 2 APF will be always delivered on the same vcpu as
+       type 1 was, but guest should not rely on that.
index 2f229e5..2689ee5 100644 (file)
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
+#define __KVM_HAVE_ARCH_VM_ALLOC 1
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
 #endif /* __ASSEMBLY__*/
 
 #endif
index f56a631..70d224d 100644 (file)
@@ -749,7 +749,7 @@ out:
        return r;
 }
 
-static struct kvm *kvm_alloc_kvm(void)
+struct kvm *kvm_arch_alloc_vm(void)
 {
 
        struct kvm *kvm;
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
        vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
 
        if (!vm_base)
-               return ERR_PTR(-ENOMEM);
+               return NULL;
 
        memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
        kvm = (struct kvm *)(vm_base +
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 #define GUEST_PHYSICAL_RR4     0x2739
 #define VMM_INIT_RR            0x1660
 
-static void kvm_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
        BUG_ON(!kvm);
 
+       kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
        kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
        kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
        kvm->arch.vmm_init_rr = VMM_INIT_RR;
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-}
-
-struct  kvm *kvm_arch_create_vm(void)
-{
-       struct kvm *kvm = kvm_alloc_kvm();
-
-       if (IS_ERR(kvm))
-               return ERR_PTR(-ENOMEM);
-
-       kvm->arch.is_sn2 = ia64_platform_is("sn2");
-
-       kvm_init_vm(kvm);
-
-       return kvm;
 
+       return 0;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@@ -962,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                r = kvm_setup_default_irq_routing(kvm);
                if (r) {
+                       mutex_lock(&kvm->slots_lock);
                        kvm_ioapic_destroy(kvm);
+                       mutex_unlock(&kvm->slots_lock);
                        goto out;
                }
                break;
@@ -1357,7 +1348,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        return -EINVAL;
 }
 
-static void free_kvm(struct kvm *kvm)
+void kvm_arch_free_vm(struct kvm *kvm)
 {
        unsigned long vm_base = kvm->arch.vm_base;
 
@@ -1399,9 +1390,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 #endif
        kfree(kvm->arch.vioapic);
        kvm_release_vm_pages(kvm);
-       kvm_free_physmem(kvm);
-       cleanup_srcu_struct(&kvm->srcu);
-       free_kvm(kvm);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
index e316847..badc983 100644 (file)
@@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        int err = -ENOMEM;
        unsigned long p;
 
-       vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
+       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
        if (!vcpu_book3s)
                goto out;
 
-       memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
-
        vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
                kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
        if (!vcpu_book3s->shadow_vcpu)
index 38f756f..9975846 100644 (file)
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
        *(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-       struct kvm *kvm;
-
-       kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
-
-       return kvm;
+       return 0;
 }
 
-static void kvmppc_free_vcpus(struct kvm *kvm)
+void kvm_arch_destroy_vm(struct kvm *kvm)
 {
        unsigned int i;
        struct kvm_vcpu *vcpu;
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 }
 
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-       kvmppc_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
-       cleanup_srcu_struct(&kvm->srcu);
-       kfree(kvm);
-}
-
 int kvm_dev_ioctl_check_extension(long ext)
 {
        int r;
index 985d825..bade533 100644 (file)
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
        return r;
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-       struct kvm *kvm;
        int rc;
        char debug_name[16];
 
        rc = s390_enable_sie();
        if (rc)
-               goto out_nokvm;
-
-       rc = -ENOMEM;
-       kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-       if (!kvm)
-               goto out_nokvm;
+               goto out_err;
 
        kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
        if (!kvm->arch.sca)
-               goto out_nosca;
+               goto out_err;
 
        sprintf(debug_name, "kvm-%u", current->pid);
 
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
        debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
        VM_EVENT(kvm, 3, "%s", "vm created");
 
-       return kvm;
+       return 0;
 out_nodbf:
        free_page((unsigned long)(kvm->arch.sca));
-out_nosca:
-       kfree(kvm);
-out_nokvm:
-       return ERR_PTR(rc);
+out_err:
+       return rc;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
        kvm_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
        free_page((unsigned long)(kvm->arch.sca));
        debug_unregister(kvm->arch.dbf);
-       cleanup_srcu_struct(&kvm->srcu);
-       kfree(kvm);
 }
 
 /* Section: vcpu related */
index b36c6b3..8e37deb 100644 (file)
 
 struct x86_emulate_ctxt;
 
+struct x86_exception {
+       u8 vector;
+       bool error_code_valid;
+       u16 error_code;
+       bool nested_page_fault;
+       u64 address; /* cr2 or nested page fault gpa */
+};
+
 /*
  * x86_emulate_ops:
  *
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
         *  @bytes: [IN ] Number of bytes to read from memory.
         */
        int (*read_std)(unsigned long addr, void *val,
-                       unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+                       unsigned int bytes, struct kvm_vcpu *vcpu,
+                       struct x86_exception *fault);
 
        /*
         * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
         *  @bytes: [IN ] Number of bytes to write to memory.
         */
        int (*write_std)(unsigned long addr, void *val,
-                        unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+                        unsigned int bytes, struct kvm_vcpu *vcpu,
+                        struct x86_exception *fault);
        /*
         * fetch: Read bytes of standard (non-emulated/special) memory.
         *        Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
         *  @bytes: [IN ] Number of bytes to read from memory.
         */
        int (*fetch)(unsigned long addr, void *val,
-                       unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+                    unsigned int bytes, struct kvm_vcpu *vcpu,
+                    struct x86_exception *fault);
 
        /*
         * read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
        int (*read_emulated)(unsigned long addr,
                             void *val,
                             unsigned int bytes,
-                            unsigned int *error,
+                            struct x86_exception *fault,
                             struct kvm_vcpu *vcpu);
 
        /*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
        int (*write_emulated)(unsigned long addr,
                              const void *val,
                              unsigned int bytes,
-                             unsigned int *error,
+                             struct x86_exception *fault,
                              struct kvm_vcpu *vcpu);
 
        /*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
                                const void *old,
                                const void *new,
                                unsigned int bytes,
-                               unsigned int *error,
+                               struct x86_exception *fault,
                                struct kvm_vcpu *vcpu);
 
        int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
        };
        union {
                unsigned long *reg;
-               unsigned long mem;
+               struct segmented_address {
+                       ulong ea;
+                       unsigned seg;
+               } mem;
        } addr;
        union {
                unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
 
        bool perm_ok; /* do not check permissions if true */
 
-       int exception; /* exception that happens during emulation or -1 */
-       u32 error_code; /* error code for exception */
-       bool error_code_valid;
+       bool have_exception;
+       struct x86_exception exception;
 
        /* decode cache */
        struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
 #define EMULATION_FAILED -1
 #define EMULATION_OK 0
 #define EMULATION_RESTART 1
index f702f82..aa75f21 100644 (file)
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
+#define ASYNC_PF_PER_VCPU 64
+
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_vcpu;
 struct kvm;
+struct kvm_async_pf;
 
 enum kvm_reg {
        VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
 
 enum kvm_reg_ex {
        VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+       VCPU_EXREG_CR3,
 };
 
 enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
        void (*new_cr3)(struct kvm_vcpu *vcpu);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
        unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
-       int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-       void (*inject_page_fault)(struct kvm_vcpu *vcpu);
+       int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+                         bool prefault);
+       void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+                                 struct x86_exception *fault);
        void (*free)(struct kvm_vcpu *vcpu);
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
-                           u32 *error);
+                           struct x86_exception *exception);
        gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
        int (*sync_page)(struct kvm_vcpu *vcpu,
-                        struct kvm_mmu_page *sp, bool clear_unsync);
+                        struct kvm_mmu_page *sp);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
        hpa_t root_hpa;
        int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
         */
        struct kvm_mmu *walk_mmu;
 
-       /*
-        * This struct is filled with the necessary information to propagate a
-        * page fault into the guest
-        */
-       struct {
-               u64      address;
-               unsigned error_code;
-               bool     nested;
-       } fault;
-
        /* only needed in kvm_pv_mmu_op() path, but it's hot so
         * put it here to avoid allocation */
        struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
        u64 hv_vapic;
 
        cpumask_var_t wbinvd_dirty_mask;
+
+       struct {
+               bool halted;
+               gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+               struct gfn_to_hva_cache data;
+               u64 msr_val;
+               u32 id;
+               bool send_user_only;
+       } apf;
 };
 
 struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
        /* fields used by HYPER-V emulation */
        u64 hv_guest_os_id;
        u64 hv_hypercall;
+
+       #ifdef CONFIG_KVM_MMU_AUDIT
+       int audit_point;
+       #endif
 };
 
 struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
                            struct kvm_segment *var, int seg);
        void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
        void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+       void (*decache_cr3)(struct kvm_vcpu *vcpu);
        void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
        void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
 
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
+       void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
        const struct trace_print_flags *exit_reasons_str;
 };
 
+struct kvm_arch_async_pf {
+       u32 token;
+       gfn_t gfn;
+       unsigned long cr3;
+       bool direct_map;
+};
+
 extern struct kvm_x86_ops *kvm_x86_ops;
 
 int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -623,8 +640,15 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE         (1 << 0)
 #define EMULTYPE_TRAP_UD           (1 << 1)
 #define EMULTYPE_SKIP              (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
-                       unsigned long cr2, u16 error_code, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+                           int emulation_type, void *insn, int insn_len);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+                       int emulation_type)
+{
+       return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                            gfn_t gfn, void *data, int offset, int len,
                            u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+                             struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+                              struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+                              struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+                               struct x86_exception *exception);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+                      void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
 #define HF_VINTR_MASK          (1 << 2)
 #define HF_NMI_MASK            (1 << 3)
 #define HF_IRET_MASK           (1 << 4)
+#define HF_GUEST_MASK          (1 << 5) /* VCPU is in guest-mode */
 
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
  * Trap the fault and ignore the instruction if that happens.
  */
-asmlinkage void kvm_handle_fault_on_reboot(void);
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
 
 #define __kvm_handle_fault_on_reboot(insn) \
        "666: " insn "\n\t" \
+       "668: \n\t"                           \
        ".pushsection .fixup, \"ax\" \n" \
        "667: \n\t" \
+       "cmpb $0, kvm_rebooting \n\t"         \
+       "jne 668b \n\t"                       \
        __ASM_SIZE(push) " $666b \n\t"        \
-       "jmp kvm_handle_fault_on_reboot \n\t" \
+       "call kvm_spurious_fault \n\t"        \
        ".popsection \n\t" \
        ".pushsection __ex_table, \"a\" \n\t" \
        _ASM_PTR " 666b, 667b \n\t" \
@@ -799,4 +833,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
 
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+                              struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
 #endif /* _ASM_X86_KVM_HOST_H */
index 7b562b6..a427bf7 100644 (file)
@@ -20,6 +20,7 @@
  * are available. The use of 0x11 and 0x12 is deprecated
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
+#define KVM_FEATURE_ASYNC_PF           4
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
+#define KVM_ASYNC_PF_ENABLED                   (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS               (1 << 1)
+
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
 #define KVM_MMU_OP_FLUSH_TLB           2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
        __u64 pt_phys;
 };
 
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+       __u32 reason;
+       __u8 pad[60];
+       __u32 enabled;
+};
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
 extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
 
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
 
 #ifdef CONFIG_KVM_GUEST
 void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
 #else
 #define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+       return 0;
+}
 #endif
 
 #endif /* __KERNEL__ */
index 0e83105..f2b83bc 100644 (file)
@@ -47,14 +47,13 @@ enum {
        INTERCEPT_MONITOR,
        INTERCEPT_MWAIT,
        INTERCEPT_MWAIT_COND,
+       INTERCEPT_XSETBV,
 };
 
 
 struct __attribute__ ((__packed__)) vmcb_control_area {
-       u16 intercept_cr_read;
-       u16 intercept_cr_write;
-       u16 intercept_dr_read;
-       u16 intercept_dr_write;
+       u32 intercept_cr;
+       u32 intercept_dr;
        u32 intercept_exceptions;
        u64 intercept;
        u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
        u32 event_inj_err;
        u64 nested_cr3;
        u64 lbr_ctl;
-       u64 reserved_5;
+       u32 clean;
+       u32 reserved_5;
        u64 next_rip;
-       u8 reserved_6[816];
+       u8 insn_len;
+       u8 insn_bytes[15];
+       u8 reserved_6[800];
 };
 
 
 #define TLB_CONTROL_DO_NOTHING 0
 #define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
 
 #define V_TPR_MASK 0x0f
 
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
 #define SVM_SELECTOR_CODE_MASK (1 << 3)
 
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
+#define INTERCEPT_CR0_READ     0
+#define INTERCEPT_CR3_READ     3
+#define INTERCEPT_CR4_READ     4
+#define INTERCEPT_CR8_READ     8
+#define INTERCEPT_CR0_WRITE    (16 + 0)
+#define INTERCEPT_CR3_WRITE    (16 + 3)
+#define INTERCEPT_CR4_WRITE    (16 + 4)
+#define INTERCEPT_CR8_WRITE    (16 + 8)
+
+#define INTERCEPT_DR0_READ     0
+#define INTERCEPT_DR1_READ     1
+#define INTERCEPT_DR2_READ     2
+#define INTERCEPT_DR3_READ     3
+#define INTERCEPT_DR4_READ     4
+#define INTERCEPT_DR5_READ     5
+#define INTERCEPT_DR6_READ     6
+#define INTERCEPT_DR7_READ     7
+#define INTERCEPT_DR0_WRITE    (16 + 0)
+#define INTERCEPT_DR1_WRITE    (16 + 1)
+#define INTERCEPT_DR2_WRITE    (16 + 2)
+#define INTERCEPT_DR3_WRITE    (16 + 3)
+#define INTERCEPT_DR4_WRITE    (16 + 4)
+#define INTERCEPT_DR5_WRITE    (16 + 5)
+#define INTERCEPT_DR6_WRITE    (16 + 6)
+#define INTERCEPT_DR7_WRITE    (16 + 7)
 
 #define SVM_EVTINJ_VEC_MASK 0xff
 
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
 #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
 
+#define SVM_EXITINFO_REG_MASK 0x0F
+
 #define        SVM_EXIT_READ_CR0       0x000
 #define        SVM_EXIT_READ_CR3       0x003
 #define        SVM_EXIT_READ_CR4       0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXIT_MONITOR       0x08a
 #define SVM_EXIT_MWAIT         0x08b
 #define SVM_EXIT_MWAIT_COND    0x08c
+#define SVM_EXIT_XSETBV                0x08d
 #define SVM_EXIT_NPF           0x400
 
 #define SVM_EXIT_ERR           -1
index f66cda5..0310da6 100644 (file)
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
 asmlinkage void stack_segment(void);
 asmlinkage void general_protection(void);
 asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
 asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void alignment_check(void);
index 9f0cbd9..84471b8 100644 (file)
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
 
+#define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL      0x00001000
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
 #define VM_EXIT_SAVE_IA32_PAT                  0x00040000
 #define VM_EXIT_LOAD_IA32_PAT                  0x00080000
+#define VM_EXIT_SAVE_IA32_EFER                  0x00100000
+#define VM_EXIT_LOAD_IA32_EFER                  0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
 
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS            0x00000002
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL     0x00002000
 #define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
 
 /* VMCS Encodings */
 enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
 #define EXIT_REASON_TASK_SWITCH         9
 #define EXIT_REASON_CPUID               10
 #define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVD                13
 #define EXIT_REASON_INVLPG              14
 #define EXIT_REASON_RDPMC               15
 #define EXIT_REASON_RDTSC               16
@@ -296,6 +305,12 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_SMI           0x00000004
 #define GUEST_INTR_STATE_NMI           0x00000008
 
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE          0
+#define GUEST_ACTIVITY_HLT             1
+#define GUEST_ACTIVITY_SHUTDOWN                2
+#define GUEST_ACTIVITY_WAIT_SIPI       3
+
 /*
  * Exit Qualifications for MOV for Control Register Access
  */
index 591e601..c8b4efa 100644 (file)
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
        CFI_ENDPROC
 END(general_protection)
 
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+       RING0_EC_FRAME
+       pushl $do_async_page_fault
+       CFI_ADJUST_CFA_OFFSET 4
+       jmp error_code
+       CFI_ENDPROC
+END(apf_page_fault)
+#endif
+
 /*
  * End of kprobes section
  */
index d3b895f..aed1ffb 100644 (file)
@@ -1329,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
 #endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
 #ifdef CONFIG_X86_MCE
 paranoidzeroentry machine_check *machine_check_vector(%rip)
 #endif
index 58bb239..e60c38c 100644 (file)
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
        set_stopped_child_used_math(tsk);
        return 0;
 }
+EXPORT_SYMBOL_GPL(init_fpu);
 
 /*
  * The xstateregs_active() routine is the same as the fpregs_active() routine,
index 63b0ec8..8dc4466 100644 (file)
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
 #include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
 
 #define MMU_QUEUE_SIZE 1024
 
+static int kvmapf = 1;
+
+static int parse_no_kvmapf(char *arg)
+{
+        kvmapf = 0;
+        return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
 struct kvm_para_state {
        u8 mmu_queue[MMU_QUEUE_SIZE];
        int mmu_queue_len;
 };
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
 {
 }
 
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+       struct hlist_node link;
+       wait_queue_head_t wq;
+       u32 token;
+       int cpu;
+       bool halted;
+       struct mm_struct *mm;
+};
+
+static struct kvm_task_sleep_head {
+       spinlock_t lock;
+       struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+                                                 u32 token)
+{
+       struct hlist_node *p;
+
+       hlist_for_each(p, &b->list) {
+               struct kvm_task_sleep_node *n =
+                       hlist_entry(p, typeof(*n), link);
+               if (n->token == token)
+                       return n;
+       }
+
+       return NULL;
+}
+
+void kvm_async_pf_task_wait(u32 token)
+{
+       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+       struct kvm_task_sleep_node n, *e;
+       DEFINE_WAIT(wait);
+       int cpu, idle;
+
+       cpu = get_cpu();
+       idle = idle_cpu(cpu);
+       put_cpu();
+
+       spin_lock(&b->lock);
+       e = _find_apf_task(b, token);
+       if (e) {
+               /* dummy entry exist -> wake up was delivered ahead of PF */
+               hlist_del(&e->link);
+               kfree(e);
+               spin_unlock(&b->lock);
+               return;
+       }
+
+       n.token = token;
+       n.cpu = smp_processor_id();
+       n.mm = current->active_mm;
+       n.halted = idle || preempt_count() > 1;
+       atomic_inc(&n.mm->mm_count);
+       init_waitqueue_head(&n.wq);
+       hlist_add_head(&n.link, &b->list);
+       spin_unlock(&b->lock);
+
+       for (;;) {
+               if (!n.halted)
+                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+               if (hlist_unhashed(&n.link))
+                       break;
+
+               if (!n.halted) {
+                       local_irq_enable();
+                       schedule();
+                       local_irq_disable();
+               } else {
+                       /*
+                        * We cannot reschedule. So halt.
+                        */
+                       native_safe_halt();
+                       local_irq_disable();
+               }
+       }
+       if (!n.halted)
+               finish_wait(&n.wq, &wait);
+
+       return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+       hlist_del_init(&n->link);
+       if (!n->mm)
+               return;
+       mmdrop(n->mm);
+       if (n->halted)
+               smp_send_reschedule(n->cpu);
+       else if (waitqueue_active(&n->wq))
+               wake_up(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+       int i;
+
+       for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+               struct hlist_node *p, *next;
+               struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+               spin_lock(&b->lock);
+               hlist_for_each_safe(p, next, &b->list) {
+                       struct kvm_task_sleep_node *n =
+                               hlist_entry(p, typeof(*n), link);
+                       if (n->cpu == smp_processor_id())
+                               apf_task_wake_one(n);
+               }
+               spin_unlock(&b->lock);
+       }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+       struct kvm_task_sleep_node *n;
+
+       if (token == ~0) {
+               apf_task_wake_all();
+               return;
+       }
+
+again:
+       spin_lock(&b->lock);
+       n = _find_apf_task(b, token);
+       if (!n) {
+               /*
+                * async PF was not yet handled.
+                * Add dummy entry for the token.
+                */
+               n = kmalloc(sizeof(*n), GFP_ATOMIC);
+               if (!n) {
+                       /*
+                        * Allocation failed! Busy wait while other cpu
+                        * handles async PF.
+                        */
+                       spin_unlock(&b->lock);
+                       cpu_relax();
+                       goto again;
+               }
+               n->token = token;
+               n->cpu = smp_processor_id();
+               n->mm = NULL;
+               init_waitqueue_head(&n->wq);
+               hlist_add_head(&n->link, &b->list);
+       } else
+               apf_task_wake_one(n);
+       spin_unlock(&b->lock);
+       return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+       u32 reason = 0;
+
+       if (__get_cpu_var(apf_reason).enabled) {
+               reason = __get_cpu_var(apf_reason).reason;
+               __get_cpu_var(apf_reason).reason = 0;
+       }
+
+       return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+       switch (kvm_read_and_reset_pf_reason()) {
+       default:
+               do_page_fault(regs, error_code);
+               break;
+       case KVM_PV_REASON_PAGE_NOT_PRESENT:
+               /* page is swapped out by the host. */
+               kvm_async_pf_task_wait((u32)read_cr2());
+               break;
+       case KVM_PV_REASON_PAGE_READY:
+               kvm_async_pf_task_wake((u32)read_cr2());
+               break;
+       }
+}
+
 static void kvm_mmu_op(void *buffer, unsigned len)
 {
        int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+void __cpuinit kvm_guest_cpu_init(void)
+{
+       if (!kvm_para_available())
+               return;
+
+       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+               u64 pa = __pa(&__get_cpu_var(apf_reason));
+
+#ifdef CONFIG_PREEMPT
+               pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+               wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+               __get_cpu_var(apf_reason).enabled = 1;
+               printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+                      smp_processor_id());
+       }
+}
+
+static void kvm_pv_disable_apf(void *unused)
+{
+       if (!__get_cpu_var(apf_reason).enabled)
+               return;
+
+       wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+       __get_cpu_var(apf_reason).enabled = 0;
+
+       printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+              smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+                               unsigned long code, void *unused)
+{
+       if (code == SYS_RESTART)
+               on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+       .notifier_call = kvm_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+#ifdef CONFIG_KVM_CLOCK
+       WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
+       kvm_guest_cpu_init();
+       native_smp_prepare_boot_cpu();
+}
+
+static void kvm_guest_cpu_online(void *dummy)
+{
+       kvm_guest_cpu_init();
+}
+
+static void kvm_guest_cpu_offline(void *dummy)
+{
+       kvm_pv_disable_apf(NULL);
+       apf_task_wake_all();
+}
+
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+                                   unsigned long action, void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE_FROZEN:
+               smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+               break;
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+        .notifier_call  = kvm_cpu_notify,
+};
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+       set_intr_gate(14, &async_page_fault);
+}
+
 void __init kvm_guest_init(void)
 {
+       int i;
+
        if (!kvm_para_available())
                return;
 
        paravirt_ops_setup();
+       register_reboot_notifier(&kvm_pv_reboot_nb);
+       for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+               spin_lock_init(&async_pf_sleepers[i].lock);
+       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+               x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+#ifdef CONFIG_SMP
+       smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+       register_cpu_notifier(&kvm_cpu_notifier);
+#else
+       kvm_guest_cpu_init();
+#endif
 }
index ca43ce3..f98d3ea 100644 (file)
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
        int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 }
 #endif
 
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
-       WARN_ON(kvm_register_clock("primary cpu clock"));
-       native_smp_prepare_boot_cpu();
-}
-#endif
-
 /*
  * After the clock is registered, the host will keep writing to the
  * registered memory location. If the guest happens to shutdown, this memory
@@ -205,9 +197,6 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_X86_LOCAL_APIC
        x86_cpuinit.setup_percpu_clockev =
                kvm_setup_secondary_clock;
-#endif
-#ifdef CONFIG_SMP
-       smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
        machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
index ddc131f..50f6364 100644 (file)
@@ -28,6 +28,7 @@ config KVM
        select HAVE_KVM_IRQCHIP
        select HAVE_KVM_EVENTFD
        select KVM_APIC_ARCHITECTURE
+       select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
        ---help---
index 31a7035..f15501f 100644 (file)
@@ -1,5 +1,5 @@
 
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
 
 CFLAGS_x86.o := -I.
 CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y                   += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
                                coalesced_mmio.o irq_comm.o eventfd.o \
                                assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
                           i8254.o timer.o
index 38b6e8d..caf9667 100644 (file)
  * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
  */
 
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
 }
 
 static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long reg)
 {
-       return base + address_mask(c, reg);
+       return address_mask(c, reg);
 }
 
 static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
        return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
-                                      struct x86_emulate_ops *ops,
-                                      struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+                            struct x86_emulate_ops *ops,
+                            struct decode_cache *c)
 {
        if (!c->has_seg_override)
                return 0;
 
-       return seg_base(ctxt, ops, c->seg_override);
+       return c->seg_override;
 }
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
-                            struct x86_emulate_ops *ops)
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+                   struct segmented_address addr)
 {
-       return seg_base(ctxt, ops, VCPU_SREG_ES);
-}
-
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
-                            struct x86_emulate_ops *ops)
-{
-       return seg_base(ctxt, ops, VCPU_SREG_SS);
-}
+       struct decode_cache *c = &ctxt->decode;
+       ulong la;
 
-static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
-                                     u32 error, bool valid)
-{
-       ctxt->exception = vec;
-       ctxt->error_code = error;
-       ctxt->error_code_valid = valid;
+       la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+       if (c->ad_bytes != 8)
+               la &= (u32)-1;
+       return la;
 }
 
-static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+                            u32 error, bool valid)
 {
-       emulate_exception(ctxt, GP_VECTOR, err, true);
+       ctxt->exception.vector = vec;
+       ctxt->exception.error_code = error;
+       ctxt->exception.error_code_valid = valid;
+       return X86EMUL_PROPAGATE_FAULT;
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 {
-       emulate_exception(ctxt, PF_VECTOR, 0, true);
+       return emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_exception(ctxt, UD_VECTOR, 0, false);
+       return emulate_exception(ctxt, UD_VECTOR, 0, false);
 }
 
-static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
 {
-       emulate_exception(ctxt, TS_VECTOR, err, true);
+       return emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
 static int emulate_de(struct x86_emulate_ctxt *ctxt)
 {
-       emulate_exception(ctxt, DE_VECTOR, 0, false);
-       return X86EMUL_PROPAGATE_FAULT;
+       return emulate_exception(ctxt, DE_VECTOR, 0, false);
 }
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
                cur_size = fc->end - fc->start;
                size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
                rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
-                               size, ctxt->vcpu, NULL);
+                               size, ctxt->vcpu, &ctxt->exception);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
 
 static int read_descriptor(struct x86_emulate_ctxt *ctxt,
                           struct x86_emulate_ops *ops,
-                          ulong addr,
+                          struct segmented_address addr,
                           u16 *size, unsigned long *address, int op_bytes)
 {
        int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
        if (op_bytes == 2)
                op_bytes = 3;
        *address = 0;
-       rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
+       rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+                          ctxt->vcpu, &ctxt->exception);
        if (rc != X86EMUL_CONTINUE)
                return rc;
-       rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
+       addr.ea += 2;
+       rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
+                          ctxt->vcpu, &ctxt->exception);
        return rc;
 }
 
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                        break;
                }
        }
-       op->addr.mem = modrm_ea;
+       op->addr.mem.ea = modrm_ea;
 done:
        return rc;
 }
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
        op->type = OP_MEM;
        switch (c->ad_bytes) {
        case 2:
-               op->addr.mem = insn_fetch(u16, 2, c->eip);
+               op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
                break;
        case 4:
-               op->addr.mem = insn_fetch(u32, 4, c->eip);
+               op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
                break;
        case 8:
-               op->addr.mem = insn_fetch(u64, 8, c->eip);
+               op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
                break;
        }
 done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
                else if (c->src.bytes == 4)
                        sv = (s32)c->src.val & (s32)mask;
 
-               c->dst.addr.mem += (sv >> 3);
+               c->dst.addr.mem.ea += (sv >> 3);
        }
 
        /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 {
        int rc;
        struct read_cache *mc = &ctxt->decode.mem_read;
-       u32 err;
 
        while (size) {
                int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
                if (mc->pos < mc->end)
                        goto read_cached;
 
-               rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
-                                       ctxt->vcpu);
-               if (rc == X86EMUL_PROPAGATE_FAULT)
-                       emulate_pf(ctxt);
+               rc = ops->read_emulated(addr, mc->data + mc->end, n,
+                                       &ctxt->exception, ctxt->vcpu);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        struct desc_ptr dt;
        u16 index = selector >> 3;
        int ret;
-       u32 err;
        ulong addr;
 
        get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-       if (dt.size < index * 8 + 7) {
-               emulate_gp(ctxt, selector & 0xfffc);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (dt.size < index * 8 + 7)
+               return emulate_gp(ctxt, selector & 0xfffc);
        addr = dt.address + index * 8;
-       ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT)
-               emulate_pf(ctxt);
+       ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+                           &ctxt->exception);
 
        return ret;
 }
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 {
        struct desc_ptr dt;
        u16 index = selector >> 3;
-       u32 err;
        ulong addr;
        int ret;
 
        get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-       if (dt.size < index * 8 + 7) {
-               emulate_gp(ctxt, selector & 0xfffc);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (dt.size < index * 8 + 7)
+               return emulate_gp(ctxt, selector & 0xfffc);
 
        addr = dt.address + index * 8;
-       ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT)
-               emulate_pf(ctxt);
+       ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+                            &ctxt->exception);
 
        return ret;
 }
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 {
        int rc;
        struct decode_cache *c = &ctxt->decode;
-       u32 err;
 
        switch (c->dst.type) {
        case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
        case OP_MEM:
                if (c->lock_prefix)
                        rc = ops->cmpxchg_emulated(
-                                       c->dst.addr.mem,
+                                       linear(ctxt, c->dst.addr.mem),
                                        &c->dst.orig_val,
                                        &c->dst.val,
                                        c->dst.bytes,
-                                       &err,
+                                       &ctxt->exception,
                                        ctxt->vcpu);
                else
                        rc = ops->write_emulated(
-                                       c->dst.addr.mem,
+                                       linear(ctxt, c->dst.addr.mem),
                                        &c->dst.val,
                                        c->dst.bytes,
-                                       &err,
+                                       &ctxt->exception,
                                        ctxt->vcpu);
-               if (rc == X86EMUL_PROPAGATE_FAULT)
-                       emulate_pf(ctxt);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
        c->dst.bytes = c->op_bytes;
        c->dst.val = c->src.val;
        register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-       c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
-                                          c->regs[VCPU_REGS_RSP]);
+       c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+       c->dst.addr.mem.seg = VCPU_SREG_SS;
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 {
        struct decode_cache *c = &ctxt->decode;
        int rc;
+       struct segmented_address addr;
 
-       rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
-                                                      c->regs[VCPU_REGS_RSP]),
-                          dest, len);
+       addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+       addr.seg = VCPU_SREG_SS;
+       rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
                        change_mask |= EFLG_IF;
                break;
        case X86EMUL_MODE_VM86:
-               if (iopl < 3) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+               if (iopl < 3)
+                       return emulate_gp(ctxt, 0);
                change_mask |= EFLG_IF;
                break;
        default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
        *(unsigned long *)dest =
                (ctxt->eflags & ~change_mask) | (val & change_mask);
 
-       if (rc == X86EMUL_PROPAGATE_FAULT)
-               emulate_pf(ctxt);
-
        return rc;
 }
 
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
        gva_t cs_addr;
        gva_t eip_addr;
        u16 cs, eip;
-       u32 err;
 
        /* TODO: Add limit checks */
        c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
        eip_addr = dt.address + (irq << 2);
        cs_addr = dt.address + (irq << 2) + 2;
 
-       rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+       rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+       rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       if (temp_eip & ~0xffff) {
-               emulate_gp(ctxt, 0);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (temp_eip & ~0xffff)
+               return emulate_gp(ctxt, 0);
 
        rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
        /* syscall is not available in real mode */
        if (ctxt->mode == X86EMUL_MODE_REAL ||
-           ctxt->mode == X86EMUL_MODE_VM86) {
-               emulate_ud(ctxt);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+           ctxt->mode == X86EMUL_MODE_VM86)
+               return emulate_ud(ctxt);
 
        setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        u16 cs_sel, ss_sel;
 
        /* inject #GP if in real mode */
-       if (ctxt->mode == X86EMUL_MODE_REAL) {
-               emulate_gp(ctxt, 0);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (ctxt->mode == X86EMUL_MODE_REAL)
+               return emulate_gp(ctxt, 0);
 
        /* XXX sysenter/sysexit have not been tested in 64bit mode.
        * Therefore, we inject an #UD.
        */
-       if (ctxt->mode == X86EMUL_MODE_PROT64) {
-               emulate_ud(ctxt);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (ctxt->mode == X86EMUL_MODE_PROT64)
+               return emulate_ud(ctxt);
 
        setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
        ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
        switch (ctxt->mode) {
        case X86EMUL_MODE_PROT32:
-               if ((msr_data & 0xfffc) == 0x0) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+               if ((msr_data & 0xfffc) == 0x0)
+                       return emulate_gp(ctxt, 0);
                break;
        case X86EMUL_MODE_PROT64:
-               if (msr_data == 0x0) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+               if (msr_data == 0x0)
+                       return emulate_gp(ctxt, 0);
                break;
        }
 
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
        /* inject #GP if in real mode or Virtual 8086 mode */
        if (ctxt->mode == X86EMUL_MODE_REAL ||
-           ctxt->mode == X86EMUL_MODE_VM86) {
-               emulate_gp(ctxt, 0);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+           ctxt->mode == X86EMUL_MODE_VM86)
+               return emulate_gp(ctxt, 0);
 
        setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        switch (usermode) {
        case X86EMUL_MODE_PROT32:
                cs_sel = (u16)(msr_data + 16);
-               if ((msr_data & 0xfffc) == 0x0) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+               if ((msr_data & 0xfffc) == 0x0)
+                       return emulate_gp(ctxt, 0);
                ss_sel = (u16)(msr_data + 24);
                break;
        case X86EMUL_MODE_PROT64:
                cs_sel = (u16)(msr_data + 32);
-               if (msr_data == 0x0) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+               if (msr_data == 0x0)
+                       return emulate_gp(ctxt, 0);
                ss_sel = cs_sel + 8;
                cs.d = 0;
                cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 {
        struct tss_segment_16 tss_seg;
        int ret;
-       u32 err, new_tss_base = get_desc_base(new_desc);
+       u32 new_tss_base = get_desc_base(new_desc);
 
        ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                           &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                           &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        save_state_to_tss16(ctxt, ops, &tss_seg);
 
        ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                            &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                            &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                           &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                           &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        if (old_tss_sel != 0xffff) {
                tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
                ret = ops->write_std(new_tss_base,
                                     &tss_seg.prev_task_link,
                                     sizeof tss_seg.prev_task_link,
-                                    ctxt->vcpu, &err);
-               if (ret == X86EMUL_PROPAGATE_FAULT) {
+                                    ctxt->vcpu, &ctxt->exception);
+               if (ret != X86EMUL_CONTINUE)
                        /* FIXME: need to provide precise fault address */
-                       emulate_pf(ctxt);
                        return ret;
-               }
        }
 
        return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
        struct decode_cache *c = &ctxt->decode;
        int ret;
 
-       if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
-               emulate_gp(ctxt, 0);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+               return emulate_gp(ctxt, 0);
        c->eip = tss->eip;
        ctxt->eflags = tss->eflags | 2;
        c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 {
        struct tss_segment_32 tss_seg;
        int ret;
-       u32 err, new_tss_base = get_desc_base(new_desc);
+       u32 new_tss_base = get_desc_base(new_desc);
 
        ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                           &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                           &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        save_state_to_tss32(ctxt, ops, &tss_seg);
 
        ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                            &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                            &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-                           &err);
-       if (ret == X86EMUL_PROPAGATE_FAULT) {
+                           &ctxt->exception);
+       if (ret != X86EMUL_CONTINUE)
                /* FIXME: need to provide precise fault address */
-               emulate_pf(ctxt);
                return ret;
-       }
 
        if (old_tss_sel != 0xffff) {
                tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
                ret = ops->write_std(new_tss_base,
                                     &tss_seg.prev_task_link,
                                     sizeof tss_seg.prev_task_link,
-                                    ctxt->vcpu, &err);
-               if (ret == X86EMUL_PROPAGATE_FAULT) {
+                                    ctxt->vcpu, &ctxt->exception);
+               if (ret != X86EMUL_CONTINUE)
                        /* FIXME: need to provide precise fault address */
-                       emulate_pf(ctxt);
                        return ret;
-               }
        }
 
        return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
        if (reason != TASK_SWITCH_IRET) {
                if ((tss_selector & 3) > next_tss_desc.dpl ||
-                   ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-                       emulate_gp(ctxt, 0);
-                       return X86EMUL_PROPAGATE_FAULT;
-               }
+                   ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+                       return emulate_gp(ctxt, 0);
        }
 
        desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
        return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
                            int reg, struct operand *op)
 {
        struct decode_cache *c = &ctxt->decode;
        int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
        register_address_increment(c, &c->regs[reg], df * op->bytes);
-       op->addr.mem = register_address(c,  base, c->regs[reg]);
+       op->addr.mem.ea = register_address(c, c->regs[reg]);
+       op->addr.mem.seg = seg;
 }
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
        struct decode_cache *c = &ctxt->decode;
        u64 tsc = 0;
 
-       if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
-               emulate_gp(ctxt, 0);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
+       if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+               return emulate_gp(ctxt, 0);
        ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
        c->regs[VCPU_REGS_RAX] = (u32)tsc;
        c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 
        op->type = OP_IMM;
        op->bytes = size;
-       op->addr.mem = c->eip;
+       op->addr.mem.ea = c->eip;
        /* NB. Immediates are sign-extended as necessary. */
        switch (op->bytes) {
        case 1:
@@ -2678,7 +2610,7 @@ done:
 }
 
 int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
        struct x86_emulate_ops *ops = ctxt->ops;
        struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
        struct operand memop = { .type = OP_NONE };
 
        c->eip = ctxt->eip;
-       c->fetch.start = c->fetch.end = c->eip;
+       c->fetch.start = c->eip;
+       c->fetch.end = c->fetch.start + insn_len;
+       if (insn_len > 0)
+               memcpy(c->fetch.data, insn, insn_len);
        ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 
        switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
        c->execute = opcode.u.execute;
 
        /* Unrecognised? */
-       if (c->d == 0 || (c->d & Undefined)) {
-               DPRINTF("Cannot emulate %02x\n", c->b);
+       if (c->d == 0 || (c->d & Undefined))
                return -1;
-       }
 
        if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
                c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
        if (!c->has_seg_override)
                set_seg_override(c, VCPU_SREG_DS);
 
-       if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
-               memop.addr.mem += seg_override_base(ctxt, ops, c);
+       memop.addr.mem.seg = seg_override(ctxt, ops, c);
 
        if (memop.type == OP_MEM && c->ad_bytes != 8)
-               memop.addr.mem = (u32)memop.addr.mem;
+               memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
        if (memop.type == OP_MEM && c->rip_relative)
-               memop.addr.mem += c->eip;
+               memop.addr.mem.ea += c->eip;
 
        /*
         * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
        case SrcSI:
                c->src.type = OP_MEM;
                c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->src.addr.mem =
-                       register_address(c,  seg_override_base(ctxt, ops, c),
-                                        c->regs[VCPU_REGS_RSI]);
+               c->src.addr.mem.ea =
+                       register_address(c, c->regs[VCPU_REGS_RSI]);
+               c->src.addr.mem.seg = seg_override(ctxt, ops, c),
                c->src.val = 0;
                break;
        case SrcImmFAddr:
                c->src.type = OP_IMM;
-               c->src.addr.mem = c->eip;
+               c->src.addr.mem.ea = c->eip;
                c->src.bytes = c->op_bytes + 2;
                insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
                break;
@@ -2944,7 +2876,7 @@ done_prefixes:
                break;
        case DstImmUByte:
                c->dst.type = OP_IMM;
-               c->dst.addr.mem = c->eip;
+               c->dst.addr.mem.ea = c->eip;
                c->dst.bytes = 1;
                c->dst.val = insn_fetch(u8, 1, c->eip);
                break;
@@ -2969,9 +2901,9 @@ done_prefixes:
        case DstDI:
                c->dst.type = OP_MEM;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.addr.mem =
-                       register_address(c, es_base(ctxt, ops),
-                                        c->regs[VCPU_REGS_RDI]);
+               c->dst.addr.mem.ea =
+                       register_address(c, c->regs[VCPU_REGS_RDI]);
+               c->dst.addr.mem.seg = VCPU_SREG_ES;
                c->dst.val = 0;
                break;
        case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
        ctxt->decode.mem_read.pos = 0;
 
        if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-               emulate_ud(ctxt);
+               rc = emulate_ud(ctxt);
                goto done;
        }
 
        /* LOCK prefix is allowed only with some instructions */
        if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-               emulate_ud(ctxt);
+               rc = emulate_ud(ctxt);
                goto done;
        }
 
        if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
-               emulate_ud(ctxt);
+               rc = emulate_ud(ctxt);
                goto done;
        }
 
        /* Privileged instruction can be executed only in CPL=0 */
        if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-               emulate_gp(ctxt, 0);
+               rc = emulate_gp(ctxt, 0);
                goto done;
        }
 
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
        }
 
        if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
-               rc = read_emulated(ctxt, ops, c->src.addr.mem,
+               rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
                                        c->src.valptr, c->src.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
        }
 
        if (c->src2.type == OP_MEM) {
-               rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+               rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
                                        &c->src2.val, c->src2.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
        if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
                /* optimisation - avoid slow emulated read if Mov */
-               rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+               rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
                                   &c->dst.val, c->dst.bytes);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
@@ -3215,13 +3147,13 @@ special_insn:
                break;
        case 0x8c:  /* mov r/m, sreg */
                if (c->modrm_reg > VCPU_SREG_GS) {
-                       emulate_ud(ctxt);
+                       rc = emulate_ud(ctxt);
                        goto done;
                }
                c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
                break;
        case 0x8d: /* lea r16/r32, m */
-               c->dst.val = c->src.addr.mem;
+               c->dst.val = c->src.addr.mem.ea;
                break;
        case 0x8e: { /* mov seg, r/m16 */
                uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
 
                if (c->modrm_reg == VCPU_SREG_CS ||
                    c->modrm_reg > VCPU_SREG_GS) {
-                       emulate_ud(ctxt);
+                       rc = emulate_ud(ctxt);
                        goto done;
                }
 
@@ -3268,7 +3200,6 @@ special_insn:
                break;
        case 0xa6 ... 0xa7:     /* cmps */
                c->dst.type = OP_NONE; /* Disable writeback. */
-               DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
                goto cmp;
        case 0xa8 ... 0xa9:     /* test ax, imm */
                goto test;
@@ -3363,7 +3294,7 @@ special_insn:
        do_io_in:
                c->dst.bytes = min(c->dst.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-                       emulate_gp(ctxt, 0);
+                       rc = emulate_gp(ctxt, 0);
                        goto done;
                }
                if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
                c->src.bytes = min(c->src.bytes, 4u);
                if (!emulator_io_permited(ctxt, ops, c->dst.val,
                                          c->src.bytes)) {
-                       emulate_gp(ctxt, 0);
+                       rc = emulate_gp(ctxt, 0);
                        goto done;
                }
                ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
                break;
        case 0xfa: /* cli */
                if (emulator_bad_iopl(ctxt, ops)) {
-                       emulate_gp(ctxt, 0);
+                       rc = emulate_gp(ctxt, 0);
                        goto done;
                } else
                        ctxt->eflags &= ~X86_EFLAGS_IF;
                break;
        case 0xfb: /* sti */
                if (emulator_bad_iopl(ctxt, ops)) {
-                       emulate_gp(ctxt, 0);
+                       rc = emulate_gp(ctxt, 0);
                        goto done;
                } else {
                        ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
        c->dst.type = saved_dst_type;
 
        if ((c->d & SrcMask) == SrcSI)
-               string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+               string_addr_inc(ctxt, seg_override(ctxt, ops, c),
                                VCPU_REGS_RSI, &c->src);
 
        if ((c->d & DstMask) == DstDI)
-               string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+               string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
                                &c->dst);
 
        if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
        ctxt->eip = c->eip;
 
 done:
+       if (rc == X86EMUL_PROPAGATE_FAULT)
+               ctxt->have_exception = true;
        return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
                        break;
                case 5: /* not defined */
                        emulate_ud(ctxt);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                case 7: /* invlpg*/
-                       emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
+                       emulate_invlpg(ctxt->vcpu,
+                                      linear(ctxt, c->src.addr.mem));
                        /* Disable writeback. */
                        c->dst.type = OP_NONE;
                        break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
                case 5 ... 7:
                case 9 ... 15:
                        emulate_ud(ctxt);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
                c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
                if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
                    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
                        emulate_ud(ctxt);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
                ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
        case 0x22: /* mov reg, cr */
                if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
                        emulate_gp(ctxt, 0);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
                c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
                if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
                    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
                        emulate_ud(ctxt);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
 
@@ -3604,6 +3543,7 @@ twobyte_insn:
                                 ~0ULL : ~0U), ctxt->vcpu) < 0) {
                        /* #UD condition is already handled by the code above */
                        emulate_gp(ctxt, 0);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
 
@@ -3615,6 +3555,7 @@ twobyte_insn:
                        | ((u64)c->regs[VCPU_REGS_RDX] << 32);
                if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
                        emulate_gp(ctxt, 0);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                }
                rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
                /* rdmsr */
                if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
                        emulate_gp(ctxt, 0);
+                       rc = X86EMUL_PROPAGATE_FAULT;
                        goto done;
                } else {
                        c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
        goto writeback;
 
 cannot_emulate:
-       DPRINTF("Cannot emulate %02x\n", c->b);
        return -1;
 }
index 975bb45..3377d53 100644 (file)
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
        return vcpu->arch.cr4 & mask;
 }
 
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+       if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+               kvm_x86_ops->decache_cr3(vcpu);
+       return vcpu->arch.cr3;
+}
+
 static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 {
        return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
                | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
 }
 
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
 #endif
index 413f897..93cf9d0 100644 (file)
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 
        if (old_ppr != ppr) {
                apic_set_reg(apic, APIC_PROCPRI, ppr);
-               kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+               if (ppr < old_ppr)
+                       kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
        }
 }
 
index fbb04ae..9cafbb4 100644 (file)
  *
  */
 
+#include "irq.h"
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;        /* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
-       shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 }
 
 /*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
  */
-static int *slot_largepage_idx(gfn_t gfn,
-                              struct kvm_memory_slot *slot,
-                              int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+                                             struct kvm_memory_slot *slot,
+                                             int level)
 {
        unsigned long idx;
 
        idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
              (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-       return &slot->lpage_info[level - 2][idx].write_count;
+       return &slot->lpage_info[level - 2][idx];
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot;
-       int *write_count;
+       struct kvm_lpage_info *linfo;
        int i;
 
        slot = gfn_to_memslot(kvm, gfn);
        for (i = PT_DIRECTORY_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-               write_count   = slot_largepage_idx(gfn, slot, i);
-               *write_count += 1;
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->write_count += 1;
        }
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot;
-       int *write_count;
+       struct kvm_lpage_info *linfo;
        int i;
 
        slot = gfn_to_memslot(kvm, gfn);
        for (i = PT_DIRECTORY_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-               write_count   = slot_largepage_idx(gfn, slot, i);
-               *write_count -= 1;
-               WARN_ON(*write_count < 0);
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->write_count -= 1;
+               WARN_ON(linfo->write_count < 0);
        }
 }
 
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
                                int level)
 {
        struct kvm_memory_slot *slot;
-       int *largepage_idx;
+       struct kvm_lpage_info *linfo;
 
        slot = gfn_to_memslot(kvm, gfn);
        if (slot) {
-               largepage_idx = slot_largepage_idx(gfn, slot, level);
-               return *largepage_idx;
+               linfo = lpage_info_slot(gfn, slot, level);
+               return linfo->write_count;
        }
 
        return 1;
@@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
        struct kvm_memory_slot *slot;
-       unsigned long idx;
+       struct kvm_lpage_info *linfo;
 
        slot = gfn_to_memslot(kvm, gfn);
        if (likely(level == PT_PAGE_TABLE_LEVEL))
                return &slot->rmap[gfn - slot->base_gfn];
 
-       idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-               (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+       linfo = lpage_info_slot(gfn, slot, level);
 
-       return &slot->lpage_info[level - 2][idx].rmap_pde;
+       return &linfo->rmap_pde;
 }
 
 /*
@@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                end = start + (memslot->npages << PAGE_SHIFT);
                if (hva >= start && hva < end) {
                        gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+                       gfn_t gfn = memslot->base_gfn + gfn_offset;
 
                        ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
                        for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-                               unsigned long idx;
-                               int sh;
-
-                               sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
-                               idx = ((memslot->base_gfn+gfn_offset) >> sh) -
-                                       (memslot->base_gfn >> sh);
-                               ret |= handler(kvm,
-                                       &memslot->lpage_info[j][idx].rmap_pde,
-                                       data);
+                               struct kvm_lpage_info *linfo;
+
+                               linfo = lpage_info_slot(gfn, memslot,
+                                                       PT_DIRECTORY_LEVEL + j);
+                               ret |= handler(kvm, &linfo->rmap_pde, data);
                        }
                        trace_kvm_age_page(hva, memslot, ret);
                        retval |= ret;
@@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-                              struct kvm_mmu_page *sp, bool clear_unsync)
+                              struct kvm_mmu_page *sp)
 {
        return 1;
 }
@@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        if (clear_unsync)
                kvm_unlink_unsync_page(vcpu->kvm, sp);
 
-       if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
                return 1;
        }
@@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
                        continue;
 
                WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unlink_unsync_page(vcpu->kvm, s);
                if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-                       (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+                       (vcpu->arch.mmu.sync_page(vcpu, s))) {
                        kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
                        continue;
                }
-               kvm_unlink_unsync_page(vcpu->kvm, s);
                flush = true;
        }
 
@@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int user_fault,
                    int write_fault, int dirty, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
-                   bool can_unsync, bool reset_host_protection)
+                   bool can_unsync, bool host_writable)
 {
-       u64 spte;
+       u64 spte, entry = *sptep;
        int ret = 0;
 
        /*
@@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         * whether the guest actually used the pte (in order to detect
         * demand paging).
         */
-       spte = shadow_base_present_pte;
+       spte = PT_PRESENT_MASK;
        if (!speculative)
                spte |= shadow_accessed_mask;
        if (!dirty)
@@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
                        kvm_is_mmio_pfn(pfn));
 
-       if (reset_host_protection)
+       if (host_writable)
                spte |= SPTE_HOST_WRITEABLE;
+       else
+               pte_access &= ~ACC_WRITE_MASK;
 
        spte |= (u64)pfn << PAGE_SHIFT;
 
@@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
        update_spte(sptep, spte);
+       /*
+        * If we overwrite a writable spte with a read-only one we
+        * should flush remote TLBs. Otherwise rmap_write_protect
+        * will find a read-only spte, even though the writable spte
+        * might be cached on a CPU's TLB.
+        */
+       if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+               kvm_flush_remote_tlbs(vcpu->kvm);
 done:
        return ret;
 }
@@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         int user_fault, int write_fault, int dirty,
                         int *ptwrite, int level, gfn_t gfn,
                         pfn_t pfn, bool speculative,
-                        bool reset_host_protection)
+                        bool host_writable)
 {
        int was_rmapped = 0;
        int rmap_count;
@@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
        if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
                      dirty, level, gfn, pfn, speculative, true,
-                     reset_host_protection)) {
+                     host_writable)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-                       int level, gfn_t gfn, pfn_t pfn)
+                       int map_writable, int level, gfn_t gfn, pfn_t pfn,
+                       bool prefault)
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
@@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
                if (iterator.level == level) {
-                       mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+                       unsigned pte_access = ACC_ALL;
+
+                       mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
                                     0, write, 1, &pt_write,
-                                    level, gfn, pfn, false, true);
+                                    level, gfn, pfn, prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
@@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
        return 1;
 }
 
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+                        gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+                        bool prefault)
 {
        int r;
        int level;
        pfn_t pfn;
        unsigned long mmu_seq;
+       bool map_writable;
 
        level = mapping_level(vcpu, gfn);
 
@@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
-       pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+       if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+               return 0;
 
        /* mmio */
        if (is_error_pfn(pfn))
@@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
-       r = __direct_map(vcpu, v, write, level, gfn, pfn);
+       r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+                        prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
                hpa_t root = vcpu->arch.mmu.root_hpa;
                sp = page_header(root);
                mmu_sync_children(vcpu, sp);
+               trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
                return;
        }
        for (i = 0; i < 4; ++i) {
@@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
-                                 u32 access, u32 *error)
+                                 u32 access, struct x86_exception *exception)
 {
-       if (error)
-               *error = 0;
+       if (exception)
+               exception->error_code = 0;
        return vaddr;
 }
 
 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
-                                        u32 access, u32 *error)
+                                        u32 access,
+                                        struct x86_exception *exception)
 {
-       if (error)
-               *error = 0;
+       if (exception)
+               exception->error_code = 0;
        return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                               u32 error_code)
+                               u32 error_code, bool prefault)
 {
        gfn_t gfn;
        int r;
@@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        gfn = gva >> PAGE_SHIFT;
 
        return nonpaging_map(vcpu, gva & PAGE_MASK,
-                            error_code & PFERR_WRITE_MASK, gfn);
+                            error_code & PFERR_WRITE_MASK, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+       struct kvm_arch_async_pf arch;
+
+       arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+       arch.gfn = gfn;
+       arch.direct_map = vcpu->arch.mmu.direct_map;
+       arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+       return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
 }
 
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
-                               u32 error_code)
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+                    kvm_event_needs_reinjection(vcpu)))
+               return false;
+
+       return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+                        gva_t gva, pfn_t *pfn, bool write, bool *writable)
+{
+       bool async;
+
+       *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
+
+       if (!async)
+               return false; /* *pfn has correct page already */
+
+       put_page(pfn_to_page(*pfn));
+
+       if (!prefault && can_do_async_pf(vcpu)) {
+               trace_kvm_try_async_get_page(gva, gfn);
+               if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+                       trace_kvm_async_pf_doublefault(gva, gfn);
+                       kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+                       return true;
+               } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+                       return true;
+       }
+
+       *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
+
+       return false;
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+                         bool prefault)
 {
        pfn_t pfn;
        int r;
        int level;
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;
+       int write = error_code & PFERR_WRITE_MASK;
+       bool map_writable;
 
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
-       pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+               return 0;
+
+       /* mmio */
        if (is_error_pfn(pfn))
                return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
-       r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-                        level, gfn, pfn);
+       r = __direct_map(vcpu, gpa, write, map_writable,
+                        level, gfn, pfn, prefault);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return r;
@@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
-       pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
+       pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
        mmu_free_roots(vcpu);
 }
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr3;
+       return kvm_read_cr3(vcpu);
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu)
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+                             struct x86_exception *fault)
 {
-       vcpu->arch.mmu.inject_page_fault(vcpu);
+       vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
+       context->base_role.word = 0;
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
@@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                return;
         }
 
-       if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
-               return;
-
        ++vcpu->kvm->stat.mmu_pte_updated;
        if (!sp->role.cr4_pae)
                paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
        }
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+                      void *insn, int insn_len)
 {
        int r;
        enum emulation_result er;
 
-       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
        if (r < 0)
                goto out;
 
@@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
        if (r)
                goto out;
 
-       er = emulate_instruction(vcpu, cr2, error_code, 0);
+       er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
 
        switch (er) {
        case EMULATE_DONE:
@@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                if (!test_bit(slot, sp->slot_bitmap))
                        continue;
 
+               if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+                       continue;
+
                pt = sp->spt;
                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
                        /* avoid RMW */
                        if (is_writable_pte(pt[i]))
-                               pt[i] &= ~PT_WRITABLE_MASK;
+                               update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
        }
        kvm_flush_remote_tlbs(kvm);
 }
@@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void)
                kmem_cache_destroy(mmu_page_header_cache);
 }
 
-void kvm_mmu_module_exit(void)
-{
-       mmu_destroy_caches();
-       percpu_counter_destroy(&kvm_total_used_mmu_pages);
-       unregister_shrinker(&mmu_shrinker);
-}
-
 int kvm_mmu_module_init(void)
 {
        pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-       (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
+       (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
        return 1;
 }
 
@@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
@@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
        destroy_kvm_mmu(vcpu);
        free_mmu_pages(vcpu);
        mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+       mmu_destroy_caches();
+       percpu_counter_destroy(&kvm_total_used_mmu_pages);
+       unregister_shrinker(&mmu_shrinker);
        mmu_audit_disable();
 }
index ba2bcdd..5f6223b 100644 (file)
 
 #include <linux/ratelimit.h>
 
-static int audit_point;
-
-#define audit_printk(fmt, args...)             \
+#define audit_printk(kvm, fmt, args...)                \
        printk(KERN_ERR "audit: (%s) error: "   \
-               fmt, audit_point_name[audit_point], ##args)
+               fmt, audit_point_name[kvm->arch.audit_point], ##args)
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
        if (sp->unsync) {
                if (level != PT_PAGE_TABLE_LEVEL) {
-                       audit_printk("unsync sp: %p level = %d\n", sp, level);
+                       audit_printk(vcpu->kvm, "unsync sp: %p "
+                                    "level = %d\n", sp, level);
                        return;
                }
 
                if (*sptep == shadow_notrap_nonpresent_pte) {
-                       audit_printk("notrap spte in unsync sp: %p\n", sp);
+                       audit_printk(vcpu->kvm, "notrap spte in unsync "
+                                    "sp: %p\n", sp);
                        return;
                }
        }
 
        if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-               audit_printk("notrap spte in direct sp: %p\n", sp);
+               audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+                            sp);
                return;
        }
 
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
        hpa =  pfn << PAGE_SHIFT;
        if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-               audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
-                                  vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+               audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+                            "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+                            hpa, *sptep);
 }
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
        if (!gfn_to_memslot(kvm, gfn)) {
                if (!printk_ratelimit())
                        return;
-               audit_printk("no memslot for gfn %llx\n", gfn);
-               audit_printk("index %ld of sp (gfn=%llx)\n",
+               audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+               audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
                       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
                dump_stack();
                return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
        if (!*rmapp) {
                if (!printk_ratelimit())
                        return;
-               audit_printk("no rmap for writable spte %llx\n", *sptep);
+               audit_printk(kvm, "no rmap for writable spte %llx\n",
+                            *sptep);
                dump_stack();
        }
 }
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
        struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
-       if (audit_point == AUDIT_POST_SYNC && sp->unsync)
-               audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+       if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+               audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+                            "root.\n", sp);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
        spte = rmap_next(kvm, rmapp, NULL);
        while (spte) {
                if (is_writable_pte(*spte))
-                       audit_printk("shadow page has writable mappings: gfn "
-                                    "%llx role %x\n", sp->gfn, sp->role.word);
+                       audit_printk(kvm, "shadow page has writable "
+                                    "mappings: gfn %llx role %x\n",
+                                    sp->gfn, sp->role.word);
                spte = rmap_next(kvm, rmapp, spte);
        }
 }
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
        if (!__ratelimit(&ratelimit_state))
                return;
 
-       audit_point = point;
+       vcpu->kvm->arch.audit_point = point;
        audit_all_active_sps(vcpu->kvm);
        audit_vcpu_spte(vcpu);
 }
index cd7a833..53210f1 100644 (file)
@@ -72,7 +72,7 @@ struct guest_walker {
        unsigned pt_access;
        unsigned pte_access;
        gfn_t gfn;
-       u32 error_code;
+       struct x86_exception fault;
 };
 
 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
        return 1;
 
 error:
-       walker->error_code = 0;
+       walker->fault.vector = PF_VECTOR;
+       walker->fault.error_code_valid = true;
+       walker->fault.error_code = 0;
        if (present)
-               walker->error_code |= PFERR_PRESENT_MASK;
+               walker->fault.error_code |= PFERR_PRESENT_MASK;
 
-       walker->error_code |= write_fault | user_fault;
+       walker->fault.error_code |= write_fault | user_fault;
 
        if (fetch_fault && mmu->nx)
-               walker->error_code |= PFERR_FETCH_MASK;
+               walker->fault.error_code |= PFERR_FETCH_MASK;
        if (rsvd_fault)
-               walker->error_code |= PFERR_RSVD_MASK;
+               walker->fault.error_code |= PFERR_RSVD_MASK;
 
-       vcpu->arch.fault.address    = addr;
-       vcpu->arch.fault.error_code = walker->error_code;
+       walker->fault.address = addr;
+       walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
-       trace_kvm_mmu_walker_error(walker->error_code);
+       trace_kvm_mmu_walker_error(walker->fault.error_code);
        return 0;
 }
 
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                        addr, access);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp, u64 *spte,
+                                   pt_element_t gpte)
+{
+       u64 nonpresent = shadow_trap_nonpresent_pte;
+
+       if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+               goto no_present;
+
+       if (!is_present_gpte(gpte)) {
+               if (!sp->unsync)
+                       nonpresent = shadow_notrap_nonpresent_pte;
+               goto no_present;
+       }
+
+       if (!(gpte & PT_ACCESSED_MASK))
+               goto no_present;
+
+       return false;
+
+no_present:
+       drop_spte(vcpu->kvm, spte, nonpresent);
+       return true;
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                              u64 *spte, const void *pte)
 {
        pt_element_t gpte;
        unsigned pte_access;
        pfn_t pfn;
-       u64 new_spte;
 
        gpte = *(const pt_element_t *)pte;
-       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-               if (!is_present_gpte(gpte)) {
-                       if (sp->unsync)
-                               new_spte = shadow_trap_nonpresent_pte;
-                       else
-                               new_spte = shadow_notrap_nonpresent_pte;
-                       __set_spte(spte, new_spte);
-               }
+       if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return;
-       }
+
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
        pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
        if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                return;
        kvm_get_pfn(pfn);
        /*
-        * we call mmu_set_spte() with reset_host_protection = true beacuse that
+        * we call mmu_set_spte() with host_writable = true beacuse that
         * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
         */
        mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
                                u64 *sptep)
 {
        struct kvm_mmu_page *sp;
-       struct kvm_mmu *mmu = &vcpu->arch.mmu;
        pt_element_t *gptep = gw->prefetch_ptes;
        u64 *spte;
        int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 
                gpte = gptep[i];
 
-               if (!is_present_gpte(gpte) ||
-                     is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
-                       if (!sp->unsync)
-                               __set_spte(spte, shadow_notrap_nonpresent_pte);
-                       continue;
-               }
-
-               if (!(gpte & PT_ACCESSED_MASK))
+               if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                        continue;
 
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
                         int user_fault, int write_fault, int hlevel,
-                        int *ptwrite, pfn_t pfn)
+                        int *ptwrite, pfn_t pfn, bool map_writable,
+                        bool prefault)
 {
        unsigned access = gw->pt_access;
        struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
        mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
                     user_fault, write_fault, dirty, ptwrite, it.level,
-                    gw->gfn, pfn, false, true);
+                    gw->gfn, pfn, prefault, map_writable);
        FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
        return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
  *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
  *           a negative value on error.
  */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-                              u32 error_code)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+                            bool prefault)
 {
        int write_fault = error_code & PFERR_WRITE_MASK;
        int user_fault = error_code & PFERR_USER_MASK;
@@ -539,6 +551,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
        unsigned long mmu_seq;
+       bool map_writable;
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -556,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
         */
        if (!r) {
                pgprintk("%s: guest page fault\n", __func__);
-               inject_page_fault(vcpu);
-               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+               if (!prefault) {
+                       inject_page_fault(vcpu, &walker.fault);
+                       /* reset fork detector */
+                       vcpu->arch.last_pt_write_count = 0;
+               }
                return 0;
        }
 
@@ -568,7 +584,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
-       pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+                        &map_writable))
+               return 0;
 
        /* mmio */
        if (is_error_pfn(pfn))
@@ -581,7 +600,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
        kvm_mmu_free_some_pages(vcpu);
        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                            level, &write_pt, pfn);
+                            level, &write_pt, pfn, map_writable, prefault);
        (void)sptep;
        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
                 sptep, *sptep, write_pt);
@@ -661,7 +680,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
-                              u32 *error)
+                              struct x86_exception *exception)
 {
        struct guest_walker walker;
        gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +691,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
        if (r) {
                gpa = gfn_to_gpa(walker.gfn);
                gpa |= vaddr & ~PAGE_MASK;
-       } else if (error)
-               *error = walker.error_code;
+       } else if (exception)
+               *exception = walker.fault;
 
        return gpa;
 }
 
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
-                                     u32 access, u32 *error)
+                                     u32 access,
+                                     struct x86_exception *exception)
 {
        struct guest_walker walker;
        gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +710,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
        if (r) {
                gpa = gfn_to_gpa(walker.gfn);
                gpa |= vaddr & ~PAGE_MASK;
-       } else if (error)
-               *error = walker.error_code;
+       } else if (exception)
+               *exception = walker.fault;
 
        return gpa;
 }
@@ -730,12 +750,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ *   We should flush all tlbs if spte is dropped even though guest is
+ *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ *   used by guest then tlbs are not flushed, so guest is allowed to access the
+ *   freed pages.
+ *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
  */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                           bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
        int i, offset, nr_present;
-       bool reset_host_protection;
+       bool host_writable;
        gpa_t first_pte_gpa;
 
        offset = nr_present = 0;
@@ -764,31 +791,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                        return -EINVAL;
 
                gfn = gpte_to_gfn(gpte);
-               if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
-                     || gfn != sp->gfns[i] || !is_present_gpte(gpte)
-                     || !(gpte & PT_ACCESSED_MASK)) {
-                       u64 nonpresent;
 
-                       if (is_present_gpte(gpte) || !clear_unsync)
-                               nonpresent = shadow_trap_nonpresent_pte;
-                       else
-                               nonpresent = shadow_notrap_nonpresent_pte;
-                       drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+               if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+                       vcpu->kvm->tlbs_dirty++;
+                       continue;
+               }
+
+               if (gfn != sp->gfns[i]) {
+                       drop_spte(vcpu->kvm, &sp->spt[i],
+                                     shadow_trap_nonpresent_pte);
+                       vcpu->kvm->tlbs_dirty++;
                        continue;
                }
 
                nr_present++;
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-               if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
-                       pte_access &= ~ACC_WRITE_MASK;
-                       reset_host_protection = 0;
-               } else {
-                       reset_host_protection = 1;
-               }
+               host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
                         is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
                         spte_to_pfn(sp->spt[i]), true, false,
-                        reset_host_protection);
+                        host_writable);
        }
 
        return !nr_present;
index b81a9b7..25bd1bc 100644 (file)
@@ -31,6 +31,7 @@
 
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
+#include <asm/kvm_para.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV           (1 <<  1)
 #define SVM_FEATURE_SVML           (1 <<  2)
 #define SVM_FEATURE_NRIP           (1 <<  3)
+#define SVM_FEATURE_TSC_RATE       (1 <<  4)
+#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
+#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
+#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
 #define NESTED_EXIT_HOST       0       /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
        unsigned long vmexit_rax;
 
        /* cache for intercepts of the guest */
-       u16 intercept_cr_read;
-       u16 intercept_cr_write;
-       u16 intercept_dr_read;
-       u16 intercept_dr_write;
+       u32 intercept_cr;
+       u32 intercept_dr;
        u32 intercept_exceptions;
        u64 intercept;
 
@@ -123,7 +126,12 @@ struct vcpu_svm {
        u64 next_rip;
 
        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-       u64 host_gs_base;
+       struct {
+               u16 fs;
+               u16 gs;
+               u16 ldt;
+               u64 gs_base;
+       } host;
 
        u32 *msrpm;
 
@@ -133,6 +141,7 @@ struct vcpu_svm {
 
        unsigned int3_injected;
        unsigned long int3_rip;
+       u32 apf_reason;
 };
 
 #define MSR_INVALID                    0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                                      bool has_error_code, u32 error_code);
 
+enum {
+       VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+                           pause filter count */
+       VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
+       VMCB_ASID,       /* ASID */
+       VMCB_INTR,       /* int_ctl, int_vector */
+       VMCB_NPT,        /* npt_en, nCR3, gPAT */
+       VMCB_CR,         /* CR0, CR3, CR4, EFER */
+       VMCB_DR,         /* DR6, DR7 */
+       VMCB_DT,         /* GDT, IDT */
+       VMCB_SEG,        /* CS, DS, SS, ES, CPL */
+       VMCB_CR2,        /* CR2 only */
+       VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+       VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+       vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+       vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+                              & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+       vmcb->control.clean &= ~(1 << bit);
+}
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
        return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
-static inline bool is_nested(struct vcpu_svm *svm)
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+       struct vmcb_control_area *c, *h;
+       struct nested_state *g;
+
+       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+       if (!is_guest_mode(&svm->vcpu))
+               return;
+
+       c = &svm->vmcb->control;
+       h = &svm->nested.hsave->control;
+       g = &svm->nested;
+
+       c->intercept_cr = h->intercept_cr | g->intercept_cr;
+       c->intercept_dr = h->intercept_dr | g->intercept_dr;
+       c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+       c->intercept = h->intercept | g->intercept;
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+       if (is_guest_mode(&svm->vcpu))
+               return svm->nested.hsave;
+       else
+               return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_cr |= (1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_cr &= ~(1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_dr |= (1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_dr &= ~(1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_exceptions |= (1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 {
-       return svm->nested.vmcb;
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept |= (1ULL << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept &= ~(1ULL << bit);
+
+       recalc_intercepts(svm);
 }
 
 static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
 
 #define MAX_INST_SIZE 15
 
-static inline u32 svm_has(u32 feat)
-{
-       return svm_features & feat;
-}
-
 static inline void clgi(void)
 {
        asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 }
 
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
-       to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
-       force_new_asid(vcpu);
-}
-
 static int get_npt_level(void)
 {
 #ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                efer &= ~EFER_LME;
 
        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+       mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 }
 
 static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
                svm->next_rip = svm->vmcb->control.next_rip;
 
        if (!svm->next_rip) {
-               if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
+               if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
                                EMULATE_DONE)
                        printk(KERN_DEBUG "%s: NOP\n", __func__);
                return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
            nested_svm_check_exception(svm, nr, has_error_code, error_code))
                return;
 
-       if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+       if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 
                /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
 
        svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
-       if (!svm_has(SVM_FEATURE_NPT))
+       if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;
 
        if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 g_tsc_offset = 0;
 
-       if (is_nested(svm)) {
+       if (is_guest_mode(vcpu)) {
                g_tsc_offset = svm->vmcb->control.tsc_offset -
                               svm->nested.hsave->control.tsc_offset;
                svm->nested.hsave->control.tsc_offset = offset;
        }
 
        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
 static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->control.tsc_offset += adjustment;
-       if (is_nested(svm))
+       if (is_guest_mode(vcpu))
                svm->nested.hsave->control.tsc_offset += adjustment;
+       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
 static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
        struct vmcb_save_area *save = &svm->vmcb->save;
 
        svm->vcpu.fpu_active = 1;
+       svm->vcpu.arch.hflags = 0;
 
-       control->intercept_cr_read =    INTERCEPT_CR0_MASK |
-                                       INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK;
-
-       control->intercept_cr_write =   INTERCEPT_CR0_MASK |
-                                       INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK |
-                                       INTERCEPT_CR8_MASK;
-
-       control->intercept_dr_read =    INTERCEPT_DR0_MASK |
-                                       INTERCEPT_DR1_MASK |
-                                       INTERCEPT_DR2_MASK |
-                                       INTERCEPT_DR3_MASK |
-                                       INTERCEPT_DR4_MASK |
-                                       INTERCEPT_DR5_MASK |
-                                       INTERCEPT_DR6_MASK |
-                                       INTERCEPT_DR7_MASK;
-
-       control->intercept_dr_write =   INTERCEPT_DR0_MASK |
-                                       INTERCEPT_DR1_MASK |
-                                       INTERCEPT_DR2_MASK |
-                                       INTERCEPT_DR3_MASK |
-                                       INTERCEPT_DR4_MASK |
-                                       INTERCEPT_DR5_MASK |
-                                       INTERCEPT_DR6_MASK |
-                                       INTERCEPT_DR7_MASK;
-
-       control->intercept_exceptions = (1 << PF_VECTOR) |
-                                       (1 << UD_VECTOR) |
-                                       (1 << MC_VECTOR);
-
-
-       control->intercept =    (1ULL << INTERCEPT_INTR) |
-                               (1ULL << INTERCEPT_NMI) |
-                               (1ULL << INTERCEPT_SMI) |
-                               (1ULL << INTERCEPT_SELECTIVE_CR0) |
-                               (1ULL << INTERCEPT_CPUID) |
-                               (1ULL << INTERCEPT_INVD) |
-                               (1ULL << INTERCEPT_HLT) |
-                               (1ULL << INTERCEPT_INVLPG) |
-                               (1ULL << INTERCEPT_INVLPGA) |
-                               (1ULL << INTERCEPT_IOIO_PROT) |
-                               (1ULL << INTERCEPT_MSR_PROT) |
-                               (1ULL << INTERCEPT_TASK_SWITCH) |
-                               (1ULL << INTERCEPT_SHUTDOWN) |
-                               (1ULL << INTERCEPT_VMRUN) |
-                               (1ULL << INTERCEPT_VMMCALL) |
-                               (1ULL << INTERCEPT_VMLOAD) |
-                               (1ULL << INTERCEPT_VMSAVE) |
-                               (1ULL << INTERCEPT_STGI) |
-                               (1ULL << INTERCEPT_CLGI) |
-                               (1ULL << INTERCEPT_SKINIT) |
-                               (1ULL << INTERCEPT_WBINVD) |
-                               (1ULL << INTERCEPT_MONITOR) |
-                               (1ULL << INTERCEPT_MWAIT);
+       set_cr_intercept(svm, INTERCEPT_CR0_READ);
+       set_cr_intercept(svm, INTERCEPT_CR3_READ);
+       set_cr_intercept(svm, INTERCEPT_CR4_READ);
+       set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+       set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+       set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+       set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+       set_dr_intercept(svm, INTERCEPT_DR0_READ);
+       set_dr_intercept(svm, INTERCEPT_DR1_READ);
+       set_dr_intercept(svm, INTERCEPT_DR2_READ);
+       set_dr_intercept(svm, INTERCEPT_DR3_READ);
+       set_dr_intercept(svm, INTERCEPT_DR4_READ);
+       set_dr_intercept(svm, INTERCEPT_DR5_READ);
+       set_dr_intercept(svm, INTERCEPT_DR6_READ);
+       set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+       set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+       set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+
+       set_exception_intercept(svm, PF_VECTOR);
+       set_exception_intercept(svm, UD_VECTOR);
+       set_exception_intercept(svm, MC_VECTOR);
+
+       set_intercept(svm, INTERCEPT_INTR);
+       set_intercept(svm, INTERCEPT_NMI);
+       set_intercept(svm, INTERCEPT_SMI);
+       set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+       set_intercept(svm, INTERCEPT_CPUID);
+       set_intercept(svm, INTERCEPT_INVD);
+       set_intercept(svm, INTERCEPT_HLT);
+       set_intercept(svm, INTERCEPT_INVLPG);
+       set_intercept(svm, INTERCEPT_INVLPGA);
+       set_intercept(svm, INTERCEPT_IOIO_PROT);
+       set_intercept(svm, INTERCEPT_MSR_PROT);
+       set_intercept(svm, INTERCEPT_TASK_SWITCH);
+       set_intercept(svm, INTERCEPT_SHUTDOWN);
+       set_intercept(svm, INTERCEPT_VMRUN);
+       set_intercept(svm, INTERCEPT_VMMCALL);
+       set_intercept(svm, INTERCEPT_VMLOAD);
+       set_intercept(svm, INTERCEPT_VMSAVE);
+       set_intercept(svm, INTERCEPT_STGI);
+       set_intercept(svm, INTERCEPT_CLGI);
+       set_intercept(svm, INTERCEPT_SKINIT);
+       set_intercept(svm, INTERCEPT_WBINVD);
+       set_intercept(svm, INTERCEPT_MONITOR);
+       set_intercept(svm, INTERCEPT_MWAIT);
+       set_intercept(svm, INTERCEPT_XSETBV);
 
        control->iopm_base_pa = iopm_base;
        control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl = 1;
-               control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
-                                       (1ULL << INTERCEPT_INVLPG));
-               control->intercept_exceptions &= ~(1 << PF_VECTOR);
-               control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
-               control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
+               clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+               clr_intercept(svm, INTERCEPT_INVLPG);
+               clr_exception_intercept(svm, PF_VECTOR);
+               clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+               clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
                save->g_pat = 0x0007040600070406ULL;
                save->cr3 = 0;
                save->cr4 = 0;
        }
-       force_new_asid(&svm->vcpu);
+       svm->asid_generation = 0;
 
        svm->nested.vmcb = 0;
        svm->vcpu.arch.hflags = 0;
 
-       if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+       if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
                control->pause_filter_count = 3000;
-               control->intercept |= (1ULL << INTERCEPT_PAUSE);
+               set_intercept(svm, INTERCEPT_PAUSE);
        }
 
+       mark_all_dirty(svm->vmcb);
+
        enable_gif(svm);
 }
 
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (unlikely(cpu != vcpu->cpu)) {
                svm->asid_generation = 0;
+               mark_all_dirty(svm->vmcb);
        }
 
+#ifdef CONFIG_X86_64
+       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+       savesegment(fs, svm->host.fs);
+       savesegment(gs, svm->host.gs);
+       svm->host.ldt = kvm_read_ldt();
+
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        int i;
 
        ++vcpu->stat.host_state_reload;
+       kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+       loadsegment(fs, svm->host.fs);
+       load_gs_index(svm->host.gs);
+       wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+       loadsegment(gs, svm->host.gs);
+#endif
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        switch (reg) {
        case VCPU_EXREG_PDPTR:
                BUG_ON(!npt_enabled);
-               load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+               load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                break;
        default:
                BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
-       svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+       set_intercept(svm, INTERCEPT_VINTR);
 }
 
 static void svm_clear_vintr(struct vcpu_svm *svm)
 {
-       svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+       clr_intercept(svm, INTERCEPT_VINTR);
 }
 
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 
        svm->vmcb->save.idtr.limit = dt->size;
        svm->vmcb->save.idtr.base = dt->address ;
+       mark_dirty(svm->vmcb, VMCB_DT);
 }
 
 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 
        svm->vmcb->save.gdtr.limit = dt->size;
        svm->vmcb->save.gdtr.base = dt->address ;
+       mark_dirty(svm->vmcb, VMCB_DT);
 }
 
 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 }
 
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
 }
 
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = svm->vmcb;
        ulong gcr0 = svm->vcpu.arch.cr0;
        u64 *hcr0 = &svm->vmcb->save.cr0;
 
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
 
+       mark_dirty(svm->vmcb, VMCB_CR);
 
        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
-               vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
-               vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
-               if (is_nested(svm)) {
-                       struct vmcb *hsave = svm->nested.hsave;
-
-                       hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
-                       hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
-                       vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read;
-                       vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
-               }
+               clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+               clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
        } else {
-               svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
-               svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
-               if (is_nested(svm)) {
-                       struct vmcb *hsave = svm->nested.hsave;
-
-                       hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
-                       hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
-               }
+               set_cr_intercept(svm, INTERCEPT_CR0_READ);
+               set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
        }
 }
 
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (is_nested(svm)) {
+       if (is_guest_mode(vcpu)) {
                /*
                 * We are here because we run in nested mode, the host kvm
                 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         */
        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        svm->vmcb->save.cr0 = cr0;
+       mark_dirty(svm->vmcb, VMCB_CR);
        update_cr0_intercept(svm);
 }
 
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-               force_new_asid(vcpu);
+               svm_flush_tlb(vcpu);
 
        vcpu->arch.cr4 = cr4;
        if (!npt_enabled)
                cr4 |= X86_CR4_PAE;
        cr4 |= host_cr4_mce;
        to_svm(vcpu)->vmcb->save.cr4 = cr4;
+       mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
                        = (svm->vmcb->save.cs.attrib
                           >> SVM_SELECTOR_DPL_SHIFT) & 3;
 
+       mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
 static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->control.intercept_exceptions &=
-               ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+       clr_exception_intercept(svm, DB_VECTOR);
+       clr_exception_intercept(svm, BP_VECTOR);
 
        if (svm->nmi_singlestep)
-               svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+               set_exception_intercept(svm, DB_VECTOR);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
                if (vcpu->guest_debug &
                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-                       svm->vmcb->control.intercept_exceptions |=
-                               1 << DB_VECTOR;
+                       set_exception_intercept(svm, DB_VECTOR);
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
-                       svm->vmcb->control.intercept_exceptions |=
-                               1 << BP_VECTOR;
+                       set_exception_intercept(svm, BP_VECTOR);
        } else
                vcpu->guest_debug = 0;
 }
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
        else
                svm->vmcb->save.dr7 = vcpu->arch.dr7;
 
-       update_db_intercept(vcpu);
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
+       mark_dirty(svm->vmcb, VMCB_DR);
 
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
+       update_db_intercept(vcpu);
 }
 
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 
        svm->asid_generation = sd->asid_generation;
        svm->vmcb->control.asid = sd->next_asid++;
+
+       mark_dirty(svm->vmcb, VMCB_ASID);
 }
 
 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->save.dr7 = value;
+       mark_dirty(svm->vmcb, VMCB_DR);
 }
 
 static int pf_interception(struct vcpu_svm *svm)
 {
-       u64 fault_address;
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u32 error_code;
+       int r = 1;
 
-       fault_address  = svm->vmcb->control.exit_info_2;
-       error_code = svm->vmcb->control.exit_info_1;
-
-       trace_kvm_page_fault(fault_address, error_code);
-       if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
-               kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+       switch (svm->apf_reason) {
+       default:
+               error_code = svm->vmcb->control.exit_info_1;
+
+               trace_kvm_page_fault(fault_address, error_code);
+               if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+                       kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+               r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+                       svm->vmcb->control.insn_bytes,
+                       svm->vmcb->control.insn_len);
+               break;
+       case KVM_PV_REASON_PAGE_NOT_PRESENT:
+               svm->apf_reason = 0;
+               local_irq_disable();
+               kvm_async_pf_task_wait(fault_address);
+               local_irq_enable();
+               break;
+       case KVM_PV_REASON_PAGE_READY:
+               svm->apf_reason = 0;
+               local_irq_disable();
+               kvm_async_pf_task_wake(fault_address);
+               local_irq_enable();
+               break;
+       }
+       return r;
 }
 
 static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
 {
        int er;
 
-       er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
+       er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
        if (er != EMULATE_DONE)
                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       u32 excp;
-
-       if (is_nested(svm)) {
-               u32 h_excp, n_excp;
-
-               h_excp  = svm->nested.hsave->control.intercept_exceptions;
-               n_excp  = svm->nested.intercept_exceptions;
-               h_excp &= ~(1 << NM_VECTOR);
-               excp    = h_excp | n_excp;
-       } else {
-               excp  = svm->vmcb->control.intercept_exceptions;
-               excp &= ~(1 << NM_VECTOR);
-       }
 
-       svm->vmcb->control.intercept_exceptions = excp;
+       clr_exception_intercept(svm, NM_VECTOR);
 
        svm->vcpu.fpu_active = 1;
        update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
        if (string || in)
-               return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+               return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
        port = io_info >> 16;
        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->control.nested_cr3 = root;
-       force_new_asid(vcpu);
+       mark_dirty(svm->vmcb, VMCB_NPT);
+       svm_flush_tlb(vcpu);
 }
 
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+                                      struct x86_exception *fault)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->control.exit_code = SVM_EXIT_NPF;
        svm->vmcb->control.exit_code_hi = 0;
-       svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
-       svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+       svm->vmcb->control.exit_info_1 = fault->error_code;
+       svm->vmcb->control.exit_info_2 = fault->address;
 
        nested_svm_vmexit(svm);
 }
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 {
        int vmexit;
 
-       if (!is_nested(svm))
+       if (!is_guest_mode(&svm->vcpu))
                return 0;
 
        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 /* This function returns true if it is save to enable the irq window */
 static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
-       if (!is_nested(svm))
+       if (!is_guest_mode(&svm->vcpu))
                return true;
 
        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 /* This function returns true if it is save to enable the nmi window */
 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
 {
-       if (!is_nested(svm))
+       if (!is_guest_mode(&svm->vcpu))
                return true;
 
        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
                        return NESTED_EXIT_HOST;
                break;
        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-               /* When we're shadowing, trap PFs */
-               if (!npt_enabled)
+               /* When we're shadowing, trap PFs, but not async PF */
+               if (!npt_enabled && svm->apf_reason == 0)
                        return NESTED_EXIT_HOST;
                break;
        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
        case SVM_EXIT_IOIO:
                vmexit = nested_svm_intercept_ioio(svm);
                break;
-       case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
-               u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
-               if (svm->nested.intercept_cr_read & cr_bits)
+       case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+               u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+               if (svm->nested.intercept_cr & bit)
                        vmexit = NESTED_EXIT_DONE;
                break;
        }
-       case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
-               u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
-               if (svm->nested.intercept_cr_write & cr_bits)
-                       vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
-               u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
-               if (svm->nested.intercept_dr_read & dr_bits)
-                       vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
-               u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
-               if (svm->nested.intercept_dr_write & dr_bits)
+       case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+               u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+               if (svm->nested.intercept_dr & bit)
                        vmexit = NESTED_EXIT_DONE;
                break;
        }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
                if (svm->nested.intercept_exceptions & excp_bits)
                        vmexit = NESTED_EXIT_DONE;
+               /* async page fault always cause vmexit */
+               else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+                        svm->apf_reason != 0)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
        struct vmcb_control_area *dst  = &dst_vmcb->control;
        struct vmcb_control_area *from = &from_vmcb->control;
 
-       dst->intercept_cr_read    = from->intercept_cr_read;
-       dst->intercept_cr_write   = from->intercept_cr_write;
-       dst->intercept_dr_read    = from->intercept_dr_read;
-       dst->intercept_dr_write   = from->intercept_dr_write;
+       dst->intercept_cr         = from->intercept_cr;
+       dst->intercept_dr         = from->intercept_dr;
        dst->intercept_exceptions = from->intercept_exceptions;
        dst->intercept            = from->intercept;
        dst->iopm_base_pa         = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        if (!nested_vmcb)
                return 1;
 
-       /* Exit nested SVM mode */
+       /* Exit Guest-Mode */
+       leave_guest_mode(&svm->vcpu);
        svm->nested.vmcb = 0;
 
        /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        nested_vmcb->save.idtr   = vmcb->save.idtr;
        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
+       nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
        nested_vmcb->save.cr2    = vmcb->save.cr2;
        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
        nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        svm->vmcb->save.cpl = 0;
        svm->vmcb->control.exit_int_info = 0;
 
+       mark_all_dirty(svm->vmcb);
+
        nested_svm_unmap(page);
 
        nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
                               nested_vmcb->control.event_inj,
                               nested_vmcb->control.nested_ctl);
 
-       trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
-                                   nested_vmcb->control.intercept_cr_write,
+       trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+                                   nested_vmcb->control.intercept_cr >> 16,
                                    nested_vmcb->control.intercept_exceptions,
                                    nested_vmcb->control.intercept);
 
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        if (npt_enabled)
                hsave->save.cr3    = vmcb->save.cr3;
        else
-               hsave->save.cr3    = svm->vcpu.arch.cr3;
+               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
 
        copy_vmcb_control_area(hsave, vmcb);
 
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
        /* cache intercepts */
-       svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read;
-       svm->nested.intercept_cr_write   = nested_vmcb->control.intercept_cr_write;
-       svm->nested.intercept_dr_read    = nested_vmcb->control.intercept_dr_read;
-       svm->nested.intercept_dr_write   = nested_vmcb->control.intercept_dr_write;
+       svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
+       svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
        svm->nested.intercept            = nested_vmcb->control.intercept;
 
-       force_new_asid(&svm->vcpu);
+       svm_flush_tlb(&svm->vcpu);
        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
                /* We only want the cr8 intercept bits of the guest */
-               svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
-               svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+               clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+               clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
        }
 
        /* We don't want to see VMMCALLs from a nested guest */
-       svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
-       /*
-        * We don't want a nested guest to be more powerful than the guest, so
-        * all intercepts are ORed
-        */
-       svm->vmcb->control.intercept_cr_read |=
-               nested_vmcb->control.intercept_cr_read;
-       svm->vmcb->control.intercept_cr_write |=
-               nested_vmcb->control.intercept_cr_write;
-       svm->vmcb->control.intercept_dr_read |=
-               nested_vmcb->control.intercept_dr_read;
-       svm->vmcb->control.intercept_dr_write |=
-               nested_vmcb->control.intercept_dr_write;
-       svm->vmcb->control.intercept_exceptions |=
-               nested_vmcb->control.intercept_exceptions;
-
-       svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+       clr_intercept(svm, INTERCEPT_VMMCALL);
 
        svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
        nested_svm_unmap(page);
 
-       /* nested_vmcb is our indicator if nested SVM is activated */
+       /* Enter Guest-Mode */
+       enter_guest_mode(&svm->vcpu);
+
+       /*
+        * Merge guest and host intercepts - must be called  with vcpu in
+        * guest-mode to take affect here
+        */
+       recalc_intercepts(svm);
+
        svm->nested.vmcb = vmcb_gpa;
 
        enable_gif(svm);
 
+       mark_all_dirty(svm->vmcb);
+
        return true;
 }
 
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
        svm_clear_vintr(svm);
        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 
+       mark_dirty(svm->vmcb, VMCB_INTR);
+
        return 1;
 }
 
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
        return 1;
 }
 
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+       u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+       if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+               svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+               skip_emulated_instruction(&svm->vcpu);
+       }
+
+       return 1;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm)
 {
        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
 static int iret_interception(struct vcpu_svm *svm)
 {
        ++svm->vcpu.stat.nmi_window_exits;
-       svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+       clr_intercept(svm, INTERCEPT_IRET);
        svm->vcpu.arch.hflags |= HF_IRET_MASK;
        return 1;
 }
 
 static int invlpg_interception(struct vcpu_svm *svm)
 {
-       return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+       skip_emulated_instruction(&svm->vcpu);
+       return 1;
 }
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
-       return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+       return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+       int reg, cr;
+       unsigned long val;
+       int err;
+
+       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return emulate_on_interception(svm);
+
+       if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+               return emulate_on_interception(svm);
+
+       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+       cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+       err = 0;
+       if (cr >= 16) { /* mov to cr */
+               cr -= 16;
+               val = kvm_register_read(&svm->vcpu, reg);
+               switch (cr) {
+               case 0:
+                       err = kvm_set_cr0(&svm->vcpu, val);
+                       break;
+               case 3:
+                       err = kvm_set_cr3(&svm->vcpu, val);
+                       break;
+               case 4:
+                       err = kvm_set_cr4(&svm->vcpu, val);
+                       break;
+               case 8:
+                       err = kvm_set_cr8(&svm->vcpu, val);
+                       break;
+               default:
+                       WARN(1, "unhandled write to CR%d", cr);
+                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       return 1;
+               }
+       } else { /* mov from cr */
+               switch (cr) {
+               case 0:
+                       val = kvm_read_cr0(&svm->vcpu);
+                       break;
+               case 2:
+                       val = svm->vcpu.arch.cr2;
+                       break;
+               case 3:
+                       val = kvm_read_cr3(&svm->vcpu);
+                       break;
+               case 4:
+                       val = kvm_read_cr4(&svm->vcpu);
+                       break;
+               case 8:
+                       val = kvm_get_cr8(&svm->vcpu);
+                       break;
+               default:
+                       WARN(1, "unhandled read from CR%d", cr);
+                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       return 1;
+               }
+               kvm_register_write(&svm->vcpu, reg, val);
+       }
+       kvm_complete_insn_gp(&svm->vcpu, err);
+
+       return 1;
 }
 
 static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
        struct kvm_vcpu *vcpu = &svm->vcpu;
        int r;
 
-       r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+       r = cr_interception(svm);
 
        if (svm->nested.vmexit_rip) {
                kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
                svm->nested.vmexit_rip = 0;
        }
 
-       return r == EMULATE_DONE;
+       return r;
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+       int reg, dr;
+       unsigned long val;
+       int err;
+
+       if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return emulate_on_interception(svm);
+
+       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+       dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+       if (dr >= 16) { /* mov to DRn */
+               val = kvm_register_read(&svm->vcpu, reg);
+               kvm_set_dr(&svm->vcpu, dr - 16, val);
+       } else {
+               err = kvm_get_dr(&svm->vcpu, dr, &val);
+               if (!err)
+                       kvm_register_write(&svm->vcpu, reg, val);
+       }
+
+       return 1;
 }
 
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
        struct kvm_run *kvm_run = svm->vcpu.run;
+       int r;
 
        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       emulate_instruction(&svm->vcpu, 0, 0, 0);
+       r = cr_interception(svm);
        if (irqchip_in_kernel(svm->vcpu.kvm)) {
-               svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
-               return 1;
+               clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+               return r;
        }
        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
-               return 1;
+               return r;
        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
        return 0;
 }
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
        switch (ecx) {
        case MSR_IA32_TSC: {
-               u64 tsc_offset;
+               struct vmcb *vmcb = get_host_vmcb(svm);
 
-               if (is_nested(svm))
-                       tsc_offset = svm->nested.hsave->control.tsc_offset;
-               else
-                       tsc_offset = svm->vmcb->control.tsc_offset;
-
-               *data = tsc_offset + native_read_tsc();
+               *data = vmcb->control.tsc_offset + native_read_tsc();
                break;
        }
        case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
                svm->vmcb->save.sysenter_esp = data;
                break;
        case MSR_IA32_DEBUGCTLMSR:
-               if (!svm_has(SVM_FEATURE_LBRV)) {
+               if (!boot_cpu_has(X86_FEATURE_LBRV)) {
                        pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
                                        __func__, data);
                        break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
                        return 1;
 
                svm->vmcb->save.dbgctl = data;
+               mark_dirty(svm->vmcb, VMCB_LBR);
                if (data & (1ULL<<0))
                        svm_enable_lbrv(svm);
                else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
        svm_clear_vintr(svm);
        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+       mark_dirty(svm->vmcb, VMCB_INTR);
        /*
         * If the user space waits to inject interrupts, exit as soon as
         * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
 }
 
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
-       [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR0]                     = cr_interception,
+       [SVM_EXIT_READ_CR3]                     = cr_interception,
+       [SVM_EXIT_READ_CR4]                     = cr_interception,
+       [SVM_EXIT_READ_CR8]                     = cr_interception,
        [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
        [SVM_EXIT_WRITE_CR0]                    = cr0_write_interception,
-       [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR3]                    = cr_interception,
+       [SVM_EXIT_WRITE_CR4]                    = cr_interception,
        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
-       [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR3]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR4]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR5]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR6]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR7]                     = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR0]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR1]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR2]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR4]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR6]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
+       [SVM_EXIT_READ_DR0]                     = dr_interception,
+       [SVM_EXIT_READ_DR1]                     = dr_interception,
+       [SVM_EXIT_READ_DR2]                     = dr_interception,
+       [SVM_EXIT_READ_DR3]                     = dr_interception,
+       [SVM_EXIT_READ_DR4]                     = dr_interception,
+       [SVM_EXIT_READ_DR5]                     = dr_interception,
+       [SVM_EXIT_READ_DR6]                     = dr_interception,
+       [SVM_EXIT_READ_DR7]                     = dr_interception,
+       [SVM_EXIT_WRITE_DR0]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR1]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR2]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR3]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR4]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR5]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR6]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR7]                    = dr_interception,
        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
        [SVM_EXIT_MONITOR]                      = invalid_op_interception,
        [SVM_EXIT_MWAIT]                        = invalid_op_interception,
+       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
        [SVM_EXIT_NPF]                          = pf_interception,
 };
 
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
        struct vmcb_save_area *save = &svm->vmcb->save;
 
        pr_err("VMCB Control Area:\n");
-       pr_err("cr_read:            %04x\n", control->intercept_cr_read);
-       pr_err("cr_write:           %04x\n", control->intercept_cr_write);
-       pr_err("dr_read:            %04x\n", control->intercept_dr_read);
-       pr_err("dr_write:           %04x\n", control->intercept_dr_write);
+       pr_err("cr_read:            %04x\n", control->intercept_cr & 0xffff);
+       pr_err("cr_write:           %04x\n", control->intercept_cr >> 16);
+       pr_err("dr_read:            %04x\n", control->intercept_dr & 0xffff);
+       pr_err("dr_write:           %04x\n", control->intercept_dr >> 16);
        pr_err("exceptions:         %08x\n", control->intercept_exceptions);
        pr_err("intercepts:         %016llx\n", control->intercept);
        pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
 
 }
 
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+       struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+       *info1 = control->exit_info_1;
+       *info2 = control->exit_info_2;
+}
+
 static int handle_exit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_run *kvm_run = vcpu->run;
        u32 exit_code = svm->vmcb->control.exit_code;
 
-       trace_kvm_exit(exit_code, vcpu);
+       trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
-       if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+       if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
                vcpu->arch.cr0 = svm->vmcb->save.cr0;
        if (npt_enabled)
                vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if (is_nested(svm)) {
+       if (is_guest_mode(vcpu)) {
                int vmexit;
 
                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
 
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
-       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
        /* FIXME: handle wraparound of asid_generation */
        if (svm->asid_generation != sd->asid_generation)
                new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
        vcpu->arch.hflags |= HF_NMI_MASK;
-       svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+       set_intercept(svm, INTERCEPT_IRET);
        ++vcpu->stat.nmi_injections;
 }
 
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
        control->int_ctl &= ~V_INTR_PRIO_MASK;
        control->int_ctl |= V_IRQ_MASK |
                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+       mark_dirty(svm->vmcb, VMCB_INTR);
 }
 
 static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
                return;
 
        if (irr == -1)
                return;
 
        if (tpr >= irr)
-               svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
        if (masked) {
                svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+               set_intercept(svm, INTERCEPT_IRET);
        } else {
                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+               clr_intercept(svm, INTERCEPT_IRET);
        }
 }
 
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 
        ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
 
-       if (is_nested(svm))
+       if (is_guest_mode(vcpu))
                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
 
        return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
-       force_new_asid(vcpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+               svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+       else
+               svm->asid_generation--;
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
                return;
 
-       if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+       if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
                kvm_set_cr8(vcpu, cr8);
        }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 cr8;
 
-       if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+       if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
                return;
 
        cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       u16 fs_selector;
-       u16 gs_selector;
-       u16 ldt_selector;
 
        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        sync_lapic_to_cr8(vcpu);
 
-       save_host_msrs(vcpu);
-       savesegment(fs, fs_selector);
-       savesegment(gs, gs_selector);
-       ldt_selector = kvm_read_ldt();
        svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
        clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
                );
 
-       vcpu->arch.cr2 = svm->vmcb->save.cr2;
-       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
-       load_host_msrs(vcpu);
-       kvm_load_ldt(ldt_selector);
-       loadsegment(fs, fs_selector);
 #ifdef CONFIG_X86_64
-       load_gs_index(gs_selector);
-       wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+       wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 #else
-       loadsegment(gs, gs_selector);
+       loadsegment(fs, svm->host.fs);
 #endif
 
        reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        stgi();
 
+       vcpu->arch.cr2 = svm->vmcb->save.cr2;
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
        sync_cr8_to_lapic(vcpu);
 
        svm->next_rip = 0;
 
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* if exit due to PF check for async PF */
+       if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+               svm->apf_reason = kvm_read_and_reset_pf_reason();
+
        if (npt_enabled) {
                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        if (unlikely(svm->vmcb->control.exit_code ==
                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
                svm_handle_mce(svm);
+
+       mark_all_clean(svm->vmcb);
 }
 
 #undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->save.cr3 = root;
-       force_new_asid(vcpu);
+       mark_dirty(svm->vmcb, VMCB_CR);
+       svm_flush_tlb(vcpu);
 }
 
 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        svm->vmcb->control.nested_cr3 = root;
+       mark_dirty(svm->vmcb, VMCB_NPT);
 
        /* Also sync guest cr3 here in case we live migrate */
-       svm->vmcb->save.cr3 = vcpu->arch.cr3;
+       svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+       mark_dirty(svm->vmcb, VMCB_CR);
 
-       force_new_asid(vcpu);
+       svm_flush_tlb(vcpu);
 }
 
 static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
        switch (func) {
-       case 0x00000001:
-               /* Mask out xsave bit as long as it is not supported by SVM */
-               entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
-               break;
        case 0x80000001:
                if (nested)
                        entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                                   additional features */
 
                /* Support next_rip if host supports it */
-               if (svm_has(SVM_FEATURE_NRIP))
+               if (boot_cpu_has(X86_FEATURE_NRIPS))
                        entry->edx |= SVM_FEATURE_NRIP;
 
                /* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
        { SVM_EXIT_WBINVD,                      "wbinvd" },
        { SVM_EXIT_MONITOR,                     "monitor" },
        { SVM_EXIT_MWAIT,                       "mwait" },
+       { SVM_EXIT_XSETBV,                      "xsetbv" },
        { SVM_EXIT_NPF,                         "npf" },
        { -1, NULL }
 };
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
-       if (is_nested(svm))
-               svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+       set_exception_intercept(svm, NM_VECTOR);
        update_cr0_intercept(svm);
 }
 
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .get_cpl = svm_get_cpl,
        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+       .decache_cr3 = svm_decache_cr3,
        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
        .set_cr0 = svm_set_cr0,
        .set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
        .get_tdp_level = get_npt_level,
        .get_mt_mask = svm_get_mt_mask,
 
+       .get_exit_info = svm_get_exit_info,
        .exit_reasons_str = svm_exit_reasons_str,
+
        .get_lpage_level = svm_get_lpage_level,
 
        .cpuid_update = svm_cpuid_update,
index a6544b8..1357d7c 100644 (file)
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
 #define trace_kvm_apic_read(reg, val)          trace_kvm_apic(0, reg, val)
 #define trace_kvm_apic_write(reg, val)         trace_kvm_apic(1, reg, val)
 
+#define KVM_ISA_VMX   1
+#define KVM_ISA_SVM   2
+
 /*
  * Tracepoint for kvm guest exit:
  */
 TRACE_EVENT(kvm_exit,
-       TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
-       TP_ARGS(exit_reason, vcpu),
+       TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+       TP_ARGS(exit_reason, vcpu, isa),
 
        TP_STRUCT__entry(
                __field(        unsigned int,   exit_reason     )
                __field(        unsigned long,  guest_rip       )
+               __field(        u32,            isa             )
+               __field(        u64,            info1           )
+               __field(        u64,            info2           )
        ),
 
        TP_fast_assign(
                __entry->exit_reason    = exit_reason;
                __entry->guest_rip      = kvm_rip_read(vcpu);
+               __entry->isa            = isa;
+               kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+                                          &__entry->info2);
        ),
 
-       TP_printk("reason %s rip 0x%lx",
+       TP_printk("reason %s rip 0x%lx info %llx %llx",
                 ftrace_print_symbols_seq(p, __entry->exit_reason,
                                          kvm_x86_ops->exit_reasons_str),
-                __entry->guest_rip)
+                __entry->guest_rip, __entry->info1, __entry->info2)
 );
 
 /*
index 81fcbe9..bf89ec2 100644 (file)
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static int __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK                                             \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 
+static bool cpu_has_load_ia32_efer;
+
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
 
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
        u8 error;
 
        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
-                     : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                     : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                      : "cc", "memory");
        if (error)
                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
        u8 error;
 
        asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-                       : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
                        : "cc", "memory");
        if (error)
                printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 
 static unsigned long vmcs_readl(unsigned long field)
 {
-       unsigned long value;
+       unsigned long value = 0;
 
        asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-                     : "=a"(value) : "d"(field) : "cc");
+                     : "+a"(value) : "d"(field) : "cc");
        return value;
 }
 
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
        unsigned i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
+       if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+               vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+               vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+               return;
+       }
+
        for (i = 0; i < m->nr; ++i)
                if (m->guest[i].index == msr)
                        break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
        unsigned i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
+       if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+               vmcs_write64(GUEST_IA32_EFER, guest_val);
+               vmcs_write64(HOST_IA32_EFER, host_val);
+               vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+               vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+               return;
+       }
+
        for (i = 0; i < m->nr; ++i)
                if (m->guest[i].index == msr)
                        break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+       /* Ensure that we clear the HLT state in the VMCS.  We don't need to
+        * explicitly skip the instruction because if the HLT state is set, then
+        * the instruction is already executing and RIP has already been
+        * advanced. */
+       if (!yield_on_hlt &&
+           vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+               vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                bool has_error_code, u32 error_code,
                                bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+       vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
                        && tboot_enabled())
                        return 1;
                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
-                       && !tboot_enabled())
+                       && !tboot_enabled()) {
+                       printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+                               " activate TXT before enabling KVM\n");
                        return 1;
+               }
        }
 
        return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
        return 0;
 }
 
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+       u32 vmx_msr_low, vmx_msr_high;
+
+       rdmsr(msr, vmx_msr_low, vmx_msr_high);
+       return vmx_msr_high & ctl;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
        u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                &_pin_based_exec_control) < 0)
                return -EIO;
 
-       min = CPU_BASED_HLT_EXITING |
+       min =
 #ifdef CONFIG_X86_64
              CPU_BASED_CR8_LOAD_EXITING |
              CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_MWAIT_EXITING |
              CPU_BASED_MONITOR_EXITING |
              CPU_BASED_INVLPG_EXITING;
+
+       if (yield_on_hlt)
+               min |= CPU_BASED_HLT_EXITING;
+
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
        vmcs_conf->vmexit_ctrl         = _vmexit_control;
        vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
+       cpu_has_load_ia32_efer =
+               allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+                               VM_ENTRY_LOAD_IA32_EFER)
+               && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+                                  VM_EXIT_LOAD_IA32_EFER);
+
        return 0;
 }
 
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
        save->limit = vmcs_read32(sf->limit);
        save->ar = vmcs_read32(sf->ar_bytes);
        vmcs_write16(sf->selector, save->base >> 4);
-       vmcs_write32(sf->base, save->base & 0xfffff);
+       vmcs_write32(sf->base, save->base & 0xffff0);
        vmcs_write32(sf->limit, 0xffff);
        vmcs_write32(sf->ar_bytes, 0xf3);
+       if (save->base & 0xf)
+               printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+                           " aligned when entering protected mode (seg=%d)",
+                           seg);
 }
 
 static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
        vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
 }
 
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+       if (enable_ept && is_paging(vcpu))
+               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
                                        unsigned long cr0,
                                        struct kvm_vcpu *vcpu)
 {
+       vmx_decache_cr3(vcpu);
        if (!(cr0 & X86_CR0_PG)) {
                /* From paging/starting to nonpaging */
                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-               guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+               guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
                        vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_IDTR_BASE, 0);
        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
 
-       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
                return;
        }
 
+       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+               enable_irq_window(vcpu);
+               return;
+       }
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
        } else
                intr |= INTR_TYPE_EXT_INTR;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+       vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+       vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
                return 0;
 
        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                       (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+                  | GUEST_INTR_STATE_NMI));
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+               if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
                        return 1;
        /*
         * Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        }
 
        if (is_invalid_opcode(intr_info)) {
-               er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+               er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
-               return kvm_mmu_page_fault(vcpu, cr2, error_code);
+               return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
        }
 
        if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
        ++vcpu->stat.io_exits;
 
        if (string || in)
-               return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+               return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
        port = exit_qualification >> 16;
        size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
-       if (err)
-               kvm_inject_gp(vcpu, 0);
-       else
-               skip_emulated_instruction(vcpu);
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                switch (cr) {
                case 0:
                        err = kvm_set_cr0(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 3:
                        err = kvm_set_cr3(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 4:
                        err = kvm_set_cr4(vcpu, val);
-                       complete_insn_gp(vcpu, err);
+                       kvm_complete_insn_gp(vcpu, err);
                        return 1;
                case 8: {
                                u8 cr8_prev = kvm_get_cr8(vcpu);
                                u8 cr8 = kvm_register_read(vcpu, reg);
-                               kvm_set_cr8(vcpu, cr8);
-                               skip_emulated_instruction(vcpu);
+                               err = kvm_set_cr8(vcpu, cr8);
+                               kvm_complete_insn_gp(vcpu, err);
                                if (irqchip_in_kernel(vcpu->kvm))
                                        return 1;
                                if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
        case 1: /*mov from cr*/
                switch (cr) {
                case 3:
-                       kvm_register_write(vcpu, reg, vcpu->arch.cr3);
-                       trace_kvm_cr_read(cr, vcpu->arch.cr3);
+                       val = kvm_read_cr3(vcpu);
+                       kvm_register_write(vcpu, reg, val);
+                       trace_kvm_cr_read(cr, val);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+       return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
-       return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+       return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
 static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
-       return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+       return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
                        return handle_interrupt_window(&vmx->vcpu);
 
-               err = emulate_instruction(vcpu, 0, 0, 0);
+               err = emulate_instruction(vcpu, 0);
 
                if (err == EMULATE_DO_MMIO) {
                        ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = handle_halt,
+       [EXIT_REASON_INVD]                    = handle_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
        [EXIT_REASON_VMCALL]                  = handle_vmcall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+       *info1 = vmcs_readl(EXIT_QUALIFICATION);
+       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
 
-       trace_kvm_exit(exit_reason, vcpu);
+       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
 
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required && emulate_invalid_guest_state)
                return handle_invalid_guest_state(vcpu);
 
-       /* Access CR3 don't cause VMExit in paging mode, so we need
-        * to sync with guest real CR3. */
-       if (enable_ept && is_paging(vcpu))
-               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
              );
 
        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-                                 | (1 << VCPU_EXREG_PDPTR));
+                                 | (1 << VCPU_EXREG_PDPTR)
+                                 | (1 << VCPU_EXREG_CR3));
        vcpu->arch.regs_dirty = 0;
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_cpl = vmx_get_cpl,
        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+       .decache_cr3 = vmx_decache_cr3,
        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
        .set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .get_tdp_level = get_ept_level,
        .get_mt_mask = vmx_get_mt_mask,
 
+       .get_exit_info = vmx_get_exit_info,
        .exit_reasons_str = vmx_exit_reasons_str,
+
        .get_lpage_level = vmx_get_lpage_level,
 
        .cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
 
        if (enable_ept) {
                bypass_guest_pf = 0;
-               kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
-                       VMX_EPT_WRITABLE_MASK);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                VMX_EPT_EXECUTABLE_MASK);
                kvm_enable_tdp();
index 46a368c..bcc0efc 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+       int i;
+       for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+               vcpu->arch.apf.gfns[i] = ~0;
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
        unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
-       unsigned error_code = vcpu->arch.fault.error_code;
+       if (err)
+               kvm_inject_gp(vcpu, 0);
+       else
+               kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
        ++vcpu->stat.pf_guest;
-       vcpu->arch.cr2 = vcpu->arch.fault.address;
-       kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+       vcpu->arch.cr2 = fault->address;
+       kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
 
-void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
-       if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
-               vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+       if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+               vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
        else
-               vcpu->arch.mmu.inject_page_fault(vcpu);
-
-       vcpu->arch.fault.nested = false;
+               vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
                      (unsigned long *)&vcpu->arch.regs_avail))
                return true;
 
-       gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
-       offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+       gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+       offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
        r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
                                       PFERR_USER_MASK | PFERR_WRITE_MASK);
        if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                } else
 #endif
                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
-                                                vcpu->arch.cr3))
+                                                kvm_read_cr3(vcpu)))
                        return 1;
        }
 
        kvm_x86_ops->set_cr0(vcpu, cr0);
 
+       if ((cr0 ^ old_cr0) & X86_CR0_PG)
+               kvm_clear_async_pf_completion_queue(vcpu);
+
        if ((cr0 ^ old_cr0) & update_bits)
                kvm_mmu_reset_context(vcpu);
        return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return 1;
        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
                   && ((cr4 ^ old_cr4) & pdptr_bits)
-                  && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+                  && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+                                  kvm_read_cr3(vcpu)))
                return 1;
 
        if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
-       if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+       if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
                kvm_mmu_sync_roots(vcpu);
                kvm_mmu_flush_tlb(vcpu);
                return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
                return 1;
        vcpu->arch.cr3 = cr3;
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
        vcpu->arch.mmu.new_cr3(vcpu);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        if (cr8 & CR8_RESERVED_BITS)
                return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
                vcpu->arch.cr8 = cr8;
        return 0;
 }
-
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-       if (__kvm_set_cr8(vcpu, cr8))
-               kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN    7
+#define KVM_SAVE_MSRS_BEGIN    8
 static u32 msrs_to_save[] = {
        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-       HV_X64_MSR_APIC_ASSIST_PAGE,
+       HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
        MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
        kvm_x86_ops->set_efer(vcpu, efer);
 
        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-       kvm_mmu_reset_context(vcpu);
 
        /* Update reserved bits */
        if ((efer ^ old_efer) & EFER_NX)
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        return 0;
 }
 
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+       gpa_t gpa = data & ~0x3f;
+
+       /* Bits 2:5 are resrved, Should be zero */
+       if (data & 0x3c)
+               return 1;
+
+       vcpu->arch.apf.msr_val = data;
+
+       if (!(data & KVM_ASYNC_PF_ENABLED)) {
+               kvm_clear_async_pf_completion_queue(vcpu);
+               kvm_async_pf_hash_reset(vcpu);
+               return 0;
+       }
+
+       if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+               return 1;
+
+       vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+       kvm_async_pf_wakeup_all(vcpu);
+       return 0;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                }
                break;
        }
+       case MSR_KVM_ASYNC_PF_EN:
+               if (kvm_pv_enable_async_pf(vcpu, data))
+                       return 1;
+               break;
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_KVM_SYSTEM_TIME_NEW:
                data = vcpu->arch.time;
                break;
+       case MSR_KVM_ASYNC_PF_EN:
+               data = vcpu->arch.apf.msr_val;
+               break;
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
        case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_NOP_IO_DELAY:
        case KVM_CAP_MP_STATE:
        case KVM_CAP_SYNC_MMU:
+       case KVM_CAP_USER_NMI:
        case KVM_CAP_REINJECT_CONTROL:
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
        case KVM_CAP_XSAVE:
+       case KVM_CAP_ASYNC_PF:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
        return r;
 }
 
+static void cpuid_mask(u32 *word, int wordnum)
+{
+       *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                           u32 index)
 {
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        case 1:
                entry->edx &= kvm_supported_word0_x86_features;
+               cpuid_mask(&entry->edx, 0);
                entry->ecx &= kvm_supported_word4_x86_features;
+               cpuid_mask(&entry->ecx, 4);
                /* we support x2apic emulation even if host does not support
                 * it since we emulate x2apic in software */
                entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        case 0x80000001:
                entry->edx &= kvm_supported_word1_x86_features;
+               cpuid_mask(&entry->edx, 1);
                entry->ecx &= kvm_supported_word6_x86_features;
+               cpuid_mask(&entry->ecx, 6);
                break;
        }
 
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                struct kvm_memslots *slots, *old_slots;
                unsigned long *dirty_bitmap;
 
-               r = -ENOMEM;
-               dirty_bitmap = vmalloc(n);
-               if (!dirty_bitmap)
-                       goto out;
+               dirty_bitmap = memslot->dirty_bitmap_head;
+               if (memslot->dirty_bitmap == dirty_bitmap)
+                       dirty_bitmap += n / sizeof(long);
                memset(dirty_bitmap, 0, n);
 
                r = -ENOMEM;
                slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-               if (!slots) {
-                       vfree(dirty_bitmap);
+               if (!slots)
                        goto out;
-               }
                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
                slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+               slots->generation++;
 
                old_slots = kvm->memslots;
                rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                spin_unlock(&kvm->mmu_lock);
 
                r = -EFAULT;
-               if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
-                       vfree(dirty_bitmap);
+               if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
                        goto out;
-               }
-               vfree(dirty_bitmap);
        } else {
                r = -EFAULT;
                if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (vpic) {
                        r = kvm_ioapic_init(kvm);
                        if (r) {
+                               mutex_lock(&kvm->slots_lock);
                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
                                                          &vpic->dev);
+                               mutex_unlock(&kvm->slots_lock);
                                kfree(vpic);
                                goto create_irqchip_unlock;
                        }
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
                smp_wmb();
                r = kvm_setup_default_irq_routing(kvm);
                if (r) {
+                       mutex_lock(&kvm->slots_lock);
                        mutex_lock(&kvm->irq_lock);
                        kvm_ioapic_destroy(kvm);
                        kvm_destroy_pic(kvm);
                        mutex_unlock(&kvm->irq_lock);
+                       mutex_unlock(&kvm->slots_lock);
                }
        create_irqchip_unlock:
                mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
        gpa_t t_gpa;
-       u32 error;
+       struct x86_exception exception;
 
        BUG_ON(!mmu_is_nested(vcpu));
 
        /* NPT walks are always user-walks */
        access |= PFERR_USER_MASK;
-       t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
-       if (t_gpa == UNMAPPED_GVA)
-               vcpu->arch.fault.nested = true;
+       t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
 
        return t_gpa;
 }
 
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+                             struct x86_exception *exception)
 {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+                               struct x86_exception *exception)
 {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_FETCH_MASK;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+                              struct x86_exception *exception)
 {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_WRITE_MASK;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+                               struct x86_exception *exception)
 {
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
+       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
                                      struct kvm_vcpu *vcpu, u32 access,
-                                     u32 *error)
+                                     struct x86_exception *exception)
 {
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
        while (bytes) {
                gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
-                                                           error);
+                                                           exception);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
 
-               if (gpa == UNMAPPED_GVA) {
-                       r = X86EMUL_PROPAGATE_FAULT;
-                       goto out;
-               }
+               if (gpa == UNMAPPED_GVA)
+                       return X86EMUL_PROPAGATE_FAULT;
                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
                if (ret < 0) {
                        r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
 
 /* used for instruction fetching */
 static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                               struct kvm_vcpu *vcpu, u32 *error)
+                               struct kvm_vcpu *vcpu,
+                               struct x86_exception *exception)
 {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
-                                         access | PFERR_FETCH_MASK, error);
+                                         access | PFERR_FETCH_MASK,
+                                         exception);
 }
 
 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                              struct kvm_vcpu *vcpu, u32 *error)
+                              struct kvm_vcpu *vcpu,
+                              struct x86_exception *exception)
 {
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
-                                         error);
+                                         exception);
 }
 
 static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
-                              struct kvm_vcpu *vcpu, u32 *error)
+                                     struct kvm_vcpu *vcpu,
+                                     struct x86_exception *exception)
 {
-       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
 static int kvm_write_guest_virt_system(gva_t addr, void *val,
                                       unsigned int bytes,
                                       struct kvm_vcpu *vcpu,
-                                      u32 *error)
+                                      struct x86_exception *exception)
 {
        void *data = val;
        int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
        while (bytes) {
                gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
                                                             PFERR_WRITE_MASK,
-                                                            error);
+                                                            exception);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
 
-               if (gpa == UNMAPPED_GVA) {
-                       r = X86EMUL_PROPAGATE_FAULT;
-                       goto out;
-               }
+               if (gpa == UNMAPPED_GVA)
+                       return X86EMUL_PROPAGATE_FAULT;
                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
                if (ret < 0) {
                        r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
 static int emulator_read_emulated(unsigned long addr,
                                  void *val,
                                  unsigned int bytes,
-                                 unsigned int *error_code,
+                                 struct x86_exception *exception,
                                  struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
 
-       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
+       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
 
        if (gpa == UNMAPPED_GVA)
                return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                goto mmio;
 
-       if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
-                               == X86EMUL_CONTINUE)
+       if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+           == X86EMUL_CONTINUE)
                return X86EMUL_CONTINUE;
 
 mmio:
@@ -3735,7 +3789,7 @@ mmio:
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                         const void *val, int bytes)
+                       const void *val, int bytes)
 {
        int ret;
 
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
                                           const void *val,
                                           unsigned int bytes,
-                                          unsigned int *error_code,
+                                          struct x86_exception *exception,
                                           struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
 
-       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
+       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
        if (gpa == UNMAPPED_GVA)
                return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
                            const void *val,
                            unsigned int bytes,
-                           unsigned int *error_code,
+                           struct x86_exception *exception,
                            struct kvm_vcpu *vcpu)
 {
        /* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
                int rc, now;
 
                now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+               rc = emulator_write_emulated_onepage(addr, val, now, exception,
                                                     vcpu);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
                val += now;
                bytes -= now;
        }
-       return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+       return emulator_write_emulated_onepage(addr, val, bytes, exception,
                                               vcpu);
 }
 
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                                     const void *old,
                                     const void *new,
                                     unsigned int bytes,
-                                    unsigned int *error_code,
+                                    struct x86_exception *exception,
                                     struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-       return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
+       return emulator_write_emulated(addr, new, bytes, exception, vcpu);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
        if (vcpu->arch.pio.count)
                goto data_avail;
 
-       trace_kvm_pio(0, port, size, 1);
+       trace_kvm_pio(0, port, size, count);
 
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
                              const void *val, unsigned int count,
                              struct kvm_vcpu *vcpu)
 {
-       trace_kvm_pio(1, port, size, 1);
+       trace_kvm_pio(1, port, size, count);
 
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
                return X86EMUL_CONTINUE;
 
        if (kvm_x86_ops->has_wbinvd_exit()) {
-               preempt_disable();
+               int cpu = get_cpu();
+
+               cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
                smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
                                wbinvd_ipi, NULL, 1);
-               preempt_enable();
+               put_cpu();
                cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
-       }
-       wbinvd();
+       } else
+               wbinvd();
        return X86EMUL_CONTINUE;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
                value = vcpu->arch.cr2;
                break;
        case 3:
-               value = vcpu->arch.cr3;
+               value = kvm_read_cr3(vcpu);
                break;
        case 4:
                value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
                res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
                break;
        case 8:
-               res = __kvm_set_cr8(vcpu, val & 0xfUL);
+               res = kvm_set_cr8(vcpu, val);
                break;
        default:
                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-       if (ctxt->exception == PF_VECTOR)
-               kvm_propagate_fault(vcpu);
-       else if (ctxt->error_code_valid)
-               kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+       if (ctxt->exception.vector == PF_VECTOR)
+               kvm_propagate_fault(vcpu, &ctxt->exception);
+       else if (ctxt->exception.error_code_valid)
+               kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+                                     ctxt->exception.error_code);
        else
-               kvm_queue_exception(vcpu, ctxt->exception);
+               kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
+       int r = EMULATE_DONE;
+
        ++vcpu->stat.insn_emulation_fail;
        trace_kvm_emulate_insn_failed(vcpu);
-       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-       vcpu->run->internal.ndata = 0;
+       if (!is_guest_mode(vcpu)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               r = EMULATE_FAIL;
+       }
        kvm_queue_exception(vcpu, UD_VECTOR);
-       return EMULATE_FAIL;
+
+       return r;
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
        return false;
 }
 
-int emulate_instruction(struct kvm_vcpu *vcpu,
-                       unsigned long cr2,
-                       u16 error_code,
-                       int emulation_type)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+                           unsigned long cr2,
+                           int emulation_type,
+                           void *insn,
+                           int insn_len)
 {
        int r;
        struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                init_emulate_ctxt(vcpu);
                vcpu->arch.emulate_ctxt.interruptibility = 0;
-               vcpu->arch.emulate_ctxt.exception = -1;
+               vcpu->arch.emulate_ctxt.have_exception = false;
                vcpu->arch.emulate_ctxt.perm_ok = false;
 
-               r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
                if (r == X86EMUL_PROPAGATE_FAULT)
                        goto done;
 
@@ -4389,7 +4453,7 @@ restart:
        }
 
 done:
-       if (vcpu->arch.emulate_ctxt.exception >= 0) {
+       if (vcpu->arch.emulate_ctxt.have_exception) {
                inject_emulated_exception(vcpu);
                r = EMULATE_DONE;
        } else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
 
        return r;
 }
-EXPORT_SYMBOL_GPL(emulate_instruction);
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
 
        kvm_x86_ops = ops;
        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
-       kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        vcpu->fpu_active = 0;
                        kvm_x86_ops->fpu_deactivate(vcpu);
                }
+               if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+                       /* Page is swapped out. Do synthetic halt */
+                       vcpu->arch.apf.halted = true;
+                       r = 1;
+                       goto out;
+               }
        }
 
        r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 
        r = 1;
        while (r > 0) {
-               if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+               if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+                   !vcpu->arch.apf.halted)
                        r = vcpu_enter_guest(vcpu);
                else {
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                        vcpu->arch.mp_state =
                                                KVM_MP_STATE_RUNNABLE;
                                case KVM_MP_STATE_RUNNABLE:
+                                       vcpu->arch.apf.halted = false;
                                        break;
                                case KVM_MP_STATE_SIPI_RECEIVED:
                                default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.request_irq_exits;
                }
+
+               kvm_check_async_pf_completion(vcpu);
+
                if (signal_pending(current)) {
                        r = -EINTR;
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int r;
        sigset_t sigsaved;
 
+       if (!tsk_used_math(current) && init_fpu(current))
+               return -ENOMEM;
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        /* re-sync apic's tpr */
-       if (!irqchip_in_kernel(vcpu->kvm))
-               kvm_set_cr8(vcpu, kvm_run->cr8);
+       if (!irqchip_in_kernel(vcpu->kvm)) {
+               if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+                       r = -EINVAL;
+                       goto out;
+               }
+       }
 
        if (vcpu->arch.pio.count || vcpu->mmio_needed) {
                if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        vcpu->mmio_needed = 0;
                }
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-               r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+               r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
                srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
                if (r != EMULATE_DONE) {
                        r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
        sregs->cr0 = kvm_read_cr0(vcpu);
        sregs->cr2 = vcpu->arch.cr2;
-       sregs->cr3 = vcpu->arch.cr3;
+       sregs->cr3 = kvm_read_cr3(vcpu);
        sregs->cr4 = kvm_read_cr4(vcpu);
        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        kvm_x86_ops->set_gdt(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
-       mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+       mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        if (sregs->cr4 & X86_CR4_OSXSAVE)
                update_cpuid(vcpu);
        if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-               load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+               load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                mmu_reset_needed = 1;
        }
 
@@ -5773,6 +5855,8 @@ free_vcpu:
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.apf.msr_val = 0;
+
        vcpu_load(vcpu);
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.dr7 = DR7_FIXED_1;
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
+       vcpu->arch.apf.msr_val = 0;
+
+       kvm_clear_async_pf_completion_queue(vcpu);
+       kvm_async_pf_hash_reset(vcpu);
+       vcpu->arch.apf.halted = false;
 
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
                goto fail_free_mce_banks;
 
+       kvm_async_pf_hash_reset(vcpu);
+
        return 0;
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
-
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5921,7 +6007,7 @@ struct  kvm *kvm_arch_create_vm(void)
 
        spin_lock_init(&kvm->arch.tsc_write_lock);
 
-       return kvm;
+       return 0;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
        /*
         * Unpin any mmu pages first.
         */
-       kvm_for_each_vcpu(i, vcpu, kvm)
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_clear_async_pf_completion_queue(vcpu);
                kvm_unload_vcpu_mmu(vcpu);
+       }
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_arch_vcpu_free(vcpu);
 
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
        if (kvm->arch.apic_access_page)
                put_page(kvm->arch.apic_access_page);
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
-       cleanup_srcu_struct(&kvm->srcu);
-       kfree(kvm);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+       return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+               !vcpu->arch.apf.halted)
+               || !list_empty_careful(&vcpu->async_pf.done)
                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
                || vcpu->arch.nmi_pending ||
                (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+       int r;
+
+       if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+             is_error_page(work->page))
+               return;
+
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               return;
+
+       if (!vcpu->arch.mmu.direct_map &&
+             work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+               return;
+
+       vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+       return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+       return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       u32 key = kvm_async_pf_hash_fn(gfn);
+
+       while (vcpu->arch.apf.gfns[key] != ~0)
+               key = kvm_async_pf_next_probe(key);
+
+       vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       int i;
+       u32 key = kvm_async_pf_hash_fn(gfn);
+
+       for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+                    (vcpu->arch.apf.gfns[key] != gfn &&
+                     vcpu->arch.apf.gfns[key] != ~0); i++)
+               key = kvm_async_pf_next_probe(key);
+
+       return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       u32 i, j, k;
+
+       i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+       while (true) {
+               vcpu->arch.apf.gfns[i] = ~0;
+               do {
+                       j = kvm_async_pf_next_probe(j);
+                       if (vcpu->arch.apf.gfns[j] == ~0)
+                               return;
+                       k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+                       /*
+                        * k lies cyclically in ]i,j]
+                        * |    i.k.j |
+                        * |....j i.k.| or  |.k..j i...|
+                        */
+               } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+               vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+               i = j;
+       }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+       return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+                                     sizeof(val));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+                                    struct kvm_async_pf *work)
+{
+       struct x86_exception fault;
+
+       trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+       kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+       if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+           (vcpu->arch.apf.send_user_only &&
+            kvm_x86_ops->get_cpl(vcpu) == 0))
+               kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+       else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+               fault.vector = PF_VECTOR;
+               fault.error_code_valid = true;
+               fault.error_code = 0;
+               fault.nested_page_fault = false;
+               fault.address = work->arch.token;
+               kvm_inject_page_fault(vcpu, &fault);
+       }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+                                struct kvm_async_pf *work)
+{
+       struct x86_exception fault;
+
+       trace_kvm_async_pf_ready(work->arch.token, work->gva);
+       if (is_error_page(work->page))
+               work->arch.token = ~0; /* broadcast wakeup */
+       else
+               kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+       if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+           !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+               fault.vector = PF_VECTOR;
+               fault.error_code_valid = true;
+               fault.error_code = 0;
+               fault.nested_page_fault = false;
+               fault.address = work->arch.token;
+               kvm_inject_page_fault(vcpu, &fault);
+       }
+       vcpu->arch.apf.halted = false;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+               return true;
+       else
+               return !kvm_event_needs_reinjection(vcpu) &&
+                       kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
index 919ae53..ea2dc1a 100644 (file)
@@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo {
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index a055742..b5021db 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/msi.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -40,6 +42,7 @@
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
+#define KVM_REQ_APF_HALT          12
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID    0
 
@@ -74,6 +77,27 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev);
 
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+       struct work_struct work;
+       struct list_head link;
+       struct list_head queue;
+       struct kvm_vcpu *vcpu;
+       struct mm_struct *mm;
+       gva_t gva;
+       unsigned long addr;
+       struct kvm_arch_async_pf arch;
+       struct page *page;
+       bool done;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+                      struct kvm_arch_async_pf *arch);
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
+#endif
+
 struct kvm_vcpu {
        struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -104,6 +128,15 @@ struct kvm_vcpu {
        gpa_t mmio_phys_addr;
 #endif
 
+#ifdef CONFIG_KVM_ASYNC_PF
+       struct {
+               u32 queued;
+               struct list_head queue;
+               struct list_head done;
+               spinlock_t lock;
+       } async_pf;
+#endif
+
        struct kvm_vcpu_arch arch;
 };
 
@@ -113,16 +146,19 @@ struct kvm_vcpu {
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
+struct kvm_lpage_info {
+       unsigned long rmap_pde;
+       int write_count;
+};
+
 struct kvm_memory_slot {
        gfn_t base_gfn;
        unsigned long npages;
        unsigned long flags;
        unsigned long *rmap;
        unsigned long *dirty_bitmap;
-       struct {
-               unsigned long rmap_pde;
-               int write_count;
-       } *lpage_info[KVM_NR_PAGE_SIZES - 1];
+       unsigned long *dirty_bitmap_head;
+       struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
        unsigned long userspace_addr;
        int user_alloc;
        int id;
@@ -169,6 +205,7 @@ struct kvm_irq_routing_table {};
 
 struct kvm_memslots {
        int nmemslots;
+       u64 generation;
        struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
                                        KVM_PRIVATE_MEM_SLOTS];
 };
@@ -206,6 +243,10 @@ struct kvm {
 
        struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+       /*
+        * Update side is protected by irq_lock and,
+        * if configured, irqfds.lock.
+        */
        struct kvm_irq_routing_table __rcu *irq_routing;
        struct hlist_head mask_notifier_list;
        struct hlist_head irq_ack_notifier_list;
@@ -216,6 +257,7 @@ struct kvm {
        unsigned long mmu_notifier_seq;
        long mmu_notifier_count;
 #endif
+       long tlbs_dirty;
 };
 
 /* The guest did something we don't support. */
@@ -302,7 +344,11 @@ void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+                      bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+                     bool *writable);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
                         struct kvm_memory_slot *slot, gfn_t gfn);
 int memslot_id(struct kvm *kvm, gfn_t gfn);
@@ -321,18 +367,25 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
                    unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                          void *data, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                             gpa_t gpa);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                            gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
@@ -398,7 +451,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
-struct  kvm *kvm_arch_create_vm(void);
+#ifndef __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+       return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+       kfree(kvm);
+}
+#endif
+
+int kvm_arch_init_vm(struct kvm *kvm);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
@@ -414,16 +479,8 @@ struct kvm_irq_ack_notifier {
        void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
-#define KVM_ASSIGNED_MSIX_PENDING              0x1
-struct kvm_guest_msix_entry {
-       u32 vector;
-       u16 entry;
-       u16 flags;
-};
-
 struct kvm_assigned_dev_kernel {
        struct kvm_irq_ack_notifier ack_notifier;
-       struct work_struct interrupt_work;
        struct list_head list;
        int assigned_dev_id;
        int host_segnr;
@@ -434,13 +491,14 @@ struct kvm_assigned_dev_kernel {
        bool host_irq_disabled;
        struct msix_entry *host_msix_entries;
        int guest_irq;
-       struct kvm_guest_msix_entry *guest_msix_entries;
+       struct msix_entry *guest_msix_entries;
        unsigned long irq_requested_type;
        int irq_source_id;
        int flags;
        struct pci_dev *dev;
        struct kvm *kvm;
-       spinlock_t assigned_dev_lock;
+       spinlock_t intx_lock;
+       char irq_name[32];
 };
 
 struct kvm_irq_mask_notifier {
@@ -462,6 +520,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
                                   unsigned long *deliver_bitmask);
 #endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
+               int irq_source_id, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
@@ -603,17 +663,28 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 void kvm_irqfd_release(struct kvm *kvm);
+void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
 #else
 
 static inline void kvm_eventfd_init(struct kvm *kvm) {}
+
 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
        return -EINVAL;
 }
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+static inline void kvm_irq_routing_update(struct kvm *kvm,
+                                         struct kvm_irq_routing_table *irq_rt)
+{
+       rcu_assign_pointer(kvm->irq_routing, irq_rt);
+}
+#endif
+
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
        return -ENOSYS;
index 7ac0d4e..fa7cc72 100644 (file)
@@ -67,4 +67,11 @@ struct kvm_lapic_irq {
        u32 dest_id;
 };
 
+struct gfn_to_hva_cache {
+       u64 generation;
+       gpa_t gpa;
+       unsigned long hva;
+       struct kvm_memory_slot *memslot;
+};
+
 #endif /* __KVM_TYPES_H__ */
index 6dd3a51..46e3cd8 100644 (file)
@@ -6,6 +6,36 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
 
+#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
+
+#define kvm_trace_exit_reason                                          \
+       ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL),      \
+       ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN),      \
+       ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),    \
+       ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
+       ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
+
+TRACE_EVENT(kvm_userspace_exit,
+           TP_PROTO(__u32 reason, int errno),
+           TP_ARGS(reason, errno),
+
+       TP_STRUCT__entry(
+               __field(        __u32,          reason          )
+               __field(        int,            errno           )
+       ),
+
+       TP_fast_assign(
+               __entry->reason         = reason;
+               __entry->errno          = errno;
+       ),
+
+       TP_printk("reason %s (%d)",
+                 __entry->errno < 0 ?
+                 (__entry->errno == -EINTR ? "restart" : "error") :
+                 __print_symbolic(__entry->reason, kvm_trace_exit_reason),
+                 __entry->errno < 0 ? -__entry->errno : __entry->reason)
+);
+
 #if defined(__KVM_HAVE_IOAPIC)
 TRACE_EVENT(kvm_set_irq,
        TP_PROTO(unsigned int gsi, int level, int irq_source_id),
@@ -185,6 +215,97 @@ TRACE_EVENT(kvm_age_page,
                  __entry->referenced ? "YOUNG" : "OLD")
 );
 
+#ifdef CONFIG_KVM_ASYNC_PF
+DECLARE_EVENT_CLASS(kvm_async_get_page_class,
+
+       TP_PROTO(u64 gva, u64 gfn),
+
+       TP_ARGS(gva, gfn),
+
+       TP_STRUCT__entry(
+               __field(__u64, gva)
+               __field(u64, gfn)
+       ),
+
+       TP_fast_assign(
+               __entry->gva = gva;
+               __entry->gfn = gfn;
+       ),
+
+       TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
+);
+
+DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
+
+       TP_PROTO(u64 gva, u64 gfn),
+
+       TP_ARGS(gva, gfn)
+);
+
+DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
+
+       TP_PROTO(u64 gva, u64 gfn),
+
+       TP_ARGS(gva, gfn)
+);
+
+DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
+
+       TP_PROTO(u64 token, u64 gva),
+
+       TP_ARGS(token, gva),
+
+       TP_STRUCT__entry(
+               __field(__u64, token)
+               __field(__u64, gva)
+       ),
+
+       TP_fast_assign(
+               __entry->token = token;
+               __entry->gva = gva;
+       ),
+
+       TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
+
+);
+
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
+
+       TP_PROTO(u64 token, u64 gva),
+
+       TP_ARGS(token, gva)
+);
+
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
+
+       TP_PROTO(u64 token, u64 gva),
+
+       TP_ARGS(token, gva)
+);
+
+TRACE_EVENT(
+       kvm_async_pf_completed,
+       TP_PROTO(unsigned long address, struct page *page, u64 gva),
+       TP_ARGS(address, page, gva),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, address)
+               __field(pfn_t, pfn)
+               __field(u64, gva)
+               ),
+
+       TP_fast_assign(
+               __entry->address = address;
+               __entry->pfn = page ? page_to_pfn(page) : 0;
+               __entry->gva = gva;
+               ),
+
+       TP_printk("gva %#llx address %#lx pfn %#llx",  __entry->gva,
+                 __entry->address, __entry->pfn)
+);
+
+#endif
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index 7f1178f..f63ccb0 100644 (file)
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
 
 config KVM_MMIO
        bool
+
+config KVM_ASYNC_PF
+       bool
index 7c98928..ae72ae6 100644 (file)
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
        return index;
 }
 
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
-       struct kvm_assigned_dev_kernel *assigned_dev;
-       int i;
+       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+       u32 vector;
+       int index;
 
-       assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-                                   interrupt_work);
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
+               spin_lock(&assigned_dev->intx_lock);
+               disable_irq_nosync(irq);
+               assigned_dev->host_irq_disabled = true;
+               spin_unlock(&assigned_dev->intx_lock);
+       }
 
-       spin_lock_irq(&assigned_dev->assigned_dev_lock);
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               struct kvm_guest_msix_entry *guest_entries =
-                       assigned_dev->guest_msix_entries;
-               for (i = 0; i < assigned_dev->entries_nr; i++) {
-                       if (!(guest_entries[i].flags &
-                                       KVM_ASSIGNED_MSIX_PENDING))
-                               continue;
-                       guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+               index = find_index_from_host_irq(assigned_dev, irq);
+               if (index >= 0) {
+                       vector = assigned_dev->
+                                       guest_msix_entries[index].vector;
                        kvm_set_irq(assigned_dev->kvm,
-                                   assigned_dev->irq_source_id,
-                                   guest_entries[i].vector, 1);
+                                   assigned_dev->irq_source_id, vector, 1);
                }
        } else
                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
                            assigned_dev->guest_irq, 1);
 
-       spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-       unsigned long flags;
-       struct kvm_assigned_dev_kernel *assigned_dev =
-               (struct kvm_assigned_dev_kernel *) dev_id;
-
-       spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               int index = find_index_from_host_irq(assigned_dev, irq);
-               if (index < 0)
-                       goto out;
-               assigned_dev->guest_msix_entries[index].flags |=
-                       KVM_ASSIGNED_MSIX_PENDING;
-       }
-
-       schedule_work(&assigned_dev->interrupt_work);
-
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-               disable_irq_nosync(irq);
-               assigned_dev->host_irq_disabled = true;
-       }
-
-out:
-       spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
        return IRQ_HANDLED;
 }
 
@@ -114,7 +87,6 @@ out:
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
        struct kvm_assigned_dev_kernel *dev;
-       unsigned long flags;
 
        if (kian->gsi == -1)
                return;
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
        /* The guest irq may be shared so this ack may be
         * from another device.
         */
-       spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+       spin_lock(&dev->intx_lock);
        if (dev->host_irq_disabled) {
                enable_irq(dev->host_irq);
                dev->host_irq_disabled = false;
        }
-       spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+       spin_unlock(&dev->intx_lock);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm,
        kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
        assigned_dev->ack_notifier.gsi = -1;
 
+       kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+                   assigned_dev->guest_irq, 0);
+
        if (assigned_dev->irq_source_id != -1)
                kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
        assigned_dev->irq_source_id = -1;
@@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
                              struct kvm_assigned_dev_kernel *assigned_dev)
 {
        /*
-        * In kvm_free_device_irq, cancel_work_sync return true if:
-        * 1. work is scheduled, and then cancelled.
-        * 2. work callback is executed.
-        *
-        * The first one ensured that the irq is disabled and no more events
-        * would happen. But for the second one, the irq may be enabled (e.g.
-        * for MSI). So we disable irq here to prevent further events.
+        * We disable irq here to prevent further events.
         *
         * Notice this maybe result in nested disable if the interrupt type is
         * INTx, but it's OK for we are going to free it.
         *
         * If this function is a part of VM destroy, please ensure that till
         * now, the kvm state is still legal for probably we also have to wait
-        * interrupt_work done.
+        * on a currently running IRQ handler.
         */
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
                int i;
                for (i = 0; i < assigned_dev->entries_nr; i++)
-                       disable_irq_nosync(assigned_dev->
-                                          host_msix_entries[i].vector);
-
-               cancel_work_sync(&assigned_dev->interrupt_work);
+                       disable_irq(assigned_dev->host_msix_entries[i].vector);
 
                for (i = 0; i < assigned_dev->entries_nr; i++)
                        free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
                pci_disable_msix(assigned_dev->dev);
        } else {
                /* Deal with MSI and INTx */
-               disable_irq_nosync(assigned_dev->host_irq);
-               cancel_work_sync(&assigned_dev->interrupt_work);
+               disable_irq(assigned_dev->host_irq);
 
                free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
@@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 {
        kvm_free_assigned_irq(kvm, assigned_dev);
 
-       pci_reset_function(assigned_dev->dev);
+       __pci_reset_function(assigned_dev->dev);
+       pci_restore_state(assigned_dev->dev);
 
        pci_release_regions(assigned_dev->dev);
        pci_disable_device(assigned_dev->dev);
@@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
         * on the same interrupt line is not a happy situation: there
         * are going to be long delays in accepting, acking, etc.
         */
-       if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
-                       0, "kvm_assigned_intx_device", (void *)dev))
+       if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+                                IRQF_ONESHOT, dev->irq_name, (void *)dev))
                return -EIO;
        return 0;
 }
@@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
        }
 
        dev->host_irq = dev->dev->irq;
-       if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
-                       "kvm_assigned_msi_device", (void *)dev)) {
+       if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+                                0, dev->irq_name, (void *)dev)) {
                pci_disable_msi(dev->dev);
                return -EIO;
        }
@@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
                return r;
 
        for (i = 0; i < dev->entries_nr; i++) {
-               r = request_irq(dev->host_msix_entries[i].vector,
-                               kvm_assigned_dev_intr, 0,
-                               "kvm_assigned_msix_device",
-                               (void *)dev);
+               r = request_threaded_irq(dev->host_msix_entries[i].vector,
+                                        NULL, kvm_assigned_dev_thread,
+                                        0, dev->irq_name, (void *)dev);
                if (r)
                        goto err;
        }
@@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm,
        if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
                return r;
 
+       snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+                pci_name(dev->dev));
+
        switch (host_irq_type) {
        case KVM_DEV_IRQ_HOST_INTX:
                r = assigned_device_enable_host_intx(kvm, dev);
@@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        }
 
        pci_reset_function(dev);
+       pci_save_state(dev);
 
        match->assigned_dev_id = assigned_dev->assigned_dev_id;
        match->host_segnr = assigned_dev->segnr;
@@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
        match->host_devfn = assigned_dev->devfn;
        match->flags = assigned_dev->flags;
        match->dev = dev;
-       spin_lock_init(&match->assigned_dev_lock);
+       spin_lock_init(&match->intx_lock);
        match->irq_source_id = -1;
        match->kvm = kvm;
        match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-       INIT_WORK(&match->interrupt_work,
-                 kvm_assigned_dev_interrupt_work_handler);
 
        list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -579,6 +546,7 @@ out:
        mutex_unlock(&kvm->lock);
        return r;
 out_list_del:
+       pci_restore_state(dev);
        list_del(&match->list);
        pci_release_regions(dev);
 out_disable:
@@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
                        r = -ENOMEM;
                        goto msix_nr_out;
                }
-               adev->guest_msix_entries = kzalloc(
-                               sizeof(struct kvm_guest_msix_entry) *
-                               entry_nr->entry_nr, GFP_KERNEL);
+               adev->guest_msix_entries =
+                       kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+                               GFP_KERNEL);
                if (!adev->guest_msix_entries) {
                        kfree(adev->host_msix_entries);
                        r = -ENOMEM;
@@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                                  unsigned long arg)
 {
        void __user *argp = (void __user *)arg;
-       int r = -ENOTTY;
+       int r;
 
        switch (ioctl) {
        case KVM_ASSIGN_PCI_DEVICE: {
@@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                r = -EOPNOTSUPP;
                break;
        }
-#ifdef KVM_CAP_ASSIGN_DEV_IRQ
        case KVM_ASSIGN_DEV_IRQ: {
                struct kvm_assigned_irq assigned_irq;
 
@@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                        goto out;
                break;
        }
-#endif
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
        case KVM_DEASSIGN_PCI_DEVICE: {
                struct kvm_assigned_pci_dev assigned_dev;
 
@@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                        goto out;
                break;
        }
-#endif
 #ifdef KVM_CAP_IRQ_ROUTING
        case KVM_SET_GSI_ROUTING: {
                struct kvm_irq_routing routing;
@@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                break;
        }
 #endif
+       default:
+               r = -ENOTTY;
+               break;
        }
 out:
        return r;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644 (file)
index 0000000..74268b4
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
+
+#include "async_pf.h"
+#include <trace/events/kvm.h>
+
+static struct kmem_cache *async_pf_cache;
+
+int kvm_async_pf_init(void)
+{
+       async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
+
+       if (!async_pf_cache)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void kvm_async_pf_deinit(void)
+{
+       if (async_pf_cache)
+               kmem_cache_destroy(async_pf_cache);
+       async_pf_cache = NULL;
+}
+
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       INIT_LIST_HEAD(&vcpu->async_pf.done);
+       INIT_LIST_HEAD(&vcpu->async_pf.queue);
+       spin_lock_init(&vcpu->async_pf.lock);
+}
+
+static void async_pf_execute(struct work_struct *work)
+{
+       struct page *page = NULL;
+       struct kvm_async_pf *apf =
+               container_of(work, struct kvm_async_pf, work);
+       struct mm_struct *mm = apf->mm;
+       struct kvm_vcpu *vcpu = apf->vcpu;
+       unsigned long addr = apf->addr;
+       gva_t gva = apf->gva;
+
+       might_sleep();
+
+       use_mm(mm);
+       down_read(&mm->mmap_sem);
+       get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+       up_read(&mm->mmap_sem);
+       unuse_mm(mm);
+
+       spin_lock(&vcpu->async_pf.lock);
+       list_add_tail(&apf->link, &vcpu->async_pf.done);
+       apf->page = page;
+       apf->done = true;
+       spin_unlock(&vcpu->async_pf.lock);
+
+       /*
+        * apf may be freed by kvm_check_async_pf_completion() after
+        * this point
+        */
+
+       trace_kvm_async_pf_completed(addr, page, gva);
+
+       if (waitqueue_active(&vcpu->wq))
+               wake_up_interruptible(&vcpu->wq);
+
+       mmdrop(mm);
+       kvm_put_kvm(vcpu->kvm);
+}
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+       /* cancel outstanding work queue item */
+       while (!list_empty(&vcpu->async_pf.queue)) {
+               struct kvm_async_pf *work =
+                       list_entry(vcpu->async_pf.queue.next,
+                                  typeof(*work), queue);
+               cancel_work_sync(&work->work);
+               list_del(&work->queue);
+               if (!work->done) /* work was canceled */
+                       kmem_cache_free(async_pf_cache, work);
+       }
+
+       spin_lock(&vcpu->async_pf.lock);
+       while (!list_empty(&vcpu->async_pf.done)) {
+               struct kvm_async_pf *work =
+                       list_entry(vcpu->async_pf.done.next,
+                                  typeof(*work), link);
+               list_del(&work->link);
+               if (work->page)
+                       put_page(work->page);
+               kmem_cache_free(async_pf_cache, work);
+       }
+       spin_unlock(&vcpu->async_pf.lock);
+
+       vcpu->async_pf.queued = 0;
+}
+
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
+{
+       struct kvm_async_pf *work;
+
+       while (!list_empty_careful(&vcpu->async_pf.done) &&
+             kvm_arch_can_inject_async_page_present(vcpu)) {
+               spin_lock(&vcpu->async_pf.lock);
+               work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
+                                             link);
+               list_del(&work->link);
+               spin_unlock(&vcpu->async_pf.lock);
+
+               if (work->page)
+                       kvm_arch_async_page_ready(vcpu, work);
+               kvm_arch_async_page_present(vcpu, work);
+
+               list_del(&work->queue);
+               vcpu->async_pf.queued--;
+               if (work->page)
+                       put_page(work->page);
+               kmem_cache_free(async_pf_cache, work);
+       }
+}
+
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+                      struct kvm_arch_async_pf *arch)
+{
+       struct kvm_async_pf *work;
+
+       if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
+               return 0;
+
+       /* setup delayed work */
+
+       /*
+        * do alloc nowait since if we are going to sleep anyway we
+        * may as well sleep faulting in page
+        */
+       work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+       if (!work)
+               return 0;
+
+       work->page = NULL;
+       work->done = false;
+       work->vcpu = vcpu;
+       work->gva = gva;
+       work->addr = gfn_to_hva(vcpu->kvm, gfn);
+       work->arch = *arch;
+       work->mm = current->mm;
+       atomic_inc(&work->mm->mm_count);
+       kvm_get_kvm(work->vcpu->kvm);
+
+       /* this can't really happen otherwise gfn_to_pfn_async
+          would succeed */
+       if (unlikely(kvm_is_error_hva(work->addr)))
+               goto retry_sync;
+
+       INIT_WORK(&work->work, async_pf_execute);
+       if (!schedule_work(&work->work))
+               goto retry_sync;
+
+       list_add_tail(&work->queue, &vcpu->async_pf.queue);
+       vcpu->async_pf.queued++;
+       kvm_arch_async_page_not_present(vcpu, work);
+       return 1;
+retry_sync:
+       kvm_put_kvm(work->vcpu->kvm);
+       mmdrop(work->mm);
+       kmem_cache_free(async_pf_cache, work);
+       return 0;
+}
+
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
+{
+       struct kvm_async_pf *work;
+
+       if (!list_empty_careful(&vcpu->async_pf.done))
+               return 0;
+
+       work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
+       if (!work)
+               return -ENOMEM;
+
+       work->page = bad_page;
+       get_page(bad_page);
+       INIT_LIST_HEAD(&work->queue); /* for list_del to work */
+
+       spin_lock(&vcpu->async_pf.lock);
+       list_add_tail(&work->link, &vcpu->async_pf.done);
+       spin_unlock(&vcpu->async_pf.lock);
+
+       vcpu->async_pf.queued++;
+       return 0;
+}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644 (file)
index 0000000..e7ef644
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_ASYNC_PF_H__
+#define __KVM_ASYNC_PF_H__
+
+#ifdef CONFIG_KVM_ASYNC_PF
+int kvm_async_pf_init(void);
+void kvm_async_pf_deinit(void);
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
+#else
+#define kvm_async_pf_init() (0)
+#define kvm_async_pf_deinit() do{}while(0)
+#define kvm_async_pf_vcpu_init(C) do{}while(0)
+#endif
+
+#endif
index c1f1e3c..2ca4535 100644 (file)
  */
 
 struct _irqfd {
-       struct kvm               *kvm;
-       struct eventfd_ctx       *eventfd;
-       int                       gsi;
-       struct list_head          list;
-       poll_table                pt;
-       wait_queue_t              wait;
-       struct work_struct        inject;
-       struct work_struct        shutdown;
+       /* Used for MSI fast-path */
+       struct kvm *kvm;
+       wait_queue_t wait;
+       /* Update side is protected by irqfds.lock */
+       struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+       /* Used for level IRQ fast-path */
+       int gsi;
+       struct work_struct inject;
+       /* Used for setup/shutdown */
+       struct eventfd_ctx *eventfd;
+       struct list_head list;
+       poll_table pt;
+       struct work_struct shutdown;
 };
 
 static struct workqueue_struct *irqfd_cleanup_wq;
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
        unsigned long flags = (unsigned long)key;
+       struct kvm_kernel_irq_routing_entry *irq;
+       struct kvm *kvm = irqfd->kvm;
 
-       if (flags & POLLIN)
+       if (flags & POLLIN) {
+               rcu_read_lock();
+               irq = rcu_dereference(irqfd->irq_entry);
                /* An event has been signaled, inject an interrupt */
-               schedule_work(&irqfd->inject);
+               if (irq)
+                       kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+               else
+                       schedule_work(&irqfd->inject);
+               rcu_read_unlock();
+       }
 
        if (flags & POLLHUP) {
                /* The eventfd is closing, detach from KVM */
-               struct kvm *kvm = irqfd->kvm;
                unsigned long flags;
 
                spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
        add_wait_queue(wqh, &irqfd->wait);
 }
 
+/* Must be called under irqfds.lock */
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+                        struct kvm_irq_routing_table *irq_rt)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct hlist_node *n;
+
+       if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+               rcu_assign_pointer(irqfd->irq_entry, NULL);
+               return;
+       }
+
+       hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
+               /* Only fast-path MSI. */
+               if (e->type == KVM_IRQ_ROUTING_MSI)
+                       rcu_assign_pointer(irqfd->irq_entry, e);
+               else
+                       rcu_assign_pointer(irqfd->irq_entry, NULL);
+       }
+}
+
 static int
 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 {
+       struct kvm_irq_routing_table *irq_rt;
        struct _irqfd *irqfd, *tmp;
        struct file *file = NULL;
        struct eventfd_ctx *eventfd = NULL;
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
                goto fail;
        }
 
+       irq_rt = rcu_dereference_protected(kvm->irq_routing,
+                                          lockdep_is_held(&kvm->irqfds.lock));
+       irqfd_update(kvm, irqfd, irq_rt);
+
        events = file->f_op->poll(file, &irqfd->pt);
 
        list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
        spin_lock_irq(&kvm->irqfds.lock);
 
        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
-               if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
+               if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+                       /*
+                        * This rcu_assign_pointer is needed for when
+                        * another thread calls kvm_irqfd_update before
+                        * we flush workqueue below.
+                        * It is paired with synchronize_rcu done by caller
+                        * of that function.
+                        */
+                       rcu_assign_pointer(irqfd->irq_entry, NULL);
                        irqfd_deactivate(irqfd);
+               }
        }
 
        spin_unlock_irq(&kvm->irqfds.lock);
@@ -321,6 +369,25 @@ kvm_irqfd_release(struct kvm *kvm)
 
 }
 
+/*
+ * Change irq_routing and irqfd.
+ * Caller must invoke synchronize_rcu afterwards.
+ */
+void kvm_irq_routing_update(struct kvm *kvm,
+                           struct kvm_irq_routing_table *irq_rt)
+{
+       struct _irqfd *irqfd;
+
+       spin_lock_irq(&kvm->irqfds.lock);
+
+       rcu_assign_pointer(kvm->irq_routing, irq_rt);
+
+       list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+               irqfd_update(kvm, irqfd, irq_rt);
+
+       spin_unlock_irq(&kvm->irqfds.lock);
+}
+
 /*
  * create a host-wide workqueue for issuing deferred shutdown requests
  * aggregated from all vm* instances. We need our own isolated single-thread
index 8edca91..9f614b4 100644 (file)
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        return r;
 }
 
-static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-                      struct kvm *kvm, int irq_source_id, int level)
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id, int level)
 {
        struct kvm_lapic_irq irq;
 
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
        mutex_lock(&kvm->irq_lock);
        old = kvm->irq_routing;
-       rcu_assign_pointer(kvm->irq_routing, new);
+       kvm_irq_routing_update(kvm, new);
        mutex_unlock(&kvm->irq_lock);
+
        synchronize_rcu();
 
        new = old;
index 5225052..7f68625 100644 (file)
@@ -55,6 +55,7 @@
 #include <asm-generic/bitops/le.h>
 
 #include "coalesced_mmio.h"
+#include "async_pf.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -89,7 +90,8 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-static bool kvm_rebooting;
+bool kvm_rebooting;
+EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
@@ -167,8 +169,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+       int dirty_count = kvm->tlbs_dirty;
+
+       smp_mb();
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.remote_tlb_flush;
+       cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -186,6 +192,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        init_waitqueue_head(&vcpu->wq);
+       kvm_async_pf_vcpu_init(vcpu);
 
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!page) {
@@ -247,7 +254,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
        kvm->mmu_notifier_seq++;
-       need_tlb_flush = kvm_unmap_hva(kvm, address);
+       need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
        spin_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
 
@@ -291,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
        kvm->mmu_notifier_count++;
        for (; start < end; start += PAGE_SIZE)
                need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       need_tlb_flush |= kvm->tlbs_dirty;
        spin_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
 
@@ -381,11 +389,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 static struct kvm *kvm_create_vm(void)
 {
-       int r = 0, i;
-       struct kvm *kvm = kvm_arch_create_vm();
+       int r, i;
+       struct kvm *kvm = kvm_arch_alloc_vm();
 
-       if (IS_ERR(kvm))
-               goto out;
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
+
+       r = kvm_arch_init_vm(kvm);
+       if (r)
+               goto out_err_nodisable;
 
        r = hardware_enable_all();
        if (r)
@@ -399,23 +411,19 @@ static struct kvm *kvm_create_vm(void)
        r = -ENOMEM;
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
-               goto out_err;
+               goto out_err_nosrcu;
        if (init_srcu_struct(&kvm->srcu))
-               goto out_err;
+               goto out_err_nosrcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
                kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
                                        GFP_KERNEL);
-               if (!kvm->buses[i]) {
-                       cleanup_srcu_struct(&kvm->srcu);
+               if (!kvm->buses[i])
                        goto out_err;
-               }
        }
 
        r = kvm_init_mmu_notifier(kvm);
-       if (r) {
-               cleanup_srcu_struct(&kvm->srcu);
+       if (r)
                goto out_err;
-       }
 
        kvm->mm = current->mm;
        atomic_inc(&kvm->mm->mm_count);
@@ -429,19 +437,35 @@ static struct kvm *kvm_create_vm(void)
        spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        spin_unlock(&kvm_lock);
-out:
+
        return kvm;
 
 out_err:
+       cleanup_srcu_struct(&kvm->srcu);
+out_err_nosrcu:
        hardware_disable_all();
 out_err_nodisable:
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm->buses[i]);
        kfree(kvm->memslots);
-       kfree(kvm);
+       kvm_arch_free_vm(kvm);
        return ERR_PTR(r);
 }
 
+static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+       if (!memslot->dirty_bitmap)
+               return;
+
+       if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
+               vfree(memslot->dirty_bitmap_head);
+       else
+               kfree(memslot->dirty_bitmap_head);
+
+       memslot->dirty_bitmap = NULL;
+       memslot->dirty_bitmap_head = NULL;
+}
+
 /*
  * Free any memory in @free but not in @dont.
  */
@@ -454,7 +478,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
                vfree(free->rmap);
 
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-               vfree(free->dirty_bitmap);
+               kvm_destroy_dirty_bitmap(free);
 
 
        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@@ -465,7 +489,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
        }
 
        free->npages = 0;
-       free->dirty_bitmap = NULL;
        free->rmap = NULL;
 }
 
@@ -499,6 +522,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_arch_flush_shadow(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
+       kvm_free_physmem(kvm);
+       cleanup_srcu_struct(&kvm->srcu);
+       kvm_arch_free_vm(kvm);
        hardware_disable_all();
        mmdrop(mm);
 }
@@ -527,6 +553,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
        return 0;
 }
 
+/*
+ * Allocation size is twice as large as the actual dirty bitmap size.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
+ */
+static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+       unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
+
+       if (dirty_bytes > PAGE_SIZE)
+               memslot->dirty_bitmap = vzalloc(dirty_bytes);
+       else
+               memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
+
+       if (!memslot->dirty_bitmap)
+               return -ENOMEM;
+
+       memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+       return 0;
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -604,13 +651,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
        /* Allocate if a slot is being created */
 #ifndef CONFIG_S390
        if (npages && !new.rmap) {
-               new.rmap = vmalloc(npages * sizeof(*new.rmap));
+               new.rmap = vzalloc(npages * sizeof(*new.rmap));
 
                if (!new.rmap)
                        goto out_free;
 
-               memset(new.rmap, 0, npages * sizeof(*new.rmap));
-
                new.user_alloc = user_alloc;
                new.userspace_addr = mem->userspace_addr;
        }
@@ -633,14 +678,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
                             >> KVM_HPAGE_GFN_SHIFT(level));
                lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
 
-               new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
+               new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
 
                if (!new.lpage_info[i])
                        goto out_free;
 
-               memset(new.lpage_info[i], 0,
-                      lpages * sizeof(*new.lpage_info[i]));
-
                if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
                        new.lpage_info[i][0].write_count = 1;
                if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
@@ -661,12 +703,8 @@ skip_lpage:
 
        /* Allocate page dirty bitmap if needed */
        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-               unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
-
-               new.dirty_bitmap = vmalloc(dirty_bytes);
-               if (!new.dirty_bitmap)
+               if (kvm_create_dirty_bitmap(&new) < 0)
                        goto out_free;
-               memset(new.dirty_bitmap, 0, dirty_bytes);
                /* destroy any largepage mappings for dirty tracking */
                if (old.npages)
                        flush_shadow = 1;
@@ -685,6 +723,7 @@ skip_lpage:
                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
                if (mem->slot >= slots->nmemslots)
                        slots->nmemslots = mem->slot + 1;
+               slots->generation++;
                slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
 
                old_memslots = kvm->memslots;
@@ -719,6 +758,7 @@ skip_lpage:
        memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
        if (mem->slot >= slots->nmemslots)
                slots->nmemslots = mem->slot + 1;
+       slots->generation++;
 
        /* actual memory is freed via old in kvm_free_physmem_slot below */
        if (!npages) {
@@ -849,10 +889,10 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+                                               gfn_t gfn)
 {
        int i;
-       struct kvm_memslots *slots = kvm_memslots(kvm);
 
        for (i = 0; i < slots->nmemslots; ++i) {
                struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -863,6 +903,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
        }
        return NULL;
 }
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+       return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+}
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -925,12 +970,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
        return memslot - slots->memslots;
 }
 
-static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
                                     gfn_t *nr_pages)
 {
-       struct kvm_memory_slot *slot;
-
-       slot = gfn_to_memslot(kvm, gfn);
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return bad_hva();
 
@@ -942,28 +984,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
 
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
-       return gfn_to_hva_many(kvm, gfn, NULL);
+       return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
+static pfn_t get_fault_pfn(void)
+{
+       get_page(fault_page);
+       return fault_pfn;
+}
+
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
+                       bool *async, bool write_fault, bool *writable)
 {
        struct page *page[1];
-       int npages;
+       int npages = 0;
        pfn_t pfn;
 
-       if (atomic)
+       /* we can do it either atomically or asynchronously, not both */
+       BUG_ON(atomic && async);
+
+       BUG_ON(!write_fault && !writable);
+
+       if (writable)
+               *writable = true;
+
+       if (atomic || async)
                npages = __get_user_pages_fast(addr, 1, 1, page);
-       else {
+
+       if (unlikely(npages != 1) && !atomic) {
                might_sleep();
-               npages = get_user_pages_fast(addr, 1, 1, page);
+
+               if (writable)
+                       *writable = write_fault;
+
+               npages = get_user_pages_fast(addr, 1, write_fault, page);
+
+               /* map read fault as writable if possible */
+               if (unlikely(!write_fault) && npages == 1) {
+                       struct page *wpage[1];
+
+                       npages = __get_user_pages_fast(addr, 1, 1, wpage);
+                       if (npages == 1) {
+                               *writable = true;
+                               put_page(page[0]);
+                               page[0] = wpage[0];
+                       }
+                       npages = 1;
+               }
        }
 
        if (unlikely(npages != 1)) {
                struct vm_area_struct *vma;
 
                if (atomic)
-                       goto return_fault_page;
+                       return get_fault_pfn();
 
                down_read(&current->mm->mmap_sem);
                if (is_hwpoison_address(addr)) {
@@ -972,19 +1047,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
                        return page_to_pfn(hwpoison_page);
                }
 
-               vma = find_vma(current->mm, addr);
-
-               if (vma == NULL || addr < vma->vm_start ||
-                   !(vma->vm_flags & VM_PFNMAP)) {
-                       up_read(&current->mm->mmap_sem);
-return_fault_page:
-                       get_page(fault_page);
-                       return page_to_pfn(fault_page);
+               vma = find_vma_intersection(current->mm, addr, addr+1);
+
+               if (vma == NULL)
+                       pfn = get_fault_pfn();
+               else if ((vma->vm_flags & VM_PFNMAP)) {
+                       pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                               vma->vm_pgoff;
+                       BUG_ON(!kvm_is_mmio_pfn(pfn));
+               } else {
+                       if (async && (vma->vm_flags & VM_WRITE))
+                               *async = true;
+                       pfn = get_fault_pfn();
                }
-
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
                up_read(&current->mm->mmap_sem);
-               BUG_ON(!kvm_is_mmio_pfn(pfn));
        } else
                pfn = page_to_pfn(page[0]);
 
@@ -993,40 +1069,58 @@ return_fault_page:
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 {
-       return hva_to_pfn(kvm, addr, true);
+       return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+                         bool write_fault, bool *writable)
 {
        unsigned long addr;
 
+       if (async)
+               *async = false;
+
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr)) {
                get_page(bad_page);
                return page_to_pfn(bad_page);
        }
 
-       return hva_to_pfn(kvm, addr, atomic);
+       return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-       return __gfn_to_pfn(kvm, gfn, true);
+       return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+                      bool write_fault, bool *writable)
+{
+       return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-       return __gfn_to_pfn(kvm, gfn, false);
+       return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+                     bool *writable)
+{
+       return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
                         struct kvm_memory_slot *slot, gfn_t gfn)
 {
        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-       return hva_to_pfn(kvm, addr, false);
+       return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -1035,7 +1129,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
        unsigned long addr;
        gfn_t entry;
 
-       addr = gfn_to_hva_many(kvm, gfn, &entry);
+       addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
        if (kvm_is_error_hva(addr))
                return -1;
 
@@ -1219,9 +1313,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
        return 0;
 }
 
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                             gpa_t gpa)
+{
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       int offset = offset_in_page(gpa);
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+
+       ghc->gpa = gpa;
+       ghc->generation = slots->generation;
+       ghc->memslot = __gfn_to_memslot(slots, gfn);
+       ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
+       if (!kvm_is_error_hva(ghc->hva))
+               ghc->hva += offset;
+       else
+               return -EFAULT;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                          void *data, unsigned long len)
+{
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       int r;
+
+       if (slots->generation != ghc->generation)
+               kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+       if (kvm_is_error_hva(ghc->hva))
+               return -EFAULT;
+
+       r = copy_to_user((void __user *)ghc->hva, data, len);
+       if (r)
+               return -EFAULT;
+       mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-       return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+       return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
+                                   offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
 
@@ -1244,11 +1380,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                            gfn_t gfn)
 {
-       struct kvm_memory_slot *memslot;
-
-       memslot = gfn_to_memslot(kvm, gfn);
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
 
@@ -1256,6 +1390,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        }
 }
 
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *memslot;
+
+       memslot = gfn_to_memslot(kvm, gfn);
+       mark_page_dirty_in_slot(kvm, memslot, gfn);
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
@@ -1457,6 +1599,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
                if (arg)
                        goto out;
                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+               trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
                break;
        case KVM_GET_REGS: {
                struct kvm_regs *kvm_regs;
@@ -1824,7 +1967,7 @@ static struct file_operations kvm_vm_fops = {
 
 static int kvm_dev_ioctl_create_vm(void)
 {
-       int fd, r;
+       int r;
        struct kvm *kvm;
 
        kvm = kvm_create_vm();
@@ -1837,11 +1980,11 @@ static int kvm_dev_ioctl_create_vm(void)
                return r;
        }
 #endif
-       fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
-       if (fd < 0)
+       r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+       if (r < 0)
                kvm_put_kvm(kvm);
 
-       return fd;
+       return r;
 }
 
 static long kvm_dev_ioctl_check_extension_generic(long arg)
@@ -1922,7 +2065,7 @@ static struct miscdevice kvm_dev = {
        &kvm_chardev_ops,
 };
 
-static void hardware_enable(void *junk)
+static void hardware_enable_nolock(void *junk)
 {
        int cpu = raw_smp_processor_id();
        int r;
@@ -1942,7 +2085,14 @@ static void hardware_enable(void *junk)
        }
 }
 
-static void hardware_disable(void *junk)
+static void hardware_enable(void *junk)
+{
+       spin_lock(&kvm_lock);
+       hardware_enable_nolock(junk);
+       spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_nolock(void *junk)
 {
        int cpu = raw_smp_processor_id();
 
@@ -1952,13 +2102,20 @@ static void hardware_disable(void *junk)
        kvm_arch_hardware_disable(NULL);
 }
 
+static void hardware_disable(void *junk)
+{
+       spin_lock(&kvm_lock);
+       hardware_disable_nolock(junk);
+       spin_unlock(&kvm_lock);
+}
+
 static void hardware_disable_all_nolock(void)
 {
        BUG_ON(!kvm_usage_count);
 
        kvm_usage_count--;
        if (!kvm_usage_count)
-               on_each_cpu(hardware_disable, NULL, 1);
+               on_each_cpu(hardware_disable_nolock, NULL, 1);
 }
 
 static void hardware_disable_all(void)
@@ -1977,7 +2134,7 @@ static int hardware_enable_all(void)
        kvm_usage_count++;
        if (kvm_usage_count == 1) {
                atomic_set(&hardware_enable_failed, 0);
-               on_each_cpu(hardware_enable, NULL, 1);
+               on_each_cpu(hardware_enable_nolock, NULL, 1);
 
                if (atomic_read(&hardware_enable_failed)) {
                        hardware_disable_all_nolock();
@@ -2008,27 +2165,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
        case CPU_STARTING:
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
                       cpu);
-               spin_lock(&kvm_lock);
                hardware_enable(NULL);
-               spin_unlock(&kvm_lock);
                break;
        }
        return NOTIFY_OK;
 }
 
 
-asmlinkage void kvm_handle_fault_on_reboot(void)
+asmlinkage void kvm_spurious_fault(void)
 {
-       if (kvm_rebooting) {
-               /* spin while reset goes on */
-               local_irq_enable();
-               while (true)
-                       cpu_relax();
-       }
        /* Fault while not rebooting.  We want the trace. */
        BUG();
 }
-EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                      void *v)
@@ -2041,7 +2190,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
         */
        printk(KERN_INFO "kvm: exiting hardware virtualization\n");
        kvm_rebooting = true;
-       on_each_cpu(hardware_disable, NULL, 1);
+       on_each_cpu(hardware_disable_nolock, NULL, 1);
        return NOTIFY_OK;
 }
 
@@ -2211,7 +2360,7 @@ static void kvm_exit_debug(void)
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
        if (kvm_usage_count)
-               hardware_disable(NULL);
+               hardware_disable_nolock(NULL);
        return 0;
 }
 
@@ -2219,7 +2368,7 @@ static int kvm_resume(struct sys_device *dev)
 {
        if (kvm_usage_count) {
                WARN_ON(spin_is_locked(&kvm_lock));
-               hardware_enable(NULL);
+               hardware_enable_nolock(NULL);
        }
        return 0;
 }
@@ -2336,6 +2485,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                goto out_free_5;
        }
 
+       r = kvm_async_pf_init();
+       if (r)
+               goto out_free;
+
        kvm_chardev_ops.owner = module;
        kvm_vm_fops.owner = module;
        kvm_vcpu_fops.owner = module;
@@ -2343,7 +2496,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        r = misc_register(&kvm_dev);
        if (r) {
                printk(KERN_ERR "kvm: misc device register failed\n");
-               goto out_free;
+               goto out_unreg;
        }
 
        kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2506,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
        return 0;
 
+out_unreg:
+       kvm_async_pf_deinit();
 out_free:
        kmem_cache_destroy(kvm_vcpu_cache);
 out_free_5:
@@ -2385,11 +2540,12 @@ void kvm_exit(void)
        kvm_exit_debug();
        misc_deregister(&kvm_dev);
        kmem_cache_destroy(kvm_vcpu_cache);
+       kvm_async_pf_deinit();
        sysdev_unregister(&kvm_sysdev);
        sysdev_class_unregister(&kvm_sysdev_class);
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
-       on_each_cpu(hardware_disable, NULL, 1);
+       on_each_cpu(hardware_disable_nolock, NULL, 1);
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        free_cpumask_var(cpus_hardware_enabled);