Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Tue, 17 Jul 2007 18:50:26 +0000 (11:50 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Tue, 17 Jul 2007 18:50:26 +0000 (11:50 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (80 commits)
  KVM: Use CPU_DYING for disabling virtualization
  KVM: Tune hotplug/suspend IPIs
  KVM: Keep track of which cpus have virtualization enabled
  SMP: Allow smp_call_function_single() to current cpu
  i386: Allow smp_call_function_single() to current cpu
  x86_64: Allow smp_call_function_single() to current cpu
  HOTPLUG: Adapt thermal throttle to CPU_DYING
  HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING
  HOTPLUG: Add CPU_DYING notifier
  KVM: Clean up #includes
  KVM: Remove kvmfs in favor of the anonymous inodes source
  KVM: SVM: Reliably detect if SVM was disabled by BIOS
  KVM: VMX: Remove unnecessary code in vmx_tlb_flush()
  KVM: MMU: Fix Wrong tlb flush order
  KVM: VMX: Reinitialize the real-mode tss when entering real mode
  KVM: Avoid useless memory write when possible
  KVM: Fix x86 emulator writeback
  KVM: Add support for in-kernel pio handlers
  KVM: VMX: Fix interrupt checking on lightweight exit
  KVM: Adds support for in-kernel mmio handlers
  ...

18 files changed:
arch/i386/kernel/cpu/mcheck/therm_throt.c
arch/i386/kernel/smpcommon.c
arch/x86_64/kernel/smp.c
drivers/kvm/Kconfig
drivers/kvm/kvm.h
drivers/kvm/kvm_main.c
drivers/kvm/mmu.c
drivers/kvm/paging_tmpl.h
drivers/kvm/svm.c
drivers/kvm/svm.h
drivers/kvm/vmx.c
drivers/kvm/x86_emulate.c
fs/anon_inodes.c
include/linux/magic.h
include/linux/notifier.h
include/linux/smp.h
kernel/cpu.c
kernel/cpuset.c

index 7ba7c3a..1203dc5 100644 (file)
@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
        int err;
 
        sys_dev = get_cpu_sysdev(cpu);
-       mutex_lock(&therm_cpu_lock);
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
+               mutex_lock(&therm_cpu_lock);
                err = thermal_throttle_add_dev(sys_dev);
+               mutex_unlock(&therm_cpu_lock);
                WARN_ON(err);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
+               mutex_lock(&therm_cpu_lock);
                thermal_throttle_remove_dev(sys_dev);
+               mutex_unlock(&therm_cpu_lock);
                break;
        }
-       mutex_unlock(&therm_cpu_lock);
        return NOTIFY_OK;
 }
 
index 1868ae1..bbfe85a 100644 (file)
@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
 EXPORT_SYMBOL(smp_call_function);
 
 /**
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single - Run a function on a specific CPU
  * @cpu: The target CPU.  Cannot be the calling CPU.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
        int ret;
        int me = get_cpu();
        if (cpu == me) {
-               WARN_ON(1);
+               local_irq_disable();
+               func(info);
+               local_irq_enable();
                put_cpu();
-               return -EBUSY;
+               return 0;
        }
 
        ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
index 2ff4685..0694940 100644 (file)
@@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 
 /*
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @nonatomic: Currently unused.
@@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 {
        /* prevent preemption and reschedule on another processor */
        int me = get_cpu();
+
+       /* Can deadlock when called with interrupts disabled */
+       WARN_ON(irqs_disabled());
+
        if (cpu == me) {
+               local_irq_disable();
+               func(info);
+               local_irq_enable();
                put_cpu();
                return 0;
        }
 
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-
        spin_lock_bh(&call_lock);
        __smp_call_function_single(cpu, func, info, nonatomic, wait);
        spin_unlock_bh(&call_lock);
index e8e37d8..33fa28a 100644 (file)
@@ -1,12 +1,17 @@
 #
 # KVM configuration
 #
-menu "Virtualization"
+menuconfig VIRTUALIZATION
+       bool "Virtualization"
        depends on X86
+       default y
+
+if VIRTUALIZATION
 
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on X86 && EXPERIMENTAL
+       depends on X86_CMPXCHG64 || 64BIT
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
@@ -35,4 +40,4 @@ config KVM_AMD
          Provides support for KVM on AMD processors equipped with the AMD-V
          (SVM) extensions.
 
-endmenu
+endif # VIRTUALIZATION
index 152312c..a7c5e6b 100644 (file)
@@ -10,6 +10,8 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/signal.h>
 
@@ -18,6 +20,7 @@
 #include <linux/kvm_para.h>
 
 #define CR0_PE_MASK (1ULL << 0)
+#define CR0_MP_MASK (1ULL << 1)
 #define CR0_TS_MASK (1ULL << 3)
 #define CR0_NE_MASK (1ULL << 5)
 #define CR0_WP_MASK (1ULL << 16)
@@ -42,7 +45,8 @@
        (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
         | CR0_NW_MASK | CR0_CD_MASK)
 #define KVM_VM_CR0_ALWAYS_ON \
-       (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK)
+       (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
+        | CR0_MP_MASK)
 #define KVM_GUEST_CR4_MASK \
        (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS 4
 #define KVM_ALIAS_SLOTS 4
 #define KVM_MEMORY_SLOTS 4
-#define KVM_NUM_MMU_PAGES 256
+#define KVM_NUM_MMU_PAGES 1024
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
 
 #define KVM_PIO_PAGE_OFFSET 1
 
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_TLB_FLUSH 0
+
 /*
  * Address types:
  *
@@ -137,7 +146,7 @@ struct kvm_mmu_page {
        gfn_t gfn;
        union kvm_mmu_page_role role;
 
-       hpa_t page_hpa;
+       u64 *spt;
        unsigned long slot_bitmap; /* One bit set per slot which has memory
                                    * in this shadow page.
                                    */
@@ -232,6 +241,7 @@ struct kvm_pio_request {
        struct page *guest_pages[2];
        unsigned guest_page_offset;
        int in;
+       int port;
        int size;
        int string;
        int down;
@@ -252,8 +262,70 @@ struct kvm_stat {
        u32 halt_exits;
        u32 request_irq_exits;
        u32 irq_exits;
+       u32 light_exits;
+       u32 efer_reload;
+};
+
+struct kvm_io_device {
+       void (*read)(struct kvm_io_device *this,
+                    gpa_t addr,
+                    int len,
+                    void *val);
+       void (*write)(struct kvm_io_device *this,
+                     gpa_t addr,
+                     int len,
+                     const void *val);
+       int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+       void (*destructor)(struct kvm_io_device *this);
+
+       void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+                                    gpa_t addr,
+                                    int len,
+                                    void *val)
+{
+       dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+                                     gpa_t addr,
+                                     int len,
+                                     const void *val)
+{
+       dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+       return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+       if (dev->destructor)
+               dev->destructor(dev);
+}
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+       int                   dev_count;
+#define NR_IOBUS_DEVS 6
+       struct kvm_io_device *devs[NR_IOBUS_DEVS];
 };
 
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+                            struct kvm_io_device *dev);
+
 struct kvm_vcpu {
        struct kvm *kvm;
        union {
@@ -266,6 +338,8 @@ struct kvm_vcpu {
        u64 host_tsc;
        struct kvm_run *run;
        int interrupt_window_open;
+       int guest_mode;
+       unsigned long requests;
        unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
        unsigned long irq_pending[NR_IRQ_WORDS];
@@ -285,15 +359,20 @@ struct kvm_vcpu {
        u64 apic_base;
        u64 ia32_misc_enable_msr;
        int nmsrs;
+       int save_nmsrs;
+       int msr_offset_efer;
+#ifdef CONFIG_X86_64
+       int msr_offset_kernel_gs_base;
+#endif
        struct vmx_msr_entry *guest_msrs;
        struct vmx_msr_entry *host_msrs;
 
-       struct list_head free_pages;
-       struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
        struct kvm_mmu mmu;
 
        struct kvm_mmu_memory_cache mmu_pte_chain_cache;
        struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+       struct kvm_mmu_memory_cache mmu_page_cache;
+       struct kvm_mmu_memory_cache mmu_page_header_cache;
 
        gfn_t last_pt_write_gfn;
        int   last_pt_write_count;
@@ -305,6 +384,11 @@ struct kvm_vcpu {
        char *guest_fx_image;
        int fpu_active;
        int guest_fpu_loaded;
+       struct vmx_host_state {
+               int loaded;
+               u16 fs_sel, gs_sel, ldt_sel;
+               int fs_gs_ldt_reload_needed;
+       } vmx_host_state;
 
        int mmio_needed;
        int mmio_read_completed;
@@ -331,6 +415,7 @@ struct kvm_vcpu {
                        u32 ar;
                } tr, es, ds, fs, gs;
        } rmode;
+       int halt_request; /* real mode on Intel only */
 
        int cpuid_nent;
        struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
@@ -362,12 +447,15 @@ struct kvm {
        struct list_head active_mmu_pages;
        int n_free_mmu_pages;
        struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+       int nvcpus;
        struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
        int memory_config_version;
        int busy;
        unsigned long rmap_overflow;
        struct list_head vm_list;
        struct file *filp;
+       struct kvm_io_bus mmio_bus;
+       struct kvm_io_bus pio_bus;
 };
 
 struct descriptor_table {
@@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned long count, int string, int down,
                  gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
@@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
 
 int kvm_read_guest(struct kvm_vcpu *vcpu,
               gva_t addr,
@@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
 
 unsigned long segment_base(u16 selector);
 
-void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
-void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *old, const u8 *new, int bytes);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 
 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
@@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        return vcpu->mmu.page_fault(vcpu, gva, error_code);
 }
 
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+       if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
+               return 0;
+
+       return kvm_mmu_load(vcpu);
+}
+
 static inline int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
index 8f1f07a..1b206f1 100644 (file)
  */
 
 #include "kvm.h"
+#include "x86_emulate.h"
+#include "segment_descriptor.h"
 
 #include <linux/kvm.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include <linux/magic.h>
-#include <asm/processor.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
-#include <asm/msr.h>
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
 #include <linux/reboot.h>
-#include <asm/io.h>
 #include <linux/debugfs.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
-#include <asm/desc.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
 
-#include "x86_emulate.h"
-#include "segment_descriptor.h"
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -51,8 +50,12 @@ MODULE_LICENSE("GPL");
 static DEFINE_SPINLOCK(kvm_lock);
 static LIST_HEAD(vm_list);
 
+static cpumask_t cpus_hardware_enabled;
+
 struct kvm_arch_ops *kvm_arch_ops;
 
+static void hardware_disable(void *ignored);
+
 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
 
 static struct kvm_stats_debugfs_item {
@@ -72,13 +75,13 @@ static struct kvm_stats_debugfs_item {
        { "halt_exits", STAT_OFFSET(halt_exits) },
        { "request_irq", STAT_OFFSET(request_irq_exits) },
        { "irq_exits", STAT_OFFSET(irq_exits) },
+       { "light_exits", STAT_OFFSET(light_exits) },
+       { "efer_reload", STAT_OFFSET(efer_reload) },
        { NULL }
 };
 
 static struct dentry *debugfs_dir;
 
-struct vfsmount *kvmfs_mnt;
-
 #define MAX_IO_MSRS 256
 
 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
@@ -100,55 +103,6 @@ struct segment_descriptor_64 {
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 
-static struct inode *kvmfs_inode(struct file_operations *fops)
-{
-       int error = -ENOMEM;
-       struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
-
-       if (!inode)
-               goto eexit_1;
-
-       inode->i_fop = fops;
-
-       /*
-        * Mark the inode dirty from the very beginning,
-        * that way it will never be moved to the dirty
-        * list because mark_inode_dirty() will think
-        * that it already _is_ on the dirty list.
-        */
-       inode->i_state = I_DIRTY;
-       inode->i_mode = S_IRUSR | S_IWUSR;
-       inode->i_uid = current->fsuid;
-       inode->i_gid = current->fsgid;
-       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       return inode;
-
-eexit_1:
-       return ERR_PTR(error);
-}
-
-static struct file *kvmfs_file(struct inode *inode, void *private_data)
-{
-       struct file *file = get_empty_filp();
-
-       if (!file)
-               return ERR_PTR(-ENFILE);
-
-       file->f_path.mnt = mntget(kvmfs_mnt);
-       file->f_path.dentry = d_alloc_anon(inode);
-       if (!file->f_path.dentry)
-               return ERR_PTR(-ENOMEM);
-       file->f_mapping = inode->i_mapping;
-
-       file->f_pos = 0;
-       file->f_flags = O_RDWR;
-       file->f_op = inode->i_fop;
-       file->f_mode = FMODE_READ | FMODE_WRITE;
-       file->f_version = 0;
-       file->private_data = private_data;
-       return file;
-}
-
 unsigned long segment_base(u16 selector)
 {
        struct descriptor_table gdt;
@@ -307,6 +261,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu)
        mutex_unlock(&vcpu->mutex);
 }
 
+static void ack_flush(void *_completed)
+{
+       atomic_t *completed = _completed;
+
+       atomic_inc(completed);
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+       int i, cpu, needed;
+       cpumask_t cpus;
+       struct kvm_vcpu *vcpu;
+       atomic_t completed;
+
+       atomic_set(&completed, 0);
+       cpus_clear(cpus);
+       needed = 0;
+       for (i = 0; i < kvm->nvcpus; ++i) {
+               vcpu = &kvm->vcpus[i];
+               if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
+                       continue;
+               cpu = vcpu->cpu;
+               if (cpu != -1 && cpu != raw_smp_processor_id())
+                       if (!cpu_isset(cpu, cpus)) {
+                               cpu_set(cpu, cpus);
+                               ++needed;
+                       }
+       }
+
+       /*
+        * We really want smp_call_function_mask() here.  But that's not
+        * available, so ipi all cpus in parallel and wait for them
+        * to complete.
+        */
+       for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
+               smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
+       while (atomic_read(&completed) != needed) {
+               cpu_relax();
+               barrier();
+       }
+}
+
 static struct kvm *kvm_create_vm(void)
 {
        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -315,8 +311,13 @@ static struct kvm *kvm_create_vm(void)
        if (!kvm)
                return ERR_PTR(-ENOMEM);
 
+       kvm_io_bus_init(&kvm->pio_bus);
        spin_lock_init(&kvm->lock);
        INIT_LIST_HEAD(&kvm->active_mmu_pages);
+       spin_lock(&kvm_lock);
+       list_add(&kvm->vm_list, &vm_list);
+       spin_unlock(&kvm_lock);
+       kvm_io_bus_init(&kvm->mmio_bus);
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
@@ -324,10 +325,6 @@ static struct kvm *kvm_create_vm(void)
                vcpu->cpu = -1;
                vcpu->kvm = kvm;
                vcpu->mmu.root_hpa = INVALID_PAGE;
-               INIT_LIST_HEAD(&vcpu->free_pages);
-               spin_lock(&kvm_lock);
-               list_add(&kvm->vm_list, &vm_list);
-               spin_unlock(&kvm_lock);
        }
        return kvm;
 }
@@ -380,6 +377,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
                }
 }
 
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->vmcs)
+               return;
+
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
+}
+
 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
 {
        if (!vcpu->vmcs)
@@ -400,6 +407,11 @@ static void kvm_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
 
+       /*
+        * Unpin any mmu pages first.
+        */
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
        for (i = 0; i < KVM_MAX_VCPUS; ++i)
                kvm_free_vcpu(&kvm->vcpus[i]);
 }
@@ -414,6 +426,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
        spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
        spin_unlock(&kvm_lock);
+       kvm_io_bus_destroy(&kvm->pio_bus);
+       kvm_io_bus_destroy(&kvm->mmio_bus);
        kvm_free_vcpus(kvm);
        kvm_free_physmem(kvm);
        kfree(kvm);
@@ -969,7 +983,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
        int i;
-       struct kvm_memory_slot *memslot = NULL;
+       struct kvm_memory_slot *memslot;
        unsigned long rel_gfn;
 
        for (i = 0; i < kvm->nmemslots; ++i) {
@@ -978,7 +992,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
                if (gfn >= memslot->base_gfn
                    && gfn < memslot->base_gfn + memslot->npages) {
 
-                       if (!memslot || !memslot->dirty_bitmap)
+                       if (!memslot->dirty_bitmap)
                                return;
 
                        rel_gfn = gfn - memslot->base_gfn;
@@ -1037,12 +1051,31 @@ static int emulator_write_std(unsigned long addr,
        return X86EMUL_UNHANDLEABLE;
 }
 
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+                                               gpa_t addr)
+{
+       /*
+        * Note that its important to have this wrapper function because
+        * in the very near future we will be checking for MMIOs against
+        * the LAPIC as well as the general MMIO bus
+        */
+       return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+}
+
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+                                              gpa_t addr)
+{
+       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+}
+
 static int emulator_read_emulated(unsigned long addr,
                                  void *val,
                                  unsigned int bytes,
                                  struct x86_emulate_ctxt *ctxt)
 {
-       struct kvm_vcpu *vcpu = ctxt->vcpu;
+       struct kvm_vcpu      *vcpu = ctxt->vcpu;
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
@@ -1051,18 +1084,26 @@ static int emulator_read_emulated(unsigned long addr,
        } else if (emulator_read_std(addr, val, bytes, ctxt)
                   == X86EMUL_CONTINUE)
                return X86EMUL_CONTINUE;
-       else {
-               gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
-               if (gpa == UNMAPPED_GVA)
-                       return X86EMUL_PROPAGATE_FAULT;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_phys_addr = gpa;
-               vcpu->mmio_size = bytes;
-               vcpu->mmio_is_write = 0;
+       gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+       if (gpa == UNMAPPED_GVA)
+               return X86EMUL_PROPAGATE_FAULT;
 
-               return X86EMUL_UNHANDLEABLE;
+       /*
+        * Is this MMIO handled locally?
+        */
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+               return X86EMUL_CONTINUE;
        }
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 0;
+
+       return X86EMUL_UNHANDLEABLE;
 }
 
 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1070,18 +1111,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
        struct page *page;
        void *virt;
+       unsigned offset = offset_in_page(gpa);
 
        if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
                return 0;
        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
        if (!page)
                return 0;
-       kvm_mmu_pre_write(vcpu, gpa, bytes);
        mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
        virt = kmap_atomic(page, KM_USER0);
-       memcpy(virt + offset_in_page(gpa), val, bytes);
+       if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
+               kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
+               memcpy(virt + offset_in_page(gpa), val, bytes);
+       }
        kunmap_atomic(virt, KM_USER0);
-       kvm_mmu_post_write(vcpu, gpa, bytes);
        return 1;
 }
 
@@ -1090,8 +1133,9 @@ static int emulator_write_emulated(unsigned long addr,
                                   unsigned int bytes,
                                   struct x86_emulate_ctxt *ctxt)
 {
-       struct kvm_vcpu *vcpu = ctxt->vcpu;
-       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+       struct kvm_vcpu      *vcpu = ctxt->vcpu;
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
        if (gpa == UNMAPPED_GVA) {
                kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
@@ -1101,6 +1145,15 @@ static int emulator_write_emulated(unsigned long addr,
        if (emulator_write_phys(vcpu, gpa, val, bytes))
                return X86EMUL_CONTINUE;
 
+       /*
+        * Is this MMIO handled locally?
+        */
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+               return X86EMUL_CONTINUE;
+       }
+
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
        vcpu->mmio_size = bytes;
@@ -1269,6 +1322,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->irq_summary)
+               return 1;
+
+       vcpu->run->exit_reason = KVM_EXIT_HLT;
+       ++vcpu->stat.halt_exits;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
@@ -1469,6 +1533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MC0_MISC+16:
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_PERF_STATUS:
+       case MSR_IA32_EBL_CR_POWERON:
                /* MTRR registers */
        case 0xfe:
        case 0x200 ... 0x2ff:
@@ -1727,6 +1792,20 @@ static int complete_pio(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
+{
+       /* TODO: String I/O for in kernel device */
+
+       if (vcpu->pio.in)
+               kvm_iodevice_read(pio_dev, vcpu->pio.port,
+                                 vcpu->pio.size,
+                                 vcpu->pio_data);
+       else
+               kvm_iodevice_write(pio_dev, vcpu->pio.port,
+                                  vcpu->pio.size,
+                                  vcpu->pio_data);
+}
+
 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned long count, int string, int down,
                  gva_t address, int rep, unsigned port)
@@ -1735,6 +1814,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        int i;
        int nr_pages = 1;
        struct page *page;
+       struct kvm_io_device *pio_dev;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -1746,17 +1826,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->pio.cur_count = count;
        vcpu->pio.size = size;
        vcpu->pio.in = in;
+       vcpu->pio.port = port;
        vcpu->pio.string = string;
        vcpu->pio.down = down;
        vcpu->pio.guest_page_offset = offset_in_page(address);
        vcpu->pio.rep = rep;
 
+       pio_dev = vcpu_find_pio_dev(vcpu, port);
        if (!string) {
                kvm_arch_ops->cache_regs(vcpu);
                memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
                kvm_arch_ops->decache_regs(vcpu);
+               if (pio_dev) {
+                       kernel_pio(pio_dev, vcpu);
+                       complete_pio(vcpu);
+                       return 1;
+               }
                return 0;
        }
+       /* TODO: String I/O for in kernel device */
+       if (pio_dev)
+               printk(KERN_ERR "kvm_setup_pio: no string io support\n");
 
        if (!count) {
                kvm_arch_ops->skip_emulated_instruction(vcpu);
@@ -2273,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
        struct inode *inode;
        struct file *file;
 
+       r = anon_inode_getfd(&fd, &inode, &file,
+                            "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+       if (r)
+               return r;
        atomic_inc(&vcpu->kvm->filp->f_count);
-       inode = kvmfs_inode(&kvm_vcpu_fops);
-       if (IS_ERR(inode)) {
-               r = PTR_ERR(inode);
-               goto out1;
-       }
-
-       file = kvmfs_file(inode, vcpu);
-       if (IS_ERR(file)) {
-               r = PTR_ERR(file);
-               goto out2;
-       }
-
-       r = get_unused_fd();
-       if (r < 0)
-               goto out3;
-       fd = r;
-       fd_install(fd, file);
-
        return fd;
-
-out3:
-       fput(file);
-out2:
-       iput(inode);
-out1:
-       fput(vcpu->kvm->filp);
-       return r;
 }
 
 /*
@@ -2363,6 +2431,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
        if (r < 0)
                goto out_free_vcpus;
 
+       spin_lock(&kvm_lock);
+       if (n >= kvm->nvcpus)
+               kvm->nvcpus = n + 1;
+       spin_unlock(&kvm_lock);
+
        return r;
 
 out_free_vcpus:
@@ -2376,6 +2449,27 @@ out:
        return r;
 }
 
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+       u64 efer;
+       int i;
+       struct kvm_cpuid_entry *e, *entry;
+
+       rdmsrl(MSR_EFER, efer);
+       entry = NULL;
+       for (i = 0; i < vcpu->cpuid_nent; ++i) {
+               e = &vcpu->cpuid_entries[i];
+               if (e->function == 0x80000001) {
+                       entry = e;
+                       break;
+               }
+       }
+       if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
+               entry->edx &= ~(1 << 20);
+               printk(KERN_INFO ": guest NX capability removed\n");
+       }
+}
+
 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                                    struct kvm_cpuid *cpuid,
                                    struct kvm_cpuid_entry __user *entries)
@@ -2390,6 +2484,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
                goto out;
        vcpu->cpuid_nent = cpuid->nent;
+       cpuid_fix_nx_cap(vcpu);
        return 0;
 
 out:
@@ -2738,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void)
        struct file *file;
        struct kvm *kvm;
 
-       inode = kvmfs_inode(&kvm_vm_fops);
-       if (IS_ERR(inode)) {
-               r = PTR_ERR(inode);
-               goto out1;
-       }
-
        kvm = kvm_create_vm();
-       if (IS_ERR(kvm)) {
-               r = PTR_ERR(kvm);
-               goto out2;
+       if (IS_ERR(kvm))
+               return PTR_ERR(kvm);
+       r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+       if (r) {
+               kvm_destroy_vm(kvm);
+               return r;
        }
 
-       file = kvmfs_file(inode, kvm);
-       if (IS_ERR(file)) {
-               r = PTR_ERR(file);
-               goto out3;
-       }
        kvm->filp = file;
 
-       r = get_unused_fd();
-       if (r < 0)
-               goto out4;
-       fd = r;
-       fd_install(fd, file);
-
        return fd;
-
-out4:
-       fput(file);
-out3:
-       kvm_destroy_vm(kvm);
-out2:
-       iput(inode);
-out1:
-       return r;
 }
 
 static long kvm_dev_ioctl(struct file *filp,
@@ -2862,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                 * in vmx root mode.
                 */
                printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-               on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
+               on_each_cpu(hardware_disable, NULL, 0, 1);
        }
        return NOTIFY_OK;
 }
@@ -2905,33 +2977,88 @@ static void decache_vcpus_on_cpu(int cpu)
        spin_unlock(&kvm_lock);
 }
 
+static void hardware_enable(void *junk)
+{
+       int cpu = raw_smp_processor_id();
+
+       if (cpu_isset(cpu, cpus_hardware_enabled))
+               return;
+       cpu_set(cpu, cpus_hardware_enabled);
+       kvm_arch_ops->hardware_enable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+       int cpu = raw_smp_processor_id();
+
+       if (!cpu_isset(cpu, cpus_hardware_enabled))
+               return;
+       cpu_clear(cpu, cpus_hardware_enabled);
+       decache_vcpus_on_cpu(cpu);
+       kvm_arch_ops->hardware_disable(NULL);
+}
+
 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
                           void *v)
 {
        int cpu = (long)v;
 
        switch (val) {
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
+       case CPU_DYING:
+       case CPU_DYING_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
                       cpu);
-               decache_vcpus_on_cpu(cpu);
-               smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
-                                        NULL, 0, 1);
+               smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
                       cpu);
-               smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
-                                        NULL, 0, 1);
+               smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
                break;
        }
        return NOTIFY_OK;
 }
 
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+       memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+       int i;
+
+       for (i = 0; i < bus->dev_count; i++) {
+               struct kvm_io_device *pos = bus->devs[i];
+
+               kvm_iodevice_destructor(pos);
+       }
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+       int i;
+
+       for (i = 0; i < bus->dev_count; i++) {
+               struct kvm_io_device *pos = bus->devs[i];
+
+               if (pos->in_range(pos, addr))
+                       return pos;
+       }
+
+       return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+       BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+       bus->devs[bus->dev_count++] = dev;
+}
+
 static struct notifier_block kvm_cpu_notifier = {
        .notifier_call = kvm_cpu_hotplug,
        .priority = 20, /* must be > scheduler priority */
@@ -2983,14 +3110,13 @@ static void kvm_exit_debug(void)
 
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
-       decache_vcpus_on_cpu(raw_smp_processor_id());
-       on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
+       hardware_disable(NULL);
        return 0;
 }
 
 static int kvm_resume(struct sys_device *dev)
 {
-       on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
+       hardware_enable(NULL);
        return 0;
 }
 
@@ -3007,18 +3133,6 @@ static struct sys_device kvm_sysdev = {
 
 hpa_t bad_page_address;
 
-static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
-{
-       return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type kvm_fs_type = {
-       .name           = "kvmfs",
-       .get_sb         = kvmfs_get_sb,
-       .kill_sb        = kill_anon_super,
-};
-
 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
 {
        int r;
@@ -3043,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
        if (r < 0)
                goto out;
 
-       on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
+       on_each_cpu(hardware_enable, NULL, 0, 1);
        r = register_cpu_notifier(&kvm_cpu_notifier);
        if (r)
                goto out_free_1;
@@ -3075,7 +3189,7 @@ out_free_2:
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_1:
-       on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
+       on_each_cpu(hardware_disable, NULL, 0, 1);
        kvm_arch_ops->hardware_unsetup();
 out:
        kvm_arch_ops = NULL;
@@ -3089,7 +3203,7 @@ void kvm_exit_arch(void)
        sysdev_class_unregister(&kvm_sysdev_class);
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
-       on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
+       on_each_cpu(hardware_disable, NULL, 0, 1);
        kvm_arch_ops->hardware_unsetup();
        kvm_arch_ops = NULL;
 }
@@ -3103,14 +3217,6 @@ static __init int kvm_init(void)
        if (r)
                goto out4;
 
-       r = register_filesystem(&kvm_fs_type);
-       if (r)
-               goto out3;
-
-       kvmfs_mnt = kern_mount(&kvm_fs_type);
-       r = PTR_ERR(kvmfs_mnt);
-       if (IS_ERR(kvmfs_mnt))
-               goto out2;
        kvm_init_debug();
 
        kvm_init_msr_list();
@@ -3127,10 +3233,6 @@ static __init int kvm_init(void)
 
 out:
        kvm_exit_debug();
-       mntput(kvmfs_mnt);
-out2:
-       unregister_filesystem(&kvm_fs_type);
-out3:
        kvm_mmu_module_exit();
 out4:
        return r;
@@ -3140,8 +3242,6 @@ static __exit void kvm_exit(void)
 {
        kvm_exit_debug();
        __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
-       mntput(kvmfs_mnt);
-       unregister_filesystem(&kvm_fs_type);
        kvm_mmu_module_exit();
 }
 
index e8e2281..b297a6b 100644 (file)
  * the COPYING file in the top-level directory.
  *
  */
+
+#include "vmx.h"
+#include "kvm.h"
+
 #include <linux/types.h>
 #include <linux/string.h>
-#include <asm/page.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
 
-#include "vmx.h"
-#include "kvm.h"
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
 
 #undef MMU_DEBUG
 
@@ -90,25 +93,11 @@ static int dbg = 1;
 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
 
 
-#define PT32_PTE_COPY_MASK \
-       (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
-
-#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
-
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
-#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 
-#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
-#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
-
-#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
-#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
-
-#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
-
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 
 #define PT64_LEVEL_BITS 9
@@ -165,6 +154,8 @@ struct kvm_rmap_desc {
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_cache;
+static struct kmem_cache *mmu_page_header_cache;
 
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
@@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte)
                == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
 }
 
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+       set_64bit((unsigned long *)sptep, spte);
+#else
+       set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
                                  struct kmem_cache *base_cache, int min,
                                  gfp_t gfp_flags)
@@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
                goto out;
        r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
                                   rmap_desc_cache, 1, gfp_flags);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
+                                  mmu_page_cache, 4, gfp_flags);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
+                                  mmu_page_header_cache, 4, gfp_flags);
 out:
        return r;
 }
@@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
        mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
        mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
+       mmu_free_memory_cache(&vcpu->mmu_page_cache);
+       mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
 }
 
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
                BUG_ON(!(*spte & PT_WRITABLE_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
                rmap_remove(vcpu, spte);
-               kvm_arch_ops->tlb_flush(vcpu);
-               *spte &= ~(u64)PT_WRITABLE_MASK;
+               set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+               kvm_flush_remote_tlbs(vcpu->kvm);
        }
 }
 
 #ifdef MMU_DEBUG
-static int is_empty_shadow_page(hpa_t page_hpa)
+static int is_empty_shadow_page(u64 *spt)
 {
        u64 *pos;
        u64 *end;
 
-       for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
-                     pos != end; pos++)
+       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
                if (*pos != 0) {
                        printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
                               pos, *pos);
@@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa)
 }
 #endif
 
-static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
+static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
+                             struct kvm_mmu_page *page_head)
 {
-       struct kvm_mmu_page *page_head = page_header(page_hpa);
-
-       ASSERT(is_empty_shadow_page(page_hpa));
-       page_head->page_hpa = page_hpa;
-       list_move(&page_head->link, &vcpu->free_pages);
+       ASSERT(is_empty_shadow_page(page_head->spt));
+       list_del(&page_head->link);
+       mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
+       mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
        ++vcpu->kvm->n_free_mmu_pages;
 }
 
@@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 {
        struct kvm_mmu_page *page;
 
-       if (list_empty(&vcpu->free_pages))
+       if (!vcpu->kvm->n_free_mmu_pages)
                return NULL;
 
-       page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
-       list_move(&page->link, &vcpu->kvm->active_mmu_pages);
-       ASSERT(is_empty_shadow_page(page->page_hpa));
+       page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
+                                     sizeof *page);
+       page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
+       set_page_private(virt_to_page(page->spt), (unsigned long)page);
+       list_add(&page->link, &vcpu->kvm->active_mmu_pages);
+       ASSERT(is_empty_shadow_page(page->spt));
        page->slot_bitmap = 0;
        page->multimapped = 0;
        page->parent_pte = parent_pte;
@@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
        u64 *pt;
        u64 ent;
 
-       pt = __va(page->page_hpa);
+       pt = page->spt;
 
        if (page->role.level == PT_PAGE_TABLE_LEVEL) {
                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
@@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
                                rmap_remove(vcpu, &pt[i]);
                        pt[i] = 0;
                }
-               kvm_arch_ops->tlb_flush(vcpu);
+               kvm_flush_remote_tlbs(vcpu->kvm);
                return;
        }
 
@@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
                ent &= PT64_BASE_ADDR_MASK;
                mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
        }
+       kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
 static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
@@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
                }
                BUG_ON(!parent_pte);
                kvm_mmu_put_page(vcpu, page, parent_pte);
-               *parent_pte = 0;
+               set_shadow_pte(parent_pte, 0);
        }
        kvm_mmu_page_unlink_children(vcpu, page);
        if (!page->root_count) {
                hlist_del(&page->hash_link);
-               kvm_mmu_free_page(vcpu, page->page_hpa);
+               kvm_mmu_free_page(vcpu, page);
        } else
                list_move(&page->link, &vcpu->kvm->active_mmu_pages);
 }
@@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
        return r;
 }
 
+static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       struct kvm_mmu_page *page;
+
+       while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
+               pgprintk("%s: zap %lx %x\n",
+                        __FUNCTION__, gfn, page->role.word);
+               kvm_mmu_zap_page(vcpu, page);
+       }
+}
+
 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
 {
        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
@@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
                                return -ENOMEM;
                        }
 
-                       table[index] = new_table->page_hpa | PT_PRESENT_MASK
+                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
                                | PT_WRITABLE_MASK | PT_USER_MASK;
                }
                table_addr = table[index] & PT64_BASE_ADDR_MASK;
@@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
        int i;
        struct kvm_mmu_page *page;
 
+       if (!VALID_PAGE(vcpu->mmu.root_hpa))
+               return;
 #ifdef CONFIG_X86_64
        if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->mmu.root_hpa;
 
-               ASSERT(VALID_PAGE(root));
                page = page_header(root);
                --page->root_count;
                vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                hpa_t root = vcpu->mmu.pae_root[i];
 
                if (root) {
-                       ASSERT(VALID_PAGE(root));
                        root &= PT64_BASE_ADDR_MASK;
                        page = page_header(root);
                        --page->root_count;
@@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                ASSERT(!VALID_PAGE(root));
                page = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                        PT64_ROOT_LEVEL, 0, 0, NULL);
-               root = page->page_hpa;
+               root = __pa(page->spt);
                ++page->root_count;
                vcpu->mmu.root_hpa = root;
                return;
@@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
                                        PT32_ROOT_LEVEL, !is_paging(vcpu),
                                        0, NULL);
-               root = page->page_hpa;
+               root = __pa(page->spt);
                ++page->root_count;
                vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
        }
@@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
        context->free = nonpaging_free;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       mmu_alloc_roots(vcpu);
-       ASSERT(VALID_PAGE(context->root_hpa));
-       kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
+       context->root_hpa = INVALID_PAGE;
        return 0;
 }
 
@@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
        pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
        mmu_free_roots(vcpu);
-       if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-               kvm_mmu_free_some_pages(vcpu);
-       mmu_alloc_roots(vcpu);
-       kvm_mmu_flush_tlb(vcpu);
-       kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
-}
-
-static inline void set_pte_common(struct kvm_vcpu *vcpu,
-                            u64 *shadow_pte,
-                            gpa_t gaddr,
-                            int dirty,
-                            u64 access_bits,
-                            gfn_t gfn)
-{
-       hpa_t paddr;
-
-       *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
-       if (!dirty)
-               access_bits &= ~PT_WRITABLE_MASK;
-
-       paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
-       *shadow_pte |= access_bits;
-
-       if (is_error_hpa(paddr)) {
-               *shadow_pte |= gaddr;
-               *shadow_pte |= PT_SHADOW_IO_MARK;
-               *shadow_pte &= ~PT_PRESENT_MASK;
-               return;
-       }
-
-       *shadow_pte |= paddr;
-
-       if (access_bits & PT_WRITABLE_MASK) {
-               struct kvm_mmu_page *shadow;
-
-               shadow = kvm_mmu_lookup_page(vcpu, gfn);
-               if (shadow) {
-                       pgprintk("%s: found shadow page for %lx, marking ro\n",
-                                __FUNCTION__, gfn);
-                       access_bits &= ~PT_WRITABLE_MASK;
-                       if (is_writeble_pte(*shadow_pte)) {
-                                   *shadow_pte &= ~PT_WRITABLE_MASK;
-                                   kvm_arch_ops->tlb_flush(vcpu);
-                       }
-               }
-       }
-
-       if (access_bits & PT_WRITABLE_MASK)
-               mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
-
-       page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
-       rmap_add(vcpu, shadow_pte);
 }
 
 static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
        kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
 }
 
-static inline int fix_read_pf(u64 *shadow_ent)
-{
-       if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
-           !(*shadow_ent & PT_USER_MASK)) {
-               /*
-                * If supervisor write protect is disabled, we shadow kernel
-                * pages as user pages so we can trap the write access.
-                */
-               *shadow_ent |= PT_USER_MASK;
-               *shadow_ent &= ~PT_WRITABLE_MASK;
-
-               return 1;
-
-       }
-       return 0;
-}
-
 static void paging_free(struct kvm_vcpu *vcpu)
 {
        nonpaging_free(vcpu);
@@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
        context->free = paging_free;
        context->root_level = level;
        context->shadow_root_level = level;
-       mmu_alloc_roots(vcpu);
-       ASSERT(VALID_PAGE(context->root_hpa));
-       kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
-                   (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
+       context->root_hpa = INVALID_PAGE;
        return 0;
 }
 
@@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
        context->free = paging_free;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       mmu_alloc_roots(vcpu);
-       ASSERT(VALID_PAGE(context->root_hpa));
-       kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
-                   (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
+       context->root_hpa = INVALID_PAGE;
        return 0;
 }
 
@@ -1106,19 +1052,34 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+       destroy_kvm_mmu(vcpu);
+       return init_kvm_mmu(vcpu);
+}
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
        int r;
 
-       destroy_kvm_mmu(vcpu);
-       r = init_kvm_mmu(vcpu);
-       if (r < 0)
-               goto out;
+       spin_lock(&vcpu->kvm->lock);
        r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+       mmu_alloc_roots(vcpu);
+       kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
+       kvm_mmu_flush_tlb(vcpu);
 out:
+       spin_unlock(&vcpu->kvm->lock);
        return r;
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
 
-static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *page,
                                  u64 *spte)
 {
@@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
                }
        }
        *spte = 0;
+       kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *page,
+                                 u64 *spte,
+                                 const void *new, int bytes)
+{
+       if (page->role.level != PT_PAGE_TABLE_LEVEL)
+               return;
+
+       if (page->role.glevels == PT32_ROOT_LEVEL)
+               paging32_update_pte(vcpu, page, spte, new, bytes);
+       else
+               paging64_update_pte(vcpu, page, spte, new, bytes);
 }
 
-void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *old, const u8 *new, int bytes)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *page;
@@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
        unsigned pte_size;
        unsigned page_offset;
        unsigned misaligned;
+       unsigned quadrant;
        int level;
        int flooded = 0;
        int npte;
@@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
                        continue;
                pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
                misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               misaligned |= bytes < 4;
                if (misaligned || flooded) {
                        /*
                         * Misaligned accesses are too much trouble to fix
@@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
                                page_offset <<= 1;
                                npte = 2;
                        }
+                       quadrant = page_offset >> PAGE_SHIFT;
                        page_offset &= ~PAGE_MASK;
+                       if (quadrant != page->role.quadrant)
+                               continue;
                }
-               spte = __va(page->page_hpa);
-               spte += page_offset / sizeof(*spte);
+               spte = &page->spt[page_offset / sizeof(*spte)];
                while (npte--) {
-                       mmu_pre_write_zap_pte(vcpu, page, spte);
+                       mmu_pte_write_zap_pte(vcpu, page, spte);
+                       mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
                        ++spte;
                }
        }
 }
 
-void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
-{
-}
-
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 {
        gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
@@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
                                    struct kvm_mmu_page, link);
                kvm_mmu_zap_page(vcpu, page);
        }
-       while (!list_empty(&vcpu->free_pages)) {
-               page = list_entry(vcpu->free_pages.next,
-                                 struct kvm_mmu_page, link);
-               list_del(&page->link);
-               __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
-               page->page_hpa = INVALID_PAGE;
-       }
        free_page((unsigned long)vcpu->mmu.pae_root);
 }
 
@@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
        ASSERT(vcpu);
 
-       for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
-               struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
-
-               INIT_LIST_HEAD(&page_header->link);
-               if ((page = alloc_page(GFP_KERNEL)) == NULL)
-                       goto error_1;
-               set_page_private(page, (unsigned long)page_header);
-               page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
-               memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
-               list_add(&page_header->link, &vcpu->free_pages);
-               ++vcpu->kvm->n_free_mmu_pages;
-       }
+       vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
 
        /*
         * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
@@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-       ASSERT(list_empty(&vcpu->free_pages));
 
        return alloc_mmu_pages(vcpu);
 }
@@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-       ASSERT(!list_empty(&vcpu->free_pages));
 
        return init_kvm_mmu(vcpu);
 }
@@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
                if (!test_bit(slot, &page->slot_bitmap))
                        continue;
 
-               pt = __va(page->page_hpa);
+               pt = page->spt;
                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
                        /* avoid RMW */
                        if (pt[i] & PT_WRITABLE_MASK) {
@@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
        }
 
        mmu_free_memory_caches(vcpu);
-       kvm_arch_ops->tlb_flush(vcpu);
+       kvm_flush_remote_tlbs(vcpu->kvm);
        init_kvm_mmu(vcpu);
 }
 
@@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void)
                kmem_cache_destroy(pte_chain_cache);
        if (rmap_desc_cache)
                kmem_cache_destroy(rmap_desc_cache);
+       if (mmu_page_cache)
+               kmem_cache_destroy(mmu_page_cache);
+       if (mmu_page_header_cache)
+               kmem_cache_destroy(mmu_page_header_cache);
 }
 
 int kvm_mmu_module_init(void)
@@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void)
        if (!rmap_desc_cache)
                goto nomem;
 
+       mmu_page_cache = kmem_cache_create("kvm_mmu_page",
+                                          PAGE_SIZE,
+                                          PAGE_SIZE, 0, NULL, NULL);
+       if (!mmu_page_cache)
+               goto nomem;
+
+       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+                                                 sizeof(struct kvm_mmu_page),
+                                                 0, 0, NULL, NULL);
+       if (!mmu_page_header_cache)
+               goto nomem;
+
        return 0;
 
 nomem:
@@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
        int i;
 
        list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
-               u64 *pt = __va(page->page_hpa);
+               u64 *pt = page->spt;
 
                if (page->role.level != PT_PAGE_TABLE_LEVEL)
                        continue;
index 73ffbff..a7c5cb0 100644 (file)
@@ -31,7 +31,6 @@
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
-       #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
        #else
@@ -46,7 +45,6 @@
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
-       #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
        #define PT_MAX_FULL_LEVELS 2
 #else
        #error Invalid PTTYPE value
@@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
        mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
 }
 
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
-                          u64 *shadow_pte, u64 access_bits, gfn_t gfn)
+static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
+                                 u64 *shadow_pte,
+                                 gpa_t gaddr,
+                                 pt_element_t *gpte,
+                                 u64 access_bits,
+                                 int user_fault,
+                                 int write_fault,
+                                 int *ptwrite,
+                                 struct guest_walker *walker,
+                                 gfn_t gfn)
 {
-       ASSERT(*shadow_pte == 0);
-       access_bits &= guest_pte;
-       *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
-       set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
-                      guest_pte & PT_DIRTY_MASK, access_bits, gfn);
+       hpa_t paddr;
+       int dirty = *gpte & PT_DIRTY_MASK;
+       u64 spte = *shadow_pte;
+       int was_rmapped = is_rmap_pte(spte);
+
+       pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
+                " user_fault %d gfn %lx\n",
+                __FUNCTION__, spte, (u64)*gpte, access_bits,
+                write_fault, user_fault, gfn);
+
+       if (write_fault && !dirty) {
+               *gpte |= PT_DIRTY_MASK;
+               dirty = 1;
+               FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
+       }
+
+       spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
+       spte |= *gpte & PT64_NX_MASK;
+       if (!dirty)
+               access_bits &= ~PT_WRITABLE_MASK;
+
+       paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
+
+       spte |= PT_PRESENT_MASK;
+       if (access_bits & PT_USER_MASK)
+               spte |= PT_USER_MASK;
+
+       if (is_error_hpa(paddr)) {
+               spte |= gaddr;
+               spte |= PT_SHADOW_IO_MARK;
+               spte &= ~PT_PRESENT_MASK;
+               set_shadow_pte(shadow_pte, spte);
+               return;
+       }
+
+       spte |= paddr;
+
+       if ((access_bits & PT_WRITABLE_MASK)
+           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+               struct kvm_mmu_page *shadow;
+
+               spte |= PT_WRITABLE_MASK;
+               if (user_fault) {
+                       mmu_unshadow(vcpu, gfn);
+                       goto unshadowed;
+               }
+
+               shadow = kvm_mmu_lookup_page(vcpu, gfn);
+               if (shadow) {
+                       pgprintk("%s: found shadow page for %lx, marking ro\n",
+                                __FUNCTION__, gfn);
+                       access_bits &= ~PT_WRITABLE_MASK;
+                       if (is_writeble_pte(spte)) {
+                               spte &= ~PT_WRITABLE_MASK;
+                               kvm_arch_ops->tlb_flush(vcpu);
+                       }
+                       if (write_fault)
+                               *ptwrite = 1;
+               }
+       }
+
+unshadowed:
+
+       if (access_bits & PT_WRITABLE_MASK)
+               mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+
+       set_shadow_pte(shadow_pte, spte);
+       page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+       if (!was_rmapped)
+               rmap_add(vcpu, shadow_pte);
 }
 
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
-                          u64 *shadow_pte, u64 access_bits, gfn_t gfn)
+static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
+                          u64 *shadow_pte, u64 access_bits,
+                          int user_fault, int write_fault, int *ptwrite,
+                          struct guest_walker *walker, gfn_t gfn)
+{
+       access_bits &= *gpte;
+       FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
+                             gpte, access_bits, user_fault, write_fault,
+                             ptwrite, walker, gfn);
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+                             u64 *spte, const void *pte, int bytes)
+{
+       pt_element_t gpte;
+
+       if (bytes < sizeof(pt_element_t))
+               return;
+       gpte = *(const pt_element_t *)pte;
+       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
+               return;
+       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+       FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
+                      0, NULL, NULL,
+                      (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
+}
+
+static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
+                          u64 *shadow_pte, u64 access_bits,
+                          int user_fault, int write_fault, int *ptwrite,
+                          struct guest_walker *walker, gfn_t gfn)
 {
        gpa_t gaddr;
 
-       ASSERT(*shadow_pte == 0);
-       access_bits &= guest_pde;
+       access_bits &= *gpde;
        gaddr = (gpa_t)gfn << PAGE_SHIFT;
        if (PTTYPE == 32 && is_cpuid_PSE36())
-               gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
+               gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
                        (32 - PT32_DIR_PSE36_SHIFT);
-       *shadow_pte = guest_pde & PT_PTE_COPY_MASK;
-       set_pte_common(vcpu, shadow_pte, gaddr,
-                      guest_pde & PT_DIRTY_MASK, access_bits, gfn);
+       FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
+                             gpde, access_bits, user_fault, write_fault,
+                             ptwrite, walker, gfn);
 }
 
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-                             struct guest_walker *walker)
+                        struct guest_walker *walker,
+                        int user_fault, int write_fault, int *ptwrite)
 {
        hpa_t shadow_addr;
        int level;
+       u64 *shadow_ent;
        u64 *prev_shadow_ent = NULL;
        pt_element_t *guest_ent = walker->ptep;
 
@@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
        for (; ; level--) {
                u32 index = SHADOW_PT_INDEX(addr, level);
-               u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
                struct kvm_mmu_page *shadow_page;
                u64 shadow_pte;
                int metaphysical;
                gfn_t table_gfn;
                unsigned hugepage_access = 0;
 
+               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
                if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
                        if (level == PT_PAGE_TABLE_LEVEL)
-                               return shadow_ent;
+                               break;
                        shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
                        prev_shadow_ent = shadow_ent;
                        continue;
                }
 
-               if (level == PT_PAGE_TABLE_LEVEL) {
-
-                       if (walker->level == PT_DIRECTORY_LEVEL) {
-                               if (prev_shadow_ent)
-                                       *prev_shadow_ent |= PT_SHADOW_PS_MARK;
-                               FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
-                                              walker->inherited_ar,
-                                              walker->gfn);
-                       } else {
-                               ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-                               FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
-                                              walker->inherited_ar,
-                                              walker->gfn);
-                       }
-                       return shadow_ent;
-               }
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
 
                if (level - 1 == PT_PAGE_TABLE_LEVEL
                    && walker->level == PT_DIRECTORY_LEVEL) {
@@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
                                               metaphysical, hugepage_access,
                                               shadow_ent);
-               shadow_addr = shadow_page->page_hpa;
+               shadow_addr = __pa(shadow_page->spt);
                shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
                        | PT_WRITABLE_MASK | PT_USER_MASK;
                *shadow_ent = shadow_pte;
                prev_shadow_ent = shadow_ent;
        }
-}
 
-/*
- * The guest faulted for write.  We need to
- *
- * - check write permissions
- * - update the guest pte dirty bit
- * - update our own dirty page tracking structures
- */
-static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
-                              u64 *shadow_ent,
-                              struct guest_walker *walker,
-                              gva_t addr,
-                              int user,
-                              int *write_pt)
-{
-       pt_element_t *guest_ent;
-       int writable_shadow;
-       gfn_t gfn;
-       struct kvm_mmu_page *page;
-
-       if (is_writeble_pte(*shadow_ent))
-               return !user || (*shadow_ent & PT_USER_MASK);
-
-       writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
-       if (user) {
-               /*
-                * User mode access.  Fail if it's a kernel page or a read-only
-                * page.
-                */
-               if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
-                       return 0;
-               ASSERT(*shadow_ent & PT_USER_MASK);
-       } else
-               /*
-                * Kernel mode access.  Fail if it's a read-only page and
-                * supervisor write protection is enabled.
-                */
-               if (!writable_shadow) {
-                       if (is_write_protection(vcpu))
-                               return 0;
-                       *shadow_ent &= ~PT_USER_MASK;
-               }
-
-       guest_ent = walker->ptep;
-
-       if (!is_present_pte(*guest_ent)) {
-               *shadow_ent = 0;
-               return 0;
+       if (walker->level == PT_DIRECTORY_LEVEL) {
+               FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
+                              walker->inherited_ar, user_fault, write_fault,
+                              ptwrite, walker, walker->gfn);
+       } else {
+               ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
+               FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
+                              walker->inherited_ar, user_fault, write_fault,
+                              ptwrite, walker, walker->gfn);
        }
-
-       gfn = walker->gfn;
-
-       if (user) {
-               /*
-                * Usermode page faults won't be for page table updates.
-                */
-               while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
-                       pgprintk("%s: zap %lx %x\n",
-                                __FUNCTION__, gfn, page->role.word);
-                       kvm_mmu_zap_page(vcpu, page);
-               }
-       } else if (kvm_mmu_lookup_page(vcpu, gfn)) {
-               pgprintk("%s: found shadow page for %lx, marking ro\n",
-                        __FUNCTION__, gfn);
-               mark_page_dirty(vcpu->kvm, gfn);
-               FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
-               *guest_ent |= PT_DIRTY_MASK;
-               *write_pt = 1;
-               return 0;
-       }
-       mark_page_dirty(vcpu->kvm, gfn);
-       *shadow_ent |= PT_WRITABLE_MASK;
-       FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
-       *guest_ent |= PT_DIRTY_MASK;
-       rmap_add(vcpu, shadow_ent);
-
-       return 1;
+       return shadow_ent;
 }
 
 /*
@@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        int fetch_fault = error_code & PFERR_FETCH_MASK;
        struct guest_walker walker;
        u64 *shadow_pte;
-       int fixed;
        int write_pt = 0;
        int r;
 
@@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                pgprintk("%s: guest page fault\n", __FUNCTION__);
                inject_page_fault(vcpu, addr, walker.error_code);
                FNAME(release_walker)(&walker);
+               vcpu->last_pt_write_count = 0; /* reset fork detector */
                return 0;
        }
 
-       shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
-       pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
-                shadow_pte, *shadow_pte);
-
-       /*
-        * Update the shadow pte.
-        */
-       if (write_fault)
-               fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
-                                           user_fault, &write_pt);
-       else
-               fixed = fix_read_pf(shadow_pte);
-
-       pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
-                shadow_pte, *shadow_pte);
+       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+                                 &write_pt);
+       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+                shadow_pte, *shadow_pte, write_pt);
 
        FNAME(release_walker)(&walker);
 
+       if (!write_pt)
+               vcpu->last_pt_write_count = 0; /* reset fork detector */
+
        /*
         * mmio: emulate if accessible, otherwise its a guest fault.
         */
@@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 #undef PT_INDEX
 #undef SHADOW_PT_INDEX
 #undef PT_LEVEL_MASK
-#undef PT_PTE_COPY_MASK
-#undef PT_NON_PTE_COPY_MASK
 #undef PT_DIR_BASE_ADDR_MASK
 #undef PT_MAX_FULL_LEVELS
index fa17d6d..bc818cc 100644 (file)
  *
  */
 
+#include "kvm_svm.h"
+#include "x86_emulate.h"
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <asm/desc.h>
 
-#include "kvm_svm.h"
-#include "x86_emulate.h"
+#include <asm/desc.h>
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void)
        int cpu;
        struct page *iopm_pages;
        struct page *msrpm_pages;
-       void *msrpm_va;
+       void *iopm_va, *msrpm_va;
        int r;
 
        kvm_emulator_want_group7_invlpg();
@@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void)
 
        if (!iopm_pages)
                return -ENOMEM;
-       memset(page_address(iopm_pages), 0xff,
-                                       PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+
+       iopm_va = page_address(iopm_pages);
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+       clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
 
@@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
                goto out2;
 
        vcpu->svm->vmcb = page_address(page);
-       memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
+       clear_page(vcpu->svm->vmcb);
        vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
        vcpu->svm->asid_generation = 0;
        memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
@@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
        fx_init(vcpu);
        vcpu->fpu_active = 1;
-       vcpu->apic_base = 0xfee00000 |
-                       /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-                       MSR_IA32_APICBASE_ENABLE;
+       vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (vcpu == &vcpu->kvm->vcpus[0])
+               vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
 
        return 0;
 
@@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         * VMCB is undefined after a SHUTDOWN intercept
         * so reinitialize it.
         */
-       memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
+       clear_page(vcpu->svm->vmcb);
        init_vmcb(vcpu->svm->vmcb);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
@@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
        skip_emulated_instruction(vcpu);
-       if (vcpu->irq_summary)
-               return 1;
-
-       kvm_run->exit_reason = KVM_EXIT_HLT;
-       ++vcpu->stat.halt_exits;
-       return 0;
+       return kvm_emulate_halt(vcpu);
 }
 
 static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs)
        asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
 }
 
+static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       force_new_asid(vcpu);
+}
+
 static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        u16 fs_selector;
@@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int r;
 
 again:
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               return r;
+
        if (!vcpu->mmio_read_completed)
                do_interrupt_requests(vcpu, kvm_run);
 
        clgi();
 
+       vcpu->guest_mode = 1;
+       if (vcpu->requests)
+               if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
+                   svm_flush_tlb(vcpu);
+
        pre_svm_run(vcpu);
 
        save_host_msrs(vcpu);
@@ -1617,6 +1629,8 @@ again:
 #endif
                : "cc", "memory" );
 
+       vcpu->guest_mode = 0;
+
        if (vcpu->fpu_active) {
                fx_save(vcpu->guest_fx_image);
                fx_restore(vcpu->host_fx_image);
@@ -1681,11 +1695,6 @@ again:
        return r;
 }
 
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
-       force_new_asid(vcpu);
-}
-
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        vcpu->svm->vmcb->save.cr3 = root;
@@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
 
 static int is_disabled(void)
 {
+       u64 vm_cr;
+
+       rdmsrl(MSR_VM_CR, vm_cr);
+       if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+               return 1;
+
        return 0;
 }
 
index 5e93814..3b1b0f3 100644 (file)
@@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_CPUID_FUNC 0x8000000a
 
 #define MSR_EFER_SVME_MASK (1ULL << 12)
+#define MSR_VM_CR       0xc0010114
 #define MSR_VM_HSAVE_PA 0xc0010117ULL
 
+#define SVM_VM_CR_SVM_DISABLE 4
+
 #define SVM_SELECTOR_S_SHIFT 4
 #define SVM_SELECTOR_DPL_SHIFT 5
 #define SVM_SELECTOR_P_SHIFT 7
index c1ac106..80628f6 100644 (file)
 
 #include "kvm.h"
 #include "vmx.h"
+#include "segment_descriptor.h"
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+
 #include <asm/io.h>
 #include <asm/desc.h>
 
-#include "segment_descriptor.h"
-
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static int init_rmode_tss(struct kvm *kvm);
+
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 
+static struct page *vmx_io_bitmap_a;
+static struct page *vmx_io_bitmap_b;
+
 #ifdef CONFIG_X86_64
 #define HOST_IS_64 1
 #else
 #define HOST_IS_64 0
 #endif
+#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
 
 static struct vmcs_descriptor {
        int size;
@@ -82,18 +89,17 @@ static const u32 vmx_msr_index[] = {
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-#ifdef CONFIG_X86_64
-static unsigned msr_offset_kernel_gs_base;
-#define NR_64BIT_MSRS 4
-/*
- * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
- * mechanism (cpu bug AA24)
- */
-#define NR_BAD_MSRS 2
-#else
-#define NR_64BIT_MSRS 0
-#define NR_BAD_MSRS 0
-#endif
+static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr)
+{
+       return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
+}
+
+static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu)
+{
+       int efer_offset = vcpu->msr_offset_efer;
+       return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) !=
+               msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]);
+}
 
 static inline int is_page_fault(u32 intr_info)
 {
@@ -115,13 +121,23 @@ static inline int is_external_interrupt(u32 intr_info)
                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
+static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
 {
        int i;
 
        for (i = 0; i < vcpu->nmsrs; ++i)
                if (vcpu->guest_msrs[i].index == msr)
-                       return &vcpu->guest_msrs[i];
+                       return i;
+       return -1;
+}
+
+static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
+{
+       int i;
+
+       i = __find_msr_index(vcpu, msr);
+       if (i >= 0)
+               return &vcpu->guest_msrs[i];
        return NULL;
 }
 
@@ -147,6 +163,7 @@ static void __vcpu_clear(void *arg)
                vmcs_clear(vcpu->vmcs);
        if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
+       rdtscll(vcpu->host_tsc);
 }
 
 static void vcpu_clear(struct kvm_vcpu *vcpu)
@@ -234,6 +251,127 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
        vmcs_writel(field, vmcs_readl(field) | mask);
 }
 
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+       u32 eb;
+
+       eb = 1u << PF_VECTOR;
+       if (!vcpu->fpu_active)
+               eb |= 1u << NM_VECTOR;
+       if (vcpu->guest_debug.enabled)
+               eb |= 1u << 1;
+       if (vcpu->rmode.active)
+               eb = ~0;
+       vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+static void reload_tss(void)
+{
+#ifndef CONFIG_X86_64
+
+       /*
+        * VT restores TR but not its size.  Useless.
+        */
+       struct descriptor_table gdt;
+       struct segment_descriptor *descs;
+
+       get_gdt(&gdt);
+       descs = (void *)gdt.base;
+       descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+       load_TR_desc();
+#endif
+}
+
+static void load_transition_efer(struct kvm_vcpu *vcpu)
+{
+       u64 trans_efer;
+       int efer_offset = vcpu->msr_offset_efer;
+
+       trans_efer = vcpu->host_msrs[efer_offset].data;
+       trans_efer &= ~EFER_SAVE_RESTORE_BITS;
+       trans_efer |= msr_efer_save_restore_bits(
+                               vcpu->guest_msrs[efer_offset]);
+       wrmsrl(MSR_EFER, trans_efer);
+       vcpu->stat.efer_reload++;
+}
+
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vmx_host_state *hs = &vcpu->vmx_host_state;
+
+       if (hs->loaded)
+               return;
+
+       hs->loaded = 1;
+       /*
+        * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
+        * allow segment selectors with cpl > 0 or ti == 1.
+        */
+       hs->ldt_sel = read_ldt();
+       hs->fs_gs_ldt_reload_needed = hs->ldt_sel;
+       hs->fs_sel = read_fs();
+       if (!(hs->fs_sel & 7))
+               vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel);
+       else {
+               vmcs_write16(HOST_FS_SELECTOR, 0);
+               hs->fs_gs_ldt_reload_needed = 1;
+       }
+       hs->gs_sel = read_gs();
+       if (!(hs->gs_sel & 7))
+               vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel);
+       else {
+               vmcs_write16(HOST_GS_SELECTOR, 0);
+               hs->fs_gs_ldt_reload_needed = 1;
+       }
+
+#ifdef CONFIG_X86_64
+       vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+       vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+       vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel));
+       vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+       if (is_long_mode(vcpu)) {
+               save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1);
+       }
+#endif
+       load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
+       if (msr_efer_need_save_restore(vcpu))
+               load_transition_efer(vcpu);
+}
+
+static void vmx_load_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vmx_host_state *hs = &vcpu->vmx_host_state;
+
+       if (!hs->loaded)
+               return;
+
+       hs->loaded = 0;
+       if (hs->fs_gs_ldt_reload_needed) {
+               load_ldt(hs->ldt_sel);
+               load_fs(hs->fs_sel);
+               /*
+                * If we have to reload gs, we must take care to
+                * preserve our gs base.
+                */
+               local_irq_disable();
+               load_gs(hs->gs_sel);
+#ifdef CONFIG_X86_64
+               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+#endif
+               local_irq_enable();
+
+               reload_tss();
+       }
+       save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
+       load_msrs(vcpu->host_msrs, vcpu->save_nmsrs);
+       if (msr_efer_need_save_restore(vcpu))
+               load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1);
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -242,6 +380,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
 {
        u64 phys_addr = __pa(vcpu->vmcs);
        int cpu;
+       u64 tsc_this, delta;
 
        cpu = get_cpu();
 
@@ -275,15 +414,43 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
 
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+               /*
+                * Make sure the time stamp counter is monotonous.
+                */
+               rdtscll(tsc_this);
+               delta = vcpu->host_tsc - tsc_this;
+               vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
        }
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+       vmx_load_host_state(vcpu);
        kvm_put_guest_fpu(vcpu);
        put_cpu();
 }
 
+static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->fpu_active)
+               return;
+       vcpu->fpu_active = 1;
+       vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
+       if (vcpu->cr0 & CR0_TS_MASK)
+               vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
+       update_exception_bitmap(vcpu);
+}
+
+static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->fpu_active)
+               return;
+       vcpu->fpu_active = 0;
+       vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
+       update_exception_bitmap(vcpu);
+}
+
 static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
 {
        vcpu_clear(vcpu);
@@ -331,6 +498,20 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
                     INTR_INFO_VALID_MASK);
 }
 
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+void move_msr_up(struct kvm_vcpu *vcpu, int from, int to)
+{
+       struct vmx_msr_entry tmp;
+       tmp = vcpu->guest_msrs[to];
+       vcpu->guest_msrs[to] = vcpu->guest_msrs[from];
+       vcpu->guest_msrs[from] = tmp;
+       tmp = vcpu->host_msrs[to];
+       vcpu->host_msrs[to] = vcpu->host_msrs[from];
+       vcpu->host_msrs[from] = tmp;
+}
+
 /*
  * Set up the vmcs to automatically save and restore system
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -338,35 +519,41 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
  */
 static void setup_msrs(struct kvm_vcpu *vcpu)
 {
-       int nr_skip, nr_good_msrs;
-
-       if (is_long_mode(vcpu))
-               nr_skip = NR_BAD_MSRS;
-       else
-               nr_skip = NR_64BIT_MSRS;
-       nr_good_msrs = vcpu->nmsrs - nr_skip;
+       int save_nmsrs;
 
-       /*
-        * MSR_K6_STAR is only needed on long mode guests, and only
-        * if efer.sce is enabled.
-        */
-       if (find_msr_entry(vcpu, MSR_K6_STAR)) {
-               --nr_good_msrs;
+       save_nmsrs = 0;
 #ifdef CONFIG_X86_64
-               if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE))
-                       ++nr_good_msrs;
-#endif
+       if (is_long_mode(vcpu)) {
+               int index;
+
+               index = __find_msr_index(vcpu, MSR_SYSCALL_MASK);
+               if (index >= 0)
+                       move_msr_up(vcpu, index, save_nmsrs++);
+               index = __find_msr_index(vcpu, MSR_LSTAR);
+               if (index >= 0)
+                       move_msr_up(vcpu, index, save_nmsrs++);
+               index = __find_msr_index(vcpu, MSR_CSTAR);
+               if (index >= 0)
+                       move_msr_up(vcpu, index, save_nmsrs++);
+               index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
+               if (index >= 0)
+                       move_msr_up(vcpu, index, save_nmsrs++);
+               /*
+                * MSR_K6_STAR is only needed on long mode guests, and only
+                * if efer.sce is enabled.
+                */
+               index = __find_msr_index(vcpu, MSR_K6_STAR);
+               if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE))
+                       move_msr_up(vcpu, index, save_nmsrs++);
        }
+#endif
+       vcpu->save_nmsrs = save_nmsrs;
 
-       vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
-                   virt_to_phys(vcpu->guest_msrs + nr_skip));
-       vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
-                   virt_to_phys(vcpu->guest_msrs + nr_skip));
-       vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
-                   virt_to_phys(vcpu->host_msrs + nr_skip));
-       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
+#ifdef CONFIG_X86_64
+       vcpu->msr_offset_kernel_gs_base =
+               __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
+#endif
+       vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER);
 }
 
 /*
@@ -394,23 +581,6 @@ static void guest_write_tsc(u64 guest_tsc)
        vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
-static void reload_tss(void)
-{
-#ifndef CONFIG_X86_64
-
-       /*
-        * VT restores TR but not its size.  Useless.
-        */
-       struct descriptor_table gdt;
-       struct segment_descriptor *descs;
-
-       get_gdt(&gdt);
-       descs = (void *)gdt.base;
-       descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
-       load_TR_desc();
-#endif
-}
-
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -470,10 +640,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
        struct vmx_msr_entry *msr;
+       int ret = 0;
+
        switch (msr_index) {
 #ifdef CONFIG_X86_64
        case MSR_EFER:
-               return kvm_set_msr_common(vcpu, msr_index, data);
+               ret = kvm_set_msr_common(vcpu, msr_index, data);
+               if (vcpu->vmx_host_state.loaded)
+                       load_transition_efer(vcpu);
+               break;
        case MSR_FS_BASE:
                vmcs_writel(GUEST_FS_BASE, data);
                break;
@@ -497,14 +672,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                msr = find_msr_entry(vcpu, msr_index);
                if (msr) {
                        msr->data = data;
+                       if (vcpu->vmx_host_state.loaded)
+                               load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
                        break;
                }
-               return kvm_set_msr_common(vcpu, msr_index, data);
-               msr->data = data;
-               break;
+               ret = kvm_set_msr_common(vcpu, msr_index, data);
        }
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -530,10 +705,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 {
        unsigned long dr7 = 0x400;
-       u32 exception_bitmap;
        int old_singlestep;
 
-       exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
        old_singlestep = vcpu->guest_debug.singlestep;
 
        vcpu->guest_debug.enabled = dbg->enabled;
@@ -549,13 +722,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
                        dr7 |= 0 << (i*4+16); /* execution breakpoint */
                }
 
-               exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
-
                vcpu->guest_debug.singlestep = dbg->singlestep;
-       } else {
-               exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
+       } else
                vcpu->guest_debug.singlestep = 0;
-       }
 
        if (old_singlestep && !vcpu->guest_debug.singlestep) {
                unsigned long flags;
@@ -565,7 +734,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
                vmcs_writel(GUEST_RFLAGS, flags);
        }
 
-       vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
+       update_exception_bitmap(vcpu);
        vmcs_writel(GUEST_DR7, dr7);
 
        return 0;
@@ -679,14 +848,6 @@ static __exit void hardware_unsetup(void)
        free_kvm_area();
 }
 
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->rmode.active)
-               vmcs_write32(EXCEPTION_BITMAP, ~0);
-       else
-               vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
-}
-
 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 {
        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -793,6 +954,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
        fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
        fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
+
+       init_rmode_tss(vcpu->kvm);
 }
 
 #ifdef CONFIG_X86_64
@@ -837,6 +1000,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+       vmx_fpu_deactivate(vcpu);
+
        if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
                enter_pmode(vcpu);
 
@@ -852,26 +1017,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        }
 #endif
 
-       if (!(cr0 & CR0_TS_MASK)) {
-               vcpu->fpu_active = 1;
-               vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK);
-       }
-
        vmcs_writel(CR0_READ_SHADOW, cr0);
        vmcs_writel(GUEST_CR0,
                    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
        vcpu->cr0 = cr0;
+
+       if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK))
+               vmx_fpu_activate(vcpu);
 }
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        vmcs_writel(GUEST_CR3, cr3);
-
-       if (!(vcpu->cr0 & CR0_TS_MASK)) {
-               vcpu->fpu_active = 0;
-               vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
-               vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
-       }
+       if (vcpu->cr0 & CR0_PE_MASK)
+               vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -937,23 +1096,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
        var->unusable = (ar >> 16) & 1;
 }
 
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg)
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        u32 ar;
 
-       vmcs_writel(sf->base, var->base);
-       vmcs_write32(sf->limit, var->limit);
-       vmcs_write16(sf->selector, var->selector);
-       if (vcpu->rmode.active && var->s) {
-               /*
-                * Hack real-mode segments into vm86 compatibility.
-                */
-               if (var->base == 0xffff0000 && var->selector == 0xf000)
-                       vmcs_writel(sf->base, 0xf0000);
-               ar = 0xf3;
-       } else if (var->unusable)
+       if (var->unusable)
                ar = 1 << 16;
        else {
                ar = var->type & 15;
@@ -967,6 +1114,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
        }
        if (ar == 0) /* a 0 value means unusable */
                ar = AR_UNUSABLE_MASK;
+
+       return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       u32 ar;
+
+       if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
+               vcpu->rmode.tr.selector = var->selector;
+               vcpu->rmode.tr.base = var->base;
+               vcpu->rmode.tr.limit = var->limit;
+               vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
+               return;
+       }
+       vmcs_writel(sf->base, var->base);
+       vmcs_write32(sf->limit, var->limit);
+       vmcs_write16(sf->selector, var->selector);
+       if (vcpu->rmode.active && var->s) {
+               /*
+                * Hack real-mode segments into vm86 compatibility.
+                */
+               if (var->base == 0xffff0000 && var->selector == 0xf000)
+                       vmcs_writel(sf->base, 0xf0000);
+               ar = 0xf3;
+       } else
+               ar = vmx_segment_access_rights(var);
        vmcs_write32(sf->ar_bytes, ar);
 }
 
@@ -1018,16 +1194,16 @@ static int init_rmode_tss(struct kvm* kvm)
        }
 
        page = kmap_atomic(p1, KM_USER0);
-       memset(page, 0, PAGE_SIZE);
+       clear_page(page);
        *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
        kunmap_atomic(page, KM_USER0);
 
        page = kmap_atomic(p2, KM_USER0);
-       memset(page, 0, PAGE_SIZE);
+       clear_page(page);
        kunmap_atomic(page, KM_USER0);
 
        page = kmap_atomic(p3, KM_USER0);
-       memset(page, 0, PAGE_SIZE);
+       clear_page(page);
        *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
        kunmap_atomic(page, KM_USER0);
 
@@ -1066,7 +1242,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        struct descriptor_table dt;
        int i;
        int ret = 0;
-       extern asmlinkage void kvm_vmx_return(void);
+       unsigned long kvm_vmx_return;
 
        if (!init_rmode_tss(vcpu->kvm)) {
                ret = -ENOMEM;
@@ -1076,9 +1252,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        memset(vcpu->regs, 0, sizeof(vcpu->regs));
        vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
        vcpu->cr8 = 0;
-       vcpu->apic_base = 0xfee00000 |
-                       /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-                       MSR_IA32_APICBASE_ENABLE;
+       vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (vcpu == &vcpu->kvm->vcpus[0])
+               vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
 
        fx_init(vcpu);
 
@@ -1129,8 +1305,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
        /* I/O */
-       vmcs_write64(IO_BITMAP_A, 0);
-       vmcs_write64(IO_BITMAP_B, 0);
+       vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
+       vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
 
        guest_write_tsc(0);
 
@@ -1150,12 +1326,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
                               CPU_BASED_HLT_EXITING         /* 20.6.2 */
                               | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
                               | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
-                              | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
+                              | CPU_BASED_ACTIVATE_IO_BITMAP  /* 20.6.2 */
                               | CPU_BASED_MOV_DR_EXITING
                               | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
                        );
 
-       vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -1185,8 +1360,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        get_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
 
-
-       vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */
+       asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+       vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 
        rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
        vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -1210,10 +1388,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
                vcpu->host_msrs[j].reserved = 0;
                vcpu->host_msrs[j].data = data;
                vcpu->guest_msrs[j] = vcpu->host_msrs[j];
-#ifdef CONFIG_X86_64
-               if (index == MSR_KERNEL_GS_BASE)
-                       msr_offset_kernel_gs_base = j;
-#endif
                ++vcpu->nmsrs;
        }
 
@@ -1241,6 +1415,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_X86_64
        vmx_set_efer(vcpu, 0);
 #endif
+       vmx_fpu_activate(vcpu);
+       update_exception_bitmap(vcpu);
 
        return 0;
 
@@ -1365,7 +1541,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
        if (!vcpu->rmode.active)
                return 0;
 
-       if (vec == GP_VECTOR && err_code == 0)
+       /*
+        * Instruction with address size override prefix opcode 0x67
+        * Cause the #SS fault with 0 error code in VM86 mode.
+        */
+       if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
                if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
                        return 1;
        return 0;
@@ -1400,10 +1580,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        if (is_no_device(intr_info)) {
-               vcpu->fpu_active = 1;
-               vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
-               if (!(vcpu->cr0 & CR0_TS_MASK))
-                       vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
+               vmx_fpu_activate(vcpu);
                return 1;
        }
 
@@ -1445,8 +1622,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        if (vcpu->rmode.active &&
            handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
-                                                               error_code))
+                                                               error_code)) {
+               if (vcpu->halt_request) {
+                       vcpu->halt_request = 0;
+                       return kvm_emulate_halt(vcpu);
+               }
                return 1;
+       }
 
        if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -1595,11 +1777,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                break;
        case 2: /* clts */
                vcpu_load_rsp_rip(vcpu);
-               vcpu->fpu_active = 1;
-               vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
-               vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
+               vmx_fpu_deactivate(vcpu);
                vcpu->cr0 &= ~CR0_TS_MASK;
                vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
+               vmx_fpu_activate(vcpu);
                skip_emulated_instruction(vcpu);
                return 1;
        case 1: /*mov from cr*/
@@ -1734,12 +1915,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        skip_emulated_instruction(vcpu);
-       if (vcpu->irq_summary)
-               return 1;
-
-       kvm_run->exit_reason = KVM_EXIT_HLT;
-       ++vcpu->stat.halt_exits;
-       return 0;
+       return kvm_emulate_halt(vcpu);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1770,7 +1946,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 };
 
 static const int kvm_vmx_max_exit_handlers =
-       sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers);
+       ARRAY_SIZE(kvm_vmx_exit_handlers);
 
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
@@ -1810,61 +1986,44 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
                (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
 }
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+}
+
 static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        u8 fail;
-       u16 fs_sel, gs_sel, ldt_sel;
-       int fs_gs_ldt_reload_needed;
        int r;
 
-again:
-       /*
-        * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
-        * allow segment selectors with cpl > 0 or ti == 1.
-        */
-       fs_sel = read_fs();
-       gs_sel = read_gs();
-       ldt_sel = read_ldt();
-       fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
-       if (!fs_gs_ldt_reload_needed) {
-               vmcs_write16(HOST_FS_SELECTOR, fs_sel);
-               vmcs_write16(HOST_GS_SELECTOR, gs_sel);
-       } else {
-               vmcs_write16(HOST_FS_SELECTOR, 0);
-               vmcs_write16(HOST_GS_SELECTOR, 0);
-       }
-
-#ifdef CONFIG_X86_64
-       vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
-       vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
-       vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
-       vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
-#endif
+preempted:
+       if (vcpu->guest_debug.enabled)
+               kvm_guest_debug_pre(vcpu);
 
+again:
        if (!vcpu->mmio_read_completed)
                do_interrupt_requests(vcpu, kvm_run);
 
-       if (vcpu->guest_debug.enabled)
-               kvm_guest_debug_pre(vcpu);
-
+       vmx_save_host_state(vcpu);
        kvm_load_guest_fpu(vcpu);
 
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               goto out;
+
        /*
         * Loading guest fpu may have cleared host cr0.ts
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
-#ifdef CONFIG_X86_64
-       if (is_long_mode(vcpu)) {
-               save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1);
-               load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
-       }
-#endif
+       local_irq_disable();
+
+       vcpu->guest_mode = 1;
+       if (vcpu->requests)
+               if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
+                   vmx_flush_tlb(vcpu);
 
        asm (
                /* Store host registers */
-               "pushf \n\t"
 #ifdef CONFIG_X86_64
                "push %%rax; push %%rbx; push %%rdx;"
                "push %%rsi; push %%rdi; push %%rbp;"
@@ -1909,12 +2068,11 @@ again:
                "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
 #endif
                /* Enter guest mode */
-               "jne launched \n\t"
+               "jne .Llaunched \n\t"
                ASM_VMX_VMLAUNCH "\n\t"
-               "jmp kvm_vmx_return \n\t"
-               "launched: " ASM_VMX_VMRESUME "\n\t"
-               ".globl kvm_vmx_return \n\t"
-               "kvm_vmx_return: "
+               "jmp .Lkvm_vmx_return \n\t"
+               ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
+               ".Lkvm_vmx_return: "
                /* Save guest registers, load host registers, keep flags */
 #ifdef CONFIG_X86_64
                "xchg %3,     (%%rsp) \n\t"
@@ -1957,7 +2115,6 @@ again:
                "pop %%ecx; popa \n\t"
 #endif
                "setbe %0 \n\t"
-               "popf \n\t"
              : "=q" (fail)
              : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
                "c"(vcpu),
@@ -1981,84 +2138,61 @@ again:
                [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
              : "cc", "memory" );
 
-       /*
-        * Reload segment selectors ASAP. (it's needed for a functional
-        * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
-        * relies on having 0 in %gs for the CPU PDA to work.)
-        */
-       if (fs_gs_ldt_reload_needed) {
-               load_ldt(ldt_sel);
-               load_fs(fs_sel);
-               /*
-                * If we have to reload gs, we must take care to
-                * preserve our gs base.
-                */
-               local_irq_disable();
-               load_gs(gs_sel);
-#ifdef CONFIG_X86_64
-               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
-               local_irq_enable();
+       vcpu->guest_mode = 0;
+       local_irq_enable();
 
-               reload_tss();
-       }
        ++vcpu->stat.exits;
 
-#ifdef CONFIG_X86_64
-       if (is_long_mode(vcpu)) {
-               save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
-               load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
-       }
-#endif
-
        vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
        asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 
-       if (fail) {
+       if (unlikely(fail)) {
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                kvm_run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
                r = 0;
-       } else {
-               /*
-                * Profile KVM exit RIPs:
-                */
-               if (unlikely(prof_on == KVM_PROFILING))
-                       profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
-
-               vcpu->launched = 1;
-               r = kvm_handle_exit(kvm_run, vcpu);
-               if (r > 0) {
-                       /* Give scheduler a change to reschedule. */
-                       if (signal_pending(current)) {
-                               ++vcpu->stat.signal_exits;
-                               post_kvm_run_save(vcpu, kvm_run);
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               return -EINTR;
-                       }
-
-                       if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-                               ++vcpu->stat.request_irq_exits;
-                               post_kvm_run_save(vcpu, kvm_run);
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               return -EINTR;
-                       }
-
-                       kvm_resched(vcpu);
+               goto out;
+       }
+       /*
+        * Profile KVM exit RIPs:
+        */
+       if (unlikely(prof_on == KVM_PROFILING))
+               profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
+
+       vcpu->launched = 1;
+       r = kvm_handle_exit(kvm_run, vcpu);
+       if (r > 0) {
+               /* Give scheduler a change to reschedule. */
+               if (signal_pending(current)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.signal_exits;
+                       goto out;
+               }
+
+               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.request_irq_exits;
+                       goto out;
+               }
+               if (!need_resched()) {
+                       ++vcpu->stat.light_exits;
                        goto again;
                }
        }
 
+out:
+       if (r > 0) {
+               kvm_resched(vcpu);
+               goto preempted;
+       }
+
        post_kvm_run_save(vcpu, kvm_run);
        return r;
 }
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-       vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
-}
-
 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
                                  unsigned long addr,
                                  u32 err_code)
@@ -2122,7 +2256,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        vmcs_clear(vmcs);
        vcpu->vmcs = vmcs;
        vcpu->launched = 0;
-       vcpu->fpu_active = 1;
 
        return 0;
 
@@ -2188,11 +2321,50 @@ static struct kvm_arch_ops vmx_arch_ops = {
 
 static int __init vmx_init(void)
 {
-       return kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
+       void *iova;
+       int r;
+
+       vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+       if (!vmx_io_bitmap_a)
+               return -ENOMEM;
+
+       vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+       if (!vmx_io_bitmap_b) {
+               r = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * Allow direct access to the PC debug port (it is often used for I/O
+        * delays, but the vmexits simply slow things down).
+        */
+       iova = kmap(vmx_io_bitmap_a);
+       memset(iova, 0xff, PAGE_SIZE);
+       clear_bit(0x80, iova);
+       kunmap(vmx_io_bitmap_a);
+
+       iova = kmap(vmx_io_bitmap_b);
+       memset(iova, 0xff, PAGE_SIZE);
+       kunmap(vmx_io_bitmap_b);
+
+       r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
+       if (r)
+               goto out1;
+
+       return 0;
+
+out1:
+       __free_page(vmx_io_bitmap_b);
+out:
+       __free_page(vmx_io_bitmap_a);
+       return r;
 }
 
 static void __exit vmx_exit(void)
 {
+       __free_page(vmx_io_bitmap_b);
+       __free_page(vmx_io_bitmap_a);
+
        kvm_exit_arch();
 }
 
index 7ade090..f60012d 100644 (file)
@@ -98,8 +98,11 @@ static u8 opcode_table[256] = {
        0, 0, 0, 0,
        /* 0x40 - 0x4F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x50 - 0x5F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x50 - 0x57 */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x58 - 0x5F */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
        /* 0x60 - 0x6F */
        0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -128,9 +131,9 @@ static u8 opcode_table[256] = {
        /* 0xB0 - 0xBF */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xC0 - 0xC7 */
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0,
-       0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov,
-           DstMem | SrcImm | ModRM | Mov,
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+       0, ImplicitOps, 0, 0,
+       ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
        /* 0xC8 - 0xCF */
        0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xD0 - 0xD7 */
@@ -143,7 +146,8 @@ static u8 opcode_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xF0 - 0xF7 */
        0, 0, 0, 0,
-       0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+       ImplicitOps, 0,
+       ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        /* 0xF8 - 0xFF */
        0, 0, 0, 0,
        0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
@@ -152,7 +156,7 @@ static u8 opcode_table[256] = {
 static u16 twobyte_table[256] = {
        /* 0x00 - 0x0F */
        0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-       0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+       0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
        /* 0x10 - 0x1F */
        0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
        /* 0x20 - 0x2F */
@@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        int mode = ctxt->mode;
        unsigned long modrm_ea;
        int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int no_wb = 0;
 
        /* Shadow copy of register state. Committed on successful emulation. */
        unsigned long _regs[NR_VCPU_REGS];
@@ -1047,7 +1052,7 @@ done_prefixes:
                                                      _regs[VCPU_REGS_RSP]),
                                     &dst.val, dst.bytes, ctxt)) != 0)
                                goto done;
-                       dst.val = dst.orig_val; /* skanky: disable writeback */
+                       no_wb = 1;
                        break;
                default:
                        goto cannot_emulate;
@@ -1056,7 +1061,7 @@ done_prefixes:
        }
 
 writeback:
-       if ((d & Mov) || (dst.orig_val != dst.val)) {
+       if (!no_wb) {
                switch (dst.type) {
                case OP_REG:
                        /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -1149,6 +1154,23 @@ special_insn:
        case 0xae ... 0xaf:     /* scas */
                DPRINTF("Urk! I don't handle SCAS.\n");
                goto cannot_emulate;
+       case 0xf4:              /* hlt */
+               ctxt->vcpu->halt_request = 1;
+               goto done;
+       case 0xc3: /* ret */
+               dst.ptr = &_eip;
+               goto pop_instruction;
+       case 0x58 ... 0x5f: /* pop reg */
+               dst.ptr = (unsigned long *)&_regs[b & 0x7];
+
+pop_instruction:
+               if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                       _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
+                       goto done;
+
+               register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
+               no_wb = 1; /* Disable writeback. */
+               break;
        }
        goto writeback;
 
@@ -1302,8 +1324,10 @@ twobyte_insn:
 
 twobyte_special_insn:
        /* Disable writeback. */
-       dst.orig_val = dst.val;
+       no_wb = 1;
        switch (b) {
+       case 0x09:              /* wbinvd */
+               break;
        case 0x0d:              /* GrpP (prefetch) */
        case 0x18:              /* Grp16 (prefetch/nop) */
                break;
index a260198..b4a7588 100644 (file)
@@ -139,6 +139,7 @@ err_put_filp:
        put_filp(file);
        return error;
 }
+EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
  * A single inode exists for all anon_inode files. Contrary to pipes,
index 9d713c0..36cc20d 100644 (file)
@@ -13,7 +13,6 @@
 #define HPFS_SUPER_MAGIC       0xf995e849
 #define ISOFS_SUPER_MAGIC      0x9660
 #define JFFS2_SUPER_MAGIC      0x72b6
-#define KVMFS_SUPER_MAGIC      0x19700426
 #define ANON_INODE_FS_MAGIC    0x09041934
 
 #define MINIX_SUPER_MAGIC      0x137F          /* original minix fs */
index 9431101..576f2bb 100644 (file)
@@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DEAD               0x0007 /* CPU (unsigned)v dead */
 #define CPU_LOCK_ACQUIRE       0x0008 /* Acquire all hotcpu locks */
 #define CPU_LOCK_RELEASE       0x0009 /* Release all hotcpu locks */
+#define CPU_DYING              0x000A /* CPU (unsigned)v not running any task,
+                                       * not handling interrupts, soon dead */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
  * operation in progress
@@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DOWN_PREPARE_FROZEN        (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
 #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN                (CPU_DEAD | CPU_TASKS_FROZEN)
+#define CPU_DYING_FROZEN       (CPU_DYING | CPU_TASKS_FROZEN)
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
index 96ac21f..8039dac 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include <linux/errno.h>
+#include <asm/system.h>
 
 extern void cpu_idle(void);
 
@@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { }
 static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
                                           void *info, int retry, int wait)
 {
-       return -EBUSY;
+       WARN_ON(cpuid != 0);
+       local_irq_disable();
+       func(info);
+       local_irq_enable();
+       return 0;
 }
 
 #endif /* !SMP */
index 208cf34..181ae70 100644 (file)
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
        write_unlock_irq(&tasklist_lock);
 }
 
+struct take_cpu_down_param {
+       unsigned long mod;
+       void *hcpu;
+};
+
 /* Take this CPU down. */
-static int take_cpu_down(void *unused)
+static int take_cpu_down(void *_param)
 {
+       struct take_cpu_down_param *param = _param;
        int err;
 
+       raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+                               param->hcpu);
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
        cpumask_t old_allowed, tmp;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+       struct take_cpu_down_param tcd_param = {
+               .mod = mod,
+               .hcpu = hcpu,
+       };
 
        if (num_online_cpus() == 1)
                return -EBUSY;
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
        set_cpus_allowed(current, tmp);
 
        mutex_lock(&cpu_bitmask_lock);
-       p = __stop_machine_run(take_cpu_down, NULL, cpu);
+       p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
        mutex_unlock(&cpu_bitmask_lock);
 
        if (IS_ERR(p) || cpu_online(cpu)) {
index 824b1c0..b4796d8 100644 (file)
@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
 static int cpuset_handle_cpuhp(struct notifier_block *nb,
                                unsigned long phase, void *cpu)
 {
+       if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
+               return NOTIFY_DONE;
+
        common_cpu_mem_hotplug_unplug();
        return 0;
 }