KVM: Avoid useless memory write when possible
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <linux/magic.h>
24 #include <asm/processor.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <asm/msr.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <asm/uaccess.h>
32 #include <linux/reboot.h>
33 #include <asm/io.h>
34 #include <linux/debugfs.h>
35 #include <linux/highmem.h>
36 #include <linux/file.h>
37 #include <asm/desc.h>
38 #include <linux/sysdev.h>
39 #include <linux/cpu.h>
40 #include <linux/file.h>
41 #include <linux/fs.h>
42 #include <linux/mount.h>
43 #include <linux/sched.h>
44 #include <linux/cpumask.h>
45 #include <linux/smp.h>
46
47 #include "x86_emulate.h"
48 #include "segment_descriptor.h"
49
50 MODULE_AUTHOR("Qumranet");
51 MODULE_LICENSE("GPL");
52
53 static DEFINE_SPINLOCK(kvm_lock);
54 static LIST_HEAD(vm_list);
55
56 struct kvm_arch_ops *kvm_arch_ops;
57
58 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
59
60 static struct kvm_stats_debugfs_item {
61         const char *name;
62         int offset;
63         struct dentry *dentry;
64 } debugfs_entries[] = {
65         { "pf_fixed", STAT_OFFSET(pf_fixed) },
66         { "pf_guest", STAT_OFFSET(pf_guest) },
67         { "tlb_flush", STAT_OFFSET(tlb_flush) },
68         { "invlpg", STAT_OFFSET(invlpg) },
69         { "exits", STAT_OFFSET(exits) },
70         { "io_exits", STAT_OFFSET(io_exits) },
71         { "mmio_exits", STAT_OFFSET(mmio_exits) },
72         { "signal_exits", STAT_OFFSET(signal_exits) },
73         { "irq_window", STAT_OFFSET(irq_window_exits) },
74         { "halt_exits", STAT_OFFSET(halt_exits) },
75         { "request_irq", STAT_OFFSET(request_irq_exits) },
76         { "irq_exits", STAT_OFFSET(irq_exits) },
77         { "light_exits", STAT_OFFSET(light_exits) },
78         { "efer_reload", STAT_OFFSET(efer_reload) },
79         { NULL }
80 };
81
82 static struct dentry *debugfs_dir;
83
84 struct vfsmount *kvmfs_mnt;
85
86 #define MAX_IO_MSRS 256
87
88 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
89 #define LMSW_GUEST_MASK 0x0eULL
90 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
91 #define CR8_RESEVED_BITS (~0x0fULL)
92 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
93
94 #ifdef CONFIG_X86_64
95 // LDT or TSS descriptor in the GDT. 16 bytes.
96 struct segment_descriptor_64 {
97         struct segment_descriptor s;
98         u32 base_higher;
99         u32 pad_zero;
100 };
101
102 #endif
103
104 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
105                            unsigned long arg);
106
107 static struct inode *kvmfs_inode(struct file_operations *fops)
108 {
109         int error = -ENOMEM;
110         struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
111
112         if (!inode)
113                 goto eexit_1;
114
115         inode->i_fop = fops;
116
117         /*
118          * Mark the inode dirty from the very beginning,
119          * that way it will never be moved to the dirty
120          * list because mark_inode_dirty() will think
121          * that it already _is_ on the dirty list.
122          */
123         inode->i_state = I_DIRTY;
124         inode->i_mode = S_IRUSR | S_IWUSR;
125         inode->i_uid = current->fsuid;
126         inode->i_gid = current->fsgid;
127         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
128         return inode;
129
130 eexit_1:
131         return ERR_PTR(error);
132 }
133
134 static struct file *kvmfs_file(struct inode *inode, void *private_data)
135 {
136         struct file *file = get_empty_filp();
137
138         if (!file)
139                 return ERR_PTR(-ENFILE);
140
141         file->f_path.mnt = mntget(kvmfs_mnt);
142         file->f_path.dentry = d_alloc_anon(inode);
143         if (!file->f_path.dentry)
144                 return ERR_PTR(-ENOMEM);
145         file->f_mapping = inode->i_mapping;
146
147         file->f_pos = 0;
148         file->f_flags = O_RDWR;
149         file->f_op = inode->i_fop;
150         file->f_mode = FMODE_READ | FMODE_WRITE;
151         file->f_version = 0;
152         file->private_data = private_data;
153         return file;
154 }
155
156 unsigned long segment_base(u16 selector)
157 {
158         struct descriptor_table gdt;
159         struct segment_descriptor *d;
160         unsigned long table_base;
161         typedef unsigned long ul;
162         unsigned long v;
163
164         if (selector == 0)
165                 return 0;
166
167         asm ("sgdt %0" : "=m"(gdt));
168         table_base = gdt.base;
169
170         if (selector & 4) {           /* from ldt */
171                 u16 ldt_selector;
172
173                 asm ("sldt %0" : "=g"(ldt_selector));
174                 table_base = segment_base(ldt_selector);
175         }
176         d = (struct segment_descriptor *)(table_base + (selector & ~7));
177         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
178 #ifdef CONFIG_X86_64
179         if (d->system == 0
180             && (d->type == 2 || d->type == 9 || d->type == 11))
181                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
182 #endif
183         return v;
184 }
185 EXPORT_SYMBOL_GPL(segment_base);
186
187 static inline int valid_vcpu(int n)
188 {
189         return likely(n >= 0 && n < KVM_MAX_VCPUS);
190 }
191
192 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
193                    void *dest)
194 {
195         unsigned char *host_buf = dest;
196         unsigned long req_size = size;
197
198         while (size) {
199                 hpa_t paddr;
200                 unsigned now;
201                 unsigned offset;
202                 hva_t guest_buf;
203
204                 paddr = gva_to_hpa(vcpu, addr);
205
206                 if (is_error_hpa(paddr))
207                         break;
208
209                 guest_buf = (hva_t)kmap_atomic(
210                                         pfn_to_page(paddr >> PAGE_SHIFT),
211                                         KM_USER0);
212                 offset = addr & ~PAGE_MASK;
213                 guest_buf |= offset;
214                 now = min(size, PAGE_SIZE - offset);
215                 memcpy(host_buf, (void*)guest_buf, now);
216                 host_buf += now;
217                 addr += now;
218                 size -= now;
219                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
220         }
221         return req_size - size;
222 }
223 EXPORT_SYMBOL_GPL(kvm_read_guest);
224
225 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
226                     void *data)
227 {
228         unsigned char *host_buf = data;
229         unsigned long req_size = size;
230
231         while (size) {
232                 hpa_t paddr;
233                 unsigned now;
234                 unsigned offset;
235                 hva_t guest_buf;
236                 gfn_t gfn;
237
238                 paddr = gva_to_hpa(vcpu, addr);
239
240                 if (is_error_hpa(paddr))
241                         break;
242
243                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
244                 mark_page_dirty(vcpu->kvm, gfn);
245                 guest_buf = (hva_t)kmap_atomic(
246                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
247                 offset = addr & ~PAGE_MASK;
248                 guest_buf |= offset;
249                 now = min(size, PAGE_SIZE - offset);
250                 memcpy((void*)guest_buf, host_buf, now);
251                 host_buf += now;
252                 addr += now;
253                 size -= now;
254                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
255         }
256         return req_size - size;
257 }
258 EXPORT_SYMBOL_GPL(kvm_write_guest);
259
260 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
261 {
262         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
263                 return;
264
265         vcpu->guest_fpu_loaded = 1;
266         fx_save(vcpu->host_fx_image);
267         fx_restore(vcpu->guest_fx_image);
268 }
269 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
270
271 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
272 {
273         if (!vcpu->guest_fpu_loaded)
274                 return;
275
276         vcpu->guest_fpu_loaded = 0;
277         fx_save(vcpu->guest_fx_image);
278         fx_restore(vcpu->host_fx_image);
279 }
280 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
281
282 /*
283  * Switches to specified vcpu, until a matching vcpu_put()
284  */
285 static void vcpu_load(struct kvm_vcpu *vcpu)
286 {
287         mutex_lock(&vcpu->mutex);
288         kvm_arch_ops->vcpu_load(vcpu);
289 }
290
291 /*
292  * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
293  * if the slot is not populated.
294  */
295 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
296 {
297         struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
298
299         mutex_lock(&vcpu->mutex);
300         if (!vcpu->vmcs) {
301                 mutex_unlock(&vcpu->mutex);
302                 return NULL;
303         }
304         kvm_arch_ops->vcpu_load(vcpu);
305         return vcpu;
306 }
307
308 static void vcpu_put(struct kvm_vcpu *vcpu)
309 {
310         kvm_arch_ops->vcpu_put(vcpu);
311         mutex_unlock(&vcpu->mutex);
312 }
313
314 static void ack_flush(void *_completed)
315 {
316         atomic_t *completed = _completed;
317
318         atomic_inc(completed);
319 }
320
321 void kvm_flush_remote_tlbs(struct kvm *kvm)
322 {
323         int i, cpu, needed;
324         cpumask_t cpus;
325         struct kvm_vcpu *vcpu;
326         atomic_t completed;
327
328         atomic_set(&completed, 0);
329         cpus_clear(cpus);
330         needed = 0;
331         for (i = 0; i < kvm->nvcpus; ++i) {
332                 vcpu = &kvm->vcpus[i];
333                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
334                         continue;
335                 cpu = vcpu->cpu;
336                 if (cpu != -1 && cpu != raw_smp_processor_id())
337                         if (!cpu_isset(cpu, cpus)) {
338                                 cpu_set(cpu, cpus);
339                                 ++needed;
340                         }
341         }
342
343         /*
344          * We really want smp_call_function_mask() here.  But that's not
345          * available, so ipi all cpus in parallel and wait for them
346          * to complete.
347          */
348         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
349                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
350         while (atomic_read(&completed) != needed) {
351                 cpu_relax();
352                 barrier();
353         }
354 }
355
356 static struct kvm *kvm_create_vm(void)
357 {
358         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
359         int i;
360
361         if (!kvm)
362                 return ERR_PTR(-ENOMEM);
363
364         kvm_io_bus_init(&kvm->pio_bus);
365         spin_lock_init(&kvm->lock);
366         INIT_LIST_HEAD(&kvm->active_mmu_pages);
367         spin_lock(&kvm_lock);
368         list_add(&kvm->vm_list, &vm_list);
369         spin_unlock(&kvm_lock);
370         kvm_io_bus_init(&kvm->mmio_bus);
371         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
372                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
373
374                 mutex_init(&vcpu->mutex);
375                 vcpu->cpu = -1;
376                 vcpu->kvm = kvm;
377                 vcpu->mmu.root_hpa = INVALID_PAGE;
378         }
379         return kvm;
380 }
381
382 static int kvm_dev_open(struct inode *inode, struct file *filp)
383 {
384         return 0;
385 }
386
387 /*
388  * Free any memory in @free but not in @dont.
389  */
390 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
391                                   struct kvm_memory_slot *dont)
392 {
393         int i;
394
395         if (!dont || free->phys_mem != dont->phys_mem)
396                 if (free->phys_mem) {
397                         for (i = 0; i < free->npages; ++i)
398                                 if (free->phys_mem[i])
399                                         __free_page(free->phys_mem[i]);
400                         vfree(free->phys_mem);
401                 }
402
403         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
404                 vfree(free->dirty_bitmap);
405
406         free->phys_mem = NULL;
407         free->npages = 0;
408         free->dirty_bitmap = NULL;
409 }
410
411 static void kvm_free_physmem(struct kvm *kvm)
412 {
413         int i;
414
415         for (i = 0; i < kvm->nmemslots; ++i)
416                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
417 }
418
419 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
420 {
421         int i;
422
423         for (i = 0; i < 2; ++i)
424                 if (vcpu->pio.guest_pages[i]) {
425                         __free_page(vcpu->pio.guest_pages[i]);
426                         vcpu->pio.guest_pages[i] = NULL;
427                 }
428 }
429
430 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
431 {
432         if (!vcpu->vmcs)
433                 return;
434
435         vcpu_load(vcpu);
436         kvm_mmu_unload(vcpu);
437         vcpu_put(vcpu);
438 }
439
440 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
441 {
442         if (!vcpu->vmcs)
443                 return;
444
445         vcpu_load(vcpu);
446         kvm_mmu_destroy(vcpu);
447         vcpu_put(vcpu);
448         kvm_arch_ops->vcpu_free(vcpu);
449         free_page((unsigned long)vcpu->run);
450         vcpu->run = NULL;
451         free_page((unsigned long)vcpu->pio_data);
452         vcpu->pio_data = NULL;
453         free_pio_guest_pages(vcpu);
454 }
455
456 static void kvm_free_vcpus(struct kvm *kvm)
457 {
458         unsigned int i;
459
460         /*
461          * Unpin any mmu pages first.
462          */
463         for (i = 0; i < KVM_MAX_VCPUS; ++i)
464                 kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
465         for (i = 0; i < KVM_MAX_VCPUS; ++i)
466                 kvm_free_vcpu(&kvm->vcpus[i]);
467 }
468
469 static int kvm_dev_release(struct inode *inode, struct file *filp)
470 {
471         return 0;
472 }
473
474 static void kvm_destroy_vm(struct kvm *kvm)
475 {
476         spin_lock(&kvm_lock);
477         list_del(&kvm->vm_list);
478         spin_unlock(&kvm_lock);
479         kvm_io_bus_destroy(&kvm->pio_bus);
480         kvm_io_bus_destroy(&kvm->mmio_bus);
481         kvm_free_vcpus(kvm);
482         kvm_free_physmem(kvm);
483         kfree(kvm);
484 }
485
486 static int kvm_vm_release(struct inode *inode, struct file *filp)
487 {
488         struct kvm *kvm = filp->private_data;
489
490         kvm_destroy_vm(kvm);
491         return 0;
492 }
493
494 static void inject_gp(struct kvm_vcpu *vcpu)
495 {
496         kvm_arch_ops->inject_gp(vcpu, 0);
497 }
498
499 /*
500  * Load the pae pdptrs.  Return true is they are all valid.
501  */
502 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
503 {
504         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
505         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
506         int i;
507         u64 pdpte;
508         u64 *pdpt;
509         int ret;
510         struct page *page;
511
512         spin_lock(&vcpu->kvm->lock);
513         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
514         /* FIXME: !page - emulate? 0xff? */
515         pdpt = kmap_atomic(page, KM_USER0);
516
517         ret = 1;
518         for (i = 0; i < 4; ++i) {
519                 pdpte = pdpt[offset + i];
520                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
521                         ret = 0;
522                         goto out;
523                 }
524         }
525
526         for (i = 0; i < 4; ++i)
527                 vcpu->pdptrs[i] = pdpt[offset + i];
528
529 out:
530         kunmap_atomic(pdpt, KM_USER0);
531         spin_unlock(&vcpu->kvm->lock);
532
533         return ret;
534 }
535
536 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
537 {
538         if (cr0 & CR0_RESEVED_BITS) {
539                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
540                        cr0, vcpu->cr0);
541                 inject_gp(vcpu);
542                 return;
543         }
544
545         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
546                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
547                 inject_gp(vcpu);
548                 return;
549         }
550
551         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
552                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
553                        "and a clear PE flag\n");
554                 inject_gp(vcpu);
555                 return;
556         }
557
558         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
559 #ifdef CONFIG_X86_64
560                 if ((vcpu->shadow_efer & EFER_LME)) {
561                         int cs_db, cs_l;
562
563                         if (!is_pae(vcpu)) {
564                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
565                                        "in long mode while PAE is disabled\n");
566                                 inject_gp(vcpu);
567                                 return;
568                         }
569                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
570                         if (cs_l) {
571                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
572                                        "in long mode while CS.L == 1\n");
573                                 inject_gp(vcpu);
574                                 return;
575
576                         }
577                 } else
578 #endif
579                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
580                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
581                                "reserved bits\n");
582                         inject_gp(vcpu);
583                         return;
584                 }
585
586         }
587
588         kvm_arch_ops->set_cr0(vcpu, cr0);
589         vcpu->cr0 = cr0;
590
591         spin_lock(&vcpu->kvm->lock);
592         kvm_mmu_reset_context(vcpu);
593         spin_unlock(&vcpu->kvm->lock);
594         return;
595 }
596 EXPORT_SYMBOL_GPL(set_cr0);
597
598 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
599 {
600         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
601 }
602 EXPORT_SYMBOL_GPL(lmsw);
603
604 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
605 {
606         if (cr4 & CR4_RESEVED_BITS) {
607                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
608                 inject_gp(vcpu);
609                 return;
610         }
611
612         if (is_long_mode(vcpu)) {
613                 if (!(cr4 & CR4_PAE_MASK)) {
614                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
615                                "in long mode\n");
616                         inject_gp(vcpu);
617                         return;
618                 }
619         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
620                    && !load_pdptrs(vcpu, vcpu->cr3)) {
621                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
622                 inject_gp(vcpu);
623         }
624
625         if (cr4 & CR4_VMXE_MASK) {
626                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
627                 inject_gp(vcpu);
628                 return;
629         }
630         kvm_arch_ops->set_cr4(vcpu, cr4);
631         spin_lock(&vcpu->kvm->lock);
632         kvm_mmu_reset_context(vcpu);
633         spin_unlock(&vcpu->kvm->lock);
634 }
635 EXPORT_SYMBOL_GPL(set_cr4);
636
637 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
638 {
639         if (is_long_mode(vcpu)) {
640                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
641                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
642                         inject_gp(vcpu);
643                         return;
644                 }
645         } else {
646                 if (cr3 & CR3_RESEVED_BITS) {
647                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
648                         inject_gp(vcpu);
649                         return;
650                 }
651                 if (is_paging(vcpu) && is_pae(vcpu) &&
652                     !load_pdptrs(vcpu, cr3)) {
653                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
654                                "reserved bits\n");
655                         inject_gp(vcpu);
656                         return;
657                 }
658         }
659
660         vcpu->cr3 = cr3;
661         spin_lock(&vcpu->kvm->lock);
662         /*
663          * Does the new cr3 value map to physical memory? (Note, we
664          * catch an invalid cr3 even in real-mode, because it would
665          * cause trouble later on when we turn on paging anyway.)
666          *
667          * A real CPU would silently accept an invalid cr3 and would
668          * attempt to use it - with largely undefined (and often hard
669          * to debug) behavior on the guest side.
670          */
671         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
672                 inject_gp(vcpu);
673         else
674                 vcpu->mmu.new_cr3(vcpu);
675         spin_unlock(&vcpu->kvm->lock);
676 }
677 EXPORT_SYMBOL_GPL(set_cr3);
678
679 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
680 {
681         if ( cr8 & CR8_RESEVED_BITS) {
682                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
683                 inject_gp(vcpu);
684                 return;
685         }
686         vcpu->cr8 = cr8;
687 }
688 EXPORT_SYMBOL_GPL(set_cr8);
689
690 void fx_init(struct kvm_vcpu *vcpu)
691 {
692         struct __attribute__ ((__packed__)) fx_image_s {
693                 u16 control; //fcw
694                 u16 status; //fsw
695                 u16 tag; // ftw
696                 u16 opcode; //fop
697                 u64 ip; // fpu ip
698                 u64 operand;// fpu dp
699                 u32 mxcsr;
700                 u32 mxcsr_mask;
701
702         } *fx_image;
703
704         fx_save(vcpu->host_fx_image);
705         fpu_init();
706         fx_save(vcpu->guest_fx_image);
707         fx_restore(vcpu->host_fx_image);
708
709         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
710         fx_image->mxcsr = 0x1f80;
711         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
712                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
713 }
714 EXPORT_SYMBOL_GPL(fx_init);
715
716 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
717 {
718         spin_lock(&vcpu->kvm->lock);
719         kvm_mmu_slot_remove_write_access(vcpu, slot);
720         spin_unlock(&vcpu->kvm->lock);
721 }
722
723 /*
724  * Allocate some memory and give it an address in the guest physical address
725  * space.
726  *
727  * Discontiguous memory is allowed, mostly for framebuffers.
728  */
729 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
730                                           struct kvm_memory_region *mem)
731 {
732         int r;
733         gfn_t base_gfn;
734         unsigned long npages;
735         unsigned long i;
736         struct kvm_memory_slot *memslot;
737         struct kvm_memory_slot old, new;
738         int memory_config_version;
739
740         r = -EINVAL;
741         /* General sanity checks */
742         if (mem->memory_size & (PAGE_SIZE - 1))
743                 goto out;
744         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
745                 goto out;
746         if (mem->slot >= KVM_MEMORY_SLOTS)
747                 goto out;
748         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
749                 goto out;
750
751         memslot = &kvm->memslots[mem->slot];
752         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
753         npages = mem->memory_size >> PAGE_SHIFT;
754
755         if (!npages)
756                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
757
758 raced:
759         spin_lock(&kvm->lock);
760
761         memory_config_version = kvm->memory_config_version;
762         new = old = *memslot;
763
764         new.base_gfn = base_gfn;
765         new.npages = npages;
766         new.flags = mem->flags;
767
768         /* Disallow changing a memory slot's size. */
769         r = -EINVAL;
770         if (npages && old.npages && npages != old.npages)
771                 goto out_unlock;
772
773         /* Check for overlaps */
774         r = -EEXIST;
775         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
776                 struct kvm_memory_slot *s = &kvm->memslots[i];
777
778                 if (s == memslot)
779                         continue;
780                 if (!((base_gfn + npages <= s->base_gfn) ||
781                       (base_gfn >= s->base_gfn + s->npages)))
782                         goto out_unlock;
783         }
784         /*
785          * Do memory allocations outside lock.  memory_config_version will
786          * detect any races.
787          */
788         spin_unlock(&kvm->lock);
789
790         /* Deallocate if slot is being removed */
791         if (!npages)
792                 new.phys_mem = NULL;
793
794         /* Free page dirty bitmap if unneeded */
795         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
796                 new.dirty_bitmap = NULL;
797
798         r = -ENOMEM;
799
800         /* Allocate if a slot is being created */
801         if (npages && !new.phys_mem) {
802                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
803
804                 if (!new.phys_mem)
805                         goto out_free;
806
807                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
808                 for (i = 0; i < npages; ++i) {
809                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
810                                                      | __GFP_ZERO);
811                         if (!new.phys_mem[i])
812                                 goto out_free;
813                         set_page_private(new.phys_mem[i],0);
814                 }
815         }
816
817         /* Allocate page dirty bitmap if needed */
818         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
819                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
820
821                 new.dirty_bitmap = vmalloc(dirty_bytes);
822                 if (!new.dirty_bitmap)
823                         goto out_free;
824                 memset(new.dirty_bitmap, 0, dirty_bytes);
825         }
826
827         spin_lock(&kvm->lock);
828
829         if (memory_config_version != kvm->memory_config_version) {
830                 spin_unlock(&kvm->lock);
831                 kvm_free_physmem_slot(&new, &old);
832                 goto raced;
833         }
834
835         r = -EAGAIN;
836         if (kvm->busy)
837                 goto out_unlock;
838
839         if (mem->slot >= kvm->nmemslots)
840                 kvm->nmemslots = mem->slot + 1;
841
842         *memslot = new;
843         ++kvm->memory_config_version;
844
845         spin_unlock(&kvm->lock);
846
847         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
848                 struct kvm_vcpu *vcpu;
849
850                 vcpu = vcpu_load_slot(kvm, i);
851                 if (!vcpu)
852                         continue;
853                 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
854                         do_remove_write_access(vcpu, mem->slot);
855                 kvm_mmu_reset_context(vcpu);
856                 vcpu_put(vcpu);
857         }
858
859         kvm_free_physmem_slot(&old, &new);
860         return 0;
861
862 out_unlock:
863         spin_unlock(&kvm->lock);
864 out_free:
865         kvm_free_physmem_slot(&new, &old);
866 out:
867         return r;
868 }
869
870 /*
871  * Get (and clear) the dirty memory log for a memory slot.
872  */
873 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
874                                       struct kvm_dirty_log *log)
875 {
876         struct kvm_memory_slot *memslot;
877         int r, i;
878         int n;
879         int cleared;
880         unsigned long any = 0;
881
882         spin_lock(&kvm->lock);
883
884         /*
885          * Prevent changes to guest memory configuration even while the lock
886          * is not taken.
887          */
888         ++kvm->busy;
889         spin_unlock(&kvm->lock);
890         r = -EINVAL;
891         if (log->slot >= KVM_MEMORY_SLOTS)
892                 goto out;
893
894         memslot = &kvm->memslots[log->slot];
895         r = -ENOENT;
896         if (!memslot->dirty_bitmap)
897                 goto out;
898
899         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
900
901         for (i = 0; !any && i < n/sizeof(long); ++i)
902                 any = memslot->dirty_bitmap[i];
903
904         r = -EFAULT;
905         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
906                 goto out;
907
908         if (any) {
909                 cleared = 0;
910                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
911                         struct kvm_vcpu *vcpu;
912
913                         vcpu = vcpu_load_slot(kvm, i);
914                         if (!vcpu)
915                                 continue;
916                         if (!cleared) {
917                                 do_remove_write_access(vcpu, log->slot);
918                                 memset(memslot->dirty_bitmap, 0, n);
919                                 cleared = 1;
920                         }
921                         kvm_arch_ops->tlb_flush(vcpu);
922                         vcpu_put(vcpu);
923                 }
924         }
925
926         r = 0;
927
928 out:
929         spin_lock(&kvm->lock);
930         --kvm->busy;
931         spin_unlock(&kvm->lock);
932         return r;
933 }
934
935 /*
936  * Set a new alias region.  Aliases map a portion of physical memory into
937  * another portion.  This is useful for memory windows, for example the PC
938  * VGA region.
939  */
940 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
941                                          struct kvm_memory_alias *alias)
942 {
943         int r, n;
944         struct kvm_mem_alias *p;
945
946         r = -EINVAL;
947         /* General sanity checks */
948         if (alias->memory_size & (PAGE_SIZE - 1))
949                 goto out;
950         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
951                 goto out;
952         if (alias->slot >= KVM_ALIAS_SLOTS)
953                 goto out;
954         if (alias->guest_phys_addr + alias->memory_size
955             < alias->guest_phys_addr)
956                 goto out;
957         if (alias->target_phys_addr + alias->memory_size
958             < alias->target_phys_addr)
959                 goto out;
960
961         spin_lock(&kvm->lock);
962
963         p = &kvm->aliases[alias->slot];
964         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
965         p->npages = alias->memory_size >> PAGE_SHIFT;
966         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
967
968         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
969                 if (kvm->aliases[n - 1].npages)
970                         break;
971         kvm->naliases = n;
972
973         spin_unlock(&kvm->lock);
974
975         vcpu_load(&kvm->vcpus[0]);
976         spin_lock(&kvm->lock);
977         kvm_mmu_zap_all(&kvm->vcpus[0]);
978         spin_unlock(&kvm->lock);
979         vcpu_put(&kvm->vcpus[0]);
980
981         return 0;
982
983 out:
984         return r;
985 }
986
987 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
988 {
989         int i;
990         struct kvm_mem_alias *alias;
991
992         for (i = 0; i < kvm->naliases; ++i) {
993                 alias = &kvm->aliases[i];
994                 if (gfn >= alias->base_gfn
995                     && gfn < alias->base_gfn + alias->npages)
996                         return alias->target_gfn + gfn - alias->base_gfn;
997         }
998         return gfn;
999 }
1000
1001 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1002 {
1003         int i;
1004
1005         for (i = 0; i < kvm->nmemslots; ++i) {
1006                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1007
1008                 if (gfn >= memslot->base_gfn
1009                     && gfn < memslot->base_gfn + memslot->npages)
1010                         return memslot;
1011         }
1012         return NULL;
1013 }
1014
1015 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1016 {
1017         gfn = unalias_gfn(kvm, gfn);
1018         return __gfn_to_memslot(kvm, gfn);
1019 }
1020
1021 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1022 {
1023         struct kvm_memory_slot *slot;
1024
1025         gfn = unalias_gfn(kvm, gfn);
1026         slot = __gfn_to_memslot(kvm, gfn);
1027         if (!slot)
1028                 return NULL;
1029         return slot->phys_mem[gfn - slot->base_gfn];
1030 }
1031 EXPORT_SYMBOL_GPL(gfn_to_page);
1032
1033 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1034 {
1035         int i;
1036         struct kvm_memory_slot *memslot;
1037         unsigned long rel_gfn;
1038
1039         for (i = 0; i < kvm->nmemslots; ++i) {
1040                 memslot = &kvm->memslots[i];
1041
1042                 if (gfn >= memslot->base_gfn
1043                     && gfn < memslot->base_gfn + memslot->npages) {
1044
1045                         if (!memslot->dirty_bitmap)
1046                                 return;
1047
1048                         rel_gfn = gfn - memslot->base_gfn;
1049
1050                         /* avoid RMW */
1051                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1052                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1053                         return;
1054                 }
1055         }
1056 }
1057
1058 static int emulator_read_std(unsigned long addr,
1059                              void *val,
1060                              unsigned int bytes,
1061                              struct x86_emulate_ctxt *ctxt)
1062 {
1063         struct kvm_vcpu *vcpu = ctxt->vcpu;
1064         void *data = val;
1065
1066         while (bytes) {
1067                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1068                 unsigned offset = addr & (PAGE_SIZE-1);
1069                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1070                 unsigned long pfn;
1071                 struct page *page;
1072                 void *page_virt;
1073
1074                 if (gpa == UNMAPPED_GVA)
1075                         return X86EMUL_PROPAGATE_FAULT;
1076                 pfn = gpa >> PAGE_SHIFT;
1077                 page = gfn_to_page(vcpu->kvm, pfn);
1078                 if (!page)
1079                         return X86EMUL_UNHANDLEABLE;
1080                 page_virt = kmap_atomic(page, KM_USER0);
1081
1082                 memcpy(data, page_virt + offset, tocopy);
1083
1084                 kunmap_atomic(page_virt, KM_USER0);
1085
1086                 bytes -= tocopy;
1087                 data += tocopy;
1088                 addr += tocopy;
1089         }
1090
1091         return X86EMUL_CONTINUE;
1092 }
1093
1094 static int emulator_write_std(unsigned long addr,
1095                               const void *val,
1096                               unsigned int bytes,
1097                               struct x86_emulate_ctxt *ctxt)
1098 {
1099         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1100                addr, bytes);
1101         return X86EMUL_UNHANDLEABLE;
1102 }
1103
1104 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1105                                                 gpa_t addr)
1106 {
1107         /*
1108          * Note that its important to have this wrapper function because
1109          * in the very near future we will be checking for MMIOs against
1110          * the LAPIC as well as the general MMIO bus
1111          */
1112         return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1113 }
1114
1115 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1116                                                gpa_t addr)
1117 {
1118         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1119 }
1120
1121 static int emulator_read_emulated(unsigned long addr,
1122                                   void *val,
1123                                   unsigned int bytes,
1124                                   struct x86_emulate_ctxt *ctxt)
1125 {
1126         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1127         struct kvm_io_device *mmio_dev;
1128         gpa_t                 gpa;
1129
1130         if (vcpu->mmio_read_completed) {
1131                 memcpy(val, vcpu->mmio_data, bytes);
1132                 vcpu->mmio_read_completed = 0;
1133                 return X86EMUL_CONTINUE;
1134         } else if (emulator_read_std(addr, val, bytes, ctxt)
1135                    == X86EMUL_CONTINUE)
1136                 return X86EMUL_CONTINUE;
1137
1138         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1139         if (gpa == UNMAPPED_GVA)
1140                 return X86EMUL_PROPAGATE_FAULT;
1141
1142         /*
1143          * Is this MMIO handled locally?
1144          */
1145         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1146         if (mmio_dev) {
1147                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1148                 return X86EMUL_CONTINUE;
1149         }
1150
1151         vcpu->mmio_needed = 1;
1152         vcpu->mmio_phys_addr = gpa;
1153         vcpu->mmio_size = bytes;
1154         vcpu->mmio_is_write = 0;
1155
1156         return X86EMUL_UNHANDLEABLE;
1157 }
1158
1159 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1160                                const void *val, int bytes)
1161 {
1162         struct page *page;
1163         void *virt;
1164         unsigned offset = offset_in_page(gpa);
1165
1166         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1167                 return 0;
1168         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1169         if (!page)
1170                 return 0;
1171         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1172         virt = kmap_atomic(page, KM_USER0);
1173         if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
1174                 kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
1175                 memcpy(virt + offset_in_page(gpa), val, bytes);
1176         }
1177         kunmap_atomic(virt, KM_USER0);
1178         return 1;
1179 }
1180
1181 static int emulator_write_emulated(unsigned long addr,
1182                                    const void *val,
1183                                    unsigned int bytes,
1184                                    struct x86_emulate_ctxt *ctxt)
1185 {
1186         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1187         struct kvm_io_device *mmio_dev;
1188         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1189
1190         if (gpa == UNMAPPED_GVA) {
1191                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1192                 return X86EMUL_PROPAGATE_FAULT;
1193         }
1194
1195         if (emulator_write_phys(vcpu, gpa, val, bytes))
1196                 return X86EMUL_CONTINUE;
1197
1198         /*
1199          * Is this MMIO handled locally?
1200          */
1201         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1202         if (mmio_dev) {
1203                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1204                 return X86EMUL_CONTINUE;
1205         }
1206
1207         vcpu->mmio_needed = 1;
1208         vcpu->mmio_phys_addr = gpa;
1209         vcpu->mmio_size = bytes;
1210         vcpu->mmio_is_write = 1;
1211         memcpy(vcpu->mmio_data, val, bytes);
1212
1213         return X86EMUL_CONTINUE;
1214 }
1215
1216 static int emulator_cmpxchg_emulated(unsigned long addr,
1217                                      const void *old,
1218                                      const void *new,
1219                                      unsigned int bytes,
1220                                      struct x86_emulate_ctxt *ctxt)
1221 {
1222         static int reported;
1223
1224         if (!reported) {
1225                 reported = 1;
1226                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1227         }
1228         return emulator_write_emulated(addr, new, bytes, ctxt);
1229 }
1230
1231 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1232 {
1233         return kvm_arch_ops->get_segment_base(vcpu, seg);
1234 }
1235
1236 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1237 {
1238         return X86EMUL_CONTINUE;
1239 }
1240
1241 int emulate_clts(struct kvm_vcpu *vcpu)
1242 {
1243         unsigned long cr0;
1244
1245         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1246         kvm_arch_ops->set_cr0(vcpu, cr0);
1247         return X86EMUL_CONTINUE;
1248 }
1249
1250 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1251 {
1252         struct kvm_vcpu *vcpu = ctxt->vcpu;
1253
1254         switch (dr) {
1255         case 0 ... 3:
1256                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1257                 return X86EMUL_CONTINUE;
1258         default:
1259                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1260                        __FUNCTION__, dr);
1261                 return X86EMUL_UNHANDLEABLE;
1262         }
1263 }
1264
1265 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1266 {
1267         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1268         int exception;
1269
1270         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1271         if (exception) {
1272                 /* FIXME: better handling */
1273                 return X86EMUL_UNHANDLEABLE;
1274         }
1275         return X86EMUL_CONTINUE;
1276 }
1277
1278 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1279 {
1280         static int reported;
1281         u8 opcodes[4];
1282         unsigned long rip = ctxt->vcpu->rip;
1283         unsigned long rip_linear;
1284
1285         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1286
1287         if (reported)
1288                 return;
1289
1290         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1291
1292         printk(KERN_ERR "emulation failed but !mmio_needed?"
1293                " rip %lx %02x %02x %02x %02x\n",
1294                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1295         reported = 1;
1296 }
1297
1298 struct x86_emulate_ops emulate_ops = {
1299         .read_std            = emulator_read_std,
1300         .write_std           = emulator_write_std,
1301         .read_emulated       = emulator_read_emulated,
1302         .write_emulated      = emulator_write_emulated,
1303         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1304 };
1305
1306 int emulate_instruction(struct kvm_vcpu *vcpu,
1307                         struct kvm_run *run,
1308                         unsigned long cr2,
1309                         u16 error_code)
1310 {
1311         struct x86_emulate_ctxt emulate_ctxt;
1312         int r;
1313         int cs_db, cs_l;
1314
1315         vcpu->mmio_fault_cr2 = cr2;
1316         kvm_arch_ops->cache_regs(vcpu);
1317
1318         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1319
1320         emulate_ctxt.vcpu = vcpu;
1321         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1322         emulate_ctxt.cr2 = cr2;
1323         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1324                 ? X86EMUL_MODE_REAL : cs_l
1325                 ? X86EMUL_MODE_PROT64 : cs_db
1326                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1327
1328         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1329                 emulate_ctxt.cs_base = 0;
1330                 emulate_ctxt.ds_base = 0;
1331                 emulate_ctxt.es_base = 0;
1332                 emulate_ctxt.ss_base = 0;
1333         } else {
1334                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1335                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1336                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1337                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1338         }
1339
1340         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1341         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1342
1343         vcpu->mmio_is_write = 0;
1344         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1345
1346         if ((r || vcpu->mmio_is_write) && run) {
1347                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1348                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1349                 run->mmio.len = vcpu->mmio_size;
1350                 run->mmio.is_write = vcpu->mmio_is_write;
1351         }
1352
1353         if (r) {
1354                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1355                         return EMULATE_DONE;
1356                 if (!vcpu->mmio_needed) {
1357                         report_emulation_failure(&emulate_ctxt);
1358                         return EMULATE_FAIL;
1359                 }
1360                 return EMULATE_DO_MMIO;
1361         }
1362
1363         kvm_arch_ops->decache_regs(vcpu);
1364         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1365
1366         if (vcpu->mmio_is_write) {
1367                 vcpu->mmio_needed = 0;
1368                 return EMULATE_DO_MMIO;
1369         }
1370
1371         return EMULATE_DONE;
1372 }
1373 EXPORT_SYMBOL_GPL(emulate_instruction);
1374
1375 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1376 {
1377         if (vcpu->irq_summary)
1378                 return 1;
1379
1380         vcpu->run->exit_reason = KVM_EXIT_HLT;
1381         ++vcpu->stat.halt_exits;
1382         return 0;
1383 }
1384 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1385
1386 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1387 {
1388         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1389
1390         kvm_arch_ops->cache_regs(vcpu);
1391         ret = -KVM_EINVAL;
1392 #ifdef CONFIG_X86_64
1393         if (is_long_mode(vcpu)) {
1394                 nr = vcpu->regs[VCPU_REGS_RAX];
1395                 a0 = vcpu->regs[VCPU_REGS_RDI];
1396                 a1 = vcpu->regs[VCPU_REGS_RSI];
1397                 a2 = vcpu->regs[VCPU_REGS_RDX];
1398                 a3 = vcpu->regs[VCPU_REGS_RCX];
1399                 a4 = vcpu->regs[VCPU_REGS_R8];
1400                 a5 = vcpu->regs[VCPU_REGS_R9];
1401         } else
1402 #endif
1403         {
1404                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1405                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1406                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1407                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1408                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1409                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1410                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1411         }
1412         switch (nr) {
1413         default:
1414                 run->hypercall.args[0] = a0;
1415                 run->hypercall.args[1] = a1;
1416                 run->hypercall.args[2] = a2;
1417                 run->hypercall.args[3] = a3;
1418                 run->hypercall.args[4] = a4;
1419                 run->hypercall.args[5] = a5;
1420                 run->hypercall.ret = ret;
1421                 run->hypercall.longmode = is_long_mode(vcpu);
1422                 kvm_arch_ops->decache_regs(vcpu);
1423                 return 0;
1424         }
1425         vcpu->regs[VCPU_REGS_RAX] = ret;
1426         kvm_arch_ops->decache_regs(vcpu);
1427         return 1;
1428 }
1429 EXPORT_SYMBOL_GPL(kvm_hypercall);
1430
1431 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1432 {
1433         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1434 }
1435
1436 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1437 {
1438         struct descriptor_table dt = { limit, base };
1439
1440         kvm_arch_ops->set_gdt(vcpu, &dt);
1441 }
1442
1443 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1444 {
1445         struct descriptor_table dt = { limit, base };
1446
1447         kvm_arch_ops->set_idt(vcpu, &dt);
1448 }
1449
1450 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1451                    unsigned long *rflags)
1452 {
1453         lmsw(vcpu, msw);
1454         *rflags = kvm_arch_ops->get_rflags(vcpu);
1455 }
1456
1457 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1458 {
1459         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1460         switch (cr) {
1461         case 0:
1462                 return vcpu->cr0;
1463         case 2:
1464                 return vcpu->cr2;
1465         case 3:
1466                 return vcpu->cr3;
1467         case 4:
1468                 return vcpu->cr4;
1469         default:
1470                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1471                 return 0;
1472         }
1473 }
1474
1475 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1476                      unsigned long *rflags)
1477 {
1478         switch (cr) {
1479         case 0:
1480                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1481                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1482                 break;
1483         case 2:
1484                 vcpu->cr2 = val;
1485                 break;
1486         case 3:
1487                 set_cr3(vcpu, val);
1488                 break;
1489         case 4:
1490                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1491                 break;
1492         default:
1493                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1494         }
1495 }
1496
1497 /*
1498  * Register the para guest with the host:
1499  */
1500 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1501 {
1502         struct kvm_vcpu_para_state *para_state;
1503         hpa_t para_state_hpa, hypercall_hpa;
1504         struct page *para_state_page;
1505         unsigned char *hypercall;
1506         gpa_t hypercall_gpa;
1507
1508         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1509         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1510
1511         /*
1512          * Needs to be page aligned:
1513          */
1514         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1515                 goto err_gp;
1516
1517         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1518         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1519         if (is_error_hpa(para_state_hpa))
1520                 goto err_gp;
1521
1522         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1523         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1524         para_state = kmap_atomic(para_state_page, KM_USER0);
1525
1526         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1527         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1528
1529         para_state->host_version = KVM_PARA_API_VERSION;
1530         /*
1531          * We cannot support guests that try to register themselves
1532          * with a newer API version than the host supports:
1533          */
1534         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1535                 para_state->ret = -KVM_EINVAL;
1536                 goto err_kunmap_skip;
1537         }
1538
1539         hypercall_gpa = para_state->hypercall_gpa;
1540         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1541         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1542         if (is_error_hpa(hypercall_hpa)) {
1543                 para_state->ret = -KVM_EINVAL;
1544                 goto err_kunmap_skip;
1545         }
1546
1547         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1548         vcpu->para_state_page = para_state_page;
1549         vcpu->para_state_gpa = para_state_gpa;
1550         vcpu->hypercall_gpa = hypercall_gpa;
1551
1552         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1553         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1554                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1555         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1556         kunmap_atomic(hypercall, KM_USER1);
1557
1558         para_state->ret = 0;
1559 err_kunmap_skip:
1560         kunmap_atomic(para_state, KM_USER0);
1561         return 0;
1562 err_gp:
1563         return 1;
1564 }
1565
1566 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1567 {
1568         u64 data;
1569
1570         switch (msr) {
1571         case 0xc0010010: /* SYSCFG */
1572         case 0xc0010015: /* HWCR */
1573         case MSR_IA32_PLATFORM_ID:
1574         case MSR_IA32_P5_MC_ADDR:
1575         case MSR_IA32_P5_MC_TYPE:
1576         case MSR_IA32_MC0_CTL:
1577         case MSR_IA32_MCG_STATUS:
1578         case MSR_IA32_MCG_CAP:
1579         case MSR_IA32_MC0_MISC:
1580         case MSR_IA32_MC0_MISC+4:
1581         case MSR_IA32_MC0_MISC+8:
1582         case MSR_IA32_MC0_MISC+12:
1583         case MSR_IA32_MC0_MISC+16:
1584         case MSR_IA32_UCODE_REV:
1585         case MSR_IA32_PERF_STATUS:
1586         case MSR_IA32_EBL_CR_POWERON:
1587                 /* MTRR registers */
1588         case 0xfe:
1589         case 0x200 ... 0x2ff:
1590                 data = 0;
1591                 break;
1592         case 0xcd: /* fsb frequency */
1593                 data = 3;
1594                 break;
1595         case MSR_IA32_APICBASE:
1596                 data = vcpu->apic_base;
1597                 break;
1598         case MSR_IA32_MISC_ENABLE:
1599                 data = vcpu->ia32_misc_enable_msr;
1600                 break;
1601 #ifdef CONFIG_X86_64
1602         case MSR_EFER:
1603                 data = vcpu->shadow_efer;
1604                 break;
1605 #endif
1606         default:
1607                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1608                 return 1;
1609         }
1610         *pdata = data;
1611         return 0;
1612 }
1613 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1614
1615 /*
1616  * Reads an msr value (of 'msr_index') into 'pdata'.
1617  * Returns 0 on success, non-0 otherwise.
1618  * Assumes vcpu_load() was already called.
1619  */
1620 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1621 {
1622         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1623 }
1624
1625 #ifdef CONFIG_X86_64
1626
1627 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1628 {
1629         if (efer & EFER_RESERVED_BITS) {
1630                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1631                        efer);
1632                 inject_gp(vcpu);
1633                 return;
1634         }
1635
1636         if (is_paging(vcpu)
1637             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1638                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1639                 inject_gp(vcpu);
1640                 return;
1641         }
1642
1643         kvm_arch_ops->set_efer(vcpu, efer);
1644
1645         efer &= ~EFER_LMA;
1646         efer |= vcpu->shadow_efer & EFER_LMA;
1647
1648         vcpu->shadow_efer = efer;
1649 }
1650
1651 #endif
1652
1653 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1654 {
1655         switch (msr) {
1656 #ifdef CONFIG_X86_64
1657         case MSR_EFER:
1658                 set_efer(vcpu, data);
1659                 break;
1660 #endif
1661         case MSR_IA32_MC0_STATUS:
1662                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1663                        __FUNCTION__, data);
1664                 break;
1665         case MSR_IA32_MCG_STATUS:
1666                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1667                         __FUNCTION__, data);
1668                 break;
1669         case MSR_IA32_UCODE_REV:
1670         case MSR_IA32_UCODE_WRITE:
1671         case 0x200 ... 0x2ff: /* MTRRs */
1672                 break;
1673         case MSR_IA32_APICBASE:
1674                 vcpu->apic_base = data;
1675                 break;
1676         case MSR_IA32_MISC_ENABLE:
1677                 vcpu->ia32_misc_enable_msr = data;
1678                 break;
1679         /*
1680          * This is the 'probe whether the host is KVM' logic:
1681          */
1682         case MSR_KVM_API_MAGIC:
1683                 return vcpu_register_para(vcpu, data);
1684
1685         default:
1686                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1687                 return 1;
1688         }
1689         return 0;
1690 }
1691 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1692
1693 /*
1694  * Writes msr value into into the appropriate "register".
1695  * Returns 0 on success, non-0 otherwise.
1696  * Assumes vcpu_load() was already called.
1697  */
1698 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1699 {
1700         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1701 }
1702
1703 void kvm_resched(struct kvm_vcpu *vcpu)
1704 {
1705         if (!need_resched())
1706                 return;
1707         vcpu_put(vcpu);
1708         cond_resched();
1709         vcpu_load(vcpu);
1710 }
1711 EXPORT_SYMBOL_GPL(kvm_resched);
1712
1713 void load_msrs(struct vmx_msr_entry *e, int n)
1714 {
1715         int i;
1716
1717         for (i = 0; i < n; ++i)
1718                 wrmsrl(e[i].index, e[i].data);
1719 }
1720 EXPORT_SYMBOL_GPL(load_msrs);
1721
1722 void save_msrs(struct vmx_msr_entry *e, int n)
1723 {
1724         int i;
1725
1726         for (i = 0; i < n; ++i)
1727                 rdmsrl(e[i].index, e[i].data);
1728 }
1729 EXPORT_SYMBOL_GPL(save_msrs);
1730
1731 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1732 {
1733         int i;
1734         u32 function;
1735         struct kvm_cpuid_entry *e, *best;
1736
1737         kvm_arch_ops->cache_regs(vcpu);
1738         function = vcpu->regs[VCPU_REGS_RAX];
1739         vcpu->regs[VCPU_REGS_RAX] = 0;
1740         vcpu->regs[VCPU_REGS_RBX] = 0;
1741         vcpu->regs[VCPU_REGS_RCX] = 0;
1742         vcpu->regs[VCPU_REGS_RDX] = 0;
1743         best = NULL;
1744         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1745                 e = &vcpu->cpuid_entries[i];
1746                 if (e->function == function) {
1747                         best = e;
1748                         break;
1749                 }
1750                 /*
1751                  * Both basic or both extended?
1752                  */
1753                 if (((e->function ^ function) & 0x80000000) == 0)
1754                         if (!best || e->function > best->function)
1755                                 best = e;
1756         }
1757         if (best) {
1758                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1759                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1760                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1761                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1762         }
1763         kvm_arch_ops->decache_regs(vcpu);
1764         kvm_arch_ops->skip_emulated_instruction(vcpu);
1765 }
1766 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1767
1768 static int pio_copy_data(struct kvm_vcpu *vcpu)
1769 {
1770         void *p = vcpu->pio_data;
1771         void *q;
1772         unsigned bytes;
1773         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1774
1775         kvm_arch_ops->vcpu_put(vcpu);
1776         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1777                  PAGE_KERNEL);
1778         if (!q) {
1779                 kvm_arch_ops->vcpu_load(vcpu);
1780                 free_pio_guest_pages(vcpu);
1781                 return -ENOMEM;
1782         }
1783         q += vcpu->pio.guest_page_offset;
1784         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1785         if (vcpu->pio.in)
1786                 memcpy(q, p, bytes);
1787         else
1788                 memcpy(p, q, bytes);
1789         q -= vcpu->pio.guest_page_offset;
1790         vunmap(q);
1791         kvm_arch_ops->vcpu_load(vcpu);
1792         free_pio_guest_pages(vcpu);
1793         return 0;
1794 }
1795
1796 static int complete_pio(struct kvm_vcpu *vcpu)
1797 {
1798         struct kvm_pio_request *io = &vcpu->pio;
1799         long delta;
1800         int r;
1801
1802         kvm_arch_ops->cache_regs(vcpu);
1803
1804         if (!io->string) {
1805                 if (io->in)
1806                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1807                                io->size);
1808         } else {
1809                 if (io->in) {
1810                         r = pio_copy_data(vcpu);
1811                         if (r) {
1812                                 kvm_arch_ops->cache_regs(vcpu);
1813                                 return r;
1814                         }
1815                 }
1816
1817                 delta = 1;
1818                 if (io->rep) {
1819                         delta *= io->cur_count;
1820                         /*
1821                          * The size of the register should really depend on
1822                          * current address size.
1823                          */
1824                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1825                 }
1826                 if (io->down)
1827                         delta = -delta;
1828                 delta *= io->size;
1829                 if (io->in)
1830                         vcpu->regs[VCPU_REGS_RDI] += delta;
1831                 else
1832                         vcpu->regs[VCPU_REGS_RSI] += delta;
1833         }
1834
1835         kvm_arch_ops->decache_regs(vcpu);
1836
1837         io->count -= io->cur_count;
1838         io->cur_count = 0;
1839
1840         if (!io->count)
1841                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1842         return 0;
1843 }
1844
1845 void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
1846 {
1847         /* TODO: String I/O for in kernel device */
1848
1849         if (vcpu->pio.in)
1850                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1851                                   vcpu->pio.size,
1852                                   vcpu->pio_data);
1853         else
1854                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1855                                    vcpu->pio.size,
1856                                    vcpu->pio_data);
1857 }
1858
1859 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1860                   int size, unsigned long count, int string, int down,
1861                   gva_t address, int rep, unsigned port)
1862 {
1863         unsigned now, in_page;
1864         int i;
1865         int nr_pages = 1;
1866         struct page *page;
1867         struct kvm_io_device *pio_dev;
1868
1869         vcpu->run->exit_reason = KVM_EXIT_IO;
1870         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1871         vcpu->run->io.size = size;
1872         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1873         vcpu->run->io.count = count;
1874         vcpu->run->io.port = port;
1875         vcpu->pio.count = count;
1876         vcpu->pio.cur_count = count;
1877         vcpu->pio.size = size;
1878         vcpu->pio.in = in;
1879         vcpu->pio.port = port;
1880         vcpu->pio.string = string;
1881         vcpu->pio.down = down;
1882         vcpu->pio.guest_page_offset = offset_in_page(address);
1883         vcpu->pio.rep = rep;
1884
1885         pio_dev = vcpu_find_pio_dev(vcpu, port);
1886         if (!string) {
1887                 kvm_arch_ops->cache_regs(vcpu);
1888                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1889                 kvm_arch_ops->decache_regs(vcpu);
1890                 if (pio_dev) {
1891                         kernel_pio(pio_dev, vcpu);
1892                         complete_pio(vcpu);
1893                         return 1;
1894                 }
1895                 return 0;
1896         }
1897         /* TODO: String I/O for in kernel device */
1898         if (pio_dev)
1899                 printk(KERN_ERR "kvm_setup_pio: no string io support\n");
1900
1901         if (!count) {
1902                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1903                 return 1;
1904         }
1905
1906         now = min(count, PAGE_SIZE / size);
1907
1908         if (!down)
1909                 in_page = PAGE_SIZE - offset_in_page(address);
1910         else
1911                 in_page = offset_in_page(address) + size;
1912         now = min(count, (unsigned long)in_page / size);
1913         if (!now) {
1914                 /*
1915                  * String I/O straddles page boundary.  Pin two guest pages
1916                  * so that we satisfy atomicity constraints.  Do just one
1917                  * transaction to avoid complexity.
1918                  */
1919                 nr_pages = 2;
1920                 now = 1;
1921         }
1922         if (down) {
1923                 /*
1924                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1925                  */
1926                 printk(KERN_ERR "kvm: guest string pio down\n");
1927                 inject_gp(vcpu);
1928                 return 1;
1929         }
1930         vcpu->run->io.count = now;
1931         vcpu->pio.cur_count = now;
1932
1933         for (i = 0; i < nr_pages; ++i) {
1934                 spin_lock(&vcpu->kvm->lock);
1935                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1936                 if (page)
1937                         get_page(page);
1938                 vcpu->pio.guest_pages[i] = page;
1939                 spin_unlock(&vcpu->kvm->lock);
1940                 if (!page) {
1941                         inject_gp(vcpu);
1942                         free_pio_guest_pages(vcpu);
1943                         return 1;
1944                 }
1945         }
1946
1947         if (!vcpu->pio.in)
1948                 return pio_copy_data(vcpu);
1949         return 0;
1950 }
1951 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1952
1953 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1954 {
1955         int r;
1956         sigset_t sigsaved;
1957
1958         vcpu_load(vcpu);
1959
1960         if (vcpu->sigset_active)
1961                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1962
1963         /* re-sync apic's tpr */
1964         vcpu->cr8 = kvm_run->cr8;
1965
1966         if (vcpu->pio.cur_count) {
1967                 r = complete_pio(vcpu);
1968                 if (r)
1969                         goto out;
1970         }
1971
1972         if (vcpu->mmio_needed) {
1973                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1974                 vcpu->mmio_read_completed = 1;
1975                 vcpu->mmio_needed = 0;
1976                 r = emulate_instruction(vcpu, kvm_run,
1977                                         vcpu->mmio_fault_cr2, 0);
1978                 if (r == EMULATE_DO_MMIO) {
1979                         /*
1980                          * Read-modify-write.  Back to userspace.
1981                          */
1982                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1983                         r = 0;
1984                         goto out;
1985                 }
1986         }
1987
1988         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1989                 kvm_arch_ops->cache_regs(vcpu);
1990                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1991                 kvm_arch_ops->decache_regs(vcpu);
1992         }
1993
1994         r = kvm_arch_ops->run(vcpu, kvm_run);
1995
1996 out:
1997         if (vcpu->sigset_active)
1998                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1999
2000         vcpu_put(vcpu);
2001         return r;
2002 }
2003
2004 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2005                                    struct kvm_regs *regs)
2006 {
2007         vcpu_load(vcpu);
2008
2009         kvm_arch_ops->cache_regs(vcpu);
2010
2011         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2012         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2013         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2014         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2015         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2016         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2017         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2018         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2019 #ifdef CONFIG_X86_64
2020         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2021         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2022         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2023         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2024         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2025         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2026         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2027         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2028 #endif
2029
2030         regs->rip = vcpu->rip;
2031         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
2032
2033         /*
2034          * Don't leak debug flags in case they were set for guest debugging
2035          */
2036         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2037                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2038
2039         vcpu_put(vcpu);
2040
2041         return 0;
2042 }
2043
2044 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2045                                    struct kvm_regs *regs)
2046 {
2047         vcpu_load(vcpu);
2048
2049         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2050         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2051         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2052         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2053         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2054         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2055         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2056         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2057 #ifdef CONFIG_X86_64
2058         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2059         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2060         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2061         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2062         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2063         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2064         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2065         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2066 #endif
2067
2068         vcpu->rip = regs->rip;
2069         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2070
2071         kvm_arch_ops->decache_regs(vcpu);
2072
2073         vcpu_put(vcpu);
2074
2075         return 0;
2076 }
2077
2078 static void get_segment(struct kvm_vcpu *vcpu,
2079                         struct kvm_segment *var, int seg)
2080 {
2081         return kvm_arch_ops->get_segment(vcpu, var, seg);
2082 }
2083
2084 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2085                                     struct kvm_sregs *sregs)
2086 {
2087         struct descriptor_table dt;
2088
2089         vcpu_load(vcpu);
2090
2091         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2092         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2093         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2094         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2095         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2096         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2097
2098         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2099         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2100
2101         kvm_arch_ops->get_idt(vcpu, &dt);
2102         sregs->idt.limit = dt.limit;
2103         sregs->idt.base = dt.base;
2104         kvm_arch_ops->get_gdt(vcpu, &dt);
2105         sregs->gdt.limit = dt.limit;
2106         sregs->gdt.base = dt.base;
2107
2108         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2109         sregs->cr0 = vcpu->cr0;
2110         sregs->cr2 = vcpu->cr2;
2111         sregs->cr3 = vcpu->cr3;
2112         sregs->cr4 = vcpu->cr4;
2113         sregs->cr8 = vcpu->cr8;
2114         sregs->efer = vcpu->shadow_efer;
2115         sregs->apic_base = vcpu->apic_base;
2116
2117         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2118                sizeof sregs->interrupt_bitmap);
2119
2120         vcpu_put(vcpu);
2121
2122         return 0;
2123 }
2124
2125 static void set_segment(struct kvm_vcpu *vcpu,
2126                         struct kvm_segment *var, int seg)
2127 {
2128         return kvm_arch_ops->set_segment(vcpu, var, seg);
2129 }
2130
2131 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2132                                     struct kvm_sregs *sregs)
2133 {
2134         int mmu_reset_needed = 0;
2135         int i;
2136         struct descriptor_table dt;
2137
2138         vcpu_load(vcpu);
2139
2140         dt.limit = sregs->idt.limit;
2141         dt.base = sregs->idt.base;
2142         kvm_arch_ops->set_idt(vcpu, &dt);
2143         dt.limit = sregs->gdt.limit;
2144         dt.base = sregs->gdt.base;
2145         kvm_arch_ops->set_gdt(vcpu, &dt);
2146
2147         vcpu->cr2 = sregs->cr2;
2148         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2149         vcpu->cr3 = sregs->cr3;
2150
2151         vcpu->cr8 = sregs->cr8;
2152
2153         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2154 #ifdef CONFIG_X86_64
2155         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2156 #endif
2157         vcpu->apic_base = sregs->apic_base;
2158
2159         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2160
2161         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2162         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2163
2164         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2165         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2166         if (!is_long_mode(vcpu) && is_pae(vcpu))
2167                 load_pdptrs(vcpu, vcpu->cr3);
2168
2169         if (mmu_reset_needed)
2170                 kvm_mmu_reset_context(vcpu);
2171
2172         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2173                sizeof vcpu->irq_pending);
2174         vcpu->irq_summary = 0;
2175         for (i = 0; i < NR_IRQ_WORDS; ++i)
2176                 if (vcpu->irq_pending[i])
2177                         __set_bit(i, &vcpu->irq_summary);
2178
2179         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2180         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2181         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2182         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2183         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2184         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2185
2186         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2187         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2188
2189         vcpu_put(vcpu);
2190
2191         return 0;
2192 }
2193
2194 /*
2195  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2196  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2197  *
2198  * This list is modified at module load time to reflect the
2199  * capabilities of the host cpu.
2200  */
2201 static u32 msrs_to_save[] = {
2202         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2203         MSR_K6_STAR,
2204 #ifdef CONFIG_X86_64
2205         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2206 #endif
2207         MSR_IA32_TIME_STAMP_COUNTER,
2208 };
2209
2210 static unsigned num_msrs_to_save;
2211
2212 static u32 emulated_msrs[] = {
2213         MSR_IA32_MISC_ENABLE,
2214 };
2215
2216 static __init void kvm_init_msr_list(void)
2217 {
2218         u32 dummy[2];
2219         unsigned i, j;
2220
2221         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2222                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2223                         continue;
2224                 if (j < i)
2225                         msrs_to_save[j] = msrs_to_save[i];
2226                 j++;
2227         }
2228         num_msrs_to_save = j;
2229 }
2230
2231 /*
2232  * Adapt set_msr() to msr_io()'s calling convention
2233  */
2234 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2235 {
2236         return set_msr(vcpu, index, *data);
2237 }
2238
2239 /*
2240  * Read or write a bunch of msrs. All parameters are kernel addresses.
2241  *
2242  * @return number of msrs set successfully.
2243  */
2244 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2245                     struct kvm_msr_entry *entries,
2246                     int (*do_msr)(struct kvm_vcpu *vcpu,
2247                                   unsigned index, u64 *data))
2248 {
2249         int i;
2250
2251         vcpu_load(vcpu);
2252
2253         for (i = 0; i < msrs->nmsrs; ++i)
2254                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2255                         break;
2256
2257         vcpu_put(vcpu);
2258
2259         return i;
2260 }
2261
2262 /*
2263  * Read or write a bunch of msrs. Parameters are user addresses.
2264  *
2265  * @return number of msrs set successfully.
2266  */
2267 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2268                   int (*do_msr)(struct kvm_vcpu *vcpu,
2269                                 unsigned index, u64 *data),
2270                   int writeback)
2271 {
2272         struct kvm_msrs msrs;
2273         struct kvm_msr_entry *entries;
2274         int r, n;
2275         unsigned size;
2276
2277         r = -EFAULT;
2278         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2279                 goto out;
2280
2281         r = -E2BIG;
2282         if (msrs.nmsrs >= MAX_IO_MSRS)
2283                 goto out;
2284
2285         r = -ENOMEM;
2286         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2287         entries = vmalloc(size);
2288         if (!entries)
2289                 goto out;
2290
2291         r = -EFAULT;
2292         if (copy_from_user(entries, user_msrs->entries, size))
2293                 goto out_free;
2294
2295         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2296         if (r < 0)
2297                 goto out_free;
2298
2299         r = -EFAULT;
2300         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2301                 goto out_free;
2302
2303         r = n;
2304
2305 out_free:
2306         vfree(entries);
2307 out:
2308         return r;
2309 }
2310
2311 /*
2312  * Translate a guest virtual address to a guest physical address.
2313  */
2314 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2315                                     struct kvm_translation *tr)
2316 {
2317         unsigned long vaddr = tr->linear_address;
2318         gpa_t gpa;
2319
2320         vcpu_load(vcpu);
2321         spin_lock(&vcpu->kvm->lock);
2322         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2323         tr->physical_address = gpa;
2324         tr->valid = gpa != UNMAPPED_GVA;
2325         tr->writeable = 1;
2326         tr->usermode = 0;
2327         spin_unlock(&vcpu->kvm->lock);
2328         vcpu_put(vcpu);
2329
2330         return 0;
2331 }
2332
2333 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2334                                     struct kvm_interrupt *irq)
2335 {
2336         if (irq->irq < 0 || irq->irq >= 256)
2337                 return -EINVAL;
2338         vcpu_load(vcpu);
2339
2340         set_bit(irq->irq, vcpu->irq_pending);
2341         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2342
2343         vcpu_put(vcpu);
2344
2345         return 0;
2346 }
2347
2348 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2349                                       struct kvm_debug_guest *dbg)
2350 {
2351         int r;
2352
2353         vcpu_load(vcpu);
2354
2355         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2356
2357         vcpu_put(vcpu);
2358
2359         return r;
2360 }
2361
2362 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2363                                     unsigned long address,
2364                                     int *type)
2365 {
2366         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2367         unsigned long pgoff;
2368         struct page *page;
2369
2370         *type = VM_FAULT_MINOR;
2371         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2372         if (pgoff == 0)
2373                 page = virt_to_page(vcpu->run);
2374         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2375                 page = virt_to_page(vcpu->pio_data);
2376         else
2377                 return NOPAGE_SIGBUS;
2378         get_page(page);
2379         return page;
2380 }
2381
2382 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2383         .nopage = kvm_vcpu_nopage,
2384 };
2385
2386 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2387 {
2388         vma->vm_ops = &kvm_vcpu_vm_ops;
2389         return 0;
2390 }
2391
2392 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2393 {
2394         struct kvm_vcpu *vcpu = filp->private_data;
2395
2396         fput(vcpu->kvm->filp);
2397         return 0;
2398 }
2399
2400 static struct file_operations kvm_vcpu_fops = {
2401         .release        = kvm_vcpu_release,
2402         .unlocked_ioctl = kvm_vcpu_ioctl,
2403         .compat_ioctl   = kvm_vcpu_ioctl,
2404         .mmap           = kvm_vcpu_mmap,
2405 };
2406
2407 /*
2408  * Allocates an inode for the vcpu.
2409  */
2410 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2411 {
2412         int fd, r;
2413         struct inode *inode;
2414         struct file *file;
2415
2416         atomic_inc(&vcpu->kvm->filp->f_count);
2417         inode = kvmfs_inode(&kvm_vcpu_fops);
2418         if (IS_ERR(inode)) {
2419                 r = PTR_ERR(inode);
2420                 goto out1;
2421         }
2422
2423         file = kvmfs_file(inode, vcpu);
2424         if (IS_ERR(file)) {
2425                 r = PTR_ERR(file);
2426                 goto out2;
2427         }
2428
2429         r = get_unused_fd();
2430         if (r < 0)
2431                 goto out3;
2432         fd = r;
2433         fd_install(fd, file);
2434
2435         return fd;
2436
2437 out3:
2438         fput(file);
2439 out2:
2440         iput(inode);
2441 out1:
2442         fput(vcpu->kvm->filp);
2443         return r;
2444 }
2445
2446 /*
2447  * Creates some virtual cpus.  Good luck creating more than one.
2448  */
2449 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2450 {
2451         int r;
2452         struct kvm_vcpu *vcpu;
2453         struct page *page;
2454
2455         r = -EINVAL;
2456         if (!valid_vcpu(n))
2457                 goto out;
2458
2459         vcpu = &kvm->vcpus[n];
2460
2461         mutex_lock(&vcpu->mutex);
2462
2463         if (vcpu->vmcs) {
2464                 mutex_unlock(&vcpu->mutex);
2465                 return -EEXIST;
2466         }
2467
2468         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2469         r = -ENOMEM;
2470         if (!page)
2471                 goto out_unlock;
2472         vcpu->run = page_address(page);
2473
2474         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2475         r = -ENOMEM;
2476         if (!page)
2477                 goto out_free_run;
2478         vcpu->pio_data = page_address(page);
2479
2480         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2481                                            FX_IMAGE_ALIGN);
2482         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2483         vcpu->cr0 = 0x10;
2484
2485         r = kvm_arch_ops->vcpu_create(vcpu);
2486         if (r < 0)
2487                 goto out_free_vcpus;
2488
2489         r = kvm_mmu_create(vcpu);
2490         if (r < 0)
2491                 goto out_free_vcpus;
2492
2493         kvm_arch_ops->vcpu_load(vcpu);
2494         r = kvm_mmu_setup(vcpu);
2495         if (r >= 0)
2496                 r = kvm_arch_ops->vcpu_setup(vcpu);
2497         vcpu_put(vcpu);
2498
2499         if (r < 0)
2500                 goto out_free_vcpus;
2501
2502         r = create_vcpu_fd(vcpu);
2503         if (r < 0)
2504                 goto out_free_vcpus;
2505
2506         spin_lock(&kvm_lock);
2507         if (n >= kvm->nvcpus)
2508                 kvm->nvcpus = n + 1;
2509         spin_unlock(&kvm_lock);
2510
2511         return r;
2512
2513 out_free_vcpus:
2514         kvm_free_vcpu(vcpu);
2515 out_free_run:
2516         free_page((unsigned long)vcpu->run);
2517         vcpu->run = NULL;
2518 out_unlock:
2519         mutex_unlock(&vcpu->mutex);
2520 out:
2521         return r;
2522 }
2523
2524 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2525 {
2526         u64 efer;
2527         int i;
2528         struct kvm_cpuid_entry *e, *entry;
2529
2530         rdmsrl(MSR_EFER, efer);
2531         entry = NULL;
2532         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2533                 e = &vcpu->cpuid_entries[i];
2534                 if (e->function == 0x80000001) {
2535                         entry = e;
2536                         break;
2537                 }
2538         }
2539         if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
2540                 entry->edx &= ~(1 << 20);
2541                 printk(KERN_INFO ": guest NX capability removed\n");
2542         }
2543 }
2544
2545 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2546                                     struct kvm_cpuid *cpuid,
2547                                     struct kvm_cpuid_entry __user *entries)
2548 {
2549         int r;
2550
2551         r = -E2BIG;
2552         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2553                 goto out;
2554         r = -EFAULT;
2555         if (copy_from_user(&vcpu->cpuid_entries, entries,
2556                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2557                 goto out;
2558         vcpu->cpuid_nent = cpuid->nent;
2559         cpuid_fix_nx_cap(vcpu);
2560         return 0;
2561
2562 out:
2563         return r;
2564 }
2565
2566 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2567 {
2568         if (sigset) {
2569                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2570                 vcpu->sigset_active = 1;
2571                 vcpu->sigset = *sigset;
2572         } else
2573                 vcpu->sigset_active = 0;
2574         return 0;
2575 }
2576
2577 /*
2578  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2579  * we have asm/x86/processor.h
2580  */
2581 struct fxsave {
2582         u16     cwd;
2583         u16     swd;
2584         u16     twd;
2585         u16     fop;
2586         u64     rip;
2587         u64     rdp;
2588         u32     mxcsr;
2589         u32     mxcsr_mask;
2590         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2591 #ifdef CONFIG_X86_64
2592         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2593 #else
2594         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2595 #endif
2596 };
2597
2598 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2599 {
2600         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2601
2602         vcpu_load(vcpu);
2603
2604         memcpy(fpu->fpr, fxsave->st_space, 128);
2605         fpu->fcw = fxsave->cwd;
2606         fpu->fsw = fxsave->swd;
2607         fpu->ftwx = fxsave->twd;
2608         fpu->last_opcode = fxsave->fop;
2609         fpu->last_ip = fxsave->rip;
2610         fpu->last_dp = fxsave->rdp;
2611         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2612
2613         vcpu_put(vcpu);
2614
2615         return 0;
2616 }
2617
2618 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2619 {
2620         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2621
2622         vcpu_load(vcpu);
2623
2624         memcpy(fxsave->st_space, fpu->fpr, 128);
2625         fxsave->cwd = fpu->fcw;
2626         fxsave->swd = fpu->fsw;
2627         fxsave->twd = fpu->ftwx;
2628         fxsave->fop = fpu->last_opcode;
2629         fxsave->rip = fpu->last_ip;
2630         fxsave->rdp = fpu->last_dp;
2631         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2632
2633         vcpu_put(vcpu);
2634
2635         return 0;
2636 }
2637
2638 static long kvm_vcpu_ioctl(struct file *filp,
2639                            unsigned int ioctl, unsigned long arg)
2640 {
2641         struct kvm_vcpu *vcpu = filp->private_data;
2642         void __user *argp = (void __user *)arg;
2643         int r = -EINVAL;
2644
2645         switch (ioctl) {
2646         case KVM_RUN:
2647                 r = -EINVAL;
2648                 if (arg)
2649                         goto out;
2650                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2651                 break;
2652         case KVM_GET_REGS: {
2653                 struct kvm_regs kvm_regs;
2654
2655                 memset(&kvm_regs, 0, sizeof kvm_regs);
2656                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2657                 if (r)
2658                         goto out;
2659                 r = -EFAULT;
2660                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2661                         goto out;
2662                 r = 0;
2663                 break;
2664         }
2665         case KVM_SET_REGS: {
2666                 struct kvm_regs kvm_regs;
2667
2668                 r = -EFAULT;
2669                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2670                         goto out;
2671                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2672                 if (r)
2673                         goto out;
2674                 r = 0;
2675                 break;
2676         }
2677         case KVM_GET_SREGS: {
2678                 struct kvm_sregs kvm_sregs;
2679
2680                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2681                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2682                 if (r)
2683                         goto out;
2684                 r = -EFAULT;
2685                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2686                         goto out;
2687                 r = 0;
2688                 break;
2689         }
2690         case KVM_SET_SREGS: {
2691                 struct kvm_sregs kvm_sregs;
2692
2693                 r = -EFAULT;
2694                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2695                         goto out;
2696                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2697                 if (r)
2698                         goto out;
2699                 r = 0;
2700                 break;
2701         }
2702         case KVM_TRANSLATE: {
2703                 struct kvm_translation tr;
2704
2705                 r = -EFAULT;
2706                 if (copy_from_user(&tr, argp, sizeof tr))
2707                         goto out;
2708                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2709                 if (r)
2710                         goto out;
2711                 r = -EFAULT;
2712                 if (copy_to_user(argp, &tr, sizeof tr))
2713                         goto out;
2714                 r = 0;
2715                 break;
2716         }
2717         case KVM_INTERRUPT: {
2718                 struct kvm_interrupt irq;
2719
2720                 r = -EFAULT;
2721                 if (copy_from_user(&irq, argp, sizeof irq))
2722                         goto out;
2723                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2724                 if (r)
2725                         goto out;
2726                 r = 0;
2727                 break;
2728         }
2729         case KVM_DEBUG_GUEST: {
2730                 struct kvm_debug_guest dbg;
2731
2732                 r = -EFAULT;
2733                 if (copy_from_user(&dbg, argp, sizeof dbg))
2734                         goto out;
2735                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2736                 if (r)
2737                         goto out;
2738                 r = 0;
2739                 break;
2740         }
2741         case KVM_GET_MSRS:
2742                 r = msr_io(vcpu, argp, get_msr, 1);
2743                 break;
2744         case KVM_SET_MSRS:
2745                 r = msr_io(vcpu, argp, do_set_msr, 0);
2746                 break;
2747         case KVM_SET_CPUID: {
2748                 struct kvm_cpuid __user *cpuid_arg = argp;
2749                 struct kvm_cpuid cpuid;
2750
2751                 r = -EFAULT;
2752                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2753                         goto out;
2754                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2755                 if (r)
2756                         goto out;
2757                 break;
2758         }
2759         case KVM_SET_SIGNAL_MASK: {
2760                 struct kvm_signal_mask __user *sigmask_arg = argp;
2761                 struct kvm_signal_mask kvm_sigmask;
2762                 sigset_t sigset, *p;
2763
2764                 p = NULL;
2765                 if (argp) {
2766                         r = -EFAULT;
2767                         if (copy_from_user(&kvm_sigmask, argp,
2768                                            sizeof kvm_sigmask))
2769                                 goto out;
2770                         r = -EINVAL;
2771                         if (kvm_sigmask.len != sizeof sigset)
2772                                 goto out;
2773                         r = -EFAULT;
2774                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2775                                            sizeof sigset))
2776                                 goto out;
2777                         p = &sigset;
2778                 }
2779                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2780                 break;
2781         }
2782         case KVM_GET_FPU: {
2783                 struct kvm_fpu fpu;
2784
2785                 memset(&fpu, 0, sizeof fpu);
2786                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2787                 if (r)
2788                         goto out;
2789                 r = -EFAULT;
2790                 if (copy_to_user(argp, &fpu, sizeof fpu))
2791                         goto out;
2792                 r = 0;
2793                 break;
2794         }
2795         case KVM_SET_FPU: {
2796                 struct kvm_fpu fpu;
2797
2798                 r = -EFAULT;
2799                 if (copy_from_user(&fpu, argp, sizeof fpu))
2800                         goto out;
2801                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2802                 if (r)
2803                         goto out;
2804                 r = 0;
2805                 break;
2806         }
2807         default:
2808                 ;
2809         }
2810 out:
2811         return r;
2812 }
2813
2814 static long kvm_vm_ioctl(struct file *filp,
2815                            unsigned int ioctl, unsigned long arg)
2816 {
2817         struct kvm *kvm = filp->private_data;
2818         void __user *argp = (void __user *)arg;
2819         int r = -EINVAL;
2820
2821         switch (ioctl) {
2822         case KVM_CREATE_VCPU:
2823                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2824                 if (r < 0)
2825                         goto out;
2826                 break;
2827         case KVM_SET_MEMORY_REGION: {
2828                 struct kvm_memory_region kvm_mem;
2829
2830                 r = -EFAULT;
2831                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2832                         goto out;
2833                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2834                 if (r)
2835                         goto out;
2836                 break;
2837         }
2838         case KVM_GET_DIRTY_LOG: {
2839                 struct kvm_dirty_log log;
2840
2841                 r = -EFAULT;
2842                 if (copy_from_user(&log, argp, sizeof log))
2843                         goto out;
2844                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2845                 if (r)
2846                         goto out;
2847                 break;
2848         }
2849         case KVM_SET_MEMORY_ALIAS: {
2850                 struct kvm_memory_alias alias;
2851
2852                 r = -EFAULT;
2853                 if (copy_from_user(&alias, argp, sizeof alias))
2854                         goto out;
2855                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2856                 if (r)
2857                         goto out;
2858                 break;
2859         }
2860         default:
2861                 ;
2862         }
2863 out:
2864         return r;
2865 }
2866
2867 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2868                                   unsigned long address,
2869                                   int *type)
2870 {
2871         struct kvm *kvm = vma->vm_file->private_data;
2872         unsigned long pgoff;
2873         struct page *page;
2874
2875         *type = VM_FAULT_MINOR;
2876         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2877         page = gfn_to_page(kvm, pgoff);
2878         if (!page)
2879                 return NOPAGE_SIGBUS;
2880         get_page(page);
2881         return page;
2882 }
2883
2884 static struct vm_operations_struct kvm_vm_vm_ops = {
2885         .nopage = kvm_vm_nopage,
2886 };
2887
2888 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2889 {
2890         vma->vm_ops = &kvm_vm_vm_ops;
2891         return 0;
2892 }
2893
2894 static struct file_operations kvm_vm_fops = {
2895         .release        = kvm_vm_release,
2896         .unlocked_ioctl = kvm_vm_ioctl,
2897         .compat_ioctl   = kvm_vm_ioctl,
2898         .mmap           = kvm_vm_mmap,
2899 };
2900
2901 static int kvm_dev_ioctl_create_vm(void)
2902 {
2903         int fd, r;
2904         struct inode *inode;
2905         struct file *file;
2906         struct kvm *kvm;
2907
2908         inode = kvmfs_inode(&kvm_vm_fops);
2909         if (IS_ERR(inode)) {
2910                 r = PTR_ERR(inode);
2911                 goto out1;
2912         }
2913
2914         kvm = kvm_create_vm();
2915         if (IS_ERR(kvm)) {
2916                 r = PTR_ERR(kvm);
2917                 goto out2;
2918         }
2919
2920         file = kvmfs_file(inode, kvm);
2921         if (IS_ERR(file)) {
2922                 r = PTR_ERR(file);
2923                 goto out3;
2924         }
2925         kvm->filp = file;
2926
2927         r = get_unused_fd();
2928         if (r < 0)
2929                 goto out4;
2930         fd = r;
2931         fd_install(fd, file);
2932
2933         return fd;
2934
2935 out4:
2936         fput(file);
2937 out3:
2938         kvm_destroy_vm(kvm);
2939 out2:
2940         iput(inode);
2941 out1:
2942         return r;
2943 }
2944
2945 static long kvm_dev_ioctl(struct file *filp,
2946                           unsigned int ioctl, unsigned long arg)
2947 {
2948         void __user *argp = (void __user *)arg;
2949         long r = -EINVAL;
2950
2951         switch (ioctl) {
2952         case KVM_GET_API_VERSION:
2953                 r = -EINVAL;
2954                 if (arg)
2955                         goto out;
2956                 r = KVM_API_VERSION;
2957                 break;
2958         case KVM_CREATE_VM:
2959                 r = -EINVAL;
2960                 if (arg)
2961                         goto out;
2962                 r = kvm_dev_ioctl_create_vm();
2963                 break;
2964         case KVM_GET_MSR_INDEX_LIST: {
2965                 struct kvm_msr_list __user *user_msr_list = argp;
2966                 struct kvm_msr_list msr_list;
2967                 unsigned n;
2968
2969                 r = -EFAULT;
2970                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2971                         goto out;
2972                 n = msr_list.nmsrs;
2973                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2974                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2975                         goto out;
2976                 r = -E2BIG;
2977                 if (n < num_msrs_to_save)
2978                         goto out;
2979                 r = -EFAULT;
2980                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2981                                  num_msrs_to_save * sizeof(u32)))
2982                         goto out;
2983                 if (copy_to_user(user_msr_list->indices
2984                                  + num_msrs_to_save * sizeof(u32),
2985                                  &emulated_msrs,
2986                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2987                         goto out;
2988                 r = 0;
2989                 break;
2990         }
2991         case KVM_CHECK_EXTENSION:
2992                 /*
2993                  * No extensions defined at present.
2994                  */
2995                 r = 0;
2996                 break;
2997         case KVM_GET_VCPU_MMAP_SIZE:
2998                 r = -EINVAL;
2999                 if (arg)
3000                         goto out;
3001                 r = 2 * PAGE_SIZE;
3002                 break;
3003         default:
3004                 ;
3005         }
3006 out:
3007         return r;
3008 }
3009
3010 static struct file_operations kvm_chardev_ops = {
3011         .open           = kvm_dev_open,
3012         .release        = kvm_dev_release,
3013         .unlocked_ioctl = kvm_dev_ioctl,
3014         .compat_ioctl   = kvm_dev_ioctl,
3015 };
3016
3017 static struct miscdevice kvm_dev = {
3018         KVM_MINOR,
3019         "kvm",
3020         &kvm_chardev_ops,
3021 };
3022
3023 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3024                        void *v)
3025 {
3026         if (val == SYS_RESTART) {
3027                 /*
3028                  * Some (well, at least mine) BIOSes hang on reboot if
3029                  * in vmx root mode.
3030                  */
3031                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3032                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3033         }
3034         return NOTIFY_OK;
3035 }
3036
3037 static struct notifier_block kvm_reboot_notifier = {
3038         .notifier_call = kvm_reboot,
3039         .priority = 0,
3040 };
3041
3042 /*
3043  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3044  * cached on it.
3045  */
3046 static void decache_vcpus_on_cpu(int cpu)
3047 {
3048         struct kvm *vm;
3049         struct kvm_vcpu *vcpu;
3050         int i;
3051
3052         spin_lock(&kvm_lock);
3053         list_for_each_entry(vm, &vm_list, vm_list)
3054                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3055                         vcpu = &vm->vcpus[i];
3056                         /*
3057                          * If the vcpu is locked, then it is running on some
3058                          * other cpu and therefore it is not cached on the
3059                          * cpu in question.
3060                          *
3061                          * If it's not locked, check the last cpu it executed
3062                          * on.
3063                          */
3064                         if (mutex_trylock(&vcpu->mutex)) {
3065                                 if (vcpu->cpu == cpu) {
3066                                         kvm_arch_ops->vcpu_decache(vcpu);
3067                                         vcpu->cpu = -1;
3068                                 }
3069                                 mutex_unlock(&vcpu->mutex);
3070                         }
3071                 }
3072         spin_unlock(&kvm_lock);
3073 }
3074
3075 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3076                            void *v)
3077 {
3078         int cpu = (long)v;
3079
3080         switch (val) {
3081         case CPU_DOWN_PREPARE:
3082         case CPU_DOWN_PREPARE_FROZEN:
3083         case CPU_UP_CANCELED:
3084         case CPU_UP_CANCELED_FROZEN:
3085                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3086                        cpu);
3087                 decache_vcpus_on_cpu(cpu);
3088                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
3089                                          NULL, 0, 1);
3090                 break;
3091         case CPU_ONLINE:
3092         case CPU_ONLINE_FROZEN:
3093                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3094                        cpu);
3095                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
3096                                          NULL, 0, 1);
3097                 break;
3098         }
3099         return NOTIFY_OK;
3100 }
3101
3102 void kvm_io_bus_init(struct kvm_io_bus *bus)
3103 {
3104         memset(bus, 0, sizeof(*bus));
3105 }
3106
3107 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3108 {
3109         int i;
3110
3111         for (i = 0; i < bus->dev_count; i++) {
3112                 struct kvm_io_device *pos = bus->devs[i];
3113
3114                 kvm_iodevice_destructor(pos);
3115         }
3116 }
3117
3118 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3119 {
3120         int i;
3121
3122         for (i = 0; i < bus->dev_count; i++) {
3123                 struct kvm_io_device *pos = bus->devs[i];
3124
3125                 if (pos->in_range(pos, addr))
3126                         return pos;
3127         }
3128
3129         return NULL;
3130 }
3131
3132 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3133 {
3134         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3135
3136         bus->devs[bus->dev_count++] = dev;
3137 }
3138
3139 static struct notifier_block kvm_cpu_notifier = {
3140         .notifier_call = kvm_cpu_hotplug,
3141         .priority = 20, /* must be > scheduler priority */
3142 };
3143
3144 static u64 stat_get(void *_offset)
3145 {
3146         unsigned offset = (long)_offset;
3147         u64 total = 0;
3148         struct kvm *kvm;
3149         struct kvm_vcpu *vcpu;
3150         int i;
3151
3152         spin_lock(&kvm_lock);
3153         list_for_each_entry(kvm, &vm_list, vm_list)
3154                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3155                         vcpu = &kvm->vcpus[i];
3156                         total += *(u32 *)((void *)vcpu + offset);
3157                 }
3158         spin_unlock(&kvm_lock);
3159         return total;
3160 }
3161
3162 static void stat_set(void *offset, u64 val)
3163 {
3164 }
3165
3166 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3167
3168 static __init void kvm_init_debug(void)
3169 {
3170         struct kvm_stats_debugfs_item *p;
3171
3172         debugfs_dir = debugfs_create_dir("kvm", NULL);
3173         for (p = debugfs_entries; p->name; ++p)
3174                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3175                                                 (void *)(long)p->offset,
3176                                                 &stat_fops);
3177 }
3178
3179 static void kvm_exit_debug(void)
3180 {
3181         struct kvm_stats_debugfs_item *p;
3182
3183         for (p = debugfs_entries; p->name; ++p)
3184                 debugfs_remove(p->dentry);
3185         debugfs_remove(debugfs_dir);
3186 }
3187
3188 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3189 {
3190         decache_vcpus_on_cpu(raw_smp_processor_id());
3191         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3192         return 0;
3193 }
3194
3195 static int kvm_resume(struct sys_device *dev)
3196 {
3197         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3198         return 0;
3199 }
3200
3201 static struct sysdev_class kvm_sysdev_class = {
3202         set_kset_name("kvm"),
3203         .suspend = kvm_suspend,
3204         .resume = kvm_resume,
3205 };
3206
3207 static struct sys_device kvm_sysdev = {
3208         .id = 0,
3209         .cls = &kvm_sysdev_class,
3210 };
3211
3212 hpa_t bad_page_address;
3213
3214 static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3215                         const char *dev_name, void *data, struct vfsmount *mnt)
3216 {
3217         return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3218 }
3219
3220 static struct file_system_type kvm_fs_type = {
3221         .name           = "kvmfs",
3222         .get_sb         = kvmfs_get_sb,
3223         .kill_sb        = kill_anon_super,
3224 };
3225
3226 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3227 {
3228         int r;
3229
3230         if (kvm_arch_ops) {
3231                 printk(KERN_ERR "kvm: already loaded the other module\n");
3232                 return -EEXIST;
3233         }
3234
3235         if (!ops->cpu_has_kvm_support()) {
3236                 printk(KERN_ERR "kvm: no hardware support\n");
3237                 return -EOPNOTSUPP;
3238         }
3239         if (ops->disabled_by_bios()) {
3240                 printk(KERN_ERR "kvm: disabled by bios\n");
3241                 return -EOPNOTSUPP;
3242         }
3243
3244         kvm_arch_ops = ops;
3245
3246         r = kvm_arch_ops->hardware_setup();
3247         if (r < 0)
3248                 goto out;
3249
3250         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3251         r = register_cpu_notifier(&kvm_cpu_notifier);
3252         if (r)
3253                 goto out_free_1;
3254         register_reboot_notifier(&kvm_reboot_notifier);
3255
3256         r = sysdev_class_register(&kvm_sysdev_class);
3257         if (r)
3258                 goto out_free_2;
3259
3260         r = sysdev_register(&kvm_sysdev);
3261         if (r)
3262                 goto out_free_3;
3263
3264         kvm_chardev_ops.owner = module;
3265
3266         r = misc_register(&kvm_dev);
3267         if (r) {
3268                 printk (KERN_ERR "kvm: misc device register failed\n");
3269                 goto out_free;
3270         }
3271
3272         return r;
3273
3274 out_free:
3275         sysdev_unregister(&kvm_sysdev);
3276 out_free_3:
3277         sysdev_class_unregister(&kvm_sysdev_class);
3278 out_free_2:
3279         unregister_reboot_notifier(&kvm_reboot_notifier);
3280         unregister_cpu_notifier(&kvm_cpu_notifier);
3281 out_free_1:
3282         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3283         kvm_arch_ops->hardware_unsetup();
3284 out:
3285         kvm_arch_ops = NULL;
3286         return r;
3287 }
3288
3289 void kvm_exit_arch(void)
3290 {
3291         misc_deregister(&kvm_dev);
3292         sysdev_unregister(&kvm_sysdev);
3293         sysdev_class_unregister(&kvm_sysdev_class);
3294         unregister_reboot_notifier(&kvm_reboot_notifier);
3295         unregister_cpu_notifier(&kvm_cpu_notifier);
3296         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3297         kvm_arch_ops->hardware_unsetup();
3298         kvm_arch_ops = NULL;
3299 }
3300
3301 static __init int kvm_init(void)
3302 {
3303         static struct page *bad_page;
3304         int r;
3305
3306         r = kvm_mmu_module_init();
3307         if (r)
3308                 goto out4;
3309
3310         r = register_filesystem(&kvm_fs_type);
3311         if (r)
3312                 goto out3;
3313
3314         kvmfs_mnt = kern_mount(&kvm_fs_type);
3315         r = PTR_ERR(kvmfs_mnt);
3316         if (IS_ERR(kvmfs_mnt))
3317                 goto out2;
3318         kvm_init_debug();
3319
3320         kvm_init_msr_list();
3321
3322         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3323                 r = -ENOMEM;
3324                 goto out;
3325         }
3326
3327         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3328         memset(__va(bad_page_address), 0, PAGE_SIZE);
3329
3330         return 0;
3331
3332 out:
3333         kvm_exit_debug();
3334         mntput(kvmfs_mnt);
3335 out2:
3336         unregister_filesystem(&kvm_fs_type);
3337 out3:
3338         kvm_mmu_module_exit();
3339 out4:
3340         return r;
3341 }
3342
3343 static __exit void kvm_exit(void)
3344 {
3345         kvm_exit_debug();
3346         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3347         mntput(kvmfs_mnt);
3348         unregister_filesystem(&kvm_fs_type);
3349         kvm_mmu_module_exit();
3350 }
3351
3352 module_init(kvm_init)
3353 module_exit(kvm_exit)
3354
3355 EXPORT_SYMBOL_GPL(kvm_init_arch);
3356 EXPORT_SYMBOL_GPL(kvm_exit_arch);