KVM: Adds support for in-kernel mmio handlers
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <linux/magic.h>
24 #include <asm/processor.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <asm/msr.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <asm/uaccess.h>
32 #include <linux/reboot.h>
33 #include <asm/io.h>
34 #include <linux/debugfs.h>
35 #include <linux/highmem.h>
36 #include <linux/file.h>
37 #include <asm/desc.h>
38 #include <linux/sysdev.h>
39 #include <linux/cpu.h>
40 #include <linux/file.h>
41 #include <linux/fs.h>
42 #include <linux/mount.h>
43 #include <linux/sched.h>
44 #include <linux/cpumask.h>
45 #include <linux/smp.h>
46
47 #include "x86_emulate.h"
48 #include "segment_descriptor.h"
49
50 MODULE_AUTHOR("Qumranet");
51 MODULE_LICENSE("GPL");
52
53 static DEFINE_SPINLOCK(kvm_lock);
54 static LIST_HEAD(vm_list);
55
56 struct kvm_arch_ops *kvm_arch_ops;
57
58 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
59
60 static struct kvm_stats_debugfs_item {
61         const char *name;
62         int offset;
63         struct dentry *dentry;
64 } debugfs_entries[] = {
65         { "pf_fixed", STAT_OFFSET(pf_fixed) },
66         { "pf_guest", STAT_OFFSET(pf_guest) },
67         { "tlb_flush", STAT_OFFSET(tlb_flush) },
68         { "invlpg", STAT_OFFSET(invlpg) },
69         { "exits", STAT_OFFSET(exits) },
70         { "io_exits", STAT_OFFSET(io_exits) },
71         { "mmio_exits", STAT_OFFSET(mmio_exits) },
72         { "signal_exits", STAT_OFFSET(signal_exits) },
73         { "irq_window", STAT_OFFSET(irq_window_exits) },
74         { "halt_exits", STAT_OFFSET(halt_exits) },
75         { "request_irq", STAT_OFFSET(request_irq_exits) },
76         { "irq_exits", STAT_OFFSET(irq_exits) },
77         { "light_exits", STAT_OFFSET(light_exits) },
78         { "efer_reload", STAT_OFFSET(efer_reload) },
79         { NULL }
80 };
81
82 static struct dentry *debugfs_dir;
83
84 struct vfsmount *kvmfs_mnt;
85
86 #define MAX_IO_MSRS 256
87
88 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
89 #define LMSW_GUEST_MASK 0x0eULL
90 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
91 #define CR8_RESEVED_BITS (~0x0fULL)
92 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
93
94 #ifdef CONFIG_X86_64
95 // LDT or TSS descriptor in the GDT. 16 bytes.
96 struct segment_descriptor_64 {
97         struct segment_descriptor s;
98         u32 base_higher;
99         u32 pad_zero;
100 };
101
102 #endif
103
104 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
105                            unsigned long arg);
106
107 static struct inode *kvmfs_inode(struct file_operations *fops)
108 {
109         int error = -ENOMEM;
110         struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
111
112         if (!inode)
113                 goto eexit_1;
114
115         inode->i_fop = fops;
116
117         /*
118          * Mark the inode dirty from the very beginning,
119          * that way it will never be moved to the dirty
120          * list because mark_inode_dirty() will think
121          * that it already _is_ on the dirty list.
122          */
123         inode->i_state = I_DIRTY;
124         inode->i_mode = S_IRUSR | S_IWUSR;
125         inode->i_uid = current->fsuid;
126         inode->i_gid = current->fsgid;
127         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
128         return inode;
129
130 eexit_1:
131         return ERR_PTR(error);
132 }
133
134 static struct file *kvmfs_file(struct inode *inode, void *private_data)
135 {
136         struct file *file = get_empty_filp();
137
138         if (!file)
139                 return ERR_PTR(-ENFILE);
140
141         file->f_path.mnt = mntget(kvmfs_mnt);
142         file->f_path.dentry = d_alloc_anon(inode);
143         if (!file->f_path.dentry)
144                 return ERR_PTR(-ENOMEM);
145         file->f_mapping = inode->i_mapping;
146
147         file->f_pos = 0;
148         file->f_flags = O_RDWR;
149         file->f_op = inode->i_fop;
150         file->f_mode = FMODE_READ | FMODE_WRITE;
151         file->f_version = 0;
152         file->private_data = private_data;
153         return file;
154 }
155
156 unsigned long segment_base(u16 selector)
157 {
158         struct descriptor_table gdt;
159         struct segment_descriptor *d;
160         unsigned long table_base;
161         typedef unsigned long ul;
162         unsigned long v;
163
164         if (selector == 0)
165                 return 0;
166
167         asm ("sgdt %0" : "=m"(gdt));
168         table_base = gdt.base;
169
170         if (selector & 4) {           /* from ldt */
171                 u16 ldt_selector;
172
173                 asm ("sldt %0" : "=g"(ldt_selector));
174                 table_base = segment_base(ldt_selector);
175         }
176         d = (struct segment_descriptor *)(table_base + (selector & ~7));
177         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
178 #ifdef CONFIG_X86_64
179         if (d->system == 0
180             && (d->type == 2 || d->type == 9 || d->type == 11))
181                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
182 #endif
183         return v;
184 }
185 EXPORT_SYMBOL_GPL(segment_base);
186
187 static inline int valid_vcpu(int n)
188 {
189         return likely(n >= 0 && n < KVM_MAX_VCPUS);
190 }
191
192 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
193                    void *dest)
194 {
195         unsigned char *host_buf = dest;
196         unsigned long req_size = size;
197
198         while (size) {
199                 hpa_t paddr;
200                 unsigned now;
201                 unsigned offset;
202                 hva_t guest_buf;
203
204                 paddr = gva_to_hpa(vcpu, addr);
205
206                 if (is_error_hpa(paddr))
207                         break;
208
209                 guest_buf = (hva_t)kmap_atomic(
210                                         pfn_to_page(paddr >> PAGE_SHIFT),
211                                         KM_USER0);
212                 offset = addr & ~PAGE_MASK;
213                 guest_buf |= offset;
214                 now = min(size, PAGE_SIZE - offset);
215                 memcpy(host_buf, (void*)guest_buf, now);
216                 host_buf += now;
217                 addr += now;
218                 size -= now;
219                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
220         }
221         return req_size - size;
222 }
223 EXPORT_SYMBOL_GPL(kvm_read_guest);
224
225 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
226                     void *data)
227 {
228         unsigned char *host_buf = data;
229         unsigned long req_size = size;
230
231         while (size) {
232                 hpa_t paddr;
233                 unsigned now;
234                 unsigned offset;
235                 hva_t guest_buf;
236                 gfn_t gfn;
237
238                 paddr = gva_to_hpa(vcpu, addr);
239
240                 if (is_error_hpa(paddr))
241                         break;
242
243                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
244                 mark_page_dirty(vcpu->kvm, gfn);
245                 guest_buf = (hva_t)kmap_atomic(
246                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
247                 offset = addr & ~PAGE_MASK;
248                 guest_buf |= offset;
249                 now = min(size, PAGE_SIZE - offset);
250                 memcpy((void*)guest_buf, host_buf, now);
251                 host_buf += now;
252                 addr += now;
253                 size -= now;
254                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
255         }
256         return req_size - size;
257 }
258 EXPORT_SYMBOL_GPL(kvm_write_guest);
259
260 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
261 {
262         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
263                 return;
264
265         vcpu->guest_fpu_loaded = 1;
266         fx_save(vcpu->host_fx_image);
267         fx_restore(vcpu->guest_fx_image);
268 }
269 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
270
271 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
272 {
273         if (!vcpu->guest_fpu_loaded)
274                 return;
275
276         vcpu->guest_fpu_loaded = 0;
277         fx_save(vcpu->guest_fx_image);
278         fx_restore(vcpu->host_fx_image);
279 }
280 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
281
282 /*
283  * Switches to specified vcpu, until a matching vcpu_put()
284  */
285 static void vcpu_load(struct kvm_vcpu *vcpu)
286 {
287         mutex_lock(&vcpu->mutex);
288         kvm_arch_ops->vcpu_load(vcpu);
289 }
290
291 /*
292  * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
293  * if the slot is not populated.
294  */
295 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
296 {
297         struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
298
299         mutex_lock(&vcpu->mutex);
300         if (!vcpu->vmcs) {
301                 mutex_unlock(&vcpu->mutex);
302                 return NULL;
303         }
304         kvm_arch_ops->vcpu_load(vcpu);
305         return vcpu;
306 }
307
308 static void vcpu_put(struct kvm_vcpu *vcpu)
309 {
310         kvm_arch_ops->vcpu_put(vcpu);
311         mutex_unlock(&vcpu->mutex);
312 }
313
314 static void ack_flush(void *_completed)
315 {
316         atomic_t *completed = _completed;
317
318         atomic_inc(completed);
319 }
320
321 void kvm_flush_remote_tlbs(struct kvm *kvm)
322 {
323         int i, cpu, needed;
324         cpumask_t cpus;
325         struct kvm_vcpu *vcpu;
326         atomic_t completed;
327
328         atomic_set(&completed, 0);
329         cpus_clear(cpus);
330         needed = 0;
331         for (i = 0; i < kvm->nvcpus; ++i) {
332                 vcpu = &kvm->vcpus[i];
333                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
334                         continue;
335                 cpu = vcpu->cpu;
336                 if (cpu != -1 && cpu != raw_smp_processor_id())
337                         if (!cpu_isset(cpu, cpus)) {
338                                 cpu_set(cpu, cpus);
339                                 ++needed;
340                         }
341         }
342
343         /*
344          * We really want smp_call_function_mask() here.  But that's not
345          * available, so ipi all cpus in parallel and wait for them
346          * to complete.
347          */
348         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
349                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
350         while (atomic_read(&completed) != needed) {
351                 cpu_relax();
352                 barrier();
353         }
354 }
355
356 static struct kvm *kvm_create_vm(void)
357 {
358         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
359         int i;
360
361         if (!kvm)
362                 return ERR_PTR(-ENOMEM);
363
364         spin_lock_init(&kvm->lock);
365         INIT_LIST_HEAD(&kvm->active_mmu_pages);
366         spin_lock(&kvm_lock);
367         list_add(&kvm->vm_list, &vm_list);
368         spin_unlock(&kvm_lock);
369         kvm_io_bus_init(&kvm->mmio_bus);
370         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
371                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
372
373                 mutex_init(&vcpu->mutex);
374                 vcpu->cpu = -1;
375                 vcpu->kvm = kvm;
376                 vcpu->mmu.root_hpa = INVALID_PAGE;
377         }
378         return kvm;
379 }
380
381 static int kvm_dev_open(struct inode *inode, struct file *filp)
382 {
383         return 0;
384 }
385
386 /*
387  * Free any memory in @free but not in @dont.
388  */
389 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
390                                   struct kvm_memory_slot *dont)
391 {
392         int i;
393
394         if (!dont || free->phys_mem != dont->phys_mem)
395                 if (free->phys_mem) {
396                         for (i = 0; i < free->npages; ++i)
397                                 if (free->phys_mem[i])
398                                         __free_page(free->phys_mem[i]);
399                         vfree(free->phys_mem);
400                 }
401
402         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
403                 vfree(free->dirty_bitmap);
404
405         free->phys_mem = NULL;
406         free->npages = 0;
407         free->dirty_bitmap = NULL;
408 }
409
410 static void kvm_free_physmem(struct kvm *kvm)
411 {
412         int i;
413
414         for (i = 0; i < kvm->nmemslots; ++i)
415                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
416 }
417
418 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
419 {
420         int i;
421
422         for (i = 0; i < 2; ++i)
423                 if (vcpu->pio.guest_pages[i]) {
424                         __free_page(vcpu->pio.guest_pages[i]);
425                         vcpu->pio.guest_pages[i] = NULL;
426                 }
427 }
428
429 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
430 {
431         if (!vcpu->vmcs)
432                 return;
433
434         vcpu_load(vcpu);
435         kvm_mmu_unload(vcpu);
436         vcpu_put(vcpu);
437 }
438
439 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
440 {
441         if (!vcpu->vmcs)
442                 return;
443
444         vcpu_load(vcpu);
445         kvm_mmu_destroy(vcpu);
446         vcpu_put(vcpu);
447         kvm_arch_ops->vcpu_free(vcpu);
448         free_page((unsigned long)vcpu->run);
449         vcpu->run = NULL;
450         free_page((unsigned long)vcpu->pio_data);
451         vcpu->pio_data = NULL;
452         free_pio_guest_pages(vcpu);
453 }
454
455 static void kvm_free_vcpus(struct kvm *kvm)
456 {
457         unsigned int i;
458
459         /*
460          * Unpin any mmu pages first.
461          */
462         for (i = 0; i < KVM_MAX_VCPUS; ++i)
463                 kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
464         for (i = 0; i < KVM_MAX_VCPUS; ++i)
465                 kvm_free_vcpu(&kvm->vcpus[i]);
466 }
467
468 static int kvm_dev_release(struct inode *inode, struct file *filp)
469 {
470         return 0;
471 }
472
473 static void kvm_destroy_vm(struct kvm *kvm)
474 {
475         spin_lock(&kvm_lock);
476         list_del(&kvm->vm_list);
477         spin_unlock(&kvm_lock);
478         kvm_io_bus_destroy(&kvm->mmio_bus);
479         kvm_free_vcpus(kvm);
480         kvm_free_physmem(kvm);
481         kfree(kvm);
482 }
483
484 static int kvm_vm_release(struct inode *inode, struct file *filp)
485 {
486         struct kvm *kvm = filp->private_data;
487
488         kvm_destroy_vm(kvm);
489         return 0;
490 }
491
492 static void inject_gp(struct kvm_vcpu *vcpu)
493 {
494         kvm_arch_ops->inject_gp(vcpu, 0);
495 }
496
497 /*
498  * Load the pae pdptrs.  Return true is they are all valid.
499  */
500 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
501 {
502         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
503         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
504         int i;
505         u64 pdpte;
506         u64 *pdpt;
507         int ret;
508         struct page *page;
509
510         spin_lock(&vcpu->kvm->lock);
511         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
512         /* FIXME: !page - emulate? 0xff? */
513         pdpt = kmap_atomic(page, KM_USER0);
514
515         ret = 1;
516         for (i = 0; i < 4; ++i) {
517                 pdpte = pdpt[offset + i];
518                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
519                         ret = 0;
520                         goto out;
521                 }
522         }
523
524         for (i = 0; i < 4; ++i)
525                 vcpu->pdptrs[i] = pdpt[offset + i];
526
527 out:
528         kunmap_atomic(pdpt, KM_USER0);
529         spin_unlock(&vcpu->kvm->lock);
530
531         return ret;
532 }
533
534 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
535 {
536         if (cr0 & CR0_RESEVED_BITS) {
537                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
538                        cr0, vcpu->cr0);
539                 inject_gp(vcpu);
540                 return;
541         }
542
543         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
544                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
545                 inject_gp(vcpu);
546                 return;
547         }
548
549         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
550                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
551                        "and a clear PE flag\n");
552                 inject_gp(vcpu);
553                 return;
554         }
555
556         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
557 #ifdef CONFIG_X86_64
558                 if ((vcpu->shadow_efer & EFER_LME)) {
559                         int cs_db, cs_l;
560
561                         if (!is_pae(vcpu)) {
562                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
563                                        "in long mode while PAE is disabled\n");
564                                 inject_gp(vcpu);
565                                 return;
566                         }
567                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
568                         if (cs_l) {
569                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
570                                        "in long mode while CS.L == 1\n");
571                                 inject_gp(vcpu);
572                                 return;
573
574                         }
575                 } else
576 #endif
577                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
578                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
579                                "reserved bits\n");
580                         inject_gp(vcpu);
581                         return;
582                 }
583
584         }
585
586         kvm_arch_ops->set_cr0(vcpu, cr0);
587         vcpu->cr0 = cr0;
588
589         spin_lock(&vcpu->kvm->lock);
590         kvm_mmu_reset_context(vcpu);
591         spin_unlock(&vcpu->kvm->lock);
592         return;
593 }
594 EXPORT_SYMBOL_GPL(set_cr0);
595
596 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
597 {
598         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
599 }
600 EXPORT_SYMBOL_GPL(lmsw);
601
602 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
603 {
604         if (cr4 & CR4_RESEVED_BITS) {
605                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
606                 inject_gp(vcpu);
607                 return;
608         }
609
610         if (is_long_mode(vcpu)) {
611                 if (!(cr4 & CR4_PAE_MASK)) {
612                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
613                                "in long mode\n");
614                         inject_gp(vcpu);
615                         return;
616                 }
617         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
618                    && !load_pdptrs(vcpu, vcpu->cr3)) {
619                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
620                 inject_gp(vcpu);
621         }
622
623         if (cr4 & CR4_VMXE_MASK) {
624                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
625                 inject_gp(vcpu);
626                 return;
627         }
628         kvm_arch_ops->set_cr4(vcpu, cr4);
629         spin_lock(&vcpu->kvm->lock);
630         kvm_mmu_reset_context(vcpu);
631         spin_unlock(&vcpu->kvm->lock);
632 }
633 EXPORT_SYMBOL_GPL(set_cr4);
634
635 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
636 {
637         if (is_long_mode(vcpu)) {
638                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
639                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
640                         inject_gp(vcpu);
641                         return;
642                 }
643         } else {
644                 if (cr3 & CR3_RESEVED_BITS) {
645                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
646                         inject_gp(vcpu);
647                         return;
648                 }
649                 if (is_paging(vcpu) && is_pae(vcpu) &&
650                     !load_pdptrs(vcpu, cr3)) {
651                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
652                                "reserved bits\n");
653                         inject_gp(vcpu);
654                         return;
655                 }
656         }
657
658         vcpu->cr3 = cr3;
659         spin_lock(&vcpu->kvm->lock);
660         /*
661          * Does the new cr3 value map to physical memory? (Note, we
662          * catch an invalid cr3 even in real-mode, because it would
663          * cause trouble later on when we turn on paging anyway.)
664          *
665          * A real CPU would silently accept an invalid cr3 and would
666          * attempt to use it - with largely undefined (and often hard
667          * to debug) behavior on the guest side.
668          */
669         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
670                 inject_gp(vcpu);
671         else
672                 vcpu->mmu.new_cr3(vcpu);
673         spin_unlock(&vcpu->kvm->lock);
674 }
675 EXPORT_SYMBOL_GPL(set_cr3);
676
677 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
678 {
679         if ( cr8 & CR8_RESEVED_BITS) {
680                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
681                 inject_gp(vcpu);
682                 return;
683         }
684         vcpu->cr8 = cr8;
685 }
686 EXPORT_SYMBOL_GPL(set_cr8);
687
688 void fx_init(struct kvm_vcpu *vcpu)
689 {
690         struct __attribute__ ((__packed__)) fx_image_s {
691                 u16 control; //fcw
692                 u16 status; //fsw
693                 u16 tag; // ftw
694                 u16 opcode; //fop
695                 u64 ip; // fpu ip
696                 u64 operand;// fpu dp
697                 u32 mxcsr;
698                 u32 mxcsr_mask;
699
700         } *fx_image;
701
702         fx_save(vcpu->host_fx_image);
703         fpu_init();
704         fx_save(vcpu->guest_fx_image);
705         fx_restore(vcpu->host_fx_image);
706
707         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
708         fx_image->mxcsr = 0x1f80;
709         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
710                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
711 }
712 EXPORT_SYMBOL_GPL(fx_init);
713
714 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
715 {
716         spin_lock(&vcpu->kvm->lock);
717         kvm_mmu_slot_remove_write_access(vcpu, slot);
718         spin_unlock(&vcpu->kvm->lock);
719 }
720
721 /*
722  * Allocate some memory and give it an address in the guest physical address
723  * space.
724  *
725  * Discontiguous memory is allowed, mostly for framebuffers.
726  */
727 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
728                                           struct kvm_memory_region *mem)
729 {
730         int r;
731         gfn_t base_gfn;
732         unsigned long npages;
733         unsigned long i;
734         struct kvm_memory_slot *memslot;
735         struct kvm_memory_slot old, new;
736         int memory_config_version;
737
738         r = -EINVAL;
739         /* General sanity checks */
740         if (mem->memory_size & (PAGE_SIZE - 1))
741                 goto out;
742         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
743                 goto out;
744         if (mem->slot >= KVM_MEMORY_SLOTS)
745                 goto out;
746         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
747                 goto out;
748
749         memslot = &kvm->memslots[mem->slot];
750         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
751         npages = mem->memory_size >> PAGE_SHIFT;
752
753         if (!npages)
754                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
755
756 raced:
757         spin_lock(&kvm->lock);
758
759         memory_config_version = kvm->memory_config_version;
760         new = old = *memslot;
761
762         new.base_gfn = base_gfn;
763         new.npages = npages;
764         new.flags = mem->flags;
765
766         /* Disallow changing a memory slot's size. */
767         r = -EINVAL;
768         if (npages && old.npages && npages != old.npages)
769                 goto out_unlock;
770
771         /* Check for overlaps */
772         r = -EEXIST;
773         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
774                 struct kvm_memory_slot *s = &kvm->memslots[i];
775
776                 if (s == memslot)
777                         continue;
778                 if (!((base_gfn + npages <= s->base_gfn) ||
779                       (base_gfn >= s->base_gfn + s->npages)))
780                         goto out_unlock;
781         }
782         /*
783          * Do memory allocations outside lock.  memory_config_version will
784          * detect any races.
785          */
786         spin_unlock(&kvm->lock);
787
788         /* Deallocate if slot is being removed */
789         if (!npages)
790                 new.phys_mem = NULL;
791
792         /* Free page dirty bitmap if unneeded */
793         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
794                 new.dirty_bitmap = NULL;
795
796         r = -ENOMEM;
797
798         /* Allocate if a slot is being created */
799         if (npages && !new.phys_mem) {
800                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
801
802                 if (!new.phys_mem)
803                         goto out_free;
804
805                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
806                 for (i = 0; i < npages; ++i) {
807                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
808                                                      | __GFP_ZERO);
809                         if (!new.phys_mem[i])
810                                 goto out_free;
811                         set_page_private(new.phys_mem[i],0);
812                 }
813         }
814
815         /* Allocate page dirty bitmap if needed */
816         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
817                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
818
819                 new.dirty_bitmap = vmalloc(dirty_bytes);
820                 if (!new.dirty_bitmap)
821                         goto out_free;
822                 memset(new.dirty_bitmap, 0, dirty_bytes);
823         }
824
825         spin_lock(&kvm->lock);
826
827         if (memory_config_version != kvm->memory_config_version) {
828                 spin_unlock(&kvm->lock);
829                 kvm_free_physmem_slot(&new, &old);
830                 goto raced;
831         }
832
833         r = -EAGAIN;
834         if (kvm->busy)
835                 goto out_unlock;
836
837         if (mem->slot >= kvm->nmemslots)
838                 kvm->nmemslots = mem->slot + 1;
839
840         *memslot = new;
841         ++kvm->memory_config_version;
842
843         spin_unlock(&kvm->lock);
844
845         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
846                 struct kvm_vcpu *vcpu;
847
848                 vcpu = vcpu_load_slot(kvm, i);
849                 if (!vcpu)
850                         continue;
851                 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
852                         do_remove_write_access(vcpu, mem->slot);
853                 kvm_mmu_reset_context(vcpu);
854                 vcpu_put(vcpu);
855         }
856
857         kvm_free_physmem_slot(&old, &new);
858         return 0;
859
860 out_unlock:
861         spin_unlock(&kvm->lock);
862 out_free:
863         kvm_free_physmem_slot(&new, &old);
864 out:
865         return r;
866 }
867
868 /*
869  * Get (and clear) the dirty memory log for a memory slot.
870  */
871 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
872                                       struct kvm_dirty_log *log)
873 {
874         struct kvm_memory_slot *memslot;
875         int r, i;
876         int n;
877         int cleared;
878         unsigned long any = 0;
879
880         spin_lock(&kvm->lock);
881
882         /*
883          * Prevent changes to guest memory configuration even while the lock
884          * is not taken.
885          */
886         ++kvm->busy;
887         spin_unlock(&kvm->lock);
888         r = -EINVAL;
889         if (log->slot >= KVM_MEMORY_SLOTS)
890                 goto out;
891
892         memslot = &kvm->memslots[log->slot];
893         r = -ENOENT;
894         if (!memslot->dirty_bitmap)
895                 goto out;
896
897         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
898
899         for (i = 0; !any && i < n/sizeof(long); ++i)
900                 any = memslot->dirty_bitmap[i];
901
902         r = -EFAULT;
903         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
904                 goto out;
905
906         if (any) {
907                 cleared = 0;
908                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
909                         struct kvm_vcpu *vcpu;
910
911                         vcpu = vcpu_load_slot(kvm, i);
912                         if (!vcpu)
913                                 continue;
914                         if (!cleared) {
915                                 do_remove_write_access(vcpu, log->slot);
916                                 memset(memslot->dirty_bitmap, 0, n);
917                                 cleared = 1;
918                         }
919                         kvm_arch_ops->tlb_flush(vcpu);
920                         vcpu_put(vcpu);
921                 }
922         }
923
924         r = 0;
925
926 out:
927         spin_lock(&kvm->lock);
928         --kvm->busy;
929         spin_unlock(&kvm->lock);
930         return r;
931 }
932
933 /*
934  * Set a new alias region.  Aliases map a portion of physical memory into
935  * another portion.  This is useful for memory windows, for example the PC
936  * VGA region.
937  */
938 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
939                                          struct kvm_memory_alias *alias)
940 {
941         int r, n;
942         struct kvm_mem_alias *p;
943
944         r = -EINVAL;
945         /* General sanity checks */
946         if (alias->memory_size & (PAGE_SIZE - 1))
947                 goto out;
948         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
949                 goto out;
950         if (alias->slot >= KVM_ALIAS_SLOTS)
951                 goto out;
952         if (alias->guest_phys_addr + alias->memory_size
953             < alias->guest_phys_addr)
954                 goto out;
955         if (alias->target_phys_addr + alias->memory_size
956             < alias->target_phys_addr)
957                 goto out;
958
959         spin_lock(&kvm->lock);
960
961         p = &kvm->aliases[alias->slot];
962         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
963         p->npages = alias->memory_size >> PAGE_SHIFT;
964         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
965
966         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
967                 if (kvm->aliases[n - 1].npages)
968                         break;
969         kvm->naliases = n;
970
971         spin_unlock(&kvm->lock);
972
973         vcpu_load(&kvm->vcpus[0]);
974         spin_lock(&kvm->lock);
975         kvm_mmu_zap_all(&kvm->vcpus[0]);
976         spin_unlock(&kvm->lock);
977         vcpu_put(&kvm->vcpus[0]);
978
979         return 0;
980
981 out:
982         return r;
983 }
984
985 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
986 {
987         int i;
988         struct kvm_mem_alias *alias;
989
990         for (i = 0; i < kvm->naliases; ++i) {
991                 alias = &kvm->aliases[i];
992                 if (gfn >= alias->base_gfn
993                     && gfn < alias->base_gfn + alias->npages)
994                         return alias->target_gfn + gfn - alias->base_gfn;
995         }
996         return gfn;
997 }
998
999 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1000 {
1001         int i;
1002
1003         for (i = 0; i < kvm->nmemslots; ++i) {
1004                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1005
1006                 if (gfn >= memslot->base_gfn
1007                     && gfn < memslot->base_gfn + memslot->npages)
1008                         return memslot;
1009         }
1010         return NULL;
1011 }
1012
1013 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1014 {
1015         gfn = unalias_gfn(kvm, gfn);
1016         return __gfn_to_memslot(kvm, gfn);
1017 }
1018
1019 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1020 {
1021         struct kvm_memory_slot *slot;
1022
1023         gfn = unalias_gfn(kvm, gfn);
1024         slot = __gfn_to_memslot(kvm, gfn);
1025         if (!slot)
1026                 return NULL;
1027         return slot->phys_mem[gfn - slot->base_gfn];
1028 }
1029 EXPORT_SYMBOL_GPL(gfn_to_page);
1030
1031 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1032 {
1033         int i;
1034         struct kvm_memory_slot *memslot;
1035         unsigned long rel_gfn;
1036
1037         for (i = 0; i < kvm->nmemslots; ++i) {
1038                 memslot = &kvm->memslots[i];
1039
1040                 if (gfn >= memslot->base_gfn
1041                     && gfn < memslot->base_gfn + memslot->npages) {
1042
1043                         if (!memslot->dirty_bitmap)
1044                                 return;
1045
1046                         rel_gfn = gfn - memslot->base_gfn;
1047
1048                         /* avoid RMW */
1049                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1050                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1051                         return;
1052                 }
1053         }
1054 }
1055
1056 static int emulator_read_std(unsigned long addr,
1057                              void *val,
1058                              unsigned int bytes,
1059                              struct x86_emulate_ctxt *ctxt)
1060 {
1061         struct kvm_vcpu *vcpu = ctxt->vcpu;
1062         void *data = val;
1063
1064         while (bytes) {
1065                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1066                 unsigned offset = addr & (PAGE_SIZE-1);
1067                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1068                 unsigned long pfn;
1069                 struct page *page;
1070                 void *page_virt;
1071
1072                 if (gpa == UNMAPPED_GVA)
1073                         return X86EMUL_PROPAGATE_FAULT;
1074                 pfn = gpa >> PAGE_SHIFT;
1075                 page = gfn_to_page(vcpu->kvm, pfn);
1076                 if (!page)
1077                         return X86EMUL_UNHANDLEABLE;
1078                 page_virt = kmap_atomic(page, KM_USER0);
1079
1080                 memcpy(data, page_virt + offset, tocopy);
1081
1082                 kunmap_atomic(page_virt, KM_USER0);
1083
1084                 bytes -= tocopy;
1085                 data += tocopy;
1086                 addr += tocopy;
1087         }
1088
1089         return X86EMUL_CONTINUE;
1090 }
1091
1092 static int emulator_write_std(unsigned long addr,
1093                               const void *val,
1094                               unsigned int bytes,
1095                               struct x86_emulate_ctxt *ctxt)
1096 {
1097         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1098                addr, bytes);
1099         return X86EMUL_UNHANDLEABLE;
1100 }
1101
1102 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1103                                                 gpa_t addr)
1104 {
1105         /*
1106          * Note that its important to have this wrapper function because
1107          * in the very near future we will be checking for MMIOs against
1108          * the LAPIC as well as the general MMIO bus
1109          */
1110         return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1111 }
1112
1113 static int emulator_read_emulated(unsigned long addr,
1114                                   void *val,
1115                                   unsigned int bytes,
1116                                   struct x86_emulate_ctxt *ctxt)
1117 {
1118         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1119         struct kvm_io_device *mmio_dev;
1120         gpa_t                 gpa;
1121
1122         if (vcpu->mmio_read_completed) {
1123                 memcpy(val, vcpu->mmio_data, bytes);
1124                 vcpu->mmio_read_completed = 0;
1125                 return X86EMUL_CONTINUE;
1126         } else if (emulator_read_std(addr, val, bytes, ctxt)
1127                    == X86EMUL_CONTINUE)
1128                 return X86EMUL_CONTINUE;
1129
1130         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1131         if (gpa == UNMAPPED_GVA)
1132                 return X86EMUL_PROPAGATE_FAULT;
1133
1134         /*
1135          * Is this MMIO handled locally?
1136          */
1137         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1138         if (mmio_dev) {
1139                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1140                 return X86EMUL_CONTINUE;
1141         }
1142
1143         vcpu->mmio_needed = 1;
1144         vcpu->mmio_phys_addr = gpa;
1145         vcpu->mmio_size = bytes;
1146         vcpu->mmio_is_write = 0;
1147
1148         return X86EMUL_UNHANDLEABLE;
1149 }
1150
1151 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1152                                const void *val, int bytes)
1153 {
1154         struct page *page;
1155         void *virt;
1156         unsigned offset = offset_in_page(gpa);
1157
1158         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1159                 return 0;
1160         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1161         if (!page)
1162                 return 0;
1163         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1164         virt = kmap_atomic(page, KM_USER0);
1165         kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
1166         memcpy(virt + offset_in_page(gpa), val, bytes);
1167         kunmap_atomic(virt, KM_USER0);
1168         return 1;
1169 }
1170
1171 static int emulator_write_emulated(unsigned long addr,
1172                                    const void *val,
1173                                    unsigned int bytes,
1174                                    struct x86_emulate_ctxt *ctxt)
1175 {
1176         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1177         struct kvm_io_device *mmio_dev;
1178         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1179
1180         if (gpa == UNMAPPED_GVA) {
1181                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1182                 return X86EMUL_PROPAGATE_FAULT;
1183         }
1184
1185         if (emulator_write_phys(vcpu, gpa, val, bytes))
1186                 return X86EMUL_CONTINUE;
1187
1188         /*
1189          * Is this MMIO handled locally?
1190          */
1191         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1192         if (mmio_dev) {
1193                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1194                 return X86EMUL_CONTINUE;
1195         }
1196
1197         vcpu->mmio_needed = 1;
1198         vcpu->mmio_phys_addr = gpa;
1199         vcpu->mmio_size = bytes;
1200         vcpu->mmio_is_write = 1;
1201         memcpy(vcpu->mmio_data, val, bytes);
1202
1203         return X86EMUL_CONTINUE;
1204 }
1205
1206 static int emulator_cmpxchg_emulated(unsigned long addr,
1207                                      const void *old,
1208                                      const void *new,
1209                                      unsigned int bytes,
1210                                      struct x86_emulate_ctxt *ctxt)
1211 {
1212         static int reported;
1213
1214         if (!reported) {
1215                 reported = 1;
1216                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1217         }
1218         return emulator_write_emulated(addr, new, bytes, ctxt);
1219 }
1220
1221 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1222 {
1223         return kvm_arch_ops->get_segment_base(vcpu, seg);
1224 }
1225
1226 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1227 {
1228         return X86EMUL_CONTINUE;
1229 }
1230
1231 int emulate_clts(struct kvm_vcpu *vcpu)
1232 {
1233         unsigned long cr0;
1234
1235         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1236         kvm_arch_ops->set_cr0(vcpu, cr0);
1237         return X86EMUL_CONTINUE;
1238 }
1239
1240 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1241 {
1242         struct kvm_vcpu *vcpu = ctxt->vcpu;
1243
1244         switch (dr) {
1245         case 0 ... 3:
1246                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1247                 return X86EMUL_CONTINUE;
1248         default:
1249                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1250                        __FUNCTION__, dr);
1251                 return X86EMUL_UNHANDLEABLE;
1252         }
1253 }
1254
1255 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1256 {
1257         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1258         int exception;
1259
1260         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1261         if (exception) {
1262                 /* FIXME: better handling */
1263                 return X86EMUL_UNHANDLEABLE;
1264         }
1265         return X86EMUL_CONTINUE;
1266 }
1267
1268 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1269 {
1270         static int reported;
1271         u8 opcodes[4];
1272         unsigned long rip = ctxt->vcpu->rip;
1273         unsigned long rip_linear;
1274
1275         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1276
1277         if (reported)
1278                 return;
1279
1280         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1281
1282         printk(KERN_ERR "emulation failed but !mmio_needed?"
1283                " rip %lx %02x %02x %02x %02x\n",
1284                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1285         reported = 1;
1286 }
1287
1288 struct x86_emulate_ops emulate_ops = {
1289         .read_std            = emulator_read_std,
1290         .write_std           = emulator_write_std,
1291         .read_emulated       = emulator_read_emulated,
1292         .write_emulated      = emulator_write_emulated,
1293         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1294 };
1295
1296 int emulate_instruction(struct kvm_vcpu *vcpu,
1297                         struct kvm_run *run,
1298                         unsigned long cr2,
1299                         u16 error_code)
1300 {
1301         struct x86_emulate_ctxt emulate_ctxt;
1302         int r;
1303         int cs_db, cs_l;
1304
1305         vcpu->mmio_fault_cr2 = cr2;
1306         kvm_arch_ops->cache_regs(vcpu);
1307
1308         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1309
1310         emulate_ctxt.vcpu = vcpu;
1311         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1312         emulate_ctxt.cr2 = cr2;
1313         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1314                 ? X86EMUL_MODE_REAL : cs_l
1315                 ? X86EMUL_MODE_PROT64 : cs_db
1316                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1317
1318         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1319                 emulate_ctxt.cs_base = 0;
1320                 emulate_ctxt.ds_base = 0;
1321                 emulate_ctxt.es_base = 0;
1322                 emulate_ctxt.ss_base = 0;
1323         } else {
1324                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1325                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1326                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1327                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1328         }
1329
1330         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1331         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1332
1333         vcpu->mmio_is_write = 0;
1334         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1335
1336         if ((r || vcpu->mmio_is_write) && run) {
1337                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1338                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1339                 run->mmio.len = vcpu->mmio_size;
1340                 run->mmio.is_write = vcpu->mmio_is_write;
1341         }
1342
1343         if (r) {
1344                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1345                         return EMULATE_DONE;
1346                 if (!vcpu->mmio_needed) {
1347                         report_emulation_failure(&emulate_ctxt);
1348                         return EMULATE_FAIL;
1349                 }
1350                 return EMULATE_DO_MMIO;
1351         }
1352
1353         kvm_arch_ops->decache_regs(vcpu);
1354         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1355
1356         if (vcpu->mmio_is_write) {
1357                 vcpu->mmio_needed = 0;
1358                 return EMULATE_DO_MMIO;
1359         }
1360
1361         return EMULATE_DONE;
1362 }
1363 EXPORT_SYMBOL_GPL(emulate_instruction);
1364
1365 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1366 {
1367         if (vcpu->irq_summary)
1368                 return 1;
1369
1370         vcpu->run->exit_reason = KVM_EXIT_HLT;
1371         ++vcpu->stat.halt_exits;
1372         return 0;
1373 }
1374 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1375
1376 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1377 {
1378         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1379
1380         kvm_arch_ops->cache_regs(vcpu);
1381         ret = -KVM_EINVAL;
1382 #ifdef CONFIG_X86_64
1383         if (is_long_mode(vcpu)) {
1384                 nr = vcpu->regs[VCPU_REGS_RAX];
1385                 a0 = vcpu->regs[VCPU_REGS_RDI];
1386                 a1 = vcpu->regs[VCPU_REGS_RSI];
1387                 a2 = vcpu->regs[VCPU_REGS_RDX];
1388                 a3 = vcpu->regs[VCPU_REGS_RCX];
1389                 a4 = vcpu->regs[VCPU_REGS_R8];
1390                 a5 = vcpu->regs[VCPU_REGS_R9];
1391         } else
1392 #endif
1393         {
1394                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1395                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1396                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1397                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1398                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1399                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1400                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1401         }
1402         switch (nr) {
1403         default:
1404                 run->hypercall.args[0] = a0;
1405                 run->hypercall.args[1] = a1;
1406                 run->hypercall.args[2] = a2;
1407                 run->hypercall.args[3] = a3;
1408                 run->hypercall.args[4] = a4;
1409                 run->hypercall.args[5] = a5;
1410                 run->hypercall.ret = ret;
1411                 run->hypercall.longmode = is_long_mode(vcpu);
1412                 kvm_arch_ops->decache_regs(vcpu);
1413                 return 0;
1414         }
1415         vcpu->regs[VCPU_REGS_RAX] = ret;
1416         kvm_arch_ops->decache_regs(vcpu);
1417         return 1;
1418 }
1419 EXPORT_SYMBOL_GPL(kvm_hypercall);
1420
1421 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1422 {
1423         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1424 }
1425
1426 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1427 {
1428         struct descriptor_table dt = { limit, base };
1429
1430         kvm_arch_ops->set_gdt(vcpu, &dt);
1431 }
1432
1433 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1434 {
1435         struct descriptor_table dt = { limit, base };
1436
1437         kvm_arch_ops->set_idt(vcpu, &dt);
1438 }
1439
1440 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1441                    unsigned long *rflags)
1442 {
1443         lmsw(vcpu, msw);
1444         *rflags = kvm_arch_ops->get_rflags(vcpu);
1445 }
1446
1447 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1448 {
1449         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1450         switch (cr) {
1451         case 0:
1452                 return vcpu->cr0;
1453         case 2:
1454                 return vcpu->cr2;
1455         case 3:
1456                 return vcpu->cr3;
1457         case 4:
1458                 return vcpu->cr4;
1459         default:
1460                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1461                 return 0;
1462         }
1463 }
1464
1465 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1466                      unsigned long *rflags)
1467 {
1468         switch (cr) {
1469         case 0:
1470                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1471                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1472                 break;
1473         case 2:
1474                 vcpu->cr2 = val;
1475                 break;
1476         case 3:
1477                 set_cr3(vcpu, val);
1478                 break;
1479         case 4:
1480                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1481                 break;
1482         default:
1483                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1484         }
1485 }
1486
1487 /*
1488  * Register the para guest with the host:
1489  */
1490 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1491 {
1492         struct kvm_vcpu_para_state *para_state;
1493         hpa_t para_state_hpa, hypercall_hpa;
1494         struct page *para_state_page;
1495         unsigned char *hypercall;
1496         gpa_t hypercall_gpa;
1497
1498         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1499         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1500
1501         /*
1502          * Needs to be page aligned:
1503          */
1504         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1505                 goto err_gp;
1506
1507         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1508         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1509         if (is_error_hpa(para_state_hpa))
1510                 goto err_gp;
1511
1512         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1513         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1514         para_state = kmap_atomic(para_state_page, KM_USER0);
1515
1516         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1517         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1518
1519         para_state->host_version = KVM_PARA_API_VERSION;
1520         /*
1521          * We cannot support guests that try to register themselves
1522          * with a newer API version than the host supports:
1523          */
1524         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1525                 para_state->ret = -KVM_EINVAL;
1526                 goto err_kunmap_skip;
1527         }
1528
1529         hypercall_gpa = para_state->hypercall_gpa;
1530         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1531         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1532         if (is_error_hpa(hypercall_hpa)) {
1533                 para_state->ret = -KVM_EINVAL;
1534                 goto err_kunmap_skip;
1535         }
1536
1537         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1538         vcpu->para_state_page = para_state_page;
1539         vcpu->para_state_gpa = para_state_gpa;
1540         vcpu->hypercall_gpa = hypercall_gpa;
1541
1542         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1543         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1544                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1545         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1546         kunmap_atomic(hypercall, KM_USER1);
1547
1548         para_state->ret = 0;
1549 err_kunmap_skip:
1550         kunmap_atomic(para_state, KM_USER0);
1551         return 0;
1552 err_gp:
1553         return 1;
1554 }
1555
1556 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1557 {
1558         u64 data;
1559
1560         switch (msr) {
1561         case 0xc0010010: /* SYSCFG */
1562         case 0xc0010015: /* HWCR */
1563         case MSR_IA32_PLATFORM_ID:
1564         case MSR_IA32_P5_MC_ADDR:
1565         case MSR_IA32_P5_MC_TYPE:
1566         case MSR_IA32_MC0_CTL:
1567         case MSR_IA32_MCG_STATUS:
1568         case MSR_IA32_MCG_CAP:
1569         case MSR_IA32_MC0_MISC:
1570         case MSR_IA32_MC0_MISC+4:
1571         case MSR_IA32_MC0_MISC+8:
1572         case MSR_IA32_MC0_MISC+12:
1573         case MSR_IA32_MC0_MISC+16:
1574         case MSR_IA32_UCODE_REV:
1575         case MSR_IA32_PERF_STATUS:
1576         case MSR_IA32_EBL_CR_POWERON:
1577                 /* MTRR registers */
1578         case 0xfe:
1579         case 0x200 ... 0x2ff:
1580                 data = 0;
1581                 break;
1582         case 0xcd: /* fsb frequency */
1583                 data = 3;
1584                 break;
1585         case MSR_IA32_APICBASE:
1586                 data = vcpu->apic_base;
1587                 break;
1588         case MSR_IA32_MISC_ENABLE:
1589                 data = vcpu->ia32_misc_enable_msr;
1590                 break;
1591 #ifdef CONFIG_X86_64
1592         case MSR_EFER:
1593                 data = vcpu->shadow_efer;
1594                 break;
1595 #endif
1596         default:
1597                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1598                 return 1;
1599         }
1600         *pdata = data;
1601         return 0;
1602 }
1603 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1604
1605 /*
1606  * Reads an msr value (of 'msr_index') into 'pdata'.
1607  * Returns 0 on success, non-0 otherwise.
1608  * Assumes vcpu_load() was already called.
1609  */
1610 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1611 {
1612         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1613 }
1614
1615 #ifdef CONFIG_X86_64
1616
1617 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1618 {
1619         if (efer & EFER_RESERVED_BITS) {
1620                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1621                        efer);
1622                 inject_gp(vcpu);
1623                 return;
1624         }
1625
1626         if (is_paging(vcpu)
1627             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1628                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1629                 inject_gp(vcpu);
1630                 return;
1631         }
1632
1633         kvm_arch_ops->set_efer(vcpu, efer);
1634
1635         efer &= ~EFER_LMA;
1636         efer |= vcpu->shadow_efer & EFER_LMA;
1637
1638         vcpu->shadow_efer = efer;
1639 }
1640
1641 #endif
1642
1643 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1644 {
1645         switch (msr) {
1646 #ifdef CONFIG_X86_64
1647         case MSR_EFER:
1648                 set_efer(vcpu, data);
1649                 break;
1650 #endif
1651         case MSR_IA32_MC0_STATUS:
1652                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1653                        __FUNCTION__, data);
1654                 break;
1655         case MSR_IA32_MCG_STATUS:
1656                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1657                         __FUNCTION__, data);
1658                 break;
1659         case MSR_IA32_UCODE_REV:
1660         case MSR_IA32_UCODE_WRITE:
1661         case 0x200 ... 0x2ff: /* MTRRs */
1662                 break;
1663         case MSR_IA32_APICBASE:
1664                 vcpu->apic_base = data;
1665                 break;
1666         case MSR_IA32_MISC_ENABLE:
1667                 vcpu->ia32_misc_enable_msr = data;
1668                 break;
1669         /*
1670          * This is the 'probe whether the host is KVM' logic:
1671          */
1672         case MSR_KVM_API_MAGIC:
1673                 return vcpu_register_para(vcpu, data);
1674
1675         default:
1676                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1677                 return 1;
1678         }
1679         return 0;
1680 }
1681 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1682
1683 /*
1684  * Writes msr value into into the appropriate "register".
1685  * Returns 0 on success, non-0 otherwise.
1686  * Assumes vcpu_load() was already called.
1687  */
1688 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1689 {
1690         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1691 }
1692
1693 void kvm_resched(struct kvm_vcpu *vcpu)
1694 {
1695         if (!need_resched())
1696                 return;
1697         vcpu_put(vcpu);
1698         cond_resched();
1699         vcpu_load(vcpu);
1700 }
1701 EXPORT_SYMBOL_GPL(kvm_resched);
1702
1703 void load_msrs(struct vmx_msr_entry *e, int n)
1704 {
1705         int i;
1706
1707         for (i = 0; i < n; ++i)
1708                 wrmsrl(e[i].index, e[i].data);
1709 }
1710 EXPORT_SYMBOL_GPL(load_msrs);
1711
1712 void save_msrs(struct vmx_msr_entry *e, int n)
1713 {
1714         int i;
1715
1716         for (i = 0; i < n; ++i)
1717                 rdmsrl(e[i].index, e[i].data);
1718 }
1719 EXPORT_SYMBOL_GPL(save_msrs);
1720
1721 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1722 {
1723         int i;
1724         u32 function;
1725         struct kvm_cpuid_entry *e, *best;
1726
1727         kvm_arch_ops->cache_regs(vcpu);
1728         function = vcpu->regs[VCPU_REGS_RAX];
1729         vcpu->regs[VCPU_REGS_RAX] = 0;
1730         vcpu->regs[VCPU_REGS_RBX] = 0;
1731         vcpu->regs[VCPU_REGS_RCX] = 0;
1732         vcpu->regs[VCPU_REGS_RDX] = 0;
1733         best = NULL;
1734         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1735                 e = &vcpu->cpuid_entries[i];
1736                 if (e->function == function) {
1737                         best = e;
1738                         break;
1739                 }
1740                 /*
1741                  * Both basic or both extended?
1742                  */
1743                 if (((e->function ^ function) & 0x80000000) == 0)
1744                         if (!best || e->function > best->function)
1745                                 best = e;
1746         }
1747         if (best) {
1748                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1749                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1750                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1751                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1752         }
1753         kvm_arch_ops->decache_regs(vcpu);
1754         kvm_arch_ops->skip_emulated_instruction(vcpu);
1755 }
1756 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1757
1758 static int pio_copy_data(struct kvm_vcpu *vcpu)
1759 {
1760         void *p = vcpu->pio_data;
1761         void *q;
1762         unsigned bytes;
1763         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1764
1765         kvm_arch_ops->vcpu_put(vcpu);
1766         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1767                  PAGE_KERNEL);
1768         if (!q) {
1769                 kvm_arch_ops->vcpu_load(vcpu);
1770                 free_pio_guest_pages(vcpu);
1771                 return -ENOMEM;
1772         }
1773         q += vcpu->pio.guest_page_offset;
1774         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1775         if (vcpu->pio.in)
1776                 memcpy(q, p, bytes);
1777         else
1778                 memcpy(p, q, bytes);
1779         q -= vcpu->pio.guest_page_offset;
1780         vunmap(q);
1781         kvm_arch_ops->vcpu_load(vcpu);
1782         free_pio_guest_pages(vcpu);
1783         return 0;
1784 }
1785
1786 static int complete_pio(struct kvm_vcpu *vcpu)
1787 {
1788         struct kvm_pio_request *io = &vcpu->pio;
1789         long delta;
1790         int r;
1791
1792         kvm_arch_ops->cache_regs(vcpu);
1793
1794         if (!io->string) {
1795                 if (io->in)
1796                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1797                                io->size);
1798         } else {
1799                 if (io->in) {
1800                         r = pio_copy_data(vcpu);
1801                         if (r) {
1802                                 kvm_arch_ops->cache_regs(vcpu);
1803                                 return r;
1804                         }
1805                 }
1806
1807                 delta = 1;
1808                 if (io->rep) {
1809                         delta *= io->cur_count;
1810                         /*
1811                          * The size of the register should really depend on
1812                          * current address size.
1813                          */
1814                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1815                 }
1816                 if (io->down)
1817                         delta = -delta;
1818                 delta *= io->size;
1819                 if (io->in)
1820                         vcpu->regs[VCPU_REGS_RDI] += delta;
1821                 else
1822                         vcpu->regs[VCPU_REGS_RSI] += delta;
1823         }
1824
1825         kvm_arch_ops->decache_regs(vcpu);
1826
1827         io->count -= io->cur_count;
1828         io->cur_count = 0;
1829
1830         if (!io->count)
1831                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1832         return 0;
1833 }
1834
1835 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836                   int size, unsigned long count, int string, int down,
1837                   gva_t address, int rep, unsigned port)
1838 {
1839         unsigned now, in_page;
1840         int i;
1841         int nr_pages = 1;
1842         struct page *page;
1843
1844         vcpu->run->exit_reason = KVM_EXIT_IO;
1845         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1846         vcpu->run->io.size = size;
1847         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1848         vcpu->run->io.count = count;
1849         vcpu->run->io.port = port;
1850         vcpu->pio.count = count;
1851         vcpu->pio.cur_count = count;
1852         vcpu->pio.size = size;
1853         vcpu->pio.in = in;
1854         vcpu->pio.string = string;
1855         vcpu->pio.down = down;
1856         vcpu->pio.guest_page_offset = offset_in_page(address);
1857         vcpu->pio.rep = rep;
1858
1859         if (!string) {
1860                 kvm_arch_ops->cache_regs(vcpu);
1861                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1862                 kvm_arch_ops->decache_regs(vcpu);
1863                 return 0;
1864         }
1865
1866         if (!count) {
1867                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1868                 return 1;
1869         }
1870
1871         now = min(count, PAGE_SIZE / size);
1872
1873         if (!down)
1874                 in_page = PAGE_SIZE - offset_in_page(address);
1875         else
1876                 in_page = offset_in_page(address) + size;
1877         now = min(count, (unsigned long)in_page / size);
1878         if (!now) {
1879                 /*
1880                  * String I/O straddles page boundary.  Pin two guest pages
1881                  * so that we satisfy atomicity constraints.  Do just one
1882                  * transaction to avoid complexity.
1883                  */
1884                 nr_pages = 2;
1885                 now = 1;
1886         }
1887         if (down) {
1888                 /*
1889                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1890                  */
1891                 printk(KERN_ERR "kvm: guest string pio down\n");
1892                 inject_gp(vcpu);
1893                 return 1;
1894         }
1895         vcpu->run->io.count = now;
1896         vcpu->pio.cur_count = now;
1897
1898         for (i = 0; i < nr_pages; ++i) {
1899                 spin_lock(&vcpu->kvm->lock);
1900                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1901                 if (page)
1902                         get_page(page);
1903                 vcpu->pio.guest_pages[i] = page;
1904                 spin_unlock(&vcpu->kvm->lock);
1905                 if (!page) {
1906                         inject_gp(vcpu);
1907                         free_pio_guest_pages(vcpu);
1908                         return 1;
1909                 }
1910         }
1911
1912         if (!vcpu->pio.in)
1913                 return pio_copy_data(vcpu);
1914         return 0;
1915 }
1916 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1917
1918 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1919 {
1920         int r;
1921         sigset_t sigsaved;
1922
1923         vcpu_load(vcpu);
1924
1925         if (vcpu->sigset_active)
1926                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1927
1928         /* re-sync apic's tpr */
1929         vcpu->cr8 = kvm_run->cr8;
1930
1931         if (vcpu->pio.cur_count) {
1932                 r = complete_pio(vcpu);
1933                 if (r)
1934                         goto out;
1935         }
1936
1937         if (vcpu->mmio_needed) {
1938                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1939                 vcpu->mmio_read_completed = 1;
1940                 vcpu->mmio_needed = 0;
1941                 r = emulate_instruction(vcpu, kvm_run,
1942                                         vcpu->mmio_fault_cr2, 0);
1943                 if (r == EMULATE_DO_MMIO) {
1944                         /*
1945                          * Read-modify-write.  Back to userspace.
1946                          */
1947                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1948                         r = 0;
1949                         goto out;
1950                 }
1951         }
1952
1953         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1954                 kvm_arch_ops->cache_regs(vcpu);
1955                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1956                 kvm_arch_ops->decache_regs(vcpu);
1957         }
1958
1959         r = kvm_arch_ops->run(vcpu, kvm_run);
1960
1961 out:
1962         if (vcpu->sigset_active)
1963                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1964
1965         vcpu_put(vcpu);
1966         return r;
1967 }
1968
1969 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1970                                    struct kvm_regs *regs)
1971 {
1972         vcpu_load(vcpu);
1973
1974         kvm_arch_ops->cache_regs(vcpu);
1975
1976         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1977         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1978         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1979         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1980         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1981         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1982         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1983         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1984 #ifdef CONFIG_X86_64
1985         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1986         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1987         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1988         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1989         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1990         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1991         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1992         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1993 #endif
1994
1995         regs->rip = vcpu->rip;
1996         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1997
1998         /*
1999          * Don't leak debug flags in case they were set for guest debugging
2000          */
2001         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2002                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2003
2004         vcpu_put(vcpu);
2005
2006         return 0;
2007 }
2008
2009 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2010                                    struct kvm_regs *regs)
2011 {
2012         vcpu_load(vcpu);
2013
2014         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2015         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2016         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2017         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2018         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2019         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2020         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2021         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2022 #ifdef CONFIG_X86_64
2023         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2024         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2025         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2026         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2027         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2028         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2029         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2030         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2031 #endif
2032
2033         vcpu->rip = regs->rip;
2034         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2035
2036         kvm_arch_ops->decache_regs(vcpu);
2037
2038         vcpu_put(vcpu);
2039
2040         return 0;
2041 }
2042
2043 static void get_segment(struct kvm_vcpu *vcpu,
2044                         struct kvm_segment *var, int seg)
2045 {
2046         return kvm_arch_ops->get_segment(vcpu, var, seg);
2047 }
2048
2049 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2050                                     struct kvm_sregs *sregs)
2051 {
2052         struct descriptor_table dt;
2053
2054         vcpu_load(vcpu);
2055
2056         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2057         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2058         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2059         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2060         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2061         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2062
2063         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2064         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2065
2066         kvm_arch_ops->get_idt(vcpu, &dt);
2067         sregs->idt.limit = dt.limit;
2068         sregs->idt.base = dt.base;
2069         kvm_arch_ops->get_gdt(vcpu, &dt);
2070         sregs->gdt.limit = dt.limit;
2071         sregs->gdt.base = dt.base;
2072
2073         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2074         sregs->cr0 = vcpu->cr0;
2075         sregs->cr2 = vcpu->cr2;
2076         sregs->cr3 = vcpu->cr3;
2077         sregs->cr4 = vcpu->cr4;
2078         sregs->cr8 = vcpu->cr8;
2079         sregs->efer = vcpu->shadow_efer;
2080         sregs->apic_base = vcpu->apic_base;
2081
2082         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2083                sizeof sregs->interrupt_bitmap);
2084
2085         vcpu_put(vcpu);
2086
2087         return 0;
2088 }
2089
2090 static void set_segment(struct kvm_vcpu *vcpu,
2091                         struct kvm_segment *var, int seg)
2092 {
2093         return kvm_arch_ops->set_segment(vcpu, var, seg);
2094 }
2095
2096 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2097                                     struct kvm_sregs *sregs)
2098 {
2099         int mmu_reset_needed = 0;
2100         int i;
2101         struct descriptor_table dt;
2102
2103         vcpu_load(vcpu);
2104
2105         dt.limit = sregs->idt.limit;
2106         dt.base = sregs->idt.base;
2107         kvm_arch_ops->set_idt(vcpu, &dt);
2108         dt.limit = sregs->gdt.limit;
2109         dt.base = sregs->gdt.base;
2110         kvm_arch_ops->set_gdt(vcpu, &dt);
2111
2112         vcpu->cr2 = sregs->cr2;
2113         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2114         vcpu->cr3 = sregs->cr3;
2115
2116         vcpu->cr8 = sregs->cr8;
2117
2118         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2119 #ifdef CONFIG_X86_64
2120         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2121 #endif
2122         vcpu->apic_base = sregs->apic_base;
2123
2124         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2125
2126         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2127         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2128
2129         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2130         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2131         if (!is_long_mode(vcpu) && is_pae(vcpu))
2132                 load_pdptrs(vcpu, vcpu->cr3);
2133
2134         if (mmu_reset_needed)
2135                 kvm_mmu_reset_context(vcpu);
2136
2137         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2138                sizeof vcpu->irq_pending);
2139         vcpu->irq_summary = 0;
2140         for (i = 0; i < NR_IRQ_WORDS; ++i)
2141                 if (vcpu->irq_pending[i])
2142                         __set_bit(i, &vcpu->irq_summary);
2143
2144         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2145         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2146         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2147         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2148         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2149         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2150
2151         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2152         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2153
2154         vcpu_put(vcpu);
2155
2156         return 0;
2157 }
2158
2159 /*
2160  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2161  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2162  *
2163  * This list is modified at module load time to reflect the
2164  * capabilities of the host cpu.
2165  */
2166 static u32 msrs_to_save[] = {
2167         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2168         MSR_K6_STAR,
2169 #ifdef CONFIG_X86_64
2170         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2171 #endif
2172         MSR_IA32_TIME_STAMP_COUNTER,
2173 };
2174
2175 static unsigned num_msrs_to_save;
2176
2177 static u32 emulated_msrs[] = {
2178         MSR_IA32_MISC_ENABLE,
2179 };
2180
2181 static __init void kvm_init_msr_list(void)
2182 {
2183         u32 dummy[2];
2184         unsigned i, j;
2185
2186         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2187                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2188                         continue;
2189                 if (j < i)
2190                         msrs_to_save[j] = msrs_to_save[i];
2191                 j++;
2192         }
2193         num_msrs_to_save = j;
2194 }
2195
2196 /*
2197  * Adapt set_msr() to msr_io()'s calling convention
2198  */
2199 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2200 {
2201         return set_msr(vcpu, index, *data);
2202 }
2203
2204 /*
2205  * Read or write a bunch of msrs. All parameters are kernel addresses.
2206  *
2207  * @return number of msrs set successfully.
2208  */
2209 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2210                     struct kvm_msr_entry *entries,
2211                     int (*do_msr)(struct kvm_vcpu *vcpu,
2212                                   unsigned index, u64 *data))
2213 {
2214         int i;
2215
2216         vcpu_load(vcpu);
2217
2218         for (i = 0; i < msrs->nmsrs; ++i)
2219                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2220                         break;
2221
2222         vcpu_put(vcpu);
2223
2224         return i;
2225 }
2226
2227 /*
2228  * Read or write a bunch of msrs. Parameters are user addresses.
2229  *
2230  * @return number of msrs set successfully.
2231  */
2232 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2233                   int (*do_msr)(struct kvm_vcpu *vcpu,
2234                                 unsigned index, u64 *data),
2235                   int writeback)
2236 {
2237         struct kvm_msrs msrs;
2238         struct kvm_msr_entry *entries;
2239         int r, n;
2240         unsigned size;
2241
2242         r = -EFAULT;
2243         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2244                 goto out;
2245
2246         r = -E2BIG;
2247         if (msrs.nmsrs >= MAX_IO_MSRS)
2248                 goto out;
2249
2250         r = -ENOMEM;
2251         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2252         entries = vmalloc(size);
2253         if (!entries)
2254                 goto out;
2255
2256         r = -EFAULT;
2257         if (copy_from_user(entries, user_msrs->entries, size))
2258                 goto out_free;
2259
2260         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2261         if (r < 0)
2262                 goto out_free;
2263
2264         r = -EFAULT;
2265         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2266                 goto out_free;
2267
2268         r = n;
2269
2270 out_free:
2271         vfree(entries);
2272 out:
2273         return r;
2274 }
2275
2276 /*
2277  * Translate a guest virtual address to a guest physical address.
2278  */
2279 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2280                                     struct kvm_translation *tr)
2281 {
2282         unsigned long vaddr = tr->linear_address;
2283         gpa_t gpa;
2284
2285         vcpu_load(vcpu);
2286         spin_lock(&vcpu->kvm->lock);
2287         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2288         tr->physical_address = gpa;
2289         tr->valid = gpa != UNMAPPED_GVA;
2290         tr->writeable = 1;
2291         tr->usermode = 0;
2292         spin_unlock(&vcpu->kvm->lock);
2293         vcpu_put(vcpu);
2294
2295         return 0;
2296 }
2297
2298 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2299                                     struct kvm_interrupt *irq)
2300 {
2301         if (irq->irq < 0 || irq->irq >= 256)
2302                 return -EINVAL;
2303         vcpu_load(vcpu);
2304
2305         set_bit(irq->irq, vcpu->irq_pending);
2306         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2307
2308         vcpu_put(vcpu);
2309
2310         return 0;
2311 }
2312
2313 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2314                                       struct kvm_debug_guest *dbg)
2315 {
2316         int r;
2317
2318         vcpu_load(vcpu);
2319
2320         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2321
2322         vcpu_put(vcpu);
2323
2324         return r;
2325 }
2326
2327 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2328                                     unsigned long address,
2329                                     int *type)
2330 {
2331         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2332         unsigned long pgoff;
2333         struct page *page;
2334
2335         *type = VM_FAULT_MINOR;
2336         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2337         if (pgoff == 0)
2338                 page = virt_to_page(vcpu->run);
2339         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2340                 page = virt_to_page(vcpu->pio_data);
2341         else
2342                 return NOPAGE_SIGBUS;
2343         get_page(page);
2344         return page;
2345 }
2346
2347 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2348         .nopage = kvm_vcpu_nopage,
2349 };
2350
2351 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2352 {
2353         vma->vm_ops = &kvm_vcpu_vm_ops;
2354         return 0;
2355 }
2356
2357 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2358 {
2359         struct kvm_vcpu *vcpu = filp->private_data;
2360
2361         fput(vcpu->kvm->filp);
2362         return 0;
2363 }
2364
2365 static struct file_operations kvm_vcpu_fops = {
2366         .release        = kvm_vcpu_release,
2367         .unlocked_ioctl = kvm_vcpu_ioctl,
2368         .compat_ioctl   = kvm_vcpu_ioctl,
2369         .mmap           = kvm_vcpu_mmap,
2370 };
2371
2372 /*
2373  * Allocates an inode for the vcpu.
2374  */
2375 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2376 {
2377         int fd, r;
2378         struct inode *inode;
2379         struct file *file;
2380
2381         atomic_inc(&vcpu->kvm->filp->f_count);
2382         inode = kvmfs_inode(&kvm_vcpu_fops);
2383         if (IS_ERR(inode)) {
2384                 r = PTR_ERR(inode);
2385                 goto out1;
2386         }
2387
2388         file = kvmfs_file(inode, vcpu);
2389         if (IS_ERR(file)) {
2390                 r = PTR_ERR(file);
2391                 goto out2;
2392         }
2393
2394         r = get_unused_fd();
2395         if (r < 0)
2396                 goto out3;
2397         fd = r;
2398         fd_install(fd, file);
2399
2400         return fd;
2401
2402 out3:
2403         fput(file);
2404 out2:
2405         iput(inode);
2406 out1:
2407         fput(vcpu->kvm->filp);
2408         return r;
2409 }
2410
2411 /*
2412  * Creates some virtual cpus.  Good luck creating more than one.
2413  */
2414 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2415 {
2416         int r;
2417         struct kvm_vcpu *vcpu;
2418         struct page *page;
2419
2420         r = -EINVAL;
2421         if (!valid_vcpu(n))
2422                 goto out;
2423
2424         vcpu = &kvm->vcpus[n];
2425
2426         mutex_lock(&vcpu->mutex);
2427
2428         if (vcpu->vmcs) {
2429                 mutex_unlock(&vcpu->mutex);
2430                 return -EEXIST;
2431         }
2432
2433         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2434         r = -ENOMEM;
2435         if (!page)
2436                 goto out_unlock;
2437         vcpu->run = page_address(page);
2438
2439         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2440         r = -ENOMEM;
2441         if (!page)
2442                 goto out_free_run;
2443         vcpu->pio_data = page_address(page);
2444
2445         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2446                                            FX_IMAGE_ALIGN);
2447         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2448         vcpu->cr0 = 0x10;
2449
2450         r = kvm_arch_ops->vcpu_create(vcpu);
2451         if (r < 0)
2452                 goto out_free_vcpus;
2453
2454         r = kvm_mmu_create(vcpu);
2455         if (r < 0)
2456                 goto out_free_vcpus;
2457
2458         kvm_arch_ops->vcpu_load(vcpu);
2459         r = kvm_mmu_setup(vcpu);
2460         if (r >= 0)
2461                 r = kvm_arch_ops->vcpu_setup(vcpu);
2462         vcpu_put(vcpu);
2463
2464         if (r < 0)
2465                 goto out_free_vcpus;
2466
2467         r = create_vcpu_fd(vcpu);
2468         if (r < 0)
2469                 goto out_free_vcpus;
2470
2471         spin_lock(&kvm_lock);
2472         if (n >= kvm->nvcpus)
2473                 kvm->nvcpus = n + 1;
2474         spin_unlock(&kvm_lock);
2475
2476         return r;
2477
2478 out_free_vcpus:
2479         kvm_free_vcpu(vcpu);
2480 out_free_run:
2481         free_page((unsigned long)vcpu->run);
2482         vcpu->run = NULL;
2483 out_unlock:
2484         mutex_unlock(&vcpu->mutex);
2485 out:
2486         return r;
2487 }
2488
2489 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2490 {
2491         u64 efer;
2492         int i;
2493         struct kvm_cpuid_entry *e, *entry;
2494
2495         rdmsrl(MSR_EFER, efer);
2496         entry = NULL;
2497         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2498                 e = &vcpu->cpuid_entries[i];
2499                 if (e->function == 0x80000001) {
2500                         entry = e;
2501                         break;
2502                 }
2503         }
2504         if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
2505                 entry->edx &= ~(1 << 20);
2506                 printk(KERN_INFO ": guest NX capability removed\n");
2507         }
2508 }
2509
2510 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2511                                     struct kvm_cpuid *cpuid,
2512                                     struct kvm_cpuid_entry __user *entries)
2513 {
2514         int r;
2515
2516         r = -E2BIG;
2517         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2518                 goto out;
2519         r = -EFAULT;
2520         if (copy_from_user(&vcpu->cpuid_entries, entries,
2521                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2522                 goto out;
2523         vcpu->cpuid_nent = cpuid->nent;
2524         cpuid_fix_nx_cap(vcpu);
2525         return 0;
2526
2527 out:
2528         return r;
2529 }
2530
2531 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2532 {
2533         if (sigset) {
2534                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2535                 vcpu->sigset_active = 1;
2536                 vcpu->sigset = *sigset;
2537         } else
2538                 vcpu->sigset_active = 0;
2539         return 0;
2540 }
2541
2542 /*
2543  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2544  * we have asm/x86/processor.h
2545  */
2546 struct fxsave {
2547         u16     cwd;
2548         u16     swd;
2549         u16     twd;
2550         u16     fop;
2551         u64     rip;
2552         u64     rdp;
2553         u32     mxcsr;
2554         u32     mxcsr_mask;
2555         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2556 #ifdef CONFIG_X86_64
2557         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2558 #else
2559         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2560 #endif
2561 };
2562
2563 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2564 {
2565         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2566
2567         vcpu_load(vcpu);
2568
2569         memcpy(fpu->fpr, fxsave->st_space, 128);
2570         fpu->fcw = fxsave->cwd;
2571         fpu->fsw = fxsave->swd;
2572         fpu->ftwx = fxsave->twd;
2573         fpu->last_opcode = fxsave->fop;
2574         fpu->last_ip = fxsave->rip;
2575         fpu->last_dp = fxsave->rdp;
2576         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2577
2578         vcpu_put(vcpu);
2579
2580         return 0;
2581 }
2582
2583 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2584 {
2585         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2586
2587         vcpu_load(vcpu);
2588
2589         memcpy(fxsave->st_space, fpu->fpr, 128);
2590         fxsave->cwd = fpu->fcw;
2591         fxsave->swd = fpu->fsw;
2592         fxsave->twd = fpu->ftwx;
2593         fxsave->fop = fpu->last_opcode;
2594         fxsave->rip = fpu->last_ip;
2595         fxsave->rdp = fpu->last_dp;
2596         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2597
2598         vcpu_put(vcpu);
2599
2600         return 0;
2601 }
2602
2603 static long kvm_vcpu_ioctl(struct file *filp,
2604                            unsigned int ioctl, unsigned long arg)
2605 {
2606         struct kvm_vcpu *vcpu = filp->private_data;
2607         void __user *argp = (void __user *)arg;
2608         int r = -EINVAL;
2609
2610         switch (ioctl) {
2611         case KVM_RUN:
2612                 r = -EINVAL;
2613                 if (arg)
2614                         goto out;
2615                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2616                 break;
2617         case KVM_GET_REGS: {
2618                 struct kvm_regs kvm_regs;
2619
2620                 memset(&kvm_regs, 0, sizeof kvm_regs);
2621                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2622                 if (r)
2623                         goto out;
2624                 r = -EFAULT;
2625                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2626                         goto out;
2627                 r = 0;
2628                 break;
2629         }
2630         case KVM_SET_REGS: {
2631                 struct kvm_regs kvm_regs;
2632
2633                 r = -EFAULT;
2634                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2635                         goto out;
2636                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2637                 if (r)
2638                         goto out;
2639                 r = 0;
2640                 break;
2641         }
2642         case KVM_GET_SREGS: {
2643                 struct kvm_sregs kvm_sregs;
2644
2645                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2646                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2647                 if (r)
2648                         goto out;
2649                 r = -EFAULT;
2650                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2651                         goto out;
2652                 r = 0;
2653                 break;
2654         }
2655         case KVM_SET_SREGS: {
2656                 struct kvm_sregs kvm_sregs;
2657
2658                 r = -EFAULT;
2659                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2660                         goto out;
2661                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2662                 if (r)
2663                         goto out;
2664                 r = 0;
2665                 break;
2666         }
2667         case KVM_TRANSLATE: {
2668                 struct kvm_translation tr;
2669
2670                 r = -EFAULT;
2671                 if (copy_from_user(&tr, argp, sizeof tr))
2672                         goto out;
2673                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2674                 if (r)
2675                         goto out;
2676                 r = -EFAULT;
2677                 if (copy_to_user(argp, &tr, sizeof tr))
2678                         goto out;
2679                 r = 0;
2680                 break;
2681         }
2682         case KVM_INTERRUPT: {
2683                 struct kvm_interrupt irq;
2684
2685                 r = -EFAULT;
2686                 if (copy_from_user(&irq, argp, sizeof irq))
2687                         goto out;
2688                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2689                 if (r)
2690                         goto out;
2691                 r = 0;
2692                 break;
2693         }
2694         case KVM_DEBUG_GUEST: {
2695                 struct kvm_debug_guest dbg;
2696
2697                 r = -EFAULT;
2698                 if (copy_from_user(&dbg, argp, sizeof dbg))
2699                         goto out;
2700                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2701                 if (r)
2702                         goto out;
2703                 r = 0;
2704                 break;
2705         }
2706         case KVM_GET_MSRS:
2707                 r = msr_io(vcpu, argp, get_msr, 1);
2708                 break;
2709         case KVM_SET_MSRS:
2710                 r = msr_io(vcpu, argp, do_set_msr, 0);
2711                 break;
2712         case KVM_SET_CPUID: {
2713                 struct kvm_cpuid __user *cpuid_arg = argp;
2714                 struct kvm_cpuid cpuid;
2715
2716                 r = -EFAULT;
2717                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2718                         goto out;
2719                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2720                 if (r)
2721                         goto out;
2722                 break;
2723         }
2724         case KVM_SET_SIGNAL_MASK: {
2725                 struct kvm_signal_mask __user *sigmask_arg = argp;
2726                 struct kvm_signal_mask kvm_sigmask;
2727                 sigset_t sigset, *p;
2728
2729                 p = NULL;
2730                 if (argp) {
2731                         r = -EFAULT;
2732                         if (copy_from_user(&kvm_sigmask, argp,
2733                                            sizeof kvm_sigmask))
2734                                 goto out;
2735                         r = -EINVAL;
2736                         if (kvm_sigmask.len != sizeof sigset)
2737                                 goto out;
2738                         r = -EFAULT;
2739                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2740                                            sizeof sigset))
2741                                 goto out;
2742                         p = &sigset;
2743                 }
2744                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2745                 break;
2746         }
2747         case KVM_GET_FPU: {
2748                 struct kvm_fpu fpu;
2749
2750                 memset(&fpu, 0, sizeof fpu);
2751                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2752                 if (r)
2753                         goto out;
2754                 r = -EFAULT;
2755                 if (copy_to_user(argp, &fpu, sizeof fpu))
2756                         goto out;
2757                 r = 0;
2758                 break;
2759         }
2760         case KVM_SET_FPU: {
2761                 struct kvm_fpu fpu;
2762
2763                 r = -EFAULT;
2764                 if (copy_from_user(&fpu, argp, sizeof fpu))
2765                         goto out;
2766                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2767                 if (r)
2768                         goto out;
2769                 r = 0;
2770                 break;
2771         }
2772         default:
2773                 ;
2774         }
2775 out:
2776         return r;
2777 }
2778
2779 static long kvm_vm_ioctl(struct file *filp,
2780                            unsigned int ioctl, unsigned long arg)
2781 {
2782         struct kvm *kvm = filp->private_data;
2783         void __user *argp = (void __user *)arg;
2784         int r = -EINVAL;
2785
2786         switch (ioctl) {
2787         case KVM_CREATE_VCPU:
2788                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2789                 if (r < 0)
2790                         goto out;
2791                 break;
2792         case KVM_SET_MEMORY_REGION: {
2793                 struct kvm_memory_region kvm_mem;
2794
2795                 r = -EFAULT;
2796                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2797                         goto out;
2798                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2799                 if (r)
2800                         goto out;
2801                 break;
2802         }
2803         case KVM_GET_DIRTY_LOG: {
2804                 struct kvm_dirty_log log;
2805
2806                 r = -EFAULT;
2807                 if (copy_from_user(&log, argp, sizeof log))
2808                         goto out;
2809                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2810                 if (r)
2811                         goto out;
2812                 break;
2813         }
2814         case KVM_SET_MEMORY_ALIAS: {
2815                 struct kvm_memory_alias alias;
2816
2817                 r = -EFAULT;
2818                 if (copy_from_user(&alias, argp, sizeof alias))
2819                         goto out;
2820                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2821                 if (r)
2822                         goto out;
2823                 break;
2824         }
2825         default:
2826                 ;
2827         }
2828 out:
2829         return r;
2830 }
2831
2832 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2833                                   unsigned long address,
2834                                   int *type)
2835 {
2836         struct kvm *kvm = vma->vm_file->private_data;
2837         unsigned long pgoff;
2838         struct page *page;
2839
2840         *type = VM_FAULT_MINOR;
2841         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2842         page = gfn_to_page(kvm, pgoff);
2843         if (!page)
2844                 return NOPAGE_SIGBUS;
2845         get_page(page);
2846         return page;
2847 }
2848
2849 static struct vm_operations_struct kvm_vm_vm_ops = {
2850         .nopage = kvm_vm_nopage,
2851 };
2852
2853 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2854 {
2855         vma->vm_ops = &kvm_vm_vm_ops;
2856         return 0;
2857 }
2858
2859 static struct file_operations kvm_vm_fops = {
2860         .release        = kvm_vm_release,
2861         .unlocked_ioctl = kvm_vm_ioctl,
2862         .compat_ioctl   = kvm_vm_ioctl,
2863         .mmap           = kvm_vm_mmap,
2864 };
2865
2866 static int kvm_dev_ioctl_create_vm(void)
2867 {
2868         int fd, r;
2869         struct inode *inode;
2870         struct file *file;
2871         struct kvm *kvm;
2872
2873         inode = kvmfs_inode(&kvm_vm_fops);
2874         if (IS_ERR(inode)) {
2875                 r = PTR_ERR(inode);
2876                 goto out1;
2877         }
2878
2879         kvm = kvm_create_vm();
2880         if (IS_ERR(kvm)) {
2881                 r = PTR_ERR(kvm);
2882                 goto out2;
2883         }
2884
2885         file = kvmfs_file(inode, kvm);
2886         if (IS_ERR(file)) {
2887                 r = PTR_ERR(file);
2888                 goto out3;
2889         }
2890         kvm->filp = file;
2891
2892         r = get_unused_fd();
2893         if (r < 0)
2894                 goto out4;
2895         fd = r;
2896         fd_install(fd, file);
2897
2898         return fd;
2899
2900 out4:
2901         fput(file);
2902 out3:
2903         kvm_destroy_vm(kvm);
2904 out2:
2905         iput(inode);
2906 out1:
2907         return r;
2908 }
2909
2910 static long kvm_dev_ioctl(struct file *filp,
2911                           unsigned int ioctl, unsigned long arg)
2912 {
2913         void __user *argp = (void __user *)arg;
2914         long r = -EINVAL;
2915
2916         switch (ioctl) {
2917         case KVM_GET_API_VERSION:
2918                 r = -EINVAL;
2919                 if (arg)
2920                         goto out;
2921                 r = KVM_API_VERSION;
2922                 break;
2923         case KVM_CREATE_VM:
2924                 r = -EINVAL;
2925                 if (arg)
2926                         goto out;
2927                 r = kvm_dev_ioctl_create_vm();
2928                 break;
2929         case KVM_GET_MSR_INDEX_LIST: {
2930                 struct kvm_msr_list __user *user_msr_list = argp;
2931                 struct kvm_msr_list msr_list;
2932                 unsigned n;
2933
2934                 r = -EFAULT;
2935                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2936                         goto out;
2937                 n = msr_list.nmsrs;
2938                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2939                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2940                         goto out;
2941                 r = -E2BIG;
2942                 if (n < num_msrs_to_save)
2943                         goto out;
2944                 r = -EFAULT;
2945                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2946                                  num_msrs_to_save * sizeof(u32)))
2947                         goto out;
2948                 if (copy_to_user(user_msr_list->indices
2949                                  + num_msrs_to_save * sizeof(u32),
2950                                  &emulated_msrs,
2951                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2952                         goto out;
2953                 r = 0;
2954                 break;
2955         }
2956         case KVM_CHECK_EXTENSION:
2957                 /*
2958                  * No extensions defined at present.
2959                  */
2960                 r = 0;
2961                 break;
2962         case KVM_GET_VCPU_MMAP_SIZE:
2963                 r = -EINVAL;
2964                 if (arg)
2965                         goto out;
2966                 r = 2 * PAGE_SIZE;
2967                 break;
2968         default:
2969                 ;
2970         }
2971 out:
2972         return r;
2973 }
2974
2975 static struct file_operations kvm_chardev_ops = {
2976         .open           = kvm_dev_open,
2977         .release        = kvm_dev_release,
2978         .unlocked_ioctl = kvm_dev_ioctl,
2979         .compat_ioctl   = kvm_dev_ioctl,
2980 };
2981
2982 static struct miscdevice kvm_dev = {
2983         KVM_MINOR,
2984         "kvm",
2985         &kvm_chardev_ops,
2986 };
2987
2988 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2989                        void *v)
2990 {
2991         if (val == SYS_RESTART) {
2992                 /*
2993                  * Some (well, at least mine) BIOSes hang on reboot if
2994                  * in vmx root mode.
2995                  */
2996                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2997                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2998         }
2999         return NOTIFY_OK;
3000 }
3001
3002 static struct notifier_block kvm_reboot_notifier = {
3003         .notifier_call = kvm_reboot,
3004         .priority = 0,
3005 };
3006
3007 /*
3008  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3009  * cached on it.
3010  */
3011 static void decache_vcpus_on_cpu(int cpu)
3012 {
3013         struct kvm *vm;
3014         struct kvm_vcpu *vcpu;
3015         int i;
3016
3017         spin_lock(&kvm_lock);
3018         list_for_each_entry(vm, &vm_list, vm_list)
3019                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3020                         vcpu = &vm->vcpus[i];
3021                         /*
3022                          * If the vcpu is locked, then it is running on some
3023                          * other cpu and therefore it is not cached on the
3024                          * cpu in question.
3025                          *
3026                          * If it's not locked, check the last cpu it executed
3027                          * on.
3028                          */
3029                         if (mutex_trylock(&vcpu->mutex)) {
3030                                 if (vcpu->cpu == cpu) {
3031                                         kvm_arch_ops->vcpu_decache(vcpu);
3032                                         vcpu->cpu = -1;
3033                                 }
3034                                 mutex_unlock(&vcpu->mutex);
3035                         }
3036                 }
3037         spin_unlock(&kvm_lock);
3038 }
3039
3040 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3041                            void *v)
3042 {
3043         int cpu = (long)v;
3044
3045         switch (val) {
3046         case CPU_DOWN_PREPARE:
3047         case CPU_DOWN_PREPARE_FROZEN:
3048         case CPU_UP_CANCELED:
3049         case CPU_UP_CANCELED_FROZEN:
3050                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3051                        cpu);
3052                 decache_vcpus_on_cpu(cpu);
3053                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
3054                                          NULL, 0, 1);
3055                 break;
3056         case CPU_ONLINE:
3057         case CPU_ONLINE_FROZEN:
3058                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3059                        cpu);
3060                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
3061                                          NULL, 0, 1);
3062                 break;
3063         }
3064         return NOTIFY_OK;
3065 }
3066
3067 void kvm_io_bus_init(struct kvm_io_bus *bus)
3068 {
3069         memset(bus, 0, sizeof(*bus));
3070 }
3071
3072 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3073 {
3074         int i;
3075
3076         for (i = 0; i < bus->dev_count; i++) {
3077                 struct kvm_io_device *pos = bus->devs[i];
3078
3079                 kvm_iodevice_destructor(pos);
3080         }
3081 }
3082
3083 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3084 {
3085         int i;
3086
3087         for (i = 0; i < bus->dev_count; i++) {
3088                 struct kvm_io_device *pos = bus->devs[i];
3089
3090                 if (pos->in_range(pos, addr))
3091                         return pos;
3092         }
3093
3094         return NULL;
3095 }
3096
3097 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3098 {
3099         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3100
3101         bus->devs[bus->dev_count++] = dev;
3102 }
3103
3104 static struct notifier_block kvm_cpu_notifier = {
3105         .notifier_call = kvm_cpu_hotplug,
3106         .priority = 20, /* must be > scheduler priority */
3107 };
3108
3109 static u64 stat_get(void *_offset)
3110 {
3111         unsigned offset = (long)_offset;
3112         u64 total = 0;
3113         struct kvm *kvm;
3114         struct kvm_vcpu *vcpu;
3115         int i;
3116
3117         spin_lock(&kvm_lock);
3118         list_for_each_entry(kvm, &vm_list, vm_list)
3119                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3120                         vcpu = &kvm->vcpus[i];
3121                         total += *(u32 *)((void *)vcpu + offset);
3122                 }
3123         spin_unlock(&kvm_lock);
3124         return total;
3125 }
3126
3127 static void stat_set(void *offset, u64 val)
3128 {
3129 }
3130
3131 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3132
3133 static __init void kvm_init_debug(void)
3134 {
3135         struct kvm_stats_debugfs_item *p;
3136
3137         debugfs_dir = debugfs_create_dir("kvm", NULL);
3138         for (p = debugfs_entries; p->name; ++p)
3139                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3140                                                 (void *)(long)p->offset,
3141                                                 &stat_fops);
3142 }
3143
3144 static void kvm_exit_debug(void)
3145 {
3146         struct kvm_stats_debugfs_item *p;
3147
3148         for (p = debugfs_entries; p->name; ++p)
3149                 debugfs_remove(p->dentry);
3150         debugfs_remove(debugfs_dir);
3151 }
3152
3153 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3154 {
3155         decache_vcpus_on_cpu(raw_smp_processor_id());
3156         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3157         return 0;
3158 }
3159
3160 static int kvm_resume(struct sys_device *dev)
3161 {
3162         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3163         return 0;
3164 }
3165
3166 static struct sysdev_class kvm_sysdev_class = {
3167         set_kset_name("kvm"),
3168         .suspend = kvm_suspend,
3169         .resume = kvm_resume,
3170 };
3171
3172 static struct sys_device kvm_sysdev = {
3173         .id = 0,
3174         .cls = &kvm_sysdev_class,
3175 };
3176
3177 hpa_t bad_page_address;
3178
3179 static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3180                         const char *dev_name, void *data, struct vfsmount *mnt)
3181 {
3182         return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3183 }
3184
3185 static struct file_system_type kvm_fs_type = {
3186         .name           = "kvmfs",
3187         .get_sb         = kvmfs_get_sb,
3188         .kill_sb        = kill_anon_super,
3189 };
3190
3191 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3192 {
3193         int r;
3194
3195         if (kvm_arch_ops) {
3196                 printk(KERN_ERR "kvm: already loaded the other module\n");
3197                 return -EEXIST;
3198         }
3199
3200         if (!ops->cpu_has_kvm_support()) {
3201                 printk(KERN_ERR "kvm: no hardware support\n");
3202                 return -EOPNOTSUPP;
3203         }
3204         if (ops->disabled_by_bios()) {
3205                 printk(KERN_ERR "kvm: disabled by bios\n");
3206                 return -EOPNOTSUPP;
3207         }
3208
3209         kvm_arch_ops = ops;
3210
3211         r = kvm_arch_ops->hardware_setup();
3212         if (r < 0)
3213                 goto out;
3214
3215         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3216         r = register_cpu_notifier(&kvm_cpu_notifier);
3217         if (r)
3218                 goto out_free_1;
3219         register_reboot_notifier(&kvm_reboot_notifier);
3220
3221         r = sysdev_class_register(&kvm_sysdev_class);
3222         if (r)
3223                 goto out_free_2;
3224
3225         r = sysdev_register(&kvm_sysdev);
3226         if (r)
3227                 goto out_free_3;
3228
3229         kvm_chardev_ops.owner = module;
3230
3231         r = misc_register(&kvm_dev);
3232         if (r) {
3233                 printk (KERN_ERR "kvm: misc device register failed\n");
3234                 goto out_free;
3235         }
3236
3237         return r;
3238
3239 out_free:
3240         sysdev_unregister(&kvm_sysdev);
3241 out_free_3:
3242         sysdev_class_unregister(&kvm_sysdev_class);
3243 out_free_2:
3244         unregister_reboot_notifier(&kvm_reboot_notifier);
3245         unregister_cpu_notifier(&kvm_cpu_notifier);
3246 out_free_1:
3247         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3248         kvm_arch_ops->hardware_unsetup();
3249 out:
3250         kvm_arch_ops = NULL;
3251         return r;
3252 }
3253
3254 void kvm_exit_arch(void)
3255 {
3256         misc_deregister(&kvm_dev);
3257         sysdev_unregister(&kvm_sysdev);
3258         sysdev_class_unregister(&kvm_sysdev_class);
3259         unregister_reboot_notifier(&kvm_reboot_notifier);
3260         unregister_cpu_notifier(&kvm_cpu_notifier);
3261         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3262         kvm_arch_ops->hardware_unsetup();
3263         kvm_arch_ops = NULL;
3264 }
3265
3266 static __init int kvm_init(void)
3267 {
3268         static struct page *bad_page;
3269         int r;
3270
3271         r = kvm_mmu_module_init();
3272         if (r)
3273                 goto out4;
3274
3275         r = register_filesystem(&kvm_fs_type);
3276         if (r)
3277                 goto out3;
3278
3279         kvmfs_mnt = kern_mount(&kvm_fs_type);
3280         r = PTR_ERR(kvmfs_mnt);
3281         if (IS_ERR(kvmfs_mnt))
3282                 goto out2;
3283         kvm_init_debug();
3284
3285         kvm_init_msr_list();
3286
3287         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3288                 r = -ENOMEM;
3289                 goto out;
3290         }
3291
3292         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3293         memset(__va(bad_page_address), 0, PAGE_SIZE);
3294
3295         return 0;
3296
3297 out:
3298         kvm_exit_debug();
3299         mntput(kvmfs_mnt);
3300 out2:
3301         unregister_filesystem(&kvm_fs_type);
3302 out3:
3303         kvm_mmu_module_exit();
3304 out4:
3305         return r;
3306 }
3307
3308 static __exit void kvm_exit(void)
3309 {
3310         kvm_exit_debug();
3311         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3312         mntput(kvmfs_mnt);
3313         unregister_filesystem(&kvm_fs_type);
3314         kvm_mmu_module_exit();
3315 }
3316
3317 module_init(kvm_init)
3318 module_exit(kvm_exit)
3319
3320 EXPORT_SYMBOL_GPL(kvm_init_arch);
3321 EXPORT_SYMBOL_GPL(kvm_exit_arch);