aca14139a680649efd202db303a3f2258373f31b
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <asm/msr.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
32 #include <asm/io.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <asm/desc.h>
37
38 #include "x86_emulate.h"
39 #include "segment_descriptor.h"
40
41 MODULE_AUTHOR("Qumranet");
42 MODULE_LICENSE("GPL");
43
44 struct kvm_arch_ops *kvm_arch_ops;
45 struct kvm_stat kvm_stat;
46 EXPORT_SYMBOL_GPL(kvm_stat);
47
48 static struct kvm_stats_debugfs_item {
49         const char *name;
50         u32 *data;
51         struct dentry *dentry;
52 } debugfs_entries[] = {
53         { "pf_fixed", &kvm_stat.pf_fixed },
54         { "pf_guest", &kvm_stat.pf_guest },
55         { "tlb_flush", &kvm_stat.tlb_flush },
56         { "invlpg", &kvm_stat.invlpg },
57         { "exits", &kvm_stat.exits },
58         { "io_exits", &kvm_stat.io_exits },
59         { "mmio_exits", &kvm_stat.mmio_exits },
60         { "signal_exits", &kvm_stat.signal_exits },
61         { "irq_window", &kvm_stat.irq_window_exits },
62         { "halt_exits", &kvm_stat.halt_exits },
63         { "request_irq", &kvm_stat.request_irq_exits },
64         { "irq_exits", &kvm_stat.irq_exits },
65         { 0, 0 }
66 };
67
68 static struct dentry *debugfs_dir;
69
70 #define MAX_IO_MSRS 256
71
72 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
73 #define LMSW_GUEST_MASK 0x0eULL
74 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
75 #define CR8_RESEVED_BITS (~0x0fULL)
76 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
77
78 #ifdef CONFIG_X86_64
79 // LDT or TSS descriptor in the GDT. 16 bytes.
80 struct segment_descriptor_64 {
81         struct segment_descriptor s;
82         u32 base_higher;
83         u32 pad_zero;
84 };
85
86 #endif
87
88 unsigned long segment_base(u16 selector)
89 {
90         struct descriptor_table gdt;
91         struct segment_descriptor *d;
92         unsigned long table_base;
93         typedef unsigned long ul;
94         unsigned long v;
95
96         if (selector == 0)
97                 return 0;
98
99         asm ("sgdt %0" : "=m"(gdt));
100         table_base = gdt.base;
101
102         if (selector & 4) {           /* from ldt */
103                 u16 ldt_selector;
104
105                 asm ("sldt %0" : "=g"(ldt_selector));
106                 table_base = segment_base(ldt_selector);
107         }
108         d = (struct segment_descriptor *)(table_base + (selector & ~7));
109         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
110 #ifdef CONFIG_X86_64
111         if (d->system == 0
112             && (d->type == 2 || d->type == 9 || d->type == 11))
113                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
114 #endif
115         return v;
116 }
117 EXPORT_SYMBOL_GPL(segment_base);
118
119 static inline int valid_vcpu(int n)
120 {
121         return likely(n >= 0 && n < KVM_MAX_VCPUS);
122 }
123
124 int kvm_read_guest(struct kvm_vcpu *vcpu,
125                              gva_t addr,
126                              unsigned long size,
127                              void *dest)
128 {
129         unsigned char *host_buf = dest;
130         unsigned long req_size = size;
131
132         while (size) {
133                 hpa_t paddr;
134                 unsigned now;
135                 unsigned offset;
136                 hva_t guest_buf;
137
138                 paddr = gva_to_hpa(vcpu, addr);
139
140                 if (is_error_hpa(paddr))
141                         break;
142
143                 guest_buf = (hva_t)kmap_atomic(
144                                         pfn_to_page(paddr >> PAGE_SHIFT),
145                                         KM_USER0);
146                 offset = addr & ~PAGE_MASK;
147                 guest_buf |= offset;
148                 now = min(size, PAGE_SIZE - offset);
149                 memcpy(host_buf, (void*)guest_buf, now);
150                 host_buf += now;
151                 addr += now;
152                 size -= now;
153                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
154         }
155         return req_size - size;
156 }
157 EXPORT_SYMBOL_GPL(kvm_read_guest);
158
159 int kvm_write_guest(struct kvm_vcpu *vcpu,
160                              gva_t addr,
161                              unsigned long size,
162                              void *data)
163 {
164         unsigned char *host_buf = data;
165         unsigned long req_size = size;
166
167         while (size) {
168                 hpa_t paddr;
169                 unsigned now;
170                 unsigned offset;
171                 hva_t guest_buf;
172
173                 paddr = gva_to_hpa(vcpu, addr);
174
175                 if (is_error_hpa(paddr))
176                         break;
177
178                 guest_buf = (hva_t)kmap_atomic(
179                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
180                 offset = addr & ~PAGE_MASK;
181                 guest_buf |= offset;
182                 now = min(size, PAGE_SIZE - offset);
183                 memcpy((void*)guest_buf, host_buf, now);
184                 host_buf += now;
185                 addr += now;
186                 size -= now;
187                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
188         }
189         return req_size - size;
190 }
191 EXPORT_SYMBOL_GPL(kvm_write_guest);
192
193 static int vcpu_slot(struct kvm_vcpu *vcpu)
194 {
195         return vcpu - vcpu->kvm->vcpus;
196 }
197
198 /*
199  * Switches to specified vcpu, until a matching vcpu_put()
200  */
201 static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
202 {
203         struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
204
205         mutex_lock(&vcpu->mutex);
206         if (unlikely(!vcpu->vmcs)) {
207                 mutex_unlock(&vcpu->mutex);
208                 return 0;
209         }
210         return kvm_arch_ops->vcpu_load(vcpu);
211 }
212
213 static void vcpu_put(struct kvm_vcpu *vcpu)
214 {
215         kvm_arch_ops->vcpu_put(vcpu);
216         mutex_unlock(&vcpu->mutex);
217 }
218
219 static int kvm_dev_open(struct inode *inode, struct file *filp)
220 {
221         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
222         int i;
223
224         if (!kvm)
225                 return -ENOMEM;
226
227         spin_lock_init(&kvm->lock);
228         INIT_LIST_HEAD(&kvm->active_mmu_pages);
229         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
230                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
231
232                 mutex_init(&vcpu->mutex);
233                 vcpu->mmu.root_hpa = INVALID_PAGE;
234                 INIT_LIST_HEAD(&vcpu->free_pages);
235         }
236         filp->private_data = kvm;
237         return 0;
238 }
239
240 /*
241  * Free any memory in @free but not in @dont.
242  */
243 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
244                                   struct kvm_memory_slot *dont)
245 {
246         int i;
247
248         if (!dont || free->phys_mem != dont->phys_mem)
249                 if (free->phys_mem) {
250                         for (i = 0; i < free->npages; ++i)
251                                 if (free->phys_mem[i])
252                                         __free_page(free->phys_mem[i]);
253                         vfree(free->phys_mem);
254                 }
255
256         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
257                 vfree(free->dirty_bitmap);
258
259         free->phys_mem = 0;
260         free->npages = 0;
261         free->dirty_bitmap = 0;
262 }
263
264 static void kvm_free_physmem(struct kvm *kvm)
265 {
266         int i;
267
268         for (i = 0; i < kvm->nmemslots; ++i)
269                 kvm_free_physmem_slot(&kvm->memslots[i], 0);
270 }
271
272 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
273 {
274         kvm_arch_ops->vcpu_free(vcpu);
275         kvm_mmu_destroy(vcpu);
276 }
277
278 static void kvm_free_vcpus(struct kvm *kvm)
279 {
280         unsigned int i;
281
282         for (i = 0; i < KVM_MAX_VCPUS; ++i)
283                 kvm_free_vcpu(&kvm->vcpus[i]);
284 }
285
286 static int kvm_dev_release(struct inode *inode, struct file *filp)
287 {
288         struct kvm *kvm = filp->private_data;
289
290         kvm_free_vcpus(kvm);
291         kvm_free_physmem(kvm);
292         kfree(kvm);
293         return 0;
294 }
295
296 static void inject_gp(struct kvm_vcpu *vcpu)
297 {
298         kvm_arch_ops->inject_gp(vcpu, 0);
299 }
300
301 static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
302                                          unsigned long cr3)
303 {
304         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
305         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
306         int i;
307         u64 pdpte;
308         u64 *pdpt;
309         struct kvm_memory_slot *memslot;
310
311         spin_lock(&vcpu->kvm->lock);
312         memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
313         /* FIXME: !memslot - emulate? 0xff? */
314         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
315
316         for (i = 0; i < 4; ++i) {
317                 pdpte = pdpt[offset + i];
318                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
319                         break;
320         }
321
322         kunmap_atomic(pdpt, KM_USER0);
323         spin_unlock(&vcpu->kvm->lock);
324
325         return i != 4;
326 }
327
328 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
329 {
330         if (cr0 & CR0_RESEVED_BITS) {
331                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
332                        cr0, vcpu->cr0);
333                 inject_gp(vcpu);
334                 return;
335         }
336
337         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
338                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
339                 inject_gp(vcpu);
340                 return;
341         }
342
343         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
344                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
345                        "and a clear PE flag\n");
346                 inject_gp(vcpu);
347                 return;
348         }
349
350         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
351 #ifdef CONFIG_X86_64
352                 if ((vcpu->shadow_efer & EFER_LME)) {
353                         int cs_db, cs_l;
354
355                         if (!is_pae(vcpu)) {
356                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
357                                        "in long mode while PAE is disabled\n");
358                                 inject_gp(vcpu);
359                                 return;
360                         }
361                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
362                         if (cs_l) {
363                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
364                                        "in long mode while CS.L == 1\n");
365                                 inject_gp(vcpu);
366                                 return;
367
368                         }
369                 } else
370 #endif
371                 if (is_pae(vcpu) &&
372                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
373                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
374                                "reserved bits\n");
375                         inject_gp(vcpu);
376                         return;
377                 }
378
379         }
380
381         kvm_arch_ops->set_cr0(vcpu, cr0);
382         vcpu->cr0 = cr0;
383
384         spin_lock(&vcpu->kvm->lock);
385         kvm_mmu_reset_context(vcpu);
386         spin_unlock(&vcpu->kvm->lock);
387         return;
388 }
389 EXPORT_SYMBOL_GPL(set_cr0);
390
391 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
392 {
393         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
394 }
395 EXPORT_SYMBOL_GPL(lmsw);
396
397 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
398 {
399         if (cr4 & CR4_RESEVED_BITS) {
400                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
401                 inject_gp(vcpu);
402                 return;
403         }
404
405         if (is_long_mode(vcpu)) {
406                 if (!(cr4 & CR4_PAE_MASK)) {
407                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
408                                "in long mode\n");
409                         inject_gp(vcpu);
410                         return;
411                 }
412         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
413                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
414                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
415                 inject_gp(vcpu);
416         }
417
418         if (cr4 & CR4_VMXE_MASK) {
419                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
420                 inject_gp(vcpu);
421                 return;
422         }
423         kvm_arch_ops->set_cr4(vcpu, cr4);
424         spin_lock(&vcpu->kvm->lock);
425         kvm_mmu_reset_context(vcpu);
426         spin_unlock(&vcpu->kvm->lock);
427 }
428 EXPORT_SYMBOL_GPL(set_cr4);
429
430 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
431 {
432         if (is_long_mode(vcpu)) {
433                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
434                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
435                         inject_gp(vcpu);
436                         return;
437                 }
438         } else {
439                 if (cr3 & CR3_RESEVED_BITS) {
440                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
441                         inject_gp(vcpu);
442                         return;
443                 }
444                 if (is_paging(vcpu) && is_pae(vcpu) &&
445                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
446                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
447                                "reserved bits\n");
448                         inject_gp(vcpu);
449                         return;
450                 }
451         }
452
453         vcpu->cr3 = cr3;
454         spin_lock(&vcpu->kvm->lock);
455         vcpu->mmu.new_cr3(vcpu);
456         spin_unlock(&vcpu->kvm->lock);
457 }
458 EXPORT_SYMBOL_GPL(set_cr3);
459
460 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
461 {
462         if ( cr8 & CR8_RESEVED_BITS) {
463                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
464                 inject_gp(vcpu);
465                 return;
466         }
467         vcpu->cr8 = cr8;
468 }
469 EXPORT_SYMBOL_GPL(set_cr8);
470
471 void fx_init(struct kvm_vcpu *vcpu)
472 {
473         struct __attribute__ ((__packed__)) fx_image_s {
474                 u16 control; //fcw
475                 u16 status; //fsw
476                 u16 tag; // ftw
477                 u16 opcode; //fop
478                 u64 ip; // fpu ip
479                 u64 operand;// fpu dp
480                 u32 mxcsr;
481                 u32 mxcsr_mask;
482
483         } *fx_image;
484
485         fx_save(vcpu->host_fx_image);
486         fpu_init();
487         fx_save(vcpu->guest_fx_image);
488         fx_restore(vcpu->host_fx_image);
489
490         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
491         fx_image->mxcsr = 0x1f80;
492         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
493                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
494 }
495 EXPORT_SYMBOL_GPL(fx_init);
496
497 /*
498  * Creates some virtual cpus.  Good luck creating more than one.
499  */
500 static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
501 {
502         int r;
503         struct kvm_vcpu *vcpu;
504
505         r = -EINVAL;
506         if (!valid_vcpu(n))
507                 goto out;
508
509         vcpu = &kvm->vcpus[n];
510
511         mutex_lock(&vcpu->mutex);
512
513         if (vcpu->vmcs) {
514                 mutex_unlock(&vcpu->mutex);
515                 return -EEXIST;
516         }
517
518         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
519                                            FX_IMAGE_ALIGN);
520         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
521
522         vcpu->cpu = -1;  /* First load will set up TR */
523         vcpu->kvm = kvm;
524         r = kvm_arch_ops->vcpu_create(vcpu);
525         if (r < 0)
526                 goto out_free_vcpus;
527
528         r = kvm_mmu_create(vcpu);
529         if (r < 0)
530                 goto out_free_vcpus;
531
532         kvm_arch_ops->vcpu_load(vcpu);
533         r = kvm_mmu_setup(vcpu);
534         if (r >= 0)
535                 r = kvm_arch_ops->vcpu_setup(vcpu);
536         vcpu_put(vcpu);
537
538         if (r < 0)
539                 goto out_free_vcpus;
540
541         return 0;
542
543 out_free_vcpus:
544         kvm_free_vcpu(vcpu);
545         mutex_unlock(&vcpu->mutex);
546 out:
547         return r;
548 }
549
550 /*
551  * Allocate some memory and give it an address in the guest physical address
552  * space.
553  *
554  * Discontiguous memory is allowed, mostly for framebuffers.
555  */
556 static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
557                                            struct kvm_memory_region *mem)
558 {
559         int r;
560         gfn_t base_gfn;
561         unsigned long npages;
562         unsigned long i;
563         struct kvm_memory_slot *memslot;
564         struct kvm_memory_slot old, new;
565         int memory_config_version;
566
567         r = -EINVAL;
568         /* General sanity checks */
569         if (mem->memory_size & (PAGE_SIZE - 1))
570                 goto out;
571         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
572                 goto out;
573         if (mem->slot >= KVM_MEMORY_SLOTS)
574                 goto out;
575         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
576                 goto out;
577
578         memslot = &kvm->memslots[mem->slot];
579         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
580         npages = mem->memory_size >> PAGE_SHIFT;
581
582         if (!npages)
583                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
584
585 raced:
586         spin_lock(&kvm->lock);
587
588         memory_config_version = kvm->memory_config_version;
589         new = old = *memslot;
590
591         new.base_gfn = base_gfn;
592         new.npages = npages;
593         new.flags = mem->flags;
594
595         /* Disallow changing a memory slot's size. */
596         r = -EINVAL;
597         if (npages && old.npages && npages != old.npages)
598                 goto out_unlock;
599
600         /* Check for overlaps */
601         r = -EEXIST;
602         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
603                 struct kvm_memory_slot *s = &kvm->memslots[i];
604
605                 if (s == memslot)
606                         continue;
607                 if (!((base_gfn + npages <= s->base_gfn) ||
608                       (base_gfn >= s->base_gfn + s->npages)))
609                         goto out_unlock;
610         }
611         /*
612          * Do memory allocations outside lock.  memory_config_version will
613          * detect any races.
614          */
615         spin_unlock(&kvm->lock);
616
617         /* Deallocate if slot is being removed */
618         if (!npages)
619                 new.phys_mem = 0;
620
621         /* Free page dirty bitmap if unneeded */
622         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
623                 new.dirty_bitmap = 0;
624
625         r = -ENOMEM;
626
627         /* Allocate if a slot is being created */
628         if (npages && !new.phys_mem) {
629                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
630
631                 if (!new.phys_mem)
632                         goto out_free;
633
634                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
635                 for (i = 0; i < npages; ++i) {
636                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
637                                                      | __GFP_ZERO);
638                         if (!new.phys_mem[i])
639                                 goto out_free;
640                 }
641         }
642
643         /* Allocate page dirty bitmap if needed */
644         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
645                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
646
647                 new.dirty_bitmap = vmalloc(dirty_bytes);
648                 if (!new.dirty_bitmap)
649                         goto out_free;
650                 memset(new.dirty_bitmap, 0, dirty_bytes);
651         }
652
653         spin_lock(&kvm->lock);
654
655         if (memory_config_version != kvm->memory_config_version) {
656                 spin_unlock(&kvm->lock);
657                 kvm_free_physmem_slot(&new, &old);
658                 goto raced;
659         }
660
661         r = -EAGAIN;
662         if (kvm->busy)
663                 goto out_unlock;
664
665         if (mem->slot >= kvm->nmemslots)
666                 kvm->nmemslots = mem->slot + 1;
667
668         *memslot = new;
669         ++kvm->memory_config_version;
670
671         spin_unlock(&kvm->lock);
672
673         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
674                 struct kvm_vcpu *vcpu;
675
676                 vcpu = vcpu_load(kvm, i);
677                 if (!vcpu)
678                         continue;
679                 kvm_mmu_reset_context(vcpu);
680                 vcpu_put(vcpu);
681         }
682
683         kvm_free_physmem_slot(&old, &new);
684         return 0;
685
686 out_unlock:
687         spin_unlock(&kvm->lock);
688 out_free:
689         kvm_free_physmem_slot(&new, &old);
690 out:
691         return r;
692 }
693
694 /*
695  * Get (and clear) the dirty memory log for a memory slot.
696  */
697 static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
698                                        struct kvm_dirty_log *log)
699 {
700         struct kvm_memory_slot *memslot;
701         int r, i;
702         int n;
703         unsigned long any = 0;
704
705         spin_lock(&kvm->lock);
706
707         /*
708          * Prevent changes to guest memory configuration even while the lock
709          * is not taken.
710          */
711         ++kvm->busy;
712         spin_unlock(&kvm->lock);
713         r = -EINVAL;
714         if (log->slot >= KVM_MEMORY_SLOTS)
715                 goto out;
716
717         memslot = &kvm->memslots[log->slot];
718         r = -ENOENT;
719         if (!memslot->dirty_bitmap)
720                 goto out;
721
722         n = ALIGN(memslot->npages, 8) / 8;
723
724         for (i = 0; !any && i < n; ++i)
725                 any = memslot->dirty_bitmap[i];
726
727         r = -EFAULT;
728         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
729                 goto out;
730
731
732         if (any) {
733                 spin_lock(&kvm->lock);
734                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
735                 spin_unlock(&kvm->lock);
736                 memset(memslot->dirty_bitmap, 0, n);
737                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
738                         struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
739
740                         if (!vcpu)
741                                 continue;
742                         kvm_arch_ops->tlb_flush(vcpu);
743                         vcpu_put(vcpu);
744                 }
745         }
746
747         r = 0;
748
749 out:
750         spin_lock(&kvm->lock);
751         --kvm->busy;
752         spin_unlock(&kvm->lock);
753         return r;
754 }
755
756 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
757 {
758         int i;
759
760         for (i = 0; i < kvm->nmemslots; ++i) {
761                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
762
763                 if (gfn >= memslot->base_gfn
764                     && gfn < memslot->base_gfn + memslot->npages)
765                         return memslot;
766         }
767         return 0;
768 }
769 EXPORT_SYMBOL_GPL(gfn_to_memslot);
770
771 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
772 {
773         int i;
774         struct kvm_memory_slot *memslot = 0;
775         unsigned long rel_gfn;
776
777         for (i = 0; i < kvm->nmemslots; ++i) {
778                 memslot = &kvm->memslots[i];
779
780                 if (gfn >= memslot->base_gfn
781                     && gfn < memslot->base_gfn + memslot->npages) {
782
783                         if (!memslot || !memslot->dirty_bitmap)
784                                 return;
785
786                         rel_gfn = gfn - memslot->base_gfn;
787
788                         /* avoid RMW */
789                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
790                                 set_bit(rel_gfn, memslot->dirty_bitmap);
791                         return;
792                 }
793         }
794 }
795
796 static int emulator_read_std(unsigned long addr,
797                              unsigned long *val,
798                              unsigned int bytes,
799                              struct x86_emulate_ctxt *ctxt)
800 {
801         struct kvm_vcpu *vcpu = ctxt->vcpu;
802         void *data = val;
803
804         while (bytes) {
805                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
806                 unsigned offset = addr & (PAGE_SIZE-1);
807                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
808                 unsigned long pfn;
809                 struct kvm_memory_slot *memslot;
810                 void *page;
811
812                 if (gpa == UNMAPPED_GVA)
813                         return X86EMUL_PROPAGATE_FAULT;
814                 pfn = gpa >> PAGE_SHIFT;
815                 memslot = gfn_to_memslot(vcpu->kvm, pfn);
816                 if (!memslot)
817                         return X86EMUL_UNHANDLEABLE;
818                 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
819
820                 memcpy(data, page + offset, tocopy);
821
822                 kunmap_atomic(page, KM_USER0);
823
824                 bytes -= tocopy;
825                 data += tocopy;
826                 addr += tocopy;
827         }
828
829         return X86EMUL_CONTINUE;
830 }
831
832 static int emulator_write_std(unsigned long addr,
833                               unsigned long val,
834                               unsigned int bytes,
835                               struct x86_emulate_ctxt *ctxt)
836 {
837         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
838                addr, bytes);
839         return X86EMUL_UNHANDLEABLE;
840 }
841
842 static int emulator_read_emulated(unsigned long addr,
843                                   unsigned long *val,
844                                   unsigned int bytes,
845                                   struct x86_emulate_ctxt *ctxt)
846 {
847         struct kvm_vcpu *vcpu = ctxt->vcpu;
848
849         if (vcpu->mmio_read_completed) {
850                 memcpy(val, vcpu->mmio_data, bytes);
851                 vcpu->mmio_read_completed = 0;
852                 return X86EMUL_CONTINUE;
853         } else if (emulator_read_std(addr, val, bytes, ctxt)
854                    == X86EMUL_CONTINUE)
855                 return X86EMUL_CONTINUE;
856         else {
857                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
858                 if (gpa == UNMAPPED_GVA)
859                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
860                 vcpu->mmio_needed = 1;
861                 vcpu->mmio_phys_addr = gpa;
862                 vcpu->mmio_size = bytes;
863                 vcpu->mmio_is_write = 0;
864
865                 return X86EMUL_UNHANDLEABLE;
866         }
867 }
868
869 static int emulator_write_emulated(unsigned long addr,
870                                    unsigned long val,
871                                    unsigned int bytes,
872                                    struct x86_emulate_ctxt *ctxt)
873 {
874         struct kvm_vcpu *vcpu = ctxt->vcpu;
875         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
876
877         if (gpa == UNMAPPED_GVA)
878                 return X86EMUL_PROPAGATE_FAULT;
879
880         vcpu->mmio_needed = 1;
881         vcpu->mmio_phys_addr = gpa;
882         vcpu->mmio_size = bytes;
883         vcpu->mmio_is_write = 1;
884         memcpy(vcpu->mmio_data, &val, bytes);
885
886         return X86EMUL_CONTINUE;
887 }
888
889 static int emulator_cmpxchg_emulated(unsigned long addr,
890                                      unsigned long old,
891                                      unsigned long new,
892                                      unsigned int bytes,
893                                      struct x86_emulate_ctxt *ctxt)
894 {
895         static int reported;
896
897         if (!reported) {
898                 reported = 1;
899                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
900         }
901         return emulator_write_emulated(addr, new, bytes, ctxt);
902 }
903
904 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
905 {
906         return kvm_arch_ops->get_segment_base(vcpu, seg);
907 }
908
909 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
910 {
911         spin_lock(&vcpu->kvm->lock);
912         vcpu->mmu.inval_page(vcpu, address);
913         spin_unlock(&vcpu->kvm->lock);
914         kvm_arch_ops->invlpg(vcpu, address);
915         return X86EMUL_CONTINUE;
916 }
917
918 int emulate_clts(struct kvm_vcpu *vcpu)
919 {
920         unsigned long cr0 = vcpu->cr0;
921
922         cr0 &= ~CR0_TS_MASK;
923         kvm_arch_ops->set_cr0(vcpu, cr0);
924         return X86EMUL_CONTINUE;
925 }
926
927 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
928 {
929         struct kvm_vcpu *vcpu = ctxt->vcpu;
930
931         switch (dr) {
932         case 0 ... 3:
933                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
934                 return X86EMUL_CONTINUE;
935         default:
936                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
937                        __FUNCTION__, dr);
938                 return X86EMUL_UNHANDLEABLE;
939         }
940 }
941
942 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
943 {
944         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
945         int exception;
946
947         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
948         if (exception) {
949                 /* FIXME: better handling */
950                 return X86EMUL_UNHANDLEABLE;
951         }
952         return X86EMUL_CONTINUE;
953 }
954
955 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
956 {
957         static int reported;
958         u8 opcodes[4];
959         unsigned long rip = ctxt->vcpu->rip;
960         unsigned long rip_linear;
961
962         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
963
964         if (reported)
965                 return;
966
967         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
968
969         printk(KERN_ERR "emulation failed but !mmio_needed?"
970                " rip %lx %02x %02x %02x %02x\n",
971                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
972         reported = 1;
973 }
974
975 struct x86_emulate_ops emulate_ops = {
976         .read_std            = emulator_read_std,
977         .write_std           = emulator_write_std,
978         .read_emulated       = emulator_read_emulated,
979         .write_emulated      = emulator_write_emulated,
980         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
981 };
982
983 int emulate_instruction(struct kvm_vcpu *vcpu,
984                         struct kvm_run *run,
985                         unsigned long cr2,
986                         u16 error_code)
987 {
988         struct x86_emulate_ctxt emulate_ctxt;
989         int r;
990         int cs_db, cs_l;
991
992         kvm_arch_ops->cache_regs(vcpu);
993
994         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
995
996         emulate_ctxt.vcpu = vcpu;
997         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
998         emulate_ctxt.cr2 = cr2;
999         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1000                 ? X86EMUL_MODE_REAL : cs_l
1001                 ? X86EMUL_MODE_PROT64 : cs_db
1002                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1003
1004         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1005                 emulate_ctxt.cs_base = 0;
1006                 emulate_ctxt.ds_base = 0;
1007                 emulate_ctxt.es_base = 0;
1008                 emulate_ctxt.ss_base = 0;
1009         } else {
1010                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1011                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1012                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1013                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1014         }
1015
1016         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1017         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1018
1019         vcpu->mmio_is_write = 0;
1020         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1021
1022         if ((r || vcpu->mmio_is_write) && run) {
1023                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1024                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1025                 run->mmio.len = vcpu->mmio_size;
1026                 run->mmio.is_write = vcpu->mmio_is_write;
1027         }
1028
1029         if (r) {
1030                 if (!vcpu->mmio_needed) {
1031                         report_emulation_failure(&emulate_ctxt);
1032                         return EMULATE_FAIL;
1033                 }
1034                 return EMULATE_DO_MMIO;
1035         }
1036
1037         kvm_arch_ops->decache_regs(vcpu);
1038         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1039
1040         if (vcpu->mmio_is_write)
1041                 return EMULATE_DO_MMIO;
1042
1043         return EMULATE_DONE;
1044 }
1045 EXPORT_SYMBOL_GPL(emulate_instruction);
1046
1047 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1048 {
1049         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1050 }
1051
1052 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1053 {
1054         struct descriptor_table dt = { limit, base };
1055
1056         kvm_arch_ops->set_gdt(vcpu, &dt);
1057 }
1058
1059 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1060 {
1061         struct descriptor_table dt = { limit, base };
1062
1063         kvm_arch_ops->set_idt(vcpu, &dt);
1064 }
1065
1066 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1067                    unsigned long *rflags)
1068 {
1069         lmsw(vcpu, msw);
1070         *rflags = kvm_arch_ops->get_rflags(vcpu);
1071 }
1072
1073 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1074 {
1075         switch (cr) {
1076         case 0:
1077                 return vcpu->cr0;
1078         case 2:
1079                 return vcpu->cr2;
1080         case 3:
1081                 return vcpu->cr3;
1082         case 4:
1083                 return vcpu->cr4;
1084         default:
1085                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1086                 return 0;
1087         }
1088 }
1089
1090 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1091                      unsigned long *rflags)
1092 {
1093         switch (cr) {
1094         case 0:
1095                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1096                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1097                 break;
1098         case 2:
1099                 vcpu->cr2 = val;
1100                 break;
1101         case 3:
1102                 set_cr3(vcpu, val);
1103                 break;
1104         case 4:
1105                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1106                 break;
1107         default:
1108                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1109         }
1110 }
1111
1112 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1113 {
1114         u64 data;
1115
1116         switch (msr) {
1117         case 0xc0010010: /* SYSCFG */
1118         case 0xc0010015: /* HWCR */
1119         case MSR_IA32_PLATFORM_ID:
1120         case MSR_IA32_P5_MC_ADDR:
1121         case MSR_IA32_P5_MC_TYPE:
1122         case MSR_IA32_MC0_CTL:
1123         case MSR_IA32_MCG_STATUS:
1124         case MSR_IA32_MCG_CAP:
1125         case MSR_IA32_MC0_MISC:
1126         case MSR_IA32_MC0_MISC+4:
1127         case MSR_IA32_MC0_MISC+8:
1128         case MSR_IA32_MC0_MISC+12:
1129         case MSR_IA32_MC0_MISC+16:
1130         case MSR_IA32_UCODE_REV:
1131         case MSR_IA32_PERF_STATUS:
1132                 /* MTRR registers */
1133         case 0xfe:
1134         case 0x200 ... 0x2ff:
1135                 data = 0;
1136                 break;
1137         case 0xcd: /* fsb frequency */
1138                 data = 3;
1139                 break;
1140         case MSR_IA32_APICBASE:
1141                 data = vcpu->apic_base;
1142                 break;
1143 #ifdef CONFIG_X86_64
1144         case MSR_EFER:
1145                 data = vcpu->shadow_efer;
1146                 break;
1147 #endif
1148         default:
1149                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1150                 return 1;
1151         }
1152         *pdata = data;
1153         return 0;
1154 }
1155 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1156
1157 /*
1158  * Reads an msr value (of 'msr_index') into 'pdata'.
1159  * Returns 0 on success, non-0 otherwise.
1160  * Assumes vcpu_load() was already called.
1161  */
1162 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1163 {
1164         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1165 }
1166
1167 #ifdef CONFIG_X86_64
1168
1169 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1170 {
1171         if (efer & EFER_RESERVED_BITS) {
1172                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1173                        efer);
1174                 inject_gp(vcpu);
1175                 return;
1176         }
1177
1178         if (is_paging(vcpu)
1179             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1180                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1181                 inject_gp(vcpu);
1182                 return;
1183         }
1184
1185         kvm_arch_ops->set_efer(vcpu, efer);
1186
1187         efer &= ~EFER_LMA;
1188         efer |= vcpu->shadow_efer & EFER_LMA;
1189
1190         vcpu->shadow_efer = efer;
1191 }
1192
1193 #endif
1194
1195 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1196 {
1197         switch (msr) {
1198 #ifdef CONFIG_X86_64
1199         case MSR_EFER:
1200                 set_efer(vcpu, data);
1201                 break;
1202 #endif
1203         case MSR_IA32_MC0_STATUS:
1204                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1205                        __FUNCTION__, data);
1206                 break;
1207         case MSR_IA32_UCODE_REV:
1208         case MSR_IA32_UCODE_WRITE:
1209         case 0x200 ... 0x2ff: /* MTRRs */
1210                 break;
1211         case MSR_IA32_APICBASE:
1212                 vcpu->apic_base = data;
1213                 break;
1214         default:
1215                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1216                 return 1;
1217         }
1218         return 0;
1219 }
1220 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1221
1222 /*
1223  * Writes msr value into into the appropriate "register".
1224  * Returns 0 on success, non-0 otherwise.
1225  * Assumes vcpu_load() was already called.
1226  */
1227 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1228 {
1229         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1230 }
1231
1232 void kvm_resched(struct kvm_vcpu *vcpu)
1233 {
1234         vcpu_put(vcpu);
1235         cond_resched();
1236         /* Cannot fail -  no vcpu unplug yet. */
1237         vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1238 }
1239 EXPORT_SYMBOL_GPL(kvm_resched);
1240
1241 void load_msrs(struct vmx_msr_entry *e, int n)
1242 {
1243         int i;
1244
1245         for (i = 0; i < n; ++i)
1246                 wrmsrl(e[i].index, e[i].data);
1247 }
1248 EXPORT_SYMBOL_GPL(load_msrs);
1249
1250 void save_msrs(struct vmx_msr_entry *e, int n)
1251 {
1252         int i;
1253
1254         for (i = 0; i < n; ++i)
1255                 rdmsrl(e[i].index, e[i].data);
1256 }
1257 EXPORT_SYMBOL_GPL(save_msrs);
1258
1259 static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1260 {
1261         struct kvm_vcpu *vcpu;
1262         int r;
1263
1264         if (!valid_vcpu(kvm_run->vcpu))
1265                 return -EINVAL;
1266
1267         vcpu = vcpu_load(kvm, kvm_run->vcpu);
1268         if (!vcpu)
1269                 return -ENOENT;
1270
1271         if (kvm_run->emulated) {
1272                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1273                 kvm_run->emulated = 0;
1274         }
1275
1276         if (kvm_run->mmio_completed) {
1277                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1278                 vcpu->mmio_read_completed = 1;
1279         }
1280
1281         vcpu->mmio_needed = 0;
1282
1283         r = kvm_arch_ops->run(vcpu, kvm_run);
1284
1285         vcpu_put(vcpu);
1286         return r;
1287 }
1288
1289 static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1290 {
1291         struct kvm_vcpu *vcpu;
1292
1293         if (!valid_vcpu(regs->vcpu))
1294                 return -EINVAL;
1295
1296         vcpu = vcpu_load(kvm, regs->vcpu);
1297         if (!vcpu)
1298                 return -ENOENT;
1299
1300         kvm_arch_ops->cache_regs(vcpu);
1301
1302         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1303         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1304         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1305         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1306         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1307         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1308         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1309         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1310 #ifdef CONFIG_X86_64
1311         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1312         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1313         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1314         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1315         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1316         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1317         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1318         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1319 #endif
1320
1321         regs->rip = vcpu->rip;
1322         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1323
1324         /*
1325          * Don't leak debug flags in case they were set for guest debugging
1326          */
1327         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1328                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1329
1330         vcpu_put(vcpu);
1331
1332         return 0;
1333 }
1334
1335 static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1336 {
1337         struct kvm_vcpu *vcpu;
1338
1339         if (!valid_vcpu(regs->vcpu))
1340                 return -EINVAL;
1341
1342         vcpu = vcpu_load(kvm, regs->vcpu);
1343         if (!vcpu)
1344                 return -ENOENT;
1345
1346         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1347         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1348         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1349         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1350         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1351         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1352         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1353         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1354 #ifdef CONFIG_X86_64
1355         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1356         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1357         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1358         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1359         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1360         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1361         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1362         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1363 #endif
1364
1365         vcpu->rip = regs->rip;
1366         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1367
1368         kvm_arch_ops->decache_regs(vcpu);
1369
1370         vcpu_put(vcpu);
1371
1372         return 0;
1373 }
1374
1375 static void get_segment(struct kvm_vcpu *vcpu,
1376                         struct kvm_segment *var, int seg)
1377 {
1378         return kvm_arch_ops->get_segment(vcpu, var, seg);
1379 }
1380
1381 static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1382 {
1383         struct kvm_vcpu *vcpu;
1384         struct descriptor_table dt;
1385
1386         if (!valid_vcpu(sregs->vcpu))
1387                 return -EINVAL;
1388         vcpu = vcpu_load(kvm, sregs->vcpu);
1389         if (!vcpu)
1390                 return -ENOENT;
1391
1392         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1393         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1394         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1395         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1396         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1397         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1398
1399         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1400         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1401
1402         kvm_arch_ops->get_idt(vcpu, &dt);
1403         sregs->idt.limit = dt.limit;
1404         sregs->idt.base = dt.base;
1405         kvm_arch_ops->get_gdt(vcpu, &dt);
1406         sregs->gdt.limit = dt.limit;
1407         sregs->gdt.base = dt.base;
1408
1409         sregs->cr0 = vcpu->cr0;
1410         sregs->cr2 = vcpu->cr2;
1411         sregs->cr3 = vcpu->cr3;
1412         sregs->cr4 = vcpu->cr4;
1413         sregs->cr8 = vcpu->cr8;
1414         sregs->efer = vcpu->shadow_efer;
1415         sregs->apic_base = vcpu->apic_base;
1416
1417         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1418                sizeof sregs->interrupt_bitmap);
1419
1420         vcpu_put(vcpu);
1421
1422         return 0;
1423 }
1424
1425 static void set_segment(struct kvm_vcpu *vcpu,
1426                         struct kvm_segment *var, int seg)
1427 {
1428         return kvm_arch_ops->set_segment(vcpu, var, seg);
1429 }
1430
1431 static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1432 {
1433         struct kvm_vcpu *vcpu;
1434         int mmu_reset_needed = 0;
1435         int i;
1436         struct descriptor_table dt;
1437
1438         if (!valid_vcpu(sregs->vcpu))
1439                 return -EINVAL;
1440         vcpu = vcpu_load(kvm, sregs->vcpu);
1441         if (!vcpu)
1442                 return -ENOENT;
1443
1444         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1445         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1446         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1447         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1448         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1449         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1450
1451         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1452         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1453
1454         dt.limit = sregs->idt.limit;
1455         dt.base = sregs->idt.base;
1456         kvm_arch_ops->set_idt(vcpu, &dt);
1457         dt.limit = sregs->gdt.limit;
1458         dt.base = sregs->gdt.base;
1459         kvm_arch_ops->set_gdt(vcpu, &dt);
1460
1461         vcpu->cr2 = sregs->cr2;
1462         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1463         vcpu->cr3 = sregs->cr3;
1464
1465         vcpu->cr8 = sregs->cr8;
1466
1467         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1468 #ifdef CONFIG_X86_64
1469         kvm_arch_ops->set_efer(vcpu, sregs->efer);
1470 #endif
1471         vcpu->apic_base = sregs->apic_base;
1472
1473         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1474         kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1475
1476         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1477         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1478
1479         if (mmu_reset_needed)
1480                 kvm_mmu_reset_context(vcpu);
1481
1482         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1483                sizeof vcpu->irq_pending);
1484         vcpu->irq_summary = 0;
1485         for (i = 0; i < NR_IRQ_WORDS; ++i)
1486                 if (vcpu->irq_pending[i])
1487                         __set_bit(i, &vcpu->irq_summary);
1488
1489         vcpu_put(vcpu);
1490
1491         return 0;
1492 }
1493
1494 /*
1495  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1496  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1497  *
1498  * This list is modified at module load time to reflect the
1499  * capabilities of the host cpu.
1500  */
1501 static u32 msrs_to_save[] = {
1502         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1503         MSR_K6_STAR,
1504 #ifdef CONFIG_X86_64
1505         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1506 #endif
1507         MSR_IA32_TIME_STAMP_COUNTER,
1508 };
1509
1510 static unsigned num_msrs_to_save;
1511
1512 static __init void kvm_init_msr_list(void)
1513 {
1514         u32 dummy[2];
1515         unsigned i, j;
1516
1517         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1518                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1519                         continue;
1520                 if (j < i)
1521                         msrs_to_save[j] = msrs_to_save[i];
1522                 j++;
1523         }
1524         num_msrs_to_save = j;
1525 }
1526
1527 /*
1528  * Adapt set_msr() to msr_io()'s calling convention
1529  */
1530 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1531 {
1532         return set_msr(vcpu, index, *data);
1533 }
1534
1535 /*
1536  * Read or write a bunch of msrs. All parameters are kernel addresses.
1537  *
1538  * @return number of msrs set successfully.
1539  */
1540 static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1541                     struct kvm_msr_entry *entries,
1542                     int (*do_msr)(struct kvm_vcpu *vcpu,
1543                                   unsigned index, u64 *data))
1544 {
1545         struct kvm_vcpu *vcpu;
1546         int i;
1547
1548         if (!valid_vcpu(msrs->vcpu))
1549                 return -EINVAL;
1550
1551         vcpu = vcpu_load(kvm, msrs->vcpu);
1552         if (!vcpu)
1553                 return -ENOENT;
1554
1555         for (i = 0; i < msrs->nmsrs; ++i)
1556                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1557                         break;
1558
1559         vcpu_put(vcpu);
1560
1561         return i;
1562 }
1563
1564 /*
1565  * Read or write a bunch of msrs. Parameters are user addresses.
1566  *
1567  * @return number of msrs set successfully.
1568  */
1569 static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1570                   int (*do_msr)(struct kvm_vcpu *vcpu,
1571                                 unsigned index, u64 *data),
1572                   int writeback)
1573 {
1574         struct kvm_msrs msrs;
1575         struct kvm_msr_entry *entries;
1576         int r, n;
1577         unsigned size;
1578
1579         r = -EFAULT;
1580         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1581                 goto out;
1582
1583         r = -E2BIG;
1584         if (msrs.nmsrs >= MAX_IO_MSRS)
1585                 goto out;
1586
1587         r = -ENOMEM;
1588         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1589         entries = vmalloc(size);
1590         if (!entries)
1591                 goto out;
1592
1593         r = -EFAULT;
1594         if (copy_from_user(entries, user_msrs->entries, size))
1595                 goto out_free;
1596
1597         r = n = __msr_io(kvm, &msrs, entries, do_msr);
1598         if (r < 0)
1599                 goto out_free;
1600
1601         r = -EFAULT;
1602         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1603                 goto out_free;
1604
1605         r = n;
1606
1607 out_free:
1608         vfree(entries);
1609 out:
1610         return r;
1611 }
1612
1613 /*
1614  * Translate a guest virtual address to a guest physical address.
1615  */
1616 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1617 {
1618         unsigned long vaddr = tr->linear_address;
1619         struct kvm_vcpu *vcpu;
1620         gpa_t gpa;
1621
1622         vcpu = vcpu_load(kvm, tr->vcpu);
1623         if (!vcpu)
1624                 return -ENOENT;
1625         spin_lock(&kvm->lock);
1626         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1627         tr->physical_address = gpa;
1628         tr->valid = gpa != UNMAPPED_GVA;
1629         tr->writeable = 1;
1630         tr->usermode = 0;
1631         spin_unlock(&kvm->lock);
1632         vcpu_put(vcpu);
1633
1634         return 0;
1635 }
1636
1637 static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1638 {
1639         struct kvm_vcpu *vcpu;
1640
1641         if (!valid_vcpu(irq->vcpu))
1642                 return -EINVAL;
1643         if (irq->irq < 0 || irq->irq >= 256)
1644                 return -EINVAL;
1645         vcpu = vcpu_load(kvm, irq->vcpu);
1646         if (!vcpu)
1647                 return -ENOENT;
1648
1649         set_bit(irq->irq, vcpu->irq_pending);
1650         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1651
1652         vcpu_put(vcpu);
1653
1654         return 0;
1655 }
1656
1657 static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1658                                      struct kvm_debug_guest *dbg)
1659 {
1660         struct kvm_vcpu *vcpu;
1661         int r;
1662
1663         if (!valid_vcpu(dbg->vcpu))
1664                 return -EINVAL;
1665         vcpu = vcpu_load(kvm, dbg->vcpu);
1666         if (!vcpu)
1667                 return -ENOENT;
1668
1669         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1670
1671         vcpu_put(vcpu);
1672
1673         return r;
1674 }
1675
1676 static long kvm_dev_ioctl(struct file *filp,
1677                           unsigned int ioctl, unsigned long arg)
1678 {
1679         struct kvm *kvm = filp->private_data;
1680         int r = -EINVAL;
1681
1682         switch (ioctl) {
1683         case KVM_GET_API_VERSION:
1684                 r = KVM_API_VERSION;
1685                 break;
1686         case KVM_CREATE_VCPU: {
1687                 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1688                 if (r)
1689                         goto out;
1690                 break;
1691         }
1692         case KVM_RUN: {
1693                 struct kvm_run kvm_run;
1694
1695                 r = -EFAULT;
1696                 if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run))
1697                         goto out;
1698                 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1699                 if (r < 0 &&  r != -EINTR)
1700                         goto out;
1701                 if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) {
1702                         r = -EFAULT;
1703                         goto out;
1704                 }
1705                 break;
1706         }
1707         case KVM_GET_REGS: {
1708                 struct kvm_regs kvm_regs;
1709
1710                 r = -EFAULT;
1711                 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1712                         goto out;
1713                 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1714                 if (r)
1715                         goto out;
1716                 r = -EFAULT;
1717                 if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs))
1718                         goto out;
1719                 r = 0;
1720                 break;
1721         }
1722         case KVM_SET_REGS: {
1723                 struct kvm_regs kvm_regs;
1724
1725                 r = -EFAULT;
1726                 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1727                         goto out;
1728                 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1729                 if (r)
1730                         goto out;
1731                 r = 0;
1732                 break;
1733         }
1734         case KVM_GET_SREGS: {
1735                 struct kvm_sregs kvm_sregs;
1736
1737                 r = -EFAULT;
1738                 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1739                         goto out;
1740                 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1741                 if (r)
1742                         goto out;
1743                 r = -EFAULT;
1744                 if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs))
1745                         goto out;
1746                 r = 0;
1747                 break;
1748         }
1749         case KVM_SET_SREGS: {
1750                 struct kvm_sregs kvm_sregs;
1751
1752                 r = -EFAULT;
1753                 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1754                         goto out;
1755                 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1756                 if (r)
1757                         goto out;
1758                 r = 0;
1759                 break;
1760         }
1761         case KVM_TRANSLATE: {
1762                 struct kvm_translation tr;
1763
1764                 r = -EFAULT;
1765                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
1766                         goto out;
1767                 r = kvm_dev_ioctl_translate(kvm, &tr);
1768                 if (r)
1769                         goto out;
1770                 r = -EFAULT;
1771                 if (copy_to_user((void *)arg, &tr, sizeof tr))
1772                         goto out;
1773                 r = 0;
1774                 break;
1775         }
1776         case KVM_INTERRUPT: {
1777                 struct kvm_interrupt irq;
1778
1779                 r = -EFAULT;
1780                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
1781                         goto out;
1782                 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1783                 if (r)
1784                         goto out;
1785                 r = 0;
1786                 break;
1787         }
1788         case KVM_DEBUG_GUEST: {
1789                 struct kvm_debug_guest dbg;
1790
1791                 r = -EFAULT;
1792                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
1793                         goto out;
1794                 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1795                 if (r)
1796                         goto out;
1797                 r = 0;
1798                 break;
1799         }
1800         case KVM_SET_MEMORY_REGION: {
1801                 struct kvm_memory_region kvm_mem;
1802
1803                 r = -EFAULT;
1804                 if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem))
1805                         goto out;
1806                 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1807                 if (r)
1808                         goto out;
1809                 break;
1810         }
1811         case KVM_GET_DIRTY_LOG: {
1812                 struct kvm_dirty_log log;
1813
1814                 r = -EFAULT;
1815                 if (copy_from_user(&log, (void *)arg, sizeof log))
1816                         goto out;
1817                 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1818                 if (r)
1819                         goto out;
1820                 break;
1821         }
1822         case KVM_GET_MSRS:
1823                 r = msr_io(kvm, (void __user *)arg, get_msr, 1);
1824                 break;
1825         case KVM_SET_MSRS:
1826                 r = msr_io(kvm, (void __user *)arg, do_set_msr, 0);
1827                 break;
1828         case KVM_GET_MSR_INDEX_LIST: {
1829                 struct kvm_msr_list __user *user_msr_list = (void __user *)arg;
1830                 struct kvm_msr_list msr_list;
1831                 unsigned n;
1832
1833                 r = -EFAULT;
1834                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1835                         goto out;
1836                 n = msr_list.nmsrs;
1837                 msr_list.nmsrs = num_msrs_to_save;
1838                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1839                         goto out;
1840                 r = -E2BIG;
1841                 if (n < num_msrs_to_save)
1842                         goto out;
1843                 r = -EFAULT;
1844                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1845                                  num_msrs_to_save * sizeof(u32)))
1846                         goto out;
1847                 r = 0;
1848         }
1849         default:
1850                 ;
1851         }
1852 out:
1853         return r;
1854 }
1855
1856 static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1857                                    unsigned long address,
1858                                    int *type)
1859 {
1860         struct kvm *kvm = vma->vm_file->private_data;
1861         unsigned long pgoff;
1862         struct kvm_memory_slot *slot;
1863         struct page *page;
1864
1865         *type = VM_FAULT_MINOR;
1866         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1867         slot = gfn_to_memslot(kvm, pgoff);
1868         if (!slot)
1869                 return NOPAGE_SIGBUS;
1870         page = gfn_to_page(slot, pgoff);
1871         if (!page)
1872                 return NOPAGE_SIGBUS;
1873         get_page(page);
1874         return page;
1875 }
1876
1877 static struct vm_operations_struct kvm_dev_vm_ops = {
1878         .nopage = kvm_dev_nopage,
1879 };
1880
1881 static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
1882 {
1883         vma->vm_ops = &kvm_dev_vm_ops;
1884         return 0;
1885 }
1886
1887 static struct file_operations kvm_chardev_ops = {
1888         .open           = kvm_dev_open,
1889         .release        = kvm_dev_release,
1890         .unlocked_ioctl = kvm_dev_ioctl,
1891         .compat_ioctl   = kvm_dev_ioctl,
1892         .mmap           = kvm_dev_mmap,
1893 };
1894
1895 static struct miscdevice kvm_dev = {
1896         MISC_DYNAMIC_MINOR,
1897         "kvm",
1898         &kvm_chardev_ops,
1899 };
1900
1901 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1902                        void *v)
1903 {
1904         if (val == SYS_RESTART) {
1905                 /*
1906                  * Some (well, at least mine) BIOSes hang on reboot if
1907                  * in vmx root mode.
1908                  */
1909                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1910                 on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1911         }
1912         return NOTIFY_OK;
1913 }
1914
1915 static struct notifier_block kvm_reboot_notifier = {
1916         .notifier_call = kvm_reboot,
1917         .priority = 0,
1918 };
1919
1920 static __init void kvm_init_debug(void)
1921 {
1922         struct kvm_stats_debugfs_item *p;
1923
1924         debugfs_dir = debugfs_create_dir("kvm", 0);
1925         for (p = debugfs_entries; p->name; ++p)
1926                 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
1927                                                p->data);
1928 }
1929
1930 static void kvm_exit_debug(void)
1931 {
1932         struct kvm_stats_debugfs_item *p;
1933
1934         for (p = debugfs_entries; p->name; ++p)
1935                 debugfs_remove(p->dentry);
1936         debugfs_remove(debugfs_dir);
1937 }
1938
1939 hpa_t bad_page_address;
1940
1941 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
1942 {
1943         int r;
1944
1945         if (kvm_arch_ops) {
1946                 printk(KERN_ERR "kvm: already loaded the other module\n");
1947                 return -EEXIST;
1948         }
1949
1950         if (!ops->cpu_has_kvm_support()) {
1951                 printk(KERN_ERR "kvm: no hardware support\n");
1952                 return -EOPNOTSUPP;
1953         }
1954         if (ops->disabled_by_bios()) {
1955                 printk(KERN_ERR "kvm: disabled by bios\n");
1956                 return -EOPNOTSUPP;
1957         }
1958
1959         kvm_arch_ops = ops;
1960
1961         r = kvm_arch_ops->hardware_setup();
1962         if (r < 0)
1963             return r;
1964
1965         on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
1966         register_reboot_notifier(&kvm_reboot_notifier);
1967
1968         kvm_chardev_ops.owner = module;
1969
1970         r = misc_register(&kvm_dev);
1971         if (r) {
1972                 printk (KERN_ERR "kvm: misc device register failed\n");
1973                 goto out_free;
1974         }
1975
1976         return r;
1977
1978 out_free:
1979         unregister_reboot_notifier(&kvm_reboot_notifier);
1980         on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1981         kvm_arch_ops->hardware_unsetup();
1982         return r;
1983 }
1984
1985 void kvm_exit_arch(void)
1986 {
1987         misc_deregister(&kvm_dev);
1988
1989         unregister_reboot_notifier(&kvm_reboot_notifier);
1990         on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1991         kvm_arch_ops->hardware_unsetup();
1992         kvm_arch_ops = NULL;
1993 }
1994
1995 static __init int kvm_init(void)
1996 {
1997         static struct page *bad_page;
1998         int r = 0;
1999
2000         kvm_init_debug();
2001
2002         kvm_init_msr_list();
2003
2004         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
2005                 r = -ENOMEM;
2006                 goto out;
2007         }
2008
2009         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
2010         memset(__va(bad_page_address), 0, PAGE_SIZE);
2011
2012         return r;
2013
2014 out:
2015         kvm_exit_debug();
2016         return r;
2017 }
2018
2019 static __exit void kvm_exit(void)
2020 {
2021         kvm_exit_debug();
2022         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
2023 }
2024
2025 module_init(kvm_init)
2026 module_exit(kvm_exit)
2027
2028 EXPORT_SYMBOL_GPL(kvm_init_arch);
2029 EXPORT_SYMBOL_GPL(kvm_exit_arch);