KVM: Emulate local APIC in kernel
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86_emulate.h"
20 #include "segment_descriptor.h"
21 #include "irq.h"
22
23 #include <linux/kvm.h>
24 #include <linux/module.h>
25 #include <linux/errno.h>
26 #include <linux/percpu.h>
27 #include <linux/gfp.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <linux/reboot.h>
32 #include <linux/debugfs.h>
33 #include <linux/highmem.h>
34 #include <linux/file.h>
35 #include <linux/sysdev.h>
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include <linux/cpumask.h>
39 #include <linux/smp.h>
40 #include <linux/anon_inodes.h>
41
42 #include <asm/processor.h>
43 #include <asm/msr.h>
44 #include <asm/io.h>
45 #include <asm/uaccess.h>
46 #include <asm/desc.h>
47
48 MODULE_AUTHOR("Qumranet");
49 MODULE_LICENSE("GPL");
50
51 static DEFINE_SPINLOCK(kvm_lock);
52 static LIST_HEAD(vm_list);
53
54 static cpumask_t cpus_hardware_enabled;
55
56 struct kvm_arch_ops *kvm_arch_ops;
57 struct kmem_cache *kvm_vcpu_cache;
58 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
59
60 static __read_mostly struct preempt_ops kvm_preempt_ops;
61
62 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
63
64 static struct kvm_stats_debugfs_item {
65         const char *name;
66         int offset;
67         struct dentry *dentry;
68 } debugfs_entries[] = {
69         { "pf_fixed", STAT_OFFSET(pf_fixed) },
70         { "pf_guest", STAT_OFFSET(pf_guest) },
71         { "tlb_flush", STAT_OFFSET(tlb_flush) },
72         { "invlpg", STAT_OFFSET(invlpg) },
73         { "exits", STAT_OFFSET(exits) },
74         { "io_exits", STAT_OFFSET(io_exits) },
75         { "mmio_exits", STAT_OFFSET(mmio_exits) },
76         { "signal_exits", STAT_OFFSET(signal_exits) },
77         { "irq_window", STAT_OFFSET(irq_window_exits) },
78         { "halt_exits", STAT_OFFSET(halt_exits) },
79         { "request_irq", STAT_OFFSET(request_irq_exits) },
80         { "irq_exits", STAT_OFFSET(irq_exits) },
81         { "light_exits", STAT_OFFSET(light_exits) },
82         { "efer_reload", STAT_OFFSET(efer_reload) },
83         { NULL }
84 };
85
86 static struct dentry *debugfs_dir;
87
88 #define MAX_IO_MSRS 256
89
90 #define CR0_RESERVED_BITS                                               \
91         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
92                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
93                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
94 #define CR4_RESERVED_BITS                                               \
95         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
96                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
97                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
98                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
99
100 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
101 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
102
103 #ifdef CONFIG_X86_64
104 // LDT or TSS descriptor in the GDT. 16 bytes.
105 struct segment_descriptor_64 {
106         struct segment_descriptor s;
107         u32 base_higher;
108         u32 pad_zero;
109 };
110
111 #endif
112
113 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
114                            unsigned long arg);
115
116 unsigned long segment_base(u16 selector)
117 {
118         struct descriptor_table gdt;
119         struct segment_descriptor *d;
120         unsigned long table_base;
121         typedef unsigned long ul;
122         unsigned long v;
123
124         if (selector == 0)
125                 return 0;
126
127         asm ("sgdt %0" : "=m"(gdt));
128         table_base = gdt.base;
129
130         if (selector & 4) {           /* from ldt */
131                 u16 ldt_selector;
132
133                 asm ("sldt %0" : "=g"(ldt_selector));
134                 table_base = segment_base(ldt_selector);
135         }
136         d = (struct segment_descriptor *)(table_base + (selector & ~7));
137         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
138 #ifdef CONFIG_X86_64
139         if (d->system == 0
140             && (d->type == 2 || d->type == 9 || d->type == 11))
141                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
142 #endif
143         return v;
144 }
145 EXPORT_SYMBOL_GPL(segment_base);
146
147 static inline int valid_vcpu(int n)
148 {
149         return likely(n >= 0 && n < KVM_MAX_VCPUS);
150 }
151
152 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
153 {
154         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
155                 return;
156
157         vcpu->guest_fpu_loaded = 1;
158         fx_save(&vcpu->host_fx_image);
159         fx_restore(&vcpu->guest_fx_image);
160 }
161 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
162
163 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
164 {
165         if (!vcpu->guest_fpu_loaded)
166                 return;
167
168         vcpu->guest_fpu_loaded = 0;
169         fx_save(&vcpu->guest_fx_image);
170         fx_restore(&vcpu->host_fx_image);
171 }
172 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
173
174 /*
175  * Switches to specified vcpu, until a matching vcpu_put()
176  */
177 static void vcpu_load(struct kvm_vcpu *vcpu)
178 {
179         int cpu;
180
181         mutex_lock(&vcpu->mutex);
182         cpu = get_cpu();
183         preempt_notifier_register(&vcpu->preempt_notifier);
184         kvm_arch_ops->vcpu_load(vcpu, cpu);
185         put_cpu();
186 }
187
188 static void vcpu_put(struct kvm_vcpu *vcpu)
189 {
190         preempt_disable();
191         kvm_arch_ops->vcpu_put(vcpu);
192         preempt_notifier_unregister(&vcpu->preempt_notifier);
193         preempt_enable();
194         mutex_unlock(&vcpu->mutex);
195 }
196
197 static void ack_flush(void *_completed)
198 {
199         atomic_t *completed = _completed;
200
201         atomic_inc(completed);
202 }
203
204 void kvm_flush_remote_tlbs(struct kvm *kvm)
205 {
206         int i, cpu, needed;
207         cpumask_t cpus;
208         struct kvm_vcpu *vcpu;
209         atomic_t completed;
210
211         atomic_set(&completed, 0);
212         cpus_clear(cpus);
213         needed = 0;
214         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
215                 vcpu = kvm->vcpus[i];
216                 if (!vcpu)
217                         continue;
218                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
219                         continue;
220                 cpu = vcpu->cpu;
221                 if (cpu != -1 && cpu != raw_smp_processor_id())
222                         if (!cpu_isset(cpu, cpus)) {
223                                 cpu_set(cpu, cpus);
224                                 ++needed;
225                         }
226         }
227
228         /*
229          * We really want smp_call_function_mask() here.  But that's not
230          * available, so ipi all cpus in parallel and wait for them
231          * to complete.
232          */
233         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
234                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
235         while (atomic_read(&completed) != needed) {
236                 cpu_relax();
237                 barrier();
238         }
239 }
240
241 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
242 {
243         struct page *page;
244         int r;
245
246         mutex_init(&vcpu->mutex);
247         vcpu->cpu = -1;
248         vcpu->mmu.root_hpa = INVALID_PAGE;
249         vcpu->kvm = kvm;
250         vcpu->vcpu_id = id;
251
252         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
253         if (!page) {
254                 r = -ENOMEM;
255                 goto fail;
256         }
257         vcpu->run = page_address(page);
258
259         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
260         if (!page) {
261                 r = -ENOMEM;
262                 goto fail_free_run;
263         }
264         vcpu->pio_data = page_address(page);
265
266         r = kvm_mmu_create(vcpu);
267         if (r < 0)
268                 goto fail_free_pio_data;
269
270         return 0;
271
272 fail_free_pio_data:
273         free_page((unsigned long)vcpu->pio_data);
274 fail_free_run:
275         free_page((unsigned long)vcpu->run);
276 fail:
277         return -ENOMEM;
278 }
279 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
280
281 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
282 {
283         kvm_mmu_destroy(vcpu);
284         kvm_free_apic(vcpu->apic);
285         free_page((unsigned long)vcpu->pio_data);
286         free_page((unsigned long)vcpu->run);
287 }
288 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
289
290 static struct kvm *kvm_create_vm(void)
291 {
292         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
293
294         if (!kvm)
295                 return ERR_PTR(-ENOMEM);
296
297         kvm_io_bus_init(&kvm->pio_bus);
298         mutex_init(&kvm->lock);
299         INIT_LIST_HEAD(&kvm->active_mmu_pages);
300         kvm_io_bus_init(&kvm->mmio_bus);
301         spin_lock(&kvm_lock);
302         list_add(&kvm->vm_list, &vm_list);
303         spin_unlock(&kvm_lock);
304         return kvm;
305 }
306
307 /*
308  * Free any memory in @free but not in @dont.
309  */
310 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
311                                   struct kvm_memory_slot *dont)
312 {
313         int i;
314
315         if (!dont || free->phys_mem != dont->phys_mem)
316                 if (free->phys_mem) {
317                         for (i = 0; i < free->npages; ++i)
318                                 if (free->phys_mem[i])
319                                         __free_page(free->phys_mem[i]);
320                         vfree(free->phys_mem);
321                 }
322
323         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
324                 vfree(free->dirty_bitmap);
325
326         free->phys_mem = NULL;
327         free->npages = 0;
328         free->dirty_bitmap = NULL;
329 }
330
331 static void kvm_free_physmem(struct kvm *kvm)
332 {
333         int i;
334
335         for (i = 0; i < kvm->nmemslots; ++i)
336                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
337 }
338
339 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
340 {
341         int i;
342
343         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
344                 if (vcpu->pio.guest_pages[i]) {
345                         __free_page(vcpu->pio.guest_pages[i]);
346                         vcpu->pio.guest_pages[i] = NULL;
347                 }
348 }
349
350 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
351 {
352         vcpu_load(vcpu);
353         kvm_mmu_unload(vcpu);
354         vcpu_put(vcpu);
355 }
356
357 static void kvm_free_vcpus(struct kvm *kvm)
358 {
359         unsigned int i;
360
361         /*
362          * Unpin any mmu pages first.
363          */
364         for (i = 0; i < KVM_MAX_VCPUS; ++i)
365                 if (kvm->vcpus[i])
366                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
367         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
368                 if (kvm->vcpus[i]) {
369                         kvm_arch_ops->vcpu_free(kvm->vcpus[i]);
370                         kvm->vcpus[i] = NULL;
371                 }
372         }
373
374 }
375
376 static void kvm_destroy_vm(struct kvm *kvm)
377 {
378         spin_lock(&kvm_lock);
379         list_del(&kvm->vm_list);
380         spin_unlock(&kvm_lock);
381         kvm_io_bus_destroy(&kvm->pio_bus);
382         kvm_io_bus_destroy(&kvm->mmio_bus);
383         kfree(kvm->vpic);
384         kvm_free_vcpus(kvm);
385         kvm_free_physmem(kvm);
386         kfree(kvm);
387 }
388
389 static int kvm_vm_release(struct inode *inode, struct file *filp)
390 {
391         struct kvm *kvm = filp->private_data;
392
393         kvm_destroy_vm(kvm);
394         return 0;
395 }
396
397 static void inject_gp(struct kvm_vcpu *vcpu)
398 {
399         kvm_arch_ops->inject_gp(vcpu, 0);
400 }
401
402 /*
403  * Load the pae pdptrs.  Return true is they are all valid.
404  */
405 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
406 {
407         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
408         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
409         int i;
410         u64 *pdpt;
411         int ret;
412         struct page *page;
413         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
414
415         mutex_lock(&vcpu->kvm->lock);
416         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
417         if (!page) {
418                 ret = 0;
419                 goto out;
420         }
421
422         pdpt = kmap_atomic(page, KM_USER0);
423         memcpy(pdpte, pdpt+offset, sizeof(pdpte));
424         kunmap_atomic(pdpt, KM_USER0);
425
426         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
427                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
428                         ret = 0;
429                         goto out;
430                 }
431         }
432         ret = 1;
433
434         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
435 out:
436         mutex_unlock(&vcpu->kvm->lock);
437
438         return ret;
439 }
440
441 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
442 {
443         if (cr0 & CR0_RESERVED_BITS) {
444                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
445                        cr0, vcpu->cr0);
446                 inject_gp(vcpu);
447                 return;
448         }
449
450         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
451                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
452                 inject_gp(vcpu);
453                 return;
454         }
455
456         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
457                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
458                        "and a clear PE flag\n");
459                 inject_gp(vcpu);
460                 return;
461         }
462
463         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
464 #ifdef CONFIG_X86_64
465                 if ((vcpu->shadow_efer & EFER_LME)) {
466                         int cs_db, cs_l;
467
468                         if (!is_pae(vcpu)) {
469                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
470                                        "in long mode while PAE is disabled\n");
471                                 inject_gp(vcpu);
472                                 return;
473                         }
474                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
475                         if (cs_l) {
476                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
477                                        "in long mode while CS.L == 1\n");
478                                 inject_gp(vcpu);
479                                 return;
480
481                         }
482                 } else
483 #endif
484                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
485                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
486                                "reserved bits\n");
487                         inject_gp(vcpu);
488                         return;
489                 }
490
491         }
492
493         kvm_arch_ops->set_cr0(vcpu, cr0);
494         vcpu->cr0 = cr0;
495
496         mutex_lock(&vcpu->kvm->lock);
497         kvm_mmu_reset_context(vcpu);
498         mutex_unlock(&vcpu->kvm->lock);
499         return;
500 }
501 EXPORT_SYMBOL_GPL(set_cr0);
502
503 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
504 {
505         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
506 }
507 EXPORT_SYMBOL_GPL(lmsw);
508
509 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
510 {
511         if (cr4 & CR4_RESERVED_BITS) {
512                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
513                 inject_gp(vcpu);
514                 return;
515         }
516
517         if (is_long_mode(vcpu)) {
518                 if (!(cr4 & X86_CR4_PAE)) {
519                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
520                                "in long mode\n");
521                         inject_gp(vcpu);
522                         return;
523                 }
524         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
525                    && !load_pdptrs(vcpu, vcpu->cr3)) {
526                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
527                 inject_gp(vcpu);
528                 return;
529         }
530
531         if (cr4 & X86_CR4_VMXE) {
532                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
533                 inject_gp(vcpu);
534                 return;
535         }
536         kvm_arch_ops->set_cr4(vcpu, cr4);
537         mutex_lock(&vcpu->kvm->lock);
538         kvm_mmu_reset_context(vcpu);
539         mutex_unlock(&vcpu->kvm->lock);
540 }
541 EXPORT_SYMBOL_GPL(set_cr4);
542
543 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
544 {
545         if (is_long_mode(vcpu)) {
546                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
547                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
548                         inject_gp(vcpu);
549                         return;
550                 }
551         } else {
552                 if (is_pae(vcpu)) {
553                         if (cr3 & CR3_PAE_RESERVED_BITS) {
554                                 printk(KERN_DEBUG
555                                        "set_cr3: #GP, reserved bits\n");
556                                 inject_gp(vcpu);
557                                 return;
558                         }
559                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
560                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
561                                        "reserved bits\n");
562                                 inject_gp(vcpu);
563                                 return;
564                         }
565                 } else {
566                         if (cr3 & CR3_NONPAE_RESERVED_BITS) {
567                                 printk(KERN_DEBUG
568                                        "set_cr3: #GP, reserved bits\n");
569                                 inject_gp(vcpu);
570                                 return;
571                         }
572                 }
573         }
574
575         mutex_lock(&vcpu->kvm->lock);
576         /*
577          * Does the new cr3 value map to physical memory? (Note, we
578          * catch an invalid cr3 even in real-mode, because it would
579          * cause trouble later on when we turn on paging anyway.)
580          *
581          * A real CPU would silently accept an invalid cr3 and would
582          * attempt to use it - with largely undefined (and often hard
583          * to debug) behavior on the guest side.
584          */
585         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
586                 inject_gp(vcpu);
587         else {
588                 vcpu->cr3 = cr3;
589                 vcpu->mmu.new_cr3(vcpu);
590         }
591         mutex_unlock(&vcpu->kvm->lock);
592 }
593 EXPORT_SYMBOL_GPL(set_cr3);
594
595 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
596 {
597         if (cr8 & CR8_RESERVED_BITS) {
598                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
599                 inject_gp(vcpu);
600                 return;
601         }
602         if (irqchip_in_kernel(vcpu->kvm))
603                 kvm_lapic_set_tpr(vcpu, cr8);
604         else
605                 vcpu->cr8 = cr8;
606 }
607 EXPORT_SYMBOL_GPL(set_cr8);
608
609 unsigned long get_cr8(struct kvm_vcpu *vcpu)
610 {
611         if (irqchip_in_kernel(vcpu->kvm))
612                 return kvm_lapic_get_cr8(vcpu);
613         else
614                 return vcpu->cr8;
615 }
616 EXPORT_SYMBOL_GPL(get_cr8);
617
618 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
619 {
620         if (irqchip_in_kernel(vcpu->kvm))
621                 return vcpu->apic_base;
622         else
623                 return vcpu->apic_base;
624 }
625 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
626
627 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
628 {
629         /* TODO: reserve bits check */
630         if (irqchip_in_kernel(vcpu->kvm))
631                 kvm_lapic_set_base(vcpu, data);
632         else
633                 vcpu->apic_base = data;
634 }
635 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
636
637 void fx_init(struct kvm_vcpu *vcpu)
638 {
639         unsigned after_mxcsr_mask;
640
641         /* Initialize guest FPU by resetting ours and saving into guest's */
642         preempt_disable();
643         fx_save(&vcpu->host_fx_image);
644         fpu_init();
645         fx_save(&vcpu->guest_fx_image);
646         fx_restore(&vcpu->host_fx_image);
647         preempt_enable();
648
649         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
650         vcpu->guest_fx_image.mxcsr = 0x1f80;
651         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
652                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
653 }
654 EXPORT_SYMBOL_GPL(fx_init);
655
656 /*
657  * Allocate some memory and give it an address in the guest physical address
658  * space.
659  *
660  * Discontiguous memory is allowed, mostly for framebuffers.
661  */
662 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
663                                           struct kvm_memory_region *mem)
664 {
665         int r;
666         gfn_t base_gfn;
667         unsigned long npages;
668         unsigned long i;
669         struct kvm_memory_slot *memslot;
670         struct kvm_memory_slot old, new;
671         int memory_config_version;
672
673         r = -EINVAL;
674         /* General sanity checks */
675         if (mem->memory_size & (PAGE_SIZE - 1))
676                 goto out;
677         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
678                 goto out;
679         if (mem->slot >= KVM_MEMORY_SLOTS)
680                 goto out;
681         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
682                 goto out;
683
684         memslot = &kvm->memslots[mem->slot];
685         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
686         npages = mem->memory_size >> PAGE_SHIFT;
687
688         if (!npages)
689                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
690
691 raced:
692         mutex_lock(&kvm->lock);
693
694         memory_config_version = kvm->memory_config_version;
695         new = old = *memslot;
696
697         new.base_gfn = base_gfn;
698         new.npages = npages;
699         new.flags = mem->flags;
700
701         /* Disallow changing a memory slot's size. */
702         r = -EINVAL;
703         if (npages && old.npages && npages != old.npages)
704                 goto out_unlock;
705
706         /* Check for overlaps */
707         r = -EEXIST;
708         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
709                 struct kvm_memory_slot *s = &kvm->memslots[i];
710
711                 if (s == memslot)
712                         continue;
713                 if (!((base_gfn + npages <= s->base_gfn) ||
714                       (base_gfn >= s->base_gfn + s->npages)))
715                         goto out_unlock;
716         }
717         /*
718          * Do memory allocations outside lock.  memory_config_version will
719          * detect any races.
720          */
721         mutex_unlock(&kvm->lock);
722
723         /* Deallocate if slot is being removed */
724         if (!npages)
725                 new.phys_mem = NULL;
726
727         /* Free page dirty bitmap if unneeded */
728         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
729                 new.dirty_bitmap = NULL;
730
731         r = -ENOMEM;
732
733         /* Allocate if a slot is being created */
734         if (npages && !new.phys_mem) {
735                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
736
737                 if (!new.phys_mem)
738                         goto out_free;
739
740                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
741                 for (i = 0; i < npages; ++i) {
742                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
743                                                      | __GFP_ZERO);
744                         if (!new.phys_mem[i])
745                                 goto out_free;
746                         set_page_private(new.phys_mem[i],0);
747                 }
748         }
749
750         /* Allocate page dirty bitmap if needed */
751         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
752                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
753
754                 new.dirty_bitmap = vmalloc(dirty_bytes);
755                 if (!new.dirty_bitmap)
756                         goto out_free;
757                 memset(new.dirty_bitmap, 0, dirty_bytes);
758         }
759
760         mutex_lock(&kvm->lock);
761
762         if (memory_config_version != kvm->memory_config_version) {
763                 mutex_unlock(&kvm->lock);
764                 kvm_free_physmem_slot(&new, &old);
765                 goto raced;
766         }
767
768         r = -EAGAIN;
769         if (kvm->busy)
770                 goto out_unlock;
771
772         if (mem->slot >= kvm->nmemslots)
773                 kvm->nmemslots = mem->slot + 1;
774
775         *memslot = new;
776         ++kvm->memory_config_version;
777
778         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
779         kvm_flush_remote_tlbs(kvm);
780
781         mutex_unlock(&kvm->lock);
782
783         kvm_free_physmem_slot(&old, &new);
784         return 0;
785
786 out_unlock:
787         mutex_unlock(&kvm->lock);
788 out_free:
789         kvm_free_physmem_slot(&new, &old);
790 out:
791         return r;
792 }
793
794 /*
795  * Get (and clear) the dirty memory log for a memory slot.
796  */
797 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
798                                       struct kvm_dirty_log *log)
799 {
800         struct kvm_memory_slot *memslot;
801         int r, i;
802         int n;
803         unsigned long any = 0;
804
805         mutex_lock(&kvm->lock);
806
807         /*
808          * Prevent changes to guest memory configuration even while the lock
809          * is not taken.
810          */
811         ++kvm->busy;
812         mutex_unlock(&kvm->lock);
813         r = -EINVAL;
814         if (log->slot >= KVM_MEMORY_SLOTS)
815                 goto out;
816
817         memslot = &kvm->memslots[log->slot];
818         r = -ENOENT;
819         if (!memslot->dirty_bitmap)
820                 goto out;
821
822         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
823
824         for (i = 0; !any && i < n/sizeof(long); ++i)
825                 any = memslot->dirty_bitmap[i];
826
827         r = -EFAULT;
828         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
829                 goto out;
830
831         /* If nothing is dirty, don't bother messing with page tables. */
832         if (any) {
833                 mutex_lock(&kvm->lock);
834                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
835                 kvm_flush_remote_tlbs(kvm);
836                 memset(memslot->dirty_bitmap, 0, n);
837                 mutex_unlock(&kvm->lock);
838         }
839
840         r = 0;
841
842 out:
843         mutex_lock(&kvm->lock);
844         --kvm->busy;
845         mutex_unlock(&kvm->lock);
846         return r;
847 }
848
849 /*
850  * Set a new alias region.  Aliases map a portion of physical memory into
851  * another portion.  This is useful for memory windows, for example the PC
852  * VGA region.
853  */
854 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
855                                          struct kvm_memory_alias *alias)
856 {
857         int r, n;
858         struct kvm_mem_alias *p;
859
860         r = -EINVAL;
861         /* General sanity checks */
862         if (alias->memory_size & (PAGE_SIZE - 1))
863                 goto out;
864         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
865                 goto out;
866         if (alias->slot >= KVM_ALIAS_SLOTS)
867                 goto out;
868         if (alias->guest_phys_addr + alias->memory_size
869             < alias->guest_phys_addr)
870                 goto out;
871         if (alias->target_phys_addr + alias->memory_size
872             < alias->target_phys_addr)
873                 goto out;
874
875         mutex_lock(&kvm->lock);
876
877         p = &kvm->aliases[alias->slot];
878         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
879         p->npages = alias->memory_size >> PAGE_SHIFT;
880         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
881
882         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
883                 if (kvm->aliases[n - 1].npages)
884                         break;
885         kvm->naliases = n;
886
887         kvm_mmu_zap_all(kvm);
888
889         mutex_unlock(&kvm->lock);
890
891         return 0;
892
893 out:
894         return r;
895 }
896
897 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
898 {
899         int i;
900         struct kvm_mem_alias *alias;
901
902         for (i = 0; i < kvm->naliases; ++i) {
903                 alias = &kvm->aliases[i];
904                 if (gfn >= alias->base_gfn
905                     && gfn < alias->base_gfn + alias->npages)
906                         return alias->target_gfn + gfn - alias->base_gfn;
907         }
908         return gfn;
909 }
910
911 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
912 {
913         int i;
914
915         for (i = 0; i < kvm->nmemslots; ++i) {
916                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
917
918                 if (gfn >= memslot->base_gfn
919                     && gfn < memslot->base_gfn + memslot->npages)
920                         return memslot;
921         }
922         return NULL;
923 }
924
925 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
926 {
927         gfn = unalias_gfn(kvm, gfn);
928         return __gfn_to_memslot(kvm, gfn);
929 }
930
931 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
932 {
933         struct kvm_memory_slot *slot;
934
935         gfn = unalias_gfn(kvm, gfn);
936         slot = __gfn_to_memslot(kvm, gfn);
937         if (!slot)
938                 return NULL;
939         return slot->phys_mem[gfn - slot->base_gfn];
940 }
941 EXPORT_SYMBOL_GPL(gfn_to_page);
942
943 /* WARNING: Does not work on aliased pages. */
944 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
945 {
946         struct kvm_memory_slot *memslot;
947
948         memslot = __gfn_to_memslot(kvm, gfn);
949         if (memslot && memslot->dirty_bitmap) {
950                 unsigned long rel_gfn = gfn - memslot->base_gfn;
951
952                 /* avoid RMW */
953                 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
954                         set_bit(rel_gfn, memslot->dirty_bitmap);
955         }
956 }
957
958 int emulator_read_std(unsigned long addr,
959                              void *val,
960                              unsigned int bytes,
961                              struct kvm_vcpu *vcpu)
962 {
963         void *data = val;
964
965         while (bytes) {
966                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
967                 unsigned offset = addr & (PAGE_SIZE-1);
968                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
969                 unsigned long pfn;
970                 struct page *page;
971                 void *page_virt;
972
973                 if (gpa == UNMAPPED_GVA)
974                         return X86EMUL_PROPAGATE_FAULT;
975                 pfn = gpa >> PAGE_SHIFT;
976                 page = gfn_to_page(vcpu->kvm, pfn);
977                 if (!page)
978                         return X86EMUL_UNHANDLEABLE;
979                 page_virt = kmap_atomic(page, KM_USER0);
980
981                 memcpy(data, page_virt + offset, tocopy);
982
983                 kunmap_atomic(page_virt, KM_USER0);
984
985                 bytes -= tocopy;
986                 data += tocopy;
987                 addr += tocopy;
988         }
989
990         return X86EMUL_CONTINUE;
991 }
992 EXPORT_SYMBOL_GPL(emulator_read_std);
993
994 static int emulator_write_std(unsigned long addr,
995                               const void *val,
996                               unsigned int bytes,
997                               struct kvm_vcpu *vcpu)
998 {
999         pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1000         return X86EMUL_UNHANDLEABLE;
1001 }
1002
1003 /*
1004  * Only apic need an MMIO device hook, so shortcut now..
1005  */
1006 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1007                                                 gpa_t addr)
1008 {
1009         struct kvm_io_device *dev;
1010
1011         if (vcpu->apic) {
1012                 dev = &vcpu->apic->dev;
1013                 if (dev->in_range(dev, addr))
1014                         return dev;
1015         }
1016         return NULL;
1017 }
1018
1019 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1020                                                 gpa_t addr)
1021 {
1022         struct kvm_io_device *dev;
1023
1024         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1025         if (dev == NULL)
1026                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1027         return dev;
1028 }
1029
1030 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1031                                                gpa_t addr)
1032 {
1033         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1034 }
1035
1036 static int emulator_read_emulated(unsigned long addr,
1037                                   void *val,
1038                                   unsigned int bytes,
1039                                   struct kvm_vcpu *vcpu)
1040 {
1041         struct kvm_io_device *mmio_dev;
1042         gpa_t                 gpa;
1043
1044         if (vcpu->mmio_read_completed) {
1045                 memcpy(val, vcpu->mmio_data, bytes);
1046                 vcpu->mmio_read_completed = 0;
1047                 return X86EMUL_CONTINUE;
1048         } else if (emulator_read_std(addr, val, bytes, vcpu)
1049                    == X86EMUL_CONTINUE)
1050                 return X86EMUL_CONTINUE;
1051
1052         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1053         if (gpa == UNMAPPED_GVA)
1054                 return X86EMUL_PROPAGATE_FAULT;
1055
1056         /*
1057          * Is this MMIO handled locally?
1058          */
1059         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1060         if (mmio_dev) {
1061                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1062                 return X86EMUL_CONTINUE;
1063         }
1064
1065         vcpu->mmio_needed = 1;
1066         vcpu->mmio_phys_addr = gpa;
1067         vcpu->mmio_size = bytes;
1068         vcpu->mmio_is_write = 0;
1069
1070         return X86EMUL_UNHANDLEABLE;
1071 }
1072
1073 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1074                                const void *val, int bytes)
1075 {
1076         struct page *page;
1077         void *virt;
1078
1079         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1080                 return 0;
1081         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1082         if (!page)
1083                 return 0;
1084         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1085         virt = kmap_atomic(page, KM_USER0);
1086         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1087         memcpy(virt + offset_in_page(gpa), val, bytes);
1088         kunmap_atomic(virt, KM_USER0);
1089         return 1;
1090 }
1091
1092 static int emulator_write_emulated_onepage(unsigned long addr,
1093                                            const void *val,
1094                                            unsigned int bytes,
1095                                            struct kvm_vcpu *vcpu)
1096 {
1097         struct kvm_io_device *mmio_dev;
1098         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1099
1100         if (gpa == UNMAPPED_GVA) {
1101                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1102                 return X86EMUL_PROPAGATE_FAULT;
1103         }
1104
1105         if (emulator_write_phys(vcpu, gpa, val, bytes))
1106                 return X86EMUL_CONTINUE;
1107
1108         /*
1109          * Is this MMIO handled locally?
1110          */
1111         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1112         if (mmio_dev) {
1113                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1114                 return X86EMUL_CONTINUE;
1115         }
1116
1117         vcpu->mmio_needed = 1;
1118         vcpu->mmio_phys_addr = gpa;
1119         vcpu->mmio_size = bytes;
1120         vcpu->mmio_is_write = 1;
1121         memcpy(vcpu->mmio_data, val, bytes);
1122
1123         return X86EMUL_CONTINUE;
1124 }
1125
1126 int emulator_write_emulated(unsigned long addr,
1127                                    const void *val,
1128                                    unsigned int bytes,
1129                                    struct kvm_vcpu *vcpu)
1130 {
1131         /* Crossing a page boundary? */
1132         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1133                 int rc, now;
1134
1135                 now = -addr & ~PAGE_MASK;
1136                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1137                 if (rc != X86EMUL_CONTINUE)
1138                         return rc;
1139                 addr += now;
1140                 val += now;
1141                 bytes -= now;
1142         }
1143         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1144 }
1145 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1146
1147 static int emulator_cmpxchg_emulated(unsigned long addr,
1148                                      const void *old,
1149                                      const void *new,
1150                                      unsigned int bytes,
1151                                      struct kvm_vcpu *vcpu)
1152 {
1153         static int reported;
1154
1155         if (!reported) {
1156                 reported = 1;
1157                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1158         }
1159         return emulator_write_emulated(addr, new, bytes, vcpu);
1160 }
1161
1162 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1163 {
1164         return kvm_arch_ops->get_segment_base(vcpu, seg);
1165 }
1166
1167 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1168 {
1169         return X86EMUL_CONTINUE;
1170 }
1171
1172 int emulate_clts(struct kvm_vcpu *vcpu)
1173 {
1174         unsigned long cr0;
1175
1176         cr0 = vcpu->cr0 & ~X86_CR0_TS;
1177         kvm_arch_ops->set_cr0(vcpu, cr0);
1178         return X86EMUL_CONTINUE;
1179 }
1180
1181 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1182 {
1183         struct kvm_vcpu *vcpu = ctxt->vcpu;
1184
1185         switch (dr) {
1186         case 0 ... 3:
1187                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1188                 return X86EMUL_CONTINUE;
1189         default:
1190                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1191                 return X86EMUL_UNHANDLEABLE;
1192         }
1193 }
1194
1195 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1196 {
1197         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1198         int exception;
1199
1200         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1201         if (exception) {
1202                 /* FIXME: better handling */
1203                 return X86EMUL_UNHANDLEABLE;
1204         }
1205         return X86EMUL_CONTINUE;
1206 }
1207
1208 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1209 {
1210         static int reported;
1211         u8 opcodes[4];
1212         unsigned long rip = ctxt->vcpu->rip;
1213         unsigned long rip_linear;
1214
1215         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1216
1217         if (reported)
1218                 return;
1219
1220         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt->vcpu);
1221
1222         printk(KERN_ERR "emulation failed but !mmio_needed?"
1223                " rip %lx %02x %02x %02x %02x\n",
1224                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1225         reported = 1;
1226 }
1227
1228 struct x86_emulate_ops emulate_ops = {
1229         .read_std            = emulator_read_std,
1230         .write_std           = emulator_write_std,
1231         .read_emulated       = emulator_read_emulated,
1232         .write_emulated      = emulator_write_emulated,
1233         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1234 };
1235
1236 int emulate_instruction(struct kvm_vcpu *vcpu,
1237                         struct kvm_run *run,
1238                         unsigned long cr2,
1239                         u16 error_code)
1240 {
1241         struct x86_emulate_ctxt emulate_ctxt;
1242         int r;
1243         int cs_db, cs_l;
1244
1245         vcpu->mmio_fault_cr2 = cr2;
1246         kvm_arch_ops->cache_regs(vcpu);
1247
1248         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1249
1250         emulate_ctxt.vcpu = vcpu;
1251         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1252         emulate_ctxt.cr2 = cr2;
1253         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1254                 ? X86EMUL_MODE_REAL : cs_l
1255                 ? X86EMUL_MODE_PROT64 : cs_db
1256                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1257
1258         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1259                 emulate_ctxt.cs_base = 0;
1260                 emulate_ctxt.ds_base = 0;
1261                 emulate_ctxt.es_base = 0;
1262                 emulate_ctxt.ss_base = 0;
1263         } else {
1264                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1265                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1266                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1267                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1268         }
1269
1270         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1271         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1272
1273         vcpu->mmio_is_write = 0;
1274         vcpu->pio.string = 0;
1275         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1276         if (vcpu->pio.string)
1277                 return EMULATE_DO_MMIO;
1278
1279         if ((r || vcpu->mmio_is_write) && run) {
1280                 run->exit_reason = KVM_EXIT_MMIO;
1281                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1282                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1283                 run->mmio.len = vcpu->mmio_size;
1284                 run->mmio.is_write = vcpu->mmio_is_write;
1285         }
1286
1287         if (r) {
1288                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1289                         return EMULATE_DONE;
1290                 if (!vcpu->mmio_needed) {
1291                         report_emulation_failure(&emulate_ctxt);
1292                         return EMULATE_FAIL;
1293                 }
1294                 return EMULATE_DO_MMIO;
1295         }
1296
1297         kvm_arch_ops->decache_regs(vcpu);
1298         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1299
1300         if (vcpu->mmio_is_write) {
1301                 vcpu->mmio_needed = 0;
1302                 return EMULATE_DO_MMIO;
1303         }
1304
1305         return EMULATE_DONE;
1306 }
1307 EXPORT_SYMBOL_GPL(emulate_instruction);
1308
1309 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1310 {
1311         if (vcpu->irq_summary ||
1312                 (irqchip_in_kernel(vcpu->kvm) && kvm_cpu_has_interrupt(vcpu)))
1313                 return 1;
1314
1315         vcpu->run->exit_reason = KVM_EXIT_HLT;
1316         ++vcpu->stat.halt_exits;
1317         return 0;
1318 }
1319 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1320
1321 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1322 {
1323         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1324
1325         kvm_arch_ops->cache_regs(vcpu);
1326         ret = -KVM_EINVAL;
1327 #ifdef CONFIG_X86_64
1328         if (is_long_mode(vcpu)) {
1329                 nr = vcpu->regs[VCPU_REGS_RAX];
1330                 a0 = vcpu->regs[VCPU_REGS_RDI];
1331                 a1 = vcpu->regs[VCPU_REGS_RSI];
1332                 a2 = vcpu->regs[VCPU_REGS_RDX];
1333                 a3 = vcpu->regs[VCPU_REGS_RCX];
1334                 a4 = vcpu->regs[VCPU_REGS_R8];
1335                 a5 = vcpu->regs[VCPU_REGS_R9];
1336         } else
1337 #endif
1338         {
1339                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1340                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1341                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1342                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1343                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1344                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1345                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1346         }
1347         switch (nr) {
1348         default:
1349                 run->hypercall.nr = nr;
1350                 run->hypercall.args[0] = a0;
1351                 run->hypercall.args[1] = a1;
1352                 run->hypercall.args[2] = a2;
1353                 run->hypercall.args[3] = a3;
1354                 run->hypercall.args[4] = a4;
1355                 run->hypercall.args[5] = a5;
1356                 run->hypercall.ret = ret;
1357                 run->hypercall.longmode = is_long_mode(vcpu);
1358                 kvm_arch_ops->decache_regs(vcpu);
1359                 return 0;
1360         }
1361         vcpu->regs[VCPU_REGS_RAX] = ret;
1362         kvm_arch_ops->decache_regs(vcpu);
1363         return 1;
1364 }
1365 EXPORT_SYMBOL_GPL(kvm_hypercall);
1366
1367 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1368 {
1369         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1370 }
1371
1372 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1373 {
1374         struct descriptor_table dt = { limit, base };
1375
1376         kvm_arch_ops->set_gdt(vcpu, &dt);
1377 }
1378
1379 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1380 {
1381         struct descriptor_table dt = { limit, base };
1382
1383         kvm_arch_ops->set_idt(vcpu, &dt);
1384 }
1385
1386 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1387                    unsigned long *rflags)
1388 {
1389         lmsw(vcpu, msw);
1390         *rflags = kvm_arch_ops->get_rflags(vcpu);
1391 }
1392
1393 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1394 {
1395         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1396         switch (cr) {
1397         case 0:
1398                 return vcpu->cr0;
1399         case 2:
1400                 return vcpu->cr2;
1401         case 3:
1402                 return vcpu->cr3;
1403         case 4:
1404                 return vcpu->cr4;
1405         default:
1406                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1407                 return 0;
1408         }
1409 }
1410
1411 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1412                      unsigned long *rflags)
1413 {
1414         switch (cr) {
1415         case 0:
1416                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1417                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1418                 break;
1419         case 2:
1420                 vcpu->cr2 = val;
1421                 break;
1422         case 3:
1423                 set_cr3(vcpu, val);
1424                 break;
1425         case 4:
1426                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1427                 break;
1428         default:
1429                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1430         }
1431 }
1432
1433 /*
1434  * Register the para guest with the host:
1435  */
1436 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1437 {
1438         struct kvm_vcpu_para_state *para_state;
1439         hpa_t para_state_hpa, hypercall_hpa;
1440         struct page *para_state_page;
1441         unsigned char *hypercall;
1442         gpa_t hypercall_gpa;
1443
1444         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1445         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1446
1447         /*
1448          * Needs to be page aligned:
1449          */
1450         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1451                 goto err_gp;
1452
1453         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1454         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1455         if (is_error_hpa(para_state_hpa))
1456                 goto err_gp;
1457
1458         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1459         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1460         para_state = kmap(para_state_page);
1461
1462         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1463         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1464
1465         para_state->host_version = KVM_PARA_API_VERSION;
1466         /*
1467          * We cannot support guests that try to register themselves
1468          * with a newer API version than the host supports:
1469          */
1470         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1471                 para_state->ret = -KVM_EINVAL;
1472                 goto err_kunmap_skip;
1473         }
1474
1475         hypercall_gpa = para_state->hypercall_gpa;
1476         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1477         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1478         if (is_error_hpa(hypercall_hpa)) {
1479                 para_state->ret = -KVM_EINVAL;
1480                 goto err_kunmap_skip;
1481         }
1482
1483         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1484         vcpu->para_state_page = para_state_page;
1485         vcpu->para_state_gpa = para_state_gpa;
1486         vcpu->hypercall_gpa = hypercall_gpa;
1487
1488         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1489         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1490                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1491         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1492         kunmap_atomic(hypercall, KM_USER1);
1493
1494         para_state->ret = 0;
1495 err_kunmap_skip:
1496         kunmap(para_state_page);
1497         return 0;
1498 err_gp:
1499         return 1;
1500 }
1501
1502 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1503 {
1504         u64 data;
1505
1506         switch (msr) {
1507         case 0xc0010010: /* SYSCFG */
1508         case 0xc0010015: /* HWCR */
1509         case MSR_IA32_PLATFORM_ID:
1510         case MSR_IA32_P5_MC_ADDR:
1511         case MSR_IA32_P5_MC_TYPE:
1512         case MSR_IA32_MC0_CTL:
1513         case MSR_IA32_MCG_STATUS:
1514         case MSR_IA32_MCG_CAP:
1515         case MSR_IA32_MC0_MISC:
1516         case MSR_IA32_MC0_MISC+4:
1517         case MSR_IA32_MC0_MISC+8:
1518         case MSR_IA32_MC0_MISC+12:
1519         case MSR_IA32_MC0_MISC+16:
1520         case MSR_IA32_UCODE_REV:
1521         case MSR_IA32_PERF_STATUS:
1522         case MSR_IA32_EBL_CR_POWERON:
1523                 /* MTRR registers */
1524         case 0xfe:
1525         case 0x200 ... 0x2ff:
1526                 data = 0;
1527                 break;
1528         case 0xcd: /* fsb frequency */
1529                 data = 3;
1530                 break;
1531         case MSR_IA32_APICBASE:
1532                 data = kvm_get_apic_base(vcpu);
1533                 break;
1534         case MSR_IA32_MISC_ENABLE:
1535                 data = vcpu->ia32_misc_enable_msr;
1536                 break;
1537 #ifdef CONFIG_X86_64
1538         case MSR_EFER:
1539                 data = vcpu->shadow_efer;
1540                 break;
1541 #endif
1542         default:
1543                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1544                 return 1;
1545         }
1546         *pdata = data;
1547         return 0;
1548 }
1549 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1550
1551 /*
1552  * Reads an msr value (of 'msr_index') into 'pdata'.
1553  * Returns 0 on success, non-0 otherwise.
1554  * Assumes vcpu_load() was already called.
1555  */
1556 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1557 {
1558         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1559 }
1560
1561 #ifdef CONFIG_X86_64
1562
1563 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1564 {
1565         if (efer & EFER_RESERVED_BITS) {
1566                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1567                        efer);
1568                 inject_gp(vcpu);
1569                 return;
1570         }
1571
1572         if (is_paging(vcpu)
1573             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1574                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1575                 inject_gp(vcpu);
1576                 return;
1577         }
1578
1579         kvm_arch_ops->set_efer(vcpu, efer);
1580
1581         efer &= ~EFER_LMA;
1582         efer |= vcpu->shadow_efer & EFER_LMA;
1583
1584         vcpu->shadow_efer = efer;
1585 }
1586
1587 #endif
1588
1589 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1590 {
1591         switch (msr) {
1592 #ifdef CONFIG_X86_64
1593         case MSR_EFER:
1594                 set_efer(vcpu, data);
1595                 break;
1596 #endif
1597         case MSR_IA32_MC0_STATUS:
1598                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1599                        __FUNCTION__, data);
1600                 break;
1601         case MSR_IA32_MCG_STATUS:
1602                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1603                         __FUNCTION__, data);
1604                 break;
1605         case MSR_IA32_UCODE_REV:
1606         case MSR_IA32_UCODE_WRITE:
1607         case 0x200 ... 0x2ff: /* MTRRs */
1608                 break;
1609         case MSR_IA32_APICBASE:
1610                 kvm_set_apic_base(vcpu, data);
1611                 break;
1612         case MSR_IA32_MISC_ENABLE:
1613                 vcpu->ia32_misc_enable_msr = data;
1614                 break;
1615         /*
1616          * This is the 'probe whether the host is KVM' logic:
1617          */
1618         case MSR_KVM_API_MAGIC:
1619                 return vcpu_register_para(vcpu, data);
1620
1621         default:
1622                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1623                 return 1;
1624         }
1625         return 0;
1626 }
1627 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1628
1629 /*
1630  * Writes msr value into into the appropriate "register".
1631  * Returns 0 on success, non-0 otherwise.
1632  * Assumes vcpu_load() was already called.
1633  */
1634 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1635 {
1636         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1637 }
1638
1639 void kvm_resched(struct kvm_vcpu *vcpu)
1640 {
1641         if (!need_resched())
1642                 return;
1643         cond_resched();
1644 }
1645 EXPORT_SYMBOL_GPL(kvm_resched);
1646
1647 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1648 {
1649         int i;
1650         u32 function;
1651         struct kvm_cpuid_entry *e, *best;
1652
1653         kvm_arch_ops->cache_regs(vcpu);
1654         function = vcpu->regs[VCPU_REGS_RAX];
1655         vcpu->regs[VCPU_REGS_RAX] = 0;
1656         vcpu->regs[VCPU_REGS_RBX] = 0;
1657         vcpu->regs[VCPU_REGS_RCX] = 0;
1658         vcpu->regs[VCPU_REGS_RDX] = 0;
1659         best = NULL;
1660         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1661                 e = &vcpu->cpuid_entries[i];
1662                 if (e->function == function) {
1663                         best = e;
1664                         break;
1665                 }
1666                 /*
1667                  * Both basic or both extended?
1668                  */
1669                 if (((e->function ^ function) & 0x80000000) == 0)
1670                         if (!best || e->function > best->function)
1671                                 best = e;
1672         }
1673         if (best) {
1674                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1675                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1676                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1677                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1678         }
1679         kvm_arch_ops->decache_regs(vcpu);
1680         kvm_arch_ops->skip_emulated_instruction(vcpu);
1681 }
1682 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1683
1684 static int pio_copy_data(struct kvm_vcpu *vcpu)
1685 {
1686         void *p = vcpu->pio_data;
1687         void *q;
1688         unsigned bytes;
1689         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1690
1691         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1692                  PAGE_KERNEL);
1693         if (!q) {
1694                 free_pio_guest_pages(vcpu);
1695                 return -ENOMEM;
1696         }
1697         q += vcpu->pio.guest_page_offset;
1698         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1699         if (vcpu->pio.in)
1700                 memcpy(q, p, bytes);
1701         else
1702                 memcpy(p, q, bytes);
1703         q -= vcpu->pio.guest_page_offset;
1704         vunmap(q);
1705         free_pio_guest_pages(vcpu);
1706         return 0;
1707 }
1708
1709 static int complete_pio(struct kvm_vcpu *vcpu)
1710 {
1711         struct kvm_pio_request *io = &vcpu->pio;
1712         long delta;
1713         int r;
1714
1715         kvm_arch_ops->cache_regs(vcpu);
1716
1717         if (!io->string) {
1718                 if (io->in)
1719                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1720                                io->size);
1721         } else {
1722                 if (io->in) {
1723                         r = pio_copy_data(vcpu);
1724                         if (r) {
1725                                 kvm_arch_ops->cache_regs(vcpu);
1726                                 return r;
1727                         }
1728                 }
1729
1730                 delta = 1;
1731                 if (io->rep) {
1732                         delta *= io->cur_count;
1733                         /*
1734                          * The size of the register should really depend on
1735                          * current address size.
1736                          */
1737                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1738                 }
1739                 if (io->down)
1740                         delta = -delta;
1741                 delta *= io->size;
1742                 if (io->in)
1743                         vcpu->regs[VCPU_REGS_RDI] += delta;
1744                 else
1745                         vcpu->regs[VCPU_REGS_RSI] += delta;
1746         }
1747
1748         kvm_arch_ops->decache_regs(vcpu);
1749
1750         io->count -= io->cur_count;
1751         io->cur_count = 0;
1752
1753         if (!io->count)
1754                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1755         return 0;
1756 }
1757
1758 static void kernel_pio(struct kvm_io_device *pio_dev,
1759                        struct kvm_vcpu *vcpu,
1760                        void *pd)
1761 {
1762         /* TODO: String I/O for in kernel device */
1763
1764         if (vcpu->pio.in)
1765                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1766                                   vcpu->pio.size,
1767                                   pd);
1768         else
1769                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1770                                    vcpu->pio.size,
1771                                    pd);
1772 }
1773
1774 static void pio_string_write(struct kvm_io_device *pio_dev,
1775                              struct kvm_vcpu *vcpu)
1776 {
1777         struct kvm_pio_request *io = &vcpu->pio;
1778         void *pd = vcpu->pio_data;
1779         int i;
1780
1781         for (i = 0; i < io->cur_count; i++) {
1782                 kvm_iodevice_write(pio_dev, io->port,
1783                                    io->size,
1784                                    pd);
1785                 pd += io->size;
1786         }
1787 }
1788
1789 int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1790                   int size, unsigned port)
1791 {
1792         struct kvm_io_device *pio_dev;
1793
1794         vcpu->run->exit_reason = KVM_EXIT_IO;
1795         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1796         vcpu->run->io.size = vcpu->pio.size = size;
1797         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1798         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1799         vcpu->run->io.port = vcpu->pio.port = port;
1800         vcpu->pio.in = in;
1801         vcpu->pio.string = 0;
1802         vcpu->pio.down = 0;
1803         vcpu->pio.guest_page_offset = 0;
1804         vcpu->pio.rep = 0;
1805
1806         kvm_arch_ops->cache_regs(vcpu);
1807         memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1808         kvm_arch_ops->decache_regs(vcpu);
1809
1810         pio_dev = vcpu_find_pio_dev(vcpu, port);
1811         if (pio_dev) {
1812                 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1813                 complete_pio(vcpu);
1814                 return 1;
1815         }
1816         return 0;
1817 }
1818 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1819
1820 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1821                   int size, unsigned long count, int down,
1822                   gva_t address, int rep, unsigned port)
1823 {
1824         unsigned now, in_page;
1825         int i, ret = 0;
1826         int nr_pages = 1;
1827         struct page *page;
1828         struct kvm_io_device *pio_dev;
1829
1830         vcpu->run->exit_reason = KVM_EXIT_IO;
1831         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1832         vcpu->run->io.size = vcpu->pio.size = size;
1833         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1834         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1835         vcpu->run->io.port = vcpu->pio.port = port;
1836         vcpu->pio.in = in;
1837         vcpu->pio.string = 1;
1838         vcpu->pio.down = down;
1839         vcpu->pio.guest_page_offset = offset_in_page(address);
1840         vcpu->pio.rep = rep;
1841
1842         if (!count) {
1843                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1844                 return 1;
1845         }
1846
1847         if (!down)
1848                 in_page = PAGE_SIZE - offset_in_page(address);
1849         else
1850                 in_page = offset_in_page(address) + size;
1851         now = min(count, (unsigned long)in_page / size);
1852         if (!now) {
1853                 /*
1854                  * String I/O straddles page boundary.  Pin two guest pages
1855                  * so that we satisfy atomicity constraints.  Do just one
1856                  * transaction to avoid complexity.
1857                  */
1858                 nr_pages = 2;
1859                 now = 1;
1860         }
1861         if (down) {
1862                 /*
1863                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1864                  */
1865                 pr_unimpl(vcpu, "guest string pio down\n");
1866                 inject_gp(vcpu);
1867                 return 1;
1868         }
1869         vcpu->run->io.count = now;
1870         vcpu->pio.cur_count = now;
1871
1872         for (i = 0; i < nr_pages; ++i) {
1873                 mutex_lock(&vcpu->kvm->lock);
1874                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1875                 if (page)
1876                         get_page(page);
1877                 vcpu->pio.guest_pages[i] = page;
1878                 mutex_unlock(&vcpu->kvm->lock);
1879                 if (!page) {
1880                         inject_gp(vcpu);
1881                         free_pio_guest_pages(vcpu);
1882                         return 1;
1883                 }
1884         }
1885
1886         pio_dev = vcpu_find_pio_dev(vcpu, port);
1887         if (!vcpu->pio.in) {
1888                 /* string PIO write */
1889                 ret = pio_copy_data(vcpu);
1890                 if (ret >= 0 && pio_dev) {
1891                         pio_string_write(pio_dev, vcpu);
1892                         complete_pio(vcpu);
1893                         if (vcpu->pio.count == 0)
1894                                 ret = 1;
1895                 }
1896         } else if (pio_dev)
1897                 pr_unimpl(vcpu, "no string pio read support yet, "
1898                        "port %x size %d count %ld\n",
1899                         port, size, count);
1900
1901         return ret;
1902 }
1903 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1904
1905 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1906 {
1907         int r;
1908         sigset_t sigsaved;
1909
1910         vcpu_load(vcpu);
1911
1912         if (vcpu->sigset_active)
1913                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1914
1915         /* re-sync apic's tpr */
1916         set_cr8(vcpu, kvm_run->cr8);
1917
1918         if (vcpu->pio.cur_count) {
1919                 r = complete_pio(vcpu);
1920                 if (r)
1921                         goto out;
1922         }
1923
1924         if (vcpu->mmio_needed) {
1925                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1926                 vcpu->mmio_read_completed = 1;
1927                 vcpu->mmio_needed = 0;
1928                 r = emulate_instruction(vcpu, kvm_run,
1929                                         vcpu->mmio_fault_cr2, 0);
1930                 if (r == EMULATE_DO_MMIO) {
1931                         /*
1932                          * Read-modify-write.  Back to userspace.
1933                          */
1934                         r = 0;
1935                         goto out;
1936                 }
1937         }
1938
1939         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1940                 kvm_arch_ops->cache_regs(vcpu);
1941                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1942                 kvm_arch_ops->decache_regs(vcpu);
1943         }
1944
1945         r = kvm_arch_ops->run(vcpu, kvm_run);
1946
1947 out:
1948         if (vcpu->sigset_active)
1949                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1950
1951         vcpu_put(vcpu);
1952         return r;
1953 }
1954
1955 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1956                                    struct kvm_regs *regs)
1957 {
1958         vcpu_load(vcpu);
1959
1960         kvm_arch_ops->cache_regs(vcpu);
1961
1962         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1963         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1964         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1965         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1966         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1967         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1968         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1969         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1970 #ifdef CONFIG_X86_64
1971         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1972         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1973         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1974         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1975         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1976         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1977         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1978         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1979 #endif
1980
1981         regs->rip = vcpu->rip;
1982         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1983
1984         /*
1985          * Don't leak debug flags in case they were set for guest debugging
1986          */
1987         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1988                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1989
1990         vcpu_put(vcpu);
1991
1992         return 0;
1993 }
1994
1995 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1996                                    struct kvm_regs *regs)
1997 {
1998         vcpu_load(vcpu);
1999
2000         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2001         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2002         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2003         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2004         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2005         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2006         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2007         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2008 #ifdef CONFIG_X86_64
2009         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2010         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2011         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2012         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2013         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2014         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2015         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2016         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2017 #endif
2018
2019         vcpu->rip = regs->rip;
2020         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2021
2022         kvm_arch_ops->decache_regs(vcpu);
2023
2024         vcpu_put(vcpu);
2025
2026         return 0;
2027 }
2028
2029 static void get_segment(struct kvm_vcpu *vcpu,
2030                         struct kvm_segment *var, int seg)
2031 {
2032         return kvm_arch_ops->get_segment(vcpu, var, seg);
2033 }
2034
2035 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2036                                     struct kvm_sregs *sregs)
2037 {
2038         struct descriptor_table dt;
2039
2040         vcpu_load(vcpu);
2041
2042         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2043         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2044         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2045         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2046         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2047         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2048
2049         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2050         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2051
2052         kvm_arch_ops->get_idt(vcpu, &dt);
2053         sregs->idt.limit = dt.limit;
2054         sregs->idt.base = dt.base;
2055         kvm_arch_ops->get_gdt(vcpu, &dt);
2056         sregs->gdt.limit = dt.limit;
2057         sregs->gdt.base = dt.base;
2058
2059         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2060         sregs->cr0 = vcpu->cr0;
2061         sregs->cr2 = vcpu->cr2;
2062         sregs->cr3 = vcpu->cr3;
2063         sregs->cr4 = vcpu->cr4;
2064         sregs->cr8 = get_cr8(vcpu);
2065         sregs->efer = vcpu->shadow_efer;
2066         sregs->apic_base = kvm_get_apic_base(vcpu);
2067
2068         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2069                sizeof sregs->interrupt_bitmap);
2070
2071         vcpu_put(vcpu);
2072
2073         return 0;
2074 }
2075
2076 static void set_segment(struct kvm_vcpu *vcpu,
2077                         struct kvm_segment *var, int seg)
2078 {
2079         return kvm_arch_ops->set_segment(vcpu, var, seg);
2080 }
2081
2082 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2083                                     struct kvm_sregs *sregs)
2084 {
2085         int mmu_reset_needed = 0;
2086         int i;
2087         struct descriptor_table dt;
2088
2089         vcpu_load(vcpu);
2090
2091         dt.limit = sregs->idt.limit;
2092         dt.base = sregs->idt.base;
2093         kvm_arch_ops->set_idt(vcpu, &dt);
2094         dt.limit = sregs->gdt.limit;
2095         dt.base = sregs->gdt.base;
2096         kvm_arch_ops->set_gdt(vcpu, &dt);
2097
2098         vcpu->cr2 = sregs->cr2;
2099         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2100         vcpu->cr3 = sregs->cr3;
2101
2102         set_cr8(vcpu, sregs->cr8);
2103
2104         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2105 #ifdef CONFIG_X86_64
2106         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2107 #endif
2108         kvm_set_apic_base(vcpu, sregs->apic_base);
2109
2110         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2111
2112         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2113         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2114
2115         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2116         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2117         if (!is_long_mode(vcpu) && is_pae(vcpu))
2118                 load_pdptrs(vcpu, vcpu->cr3);
2119
2120         if (mmu_reset_needed)
2121                 kvm_mmu_reset_context(vcpu);
2122
2123         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2124                sizeof vcpu->irq_pending);
2125         vcpu->irq_summary = 0;
2126         for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2127                 if (vcpu->irq_pending[i])
2128                         __set_bit(i, &vcpu->irq_summary);
2129
2130         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2131         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2132         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2133         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2134         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2135         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2136
2137         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2138         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2139
2140         vcpu_put(vcpu);
2141
2142         return 0;
2143 }
2144
2145 /*
2146  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2147  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2148  *
2149  * This list is modified at module load time to reflect the
2150  * capabilities of the host cpu.
2151  */
2152 static u32 msrs_to_save[] = {
2153         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2154         MSR_K6_STAR,
2155 #ifdef CONFIG_X86_64
2156         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2157 #endif
2158         MSR_IA32_TIME_STAMP_COUNTER,
2159 };
2160
2161 static unsigned num_msrs_to_save;
2162
2163 static u32 emulated_msrs[] = {
2164         MSR_IA32_MISC_ENABLE,
2165 };
2166
2167 static __init void kvm_init_msr_list(void)
2168 {
2169         u32 dummy[2];
2170         unsigned i, j;
2171
2172         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2173                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2174                         continue;
2175                 if (j < i)
2176                         msrs_to_save[j] = msrs_to_save[i];
2177                 j++;
2178         }
2179         num_msrs_to_save = j;
2180 }
2181
2182 /*
2183  * Adapt set_msr() to msr_io()'s calling convention
2184  */
2185 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2186 {
2187         return kvm_set_msr(vcpu, index, *data);
2188 }
2189
2190 /*
2191  * Read or write a bunch of msrs. All parameters are kernel addresses.
2192  *
2193  * @return number of msrs set successfully.
2194  */
2195 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2196                     struct kvm_msr_entry *entries,
2197                     int (*do_msr)(struct kvm_vcpu *vcpu,
2198                                   unsigned index, u64 *data))
2199 {
2200         int i;
2201
2202         vcpu_load(vcpu);
2203
2204         for (i = 0; i < msrs->nmsrs; ++i)
2205                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2206                         break;
2207
2208         vcpu_put(vcpu);
2209
2210         return i;
2211 }
2212
2213 /*
2214  * Read or write a bunch of msrs. Parameters are user addresses.
2215  *
2216  * @return number of msrs set successfully.
2217  */
2218 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2219                   int (*do_msr)(struct kvm_vcpu *vcpu,
2220                                 unsigned index, u64 *data),
2221                   int writeback)
2222 {
2223         struct kvm_msrs msrs;
2224         struct kvm_msr_entry *entries;
2225         int r, n;
2226         unsigned size;
2227
2228         r = -EFAULT;
2229         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2230                 goto out;
2231
2232         r = -E2BIG;
2233         if (msrs.nmsrs >= MAX_IO_MSRS)
2234                 goto out;
2235
2236         r = -ENOMEM;
2237         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2238         entries = vmalloc(size);
2239         if (!entries)
2240                 goto out;
2241
2242         r = -EFAULT;
2243         if (copy_from_user(entries, user_msrs->entries, size))
2244                 goto out_free;
2245
2246         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2247         if (r < 0)
2248                 goto out_free;
2249
2250         r = -EFAULT;
2251         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2252                 goto out_free;
2253
2254         r = n;
2255
2256 out_free:
2257         vfree(entries);
2258 out:
2259         return r;
2260 }
2261
2262 /*
2263  * Translate a guest virtual address to a guest physical address.
2264  */
2265 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2266                                     struct kvm_translation *tr)
2267 {
2268         unsigned long vaddr = tr->linear_address;
2269         gpa_t gpa;
2270
2271         vcpu_load(vcpu);
2272         mutex_lock(&vcpu->kvm->lock);
2273         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2274         tr->physical_address = gpa;
2275         tr->valid = gpa != UNMAPPED_GVA;
2276         tr->writeable = 1;
2277         tr->usermode = 0;
2278         mutex_unlock(&vcpu->kvm->lock);
2279         vcpu_put(vcpu);
2280
2281         return 0;
2282 }
2283
2284 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2285                                     struct kvm_interrupt *irq)
2286 {
2287         if (irq->irq < 0 || irq->irq >= 256)
2288                 return -EINVAL;
2289         if (irqchip_in_kernel(vcpu->kvm))
2290                 return -ENXIO;
2291         vcpu_load(vcpu);
2292
2293         set_bit(irq->irq, vcpu->irq_pending);
2294         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2295
2296         vcpu_put(vcpu);
2297
2298         return 0;
2299 }
2300
2301 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2302                                       struct kvm_debug_guest *dbg)
2303 {
2304         int r;
2305
2306         vcpu_load(vcpu);
2307
2308         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2309
2310         vcpu_put(vcpu);
2311
2312         return r;
2313 }
2314
2315 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2316                                     unsigned long address,
2317                                     int *type)
2318 {
2319         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2320         unsigned long pgoff;
2321         struct page *page;
2322
2323         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2324         if (pgoff == 0)
2325                 page = virt_to_page(vcpu->run);
2326         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2327                 page = virt_to_page(vcpu->pio_data);
2328         else
2329                 return NOPAGE_SIGBUS;
2330         get_page(page);
2331         if (type != NULL)
2332                 *type = VM_FAULT_MINOR;
2333
2334         return page;
2335 }
2336
2337 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2338         .nopage = kvm_vcpu_nopage,
2339 };
2340
2341 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2342 {
2343         vma->vm_ops = &kvm_vcpu_vm_ops;
2344         return 0;
2345 }
2346
2347 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2348 {
2349         struct kvm_vcpu *vcpu = filp->private_data;
2350
2351         fput(vcpu->kvm->filp);
2352         return 0;
2353 }
2354
2355 static struct file_operations kvm_vcpu_fops = {
2356         .release        = kvm_vcpu_release,
2357         .unlocked_ioctl = kvm_vcpu_ioctl,
2358         .compat_ioctl   = kvm_vcpu_ioctl,
2359         .mmap           = kvm_vcpu_mmap,
2360 };
2361
2362 /*
2363  * Allocates an inode for the vcpu.
2364  */
2365 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2366 {
2367         int fd, r;
2368         struct inode *inode;
2369         struct file *file;
2370
2371         r = anon_inode_getfd(&fd, &inode, &file,
2372                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2373         if (r)
2374                 return r;
2375         atomic_inc(&vcpu->kvm->filp->f_count);
2376         return fd;
2377 }
2378
2379 /*
2380  * Creates some virtual cpus.  Good luck creating more than one.
2381  */
2382 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2383 {
2384         int r;
2385         struct kvm_vcpu *vcpu;
2386
2387         if (!valid_vcpu(n))
2388                 return -EINVAL;
2389
2390         vcpu = kvm_arch_ops->vcpu_create(kvm, n);
2391         if (IS_ERR(vcpu))
2392                 return PTR_ERR(vcpu);
2393
2394         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2395
2396         /* We do fxsave: this must be aligned. */
2397         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2398
2399         vcpu_load(vcpu);
2400         r = kvm_mmu_setup(vcpu);
2401         vcpu_put(vcpu);
2402         if (r < 0)
2403                 goto free_vcpu;
2404
2405         mutex_lock(&kvm->lock);
2406         if (kvm->vcpus[n]) {
2407                 r = -EEXIST;
2408                 mutex_unlock(&kvm->lock);
2409                 goto mmu_unload;
2410         }
2411         kvm->vcpus[n] = vcpu;
2412         mutex_unlock(&kvm->lock);
2413
2414         /* Now it's all set up, let userspace reach it */
2415         r = create_vcpu_fd(vcpu);
2416         if (r < 0)
2417                 goto unlink;
2418         return r;
2419
2420 unlink:
2421         mutex_lock(&kvm->lock);
2422         kvm->vcpus[n] = NULL;
2423         mutex_unlock(&kvm->lock);
2424
2425 mmu_unload:
2426         vcpu_load(vcpu);
2427         kvm_mmu_unload(vcpu);
2428         vcpu_put(vcpu);
2429
2430 free_vcpu:
2431         kvm_arch_ops->vcpu_free(vcpu);
2432         return r;
2433 }
2434
2435 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2436 {
2437         u64 efer;
2438         int i;
2439         struct kvm_cpuid_entry *e, *entry;
2440
2441         rdmsrl(MSR_EFER, efer);
2442         entry = NULL;
2443         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2444                 e = &vcpu->cpuid_entries[i];
2445                 if (e->function == 0x80000001) {
2446                         entry = e;
2447                         break;
2448                 }
2449         }
2450         if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2451                 entry->edx &= ~(1 << 20);
2452                 printk(KERN_INFO "kvm: guest NX capability removed\n");
2453         }
2454 }
2455
2456 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2457                                     struct kvm_cpuid *cpuid,
2458                                     struct kvm_cpuid_entry __user *entries)
2459 {
2460         int r;
2461
2462         r = -E2BIG;
2463         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2464                 goto out;
2465         r = -EFAULT;
2466         if (copy_from_user(&vcpu->cpuid_entries, entries,
2467                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2468                 goto out;
2469         vcpu->cpuid_nent = cpuid->nent;
2470         cpuid_fix_nx_cap(vcpu);
2471         return 0;
2472
2473 out:
2474         return r;
2475 }
2476
2477 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2478 {
2479         if (sigset) {
2480                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2481                 vcpu->sigset_active = 1;
2482                 vcpu->sigset = *sigset;
2483         } else
2484                 vcpu->sigset_active = 0;
2485         return 0;
2486 }
2487
2488 /*
2489  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2490  * we have asm/x86/processor.h
2491  */
2492 struct fxsave {
2493         u16     cwd;
2494         u16     swd;
2495         u16     twd;
2496         u16     fop;
2497         u64     rip;
2498         u64     rdp;
2499         u32     mxcsr;
2500         u32     mxcsr_mask;
2501         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2502 #ifdef CONFIG_X86_64
2503         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2504 #else
2505         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2506 #endif
2507 };
2508
2509 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2510 {
2511         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2512
2513         vcpu_load(vcpu);
2514
2515         memcpy(fpu->fpr, fxsave->st_space, 128);
2516         fpu->fcw = fxsave->cwd;
2517         fpu->fsw = fxsave->swd;
2518         fpu->ftwx = fxsave->twd;
2519         fpu->last_opcode = fxsave->fop;
2520         fpu->last_ip = fxsave->rip;
2521         fpu->last_dp = fxsave->rdp;
2522         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2523
2524         vcpu_put(vcpu);
2525
2526         return 0;
2527 }
2528
2529 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2530 {
2531         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2532
2533         vcpu_load(vcpu);
2534
2535         memcpy(fxsave->st_space, fpu->fpr, 128);
2536         fxsave->cwd = fpu->fcw;
2537         fxsave->swd = fpu->fsw;
2538         fxsave->twd = fpu->ftwx;
2539         fxsave->fop = fpu->last_opcode;
2540         fxsave->rip = fpu->last_ip;
2541         fxsave->rdp = fpu->last_dp;
2542         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2543
2544         vcpu_put(vcpu);
2545
2546         return 0;
2547 }
2548
2549 static long kvm_vcpu_ioctl(struct file *filp,
2550                            unsigned int ioctl, unsigned long arg)
2551 {
2552         struct kvm_vcpu *vcpu = filp->private_data;
2553         void __user *argp = (void __user *)arg;
2554         int r = -EINVAL;
2555
2556         switch (ioctl) {
2557         case KVM_RUN:
2558                 r = -EINVAL;
2559                 if (arg)
2560                         goto out;
2561                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2562                 break;
2563         case KVM_GET_REGS: {
2564                 struct kvm_regs kvm_regs;
2565
2566                 memset(&kvm_regs, 0, sizeof kvm_regs);
2567                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2568                 if (r)
2569                         goto out;
2570                 r = -EFAULT;
2571                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2572                         goto out;
2573                 r = 0;
2574                 break;
2575         }
2576         case KVM_SET_REGS: {
2577                 struct kvm_regs kvm_regs;
2578
2579                 r = -EFAULT;
2580                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2581                         goto out;
2582                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2583                 if (r)
2584                         goto out;
2585                 r = 0;
2586                 break;
2587         }
2588         case KVM_GET_SREGS: {
2589                 struct kvm_sregs kvm_sregs;
2590
2591                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2592                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2593                 if (r)
2594                         goto out;
2595                 r = -EFAULT;
2596                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2597                         goto out;
2598                 r = 0;
2599                 break;
2600         }
2601         case KVM_SET_SREGS: {
2602                 struct kvm_sregs kvm_sregs;
2603
2604                 r = -EFAULT;
2605                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2606                         goto out;
2607                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2608                 if (r)
2609                         goto out;
2610                 r = 0;
2611                 break;
2612         }
2613         case KVM_TRANSLATE: {
2614                 struct kvm_translation tr;
2615
2616                 r = -EFAULT;
2617                 if (copy_from_user(&tr, argp, sizeof tr))
2618                         goto out;
2619                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2620                 if (r)
2621                         goto out;
2622                 r = -EFAULT;
2623                 if (copy_to_user(argp, &tr, sizeof tr))
2624                         goto out;
2625                 r = 0;
2626                 break;
2627         }
2628         case KVM_INTERRUPT: {
2629                 struct kvm_interrupt irq;
2630
2631                 r = -EFAULT;
2632                 if (copy_from_user(&irq, argp, sizeof irq))
2633                         goto out;
2634                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2635                 if (r)
2636                         goto out;
2637                 r = 0;
2638                 break;
2639         }
2640         case KVM_DEBUG_GUEST: {
2641                 struct kvm_debug_guest dbg;
2642
2643                 r = -EFAULT;
2644                 if (copy_from_user(&dbg, argp, sizeof dbg))
2645                         goto out;
2646                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2647                 if (r)
2648                         goto out;
2649                 r = 0;
2650                 break;
2651         }
2652         case KVM_GET_MSRS:
2653                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2654                 break;
2655         case KVM_SET_MSRS:
2656                 r = msr_io(vcpu, argp, do_set_msr, 0);
2657                 break;
2658         case KVM_SET_CPUID: {
2659                 struct kvm_cpuid __user *cpuid_arg = argp;
2660                 struct kvm_cpuid cpuid;
2661
2662                 r = -EFAULT;
2663                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2664                         goto out;
2665                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2666                 if (r)
2667                         goto out;
2668                 break;
2669         }
2670         case KVM_SET_SIGNAL_MASK: {
2671                 struct kvm_signal_mask __user *sigmask_arg = argp;
2672                 struct kvm_signal_mask kvm_sigmask;
2673                 sigset_t sigset, *p;
2674
2675                 p = NULL;
2676                 if (argp) {
2677                         r = -EFAULT;
2678                         if (copy_from_user(&kvm_sigmask, argp,
2679                                            sizeof kvm_sigmask))
2680                                 goto out;
2681                         r = -EINVAL;
2682                         if (kvm_sigmask.len != sizeof sigset)
2683                                 goto out;
2684                         r = -EFAULT;
2685                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2686                                            sizeof sigset))
2687                                 goto out;
2688                         p = &sigset;
2689                 }
2690                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2691                 break;
2692         }
2693         case KVM_GET_FPU: {
2694                 struct kvm_fpu fpu;
2695
2696                 memset(&fpu, 0, sizeof fpu);
2697                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2698                 if (r)
2699                         goto out;
2700                 r = -EFAULT;
2701                 if (copy_to_user(argp, &fpu, sizeof fpu))
2702                         goto out;
2703                 r = 0;
2704                 break;
2705         }
2706         case KVM_SET_FPU: {
2707                 struct kvm_fpu fpu;
2708
2709                 r = -EFAULT;
2710                 if (copy_from_user(&fpu, argp, sizeof fpu))
2711                         goto out;
2712                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2713                 if (r)
2714                         goto out;
2715                 r = 0;
2716                 break;
2717         }
2718         default:
2719                 ;
2720         }
2721 out:
2722         return r;
2723 }
2724
2725 static long kvm_vm_ioctl(struct file *filp,
2726                            unsigned int ioctl, unsigned long arg)
2727 {
2728         struct kvm *kvm = filp->private_data;
2729         void __user *argp = (void __user *)arg;
2730         int r = -EINVAL;
2731
2732         switch (ioctl) {
2733         case KVM_CREATE_VCPU:
2734                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2735                 if (r < 0)
2736                         goto out;
2737                 break;
2738         case KVM_SET_MEMORY_REGION: {
2739                 struct kvm_memory_region kvm_mem;
2740
2741                 r = -EFAULT;
2742                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2743                         goto out;
2744                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2745                 if (r)
2746                         goto out;
2747                 break;
2748         }
2749         case KVM_GET_DIRTY_LOG: {
2750                 struct kvm_dirty_log log;
2751
2752                 r = -EFAULT;
2753                 if (copy_from_user(&log, argp, sizeof log))
2754                         goto out;
2755                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2756                 if (r)
2757                         goto out;
2758                 break;
2759         }
2760         case KVM_SET_MEMORY_ALIAS: {
2761                 struct kvm_memory_alias alias;
2762
2763                 r = -EFAULT;
2764                 if (copy_from_user(&alias, argp, sizeof alias))
2765                         goto out;
2766                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2767                 if (r)
2768                         goto out;
2769                 break;
2770         }
2771         case KVM_CREATE_IRQCHIP:
2772                 r = -ENOMEM;
2773                 kvm->vpic = kvm_create_pic(kvm);
2774                 if (kvm->vpic)
2775                         r = 0;
2776                 else
2777                         goto out;
2778                 break;
2779         case KVM_IRQ_LINE: {
2780                 struct kvm_irq_level irq_event;
2781
2782                 r = -EFAULT;
2783                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2784                         goto out;
2785                 if (irqchip_in_kernel(kvm)) {
2786                         if (irq_event.irq < 16)
2787                                 kvm_pic_set_irq(pic_irqchip(kvm),
2788                                         irq_event.irq,
2789                                         irq_event.level);
2790                         /* TODO: IOAPIC */
2791                         r = 0;
2792                 }
2793                 break;
2794         }
2795         default:
2796                 ;
2797         }
2798 out:
2799         return r;
2800 }
2801
2802 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2803                                   unsigned long address,
2804                                   int *type)
2805 {
2806         struct kvm *kvm = vma->vm_file->private_data;
2807         unsigned long pgoff;
2808         struct page *page;
2809
2810         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2811         page = gfn_to_page(kvm, pgoff);
2812         if (!page)
2813                 return NOPAGE_SIGBUS;
2814         get_page(page);
2815         if (type != NULL)
2816                 *type = VM_FAULT_MINOR;
2817
2818         return page;
2819 }
2820
2821 static struct vm_operations_struct kvm_vm_vm_ops = {
2822         .nopage = kvm_vm_nopage,
2823 };
2824
2825 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2826 {
2827         vma->vm_ops = &kvm_vm_vm_ops;
2828         return 0;
2829 }
2830
2831 static struct file_operations kvm_vm_fops = {
2832         .release        = kvm_vm_release,
2833         .unlocked_ioctl = kvm_vm_ioctl,
2834         .compat_ioctl   = kvm_vm_ioctl,
2835         .mmap           = kvm_vm_mmap,
2836 };
2837
2838 static int kvm_dev_ioctl_create_vm(void)
2839 {
2840         int fd, r;
2841         struct inode *inode;
2842         struct file *file;
2843         struct kvm *kvm;
2844
2845         kvm = kvm_create_vm();
2846         if (IS_ERR(kvm))
2847                 return PTR_ERR(kvm);
2848         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
2849         if (r) {
2850                 kvm_destroy_vm(kvm);
2851                 return r;
2852         }
2853
2854         kvm->filp = file;
2855
2856         return fd;
2857 }
2858
2859 static long kvm_dev_ioctl(struct file *filp,
2860                           unsigned int ioctl, unsigned long arg)
2861 {
2862         void __user *argp = (void __user *)arg;
2863         long r = -EINVAL;
2864
2865         switch (ioctl) {
2866         case KVM_GET_API_VERSION:
2867                 r = -EINVAL;
2868                 if (arg)
2869                         goto out;
2870                 r = KVM_API_VERSION;
2871                 break;
2872         case KVM_CREATE_VM:
2873                 r = -EINVAL;
2874                 if (arg)
2875                         goto out;
2876                 r = kvm_dev_ioctl_create_vm();
2877                 break;
2878         case KVM_GET_MSR_INDEX_LIST: {
2879                 struct kvm_msr_list __user *user_msr_list = argp;
2880                 struct kvm_msr_list msr_list;
2881                 unsigned n;
2882
2883                 r = -EFAULT;
2884                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2885                         goto out;
2886                 n = msr_list.nmsrs;
2887                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2888                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2889                         goto out;
2890                 r = -E2BIG;
2891                 if (n < num_msrs_to_save)
2892                         goto out;
2893                 r = -EFAULT;
2894                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2895                                  num_msrs_to_save * sizeof(u32)))
2896                         goto out;
2897                 if (copy_to_user(user_msr_list->indices
2898                                  + num_msrs_to_save * sizeof(u32),
2899                                  &emulated_msrs,
2900                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2901                         goto out;
2902                 r = 0;
2903                 break;
2904         }
2905         case KVM_CHECK_EXTENSION: {
2906                 int ext = (long)argp;
2907
2908                 switch (ext) {
2909                 case KVM_CAP_IRQCHIP:
2910                         r = 1;
2911                         break;
2912                 default:
2913                         r = 0;
2914                         break;
2915                 }
2916                 break;
2917         }
2918         case KVM_GET_VCPU_MMAP_SIZE:
2919                 r = -EINVAL;
2920                 if (arg)
2921                         goto out;
2922                 r = 2 * PAGE_SIZE;
2923                 break;
2924         default:
2925                 ;
2926         }
2927 out:
2928         return r;
2929 }
2930
2931 static struct file_operations kvm_chardev_ops = {
2932         .unlocked_ioctl = kvm_dev_ioctl,
2933         .compat_ioctl   = kvm_dev_ioctl,
2934 };
2935
2936 static struct miscdevice kvm_dev = {
2937         KVM_MINOR,
2938         "kvm",
2939         &kvm_chardev_ops,
2940 };
2941
2942 /*
2943  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2944  * cached on it.
2945  */
2946 static void decache_vcpus_on_cpu(int cpu)
2947 {
2948         struct kvm *vm;
2949         struct kvm_vcpu *vcpu;
2950         int i;
2951
2952         spin_lock(&kvm_lock);
2953         list_for_each_entry(vm, &vm_list, vm_list)
2954                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2955                         vcpu = vm->vcpus[i];
2956                         if (!vcpu)
2957                                 continue;
2958                         /*
2959                          * If the vcpu is locked, then it is running on some
2960                          * other cpu and therefore it is not cached on the
2961                          * cpu in question.
2962                          *
2963                          * If it's not locked, check the last cpu it executed
2964                          * on.
2965                          */
2966                         if (mutex_trylock(&vcpu->mutex)) {
2967                                 if (vcpu->cpu == cpu) {
2968                                         kvm_arch_ops->vcpu_decache(vcpu);
2969                                         vcpu->cpu = -1;
2970                                 }
2971                                 mutex_unlock(&vcpu->mutex);
2972                         }
2973                 }
2974         spin_unlock(&kvm_lock);
2975 }
2976
2977 static void hardware_enable(void *junk)
2978 {
2979         int cpu = raw_smp_processor_id();
2980
2981         if (cpu_isset(cpu, cpus_hardware_enabled))
2982                 return;
2983         cpu_set(cpu, cpus_hardware_enabled);
2984         kvm_arch_ops->hardware_enable(NULL);
2985 }
2986
2987 static void hardware_disable(void *junk)
2988 {
2989         int cpu = raw_smp_processor_id();
2990
2991         if (!cpu_isset(cpu, cpus_hardware_enabled))
2992                 return;
2993         cpu_clear(cpu, cpus_hardware_enabled);
2994         decache_vcpus_on_cpu(cpu);
2995         kvm_arch_ops->hardware_disable(NULL);
2996 }
2997
2998 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2999                            void *v)
3000 {
3001         int cpu = (long)v;
3002
3003         switch (val) {
3004         case CPU_DYING:
3005         case CPU_DYING_FROZEN:
3006                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3007                        cpu);
3008                 hardware_disable(NULL);
3009                 break;
3010         case CPU_UP_CANCELED:
3011         case CPU_UP_CANCELED_FROZEN:
3012                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3013                        cpu);
3014                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3015                 break;
3016         case CPU_ONLINE:
3017         case CPU_ONLINE_FROZEN:
3018                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3019                        cpu);
3020                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3021                 break;
3022         }
3023         return NOTIFY_OK;
3024 }
3025
3026 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3027                        void *v)
3028 {
3029         if (val == SYS_RESTART) {
3030                 /*
3031                  * Some (well, at least mine) BIOSes hang on reboot if
3032                  * in vmx root mode.
3033                  */
3034                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3035                 on_each_cpu(hardware_disable, NULL, 0, 1);
3036         }
3037         return NOTIFY_OK;
3038 }
3039
3040 static struct notifier_block kvm_reboot_notifier = {
3041         .notifier_call = kvm_reboot,
3042         .priority = 0,
3043 };
3044
3045 void kvm_io_bus_init(struct kvm_io_bus *bus)
3046 {
3047         memset(bus, 0, sizeof(*bus));
3048 }
3049
3050 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3051 {
3052         int i;
3053
3054         for (i = 0; i < bus->dev_count; i++) {
3055                 struct kvm_io_device *pos = bus->devs[i];
3056
3057                 kvm_iodevice_destructor(pos);
3058         }
3059 }
3060
3061 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3062 {
3063         int i;
3064
3065         for (i = 0; i < bus->dev_count; i++) {
3066                 struct kvm_io_device *pos = bus->devs[i];
3067
3068                 if (pos->in_range(pos, addr))
3069                         return pos;
3070         }
3071
3072         return NULL;
3073 }
3074
3075 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3076 {
3077         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3078
3079         bus->devs[bus->dev_count++] = dev;
3080 }
3081
3082 static struct notifier_block kvm_cpu_notifier = {
3083         .notifier_call = kvm_cpu_hotplug,
3084         .priority = 20, /* must be > scheduler priority */
3085 };
3086
3087 static u64 stat_get(void *_offset)
3088 {
3089         unsigned offset = (long)_offset;
3090         u64 total = 0;
3091         struct kvm *kvm;
3092         struct kvm_vcpu *vcpu;
3093         int i;
3094
3095         spin_lock(&kvm_lock);
3096         list_for_each_entry(kvm, &vm_list, vm_list)
3097                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3098                         vcpu = kvm->vcpus[i];
3099                         if (vcpu)
3100                                 total += *(u32 *)((void *)vcpu + offset);
3101                 }
3102         spin_unlock(&kvm_lock);
3103         return total;
3104 }
3105
3106 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3107
3108 static __init void kvm_init_debug(void)
3109 {
3110         struct kvm_stats_debugfs_item *p;
3111
3112         debugfs_dir = debugfs_create_dir("kvm", NULL);
3113         for (p = debugfs_entries; p->name; ++p)
3114                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3115                                                 (void *)(long)p->offset,
3116                                                 &stat_fops);
3117 }
3118
3119 static void kvm_exit_debug(void)
3120 {
3121         struct kvm_stats_debugfs_item *p;
3122
3123         for (p = debugfs_entries; p->name; ++p)
3124                 debugfs_remove(p->dentry);
3125         debugfs_remove(debugfs_dir);
3126 }
3127
3128 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3129 {
3130         hardware_disable(NULL);
3131         return 0;
3132 }
3133
3134 static int kvm_resume(struct sys_device *dev)
3135 {
3136         hardware_enable(NULL);
3137         return 0;
3138 }
3139
3140 static struct sysdev_class kvm_sysdev_class = {
3141         set_kset_name("kvm"),
3142         .suspend = kvm_suspend,
3143         .resume = kvm_resume,
3144 };
3145
3146 static struct sys_device kvm_sysdev = {
3147         .id = 0,
3148         .cls = &kvm_sysdev_class,
3149 };
3150
3151 hpa_t bad_page_address;
3152
3153 static inline
3154 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3155 {
3156         return container_of(pn, struct kvm_vcpu, preempt_notifier);
3157 }
3158
3159 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3160 {
3161         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3162
3163         kvm_arch_ops->vcpu_load(vcpu, cpu);
3164 }
3165
3166 static void kvm_sched_out(struct preempt_notifier *pn,
3167                           struct task_struct *next)
3168 {
3169         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3170
3171         kvm_arch_ops->vcpu_put(vcpu);
3172 }
3173
3174 int kvm_init_arch(struct kvm_arch_ops *ops, unsigned int vcpu_size,
3175                   struct module *module)
3176 {
3177         int r;
3178         int cpu;
3179
3180         if (kvm_arch_ops) {
3181                 printk(KERN_ERR "kvm: already loaded the other module\n");
3182                 return -EEXIST;
3183         }
3184
3185         if (!ops->cpu_has_kvm_support()) {
3186                 printk(KERN_ERR "kvm: no hardware support\n");
3187                 return -EOPNOTSUPP;
3188         }
3189         if (ops->disabled_by_bios()) {
3190                 printk(KERN_ERR "kvm: disabled by bios\n");
3191                 return -EOPNOTSUPP;
3192         }
3193
3194         kvm_arch_ops = ops;
3195
3196         r = kvm_arch_ops->hardware_setup();
3197         if (r < 0)
3198                 goto out;
3199
3200         for_each_online_cpu(cpu) {
3201                 smp_call_function_single(cpu,
3202                                 kvm_arch_ops->check_processor_compatibility,
3203                                 &r, 0, 1);
3204                 if (r < 0)
3205                         goto out_free_0;
3206         }
3207
3208         on_each_cpu(hardware_enable, NULL, 0, 1);
3209         r = register_cpu_notifier(&kvm_cpu_notifier);
3210         if (r)
3211                 goto out_free_1;
3212         register_reboot_notifier(&kvm_reboot_notifier);
3213
3214         r = sysdev_class_register(&kvm_sysdev_class);
3215         if (r)
3216                 goto out_free_2;
3217
3218         r = sysdev_register(&kvm_sysdev);
3219         if (r)
3220                 goto out_free_3;
3221
3222         /* A kmem cache lets us meet the alignment requirements of fx_save. */
3223         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3224                                            __alignof__(struct kvm_vcpu), 0, 0);
3225         if (!kvm_vcpu_cache) {
3226                 r = -ENOMEM;
3227                 goto out_free_4;
3228         }
3229
3230         kvm_chardev_ops.owner = module;
3231
3232         r = misc_register(&kvm_dev);
3233         if (r) {
3234                 printk (KERN_ERR "kvm: misc device register failed\n");
3235                 goto out_free;
3236         }
3237
3238         kvm_preempt_ops.sched_in = kvm_sched_in;
3239         kvm_preempt_ops.sched_out = kvm_sched_out;
3240
3241         return r;
3242
3243 out_free:
3244         kmem_cache_destroy(kvm_vcpu_cache);
3245 out_free_4:
3246         sysdev_unregister(&kvm_sysdev);
3247 out_free_3:
3248         sysdev_class_unregister(&kvm_sysdev_class);
3249 out_free_2:
3250         unregister_reboot_notifier(&kvm_reboot_notifier);
3251         unregister_cpu_notifier(&kvm_cpu_notifier);
3252 out_free_1:
3253         on_each_cpu(hardware_disable, NULL, 0, 1);
3254 out_free_0:
3255         kvm_arch_ops->hardware_unsetup();
3256 out:
3257         kvm_arch_ops = NULL;
3258         return r;
3259 }
3260
3261 void kvm_exit_arch(void)
3262 {
3263         misc_deregister(&kvm_dev);
3264         kmem_cache_destroy(kvm_vcpu_cache);
3265         sysdev_unregister(&kvm_sysdev);
3266         sysdev_class_unregister(&kvm_sysdev_class);
3267         unregister_reboot_notifier(&kvm_reboot_notifier);
3268         unregister_cpu_notifier(&kvm_cpu_notifier);
3269         on_each_cpu(hardware_disable, NULL, 0, 1);
3270         kvm_arch_ops->hardware_unsetup();
3271         kvm_arch_ops = NULL;
3272 }
3273
3274 static __init int kvm_init(void)
3275 {
3276         static struct page *bad_page;
3277         int r;
3278
3279         r = kvm_mmu_module_init();
3280         if (r)
3281                 goto out4;
3282
3283         kvm_init_debug();
3284
3285         kvm_init_msr_list();
3286
3287         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3288                 r = -ENOMEM;
3289                 goto out;
3290         }
3291
3292         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3293         memset(__va(bad_page_address), 0, PAGE_SIZE);
3294
3295         return 0;
3296
3297 out:
3298         kvm_exit_debug();
3299         kvm_mmu_module_exit();
3300 out4:
3301         return r;
3302 }
3303
3304 static __exit void kvm_exit(void)
3305 {
3306         kvm_exit_debug();
3307         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3308         kvm_mmu_module_exit();
3309 }
3310
3311 module_init(kvm_init)
3312 module_exit(kvm_exit)
3313
3314 EXPORT_SYMBOL_GPL(kvm_init_arch);
3315 EXPORT_SYMBOL_GPL(kvm_exit_arch);