Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shaggy...
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <asm/msr.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
32 #include <asm/io.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <asm/desc.h>
37
38 #include "x86_emulate.h"
39 #include "segment_descriptor.h"
40
41 MODULE_AUTHOR("Qumranet");
42 MODULE_LICENSE("GPL");
43
44 struct kvm_arch_ops *kvm_arch_ops;
45 struct kvm_stat kvm_stat;
46 EXPORT_SYMBOL_GPL(kvm_stat);
47
48 static struct kvm_stats_debugfs_item {
49         const char *name;
50         u32 *data;
51         struct dentry *dentry;
52 } debugfs_entries[] = {
53         { "pf_fixed", &kvm_stat.pf_fixed },
54         { "pf_guest", &kvm_stat.pf_guest },
55         { "tlb_flush", &kvm_stat.tlb_flush },
56         { "invlpg", &kvm_stat.invlpg },
57         { "exits", &kvm_stat.exits },
58         { "io_exits", &kvm_stat.io_exits },
59         { "mmio_exits", &kvm_stat.mmio_exits },
60         { "signal_exits", &kvm_stat.signal_exits },
61         { "irq_exits", &kvm_stat.irq_exits },
62         { 0, 0 }
63 };
64
65 static struct dentry *debugfs_dir;
66
67 #define MAX_IO_MSRS 256
68
69 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
70 #define LMSW_GUEST_MASK 0x0eULL
71 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
72 #define CR8_RESEVED_BITS (~0x0fULL)
73 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
74
75 struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
76 {
77         int i;
78
79         for (i = 0; i < vcpu->nmsrs; ++i)
80                 if (vcpu->guest_msrs[i].index == msr)
81                         return &vcpu->guest_msrs[i];
82         return 0;
83 }
84 EXPORT_SYMBOL_GPL(find_msr_entry);
85
86 #ifdef __x86_64__
87 // LDT or TSS descriptor in the GDT. 16 bytes.
88 struct segment_descriptor_64 {
89         struct segment_descriptor s;
90         u32 base_higher;
91         u32 pad_zero;
92 };
93
94 #endif
95
96 unsigned long segment_base(u16 selector)
97 {
98         struct descriptor_table gdt;
99         struct segment_descriptor *d;
100         unsigned long table_base;
101         typedef unsigned long ul;
102         unsigned long v;
103
104         if (selector == 0)
105                 return 0;
106
107         asm ("sgdt %0" : "=m"(gdt));
108         table_base = gdt.base;
109
110         if (selector & 4) {           /* from ldt */
111                 u16 ldt_selector;
112
113                 asm ("sldt %0" : "=g"(ldt_selector));
114                 table_base = segment_base(ldt_selector);
115         }
116         d = (struct segment_descriptor *)(table_base + (selector & ~7));
117         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
118 #ifdef __x86_64__
119         if (d->system == 0
120             && (d->type == 2 || d->type == 9 || d->type == 11))
121                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
122 #endif
123         return v;
124 }
125 EXPORT_SYMBOL_GPL(segment_base);
126
127 int kvm_read_guest(struct kvm_vcpu *vcpu,
128                              gva_t addr,
129                              unsigned long size,
130                              void *dest)
131 {
132         unsigned char *host_buf = dest;
133         unsigned long req_size = size;
134
135         while (size) {
136                 hpa_t paddr;
137                 unsigned now;
138                 unsigned offset;
139                 hva_t guest_buf;
140
141                 paddr = gva_to_hpa(vcpu, addr);
142
143                 if (is_error_hpa(paddr))
144                         break;
145
146                 guest_buf = (hva_t)kmap_atomic(
147                                         pfn_to_page(paddr >> PAGE_SHIFT),
148                                         KM_USER0);
149                 offset = addr & ~PAGE_MASK;
150                 guest_buf |= offset;
151                 now = min(size, PAGE_SIZE - offset);
152                 memcpy(host_buf, (void*)guest_buf, now);
153                 host_buf += now;
154                 addr += now;
155                 size -= now;
156                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
157         }
158         return req_size - size;
159 }
160 EXPORT_SYMBOL_GPL(kvm_read_guest);
161
162 int kvm_write_guest(struct kvm_vcpu *vcpu,
163                              gva_t addr,
164                              unsigned long size,
165                              void *data)
166 {
167         unsigned char *host_buf = data;
168         unsigned long req_size = size;
169
170         while (size) {
171                 hpa_t paddr;
172                 unsigned now;
173                 unsigned offset;
174                 hva_t guest_buf;
175
176                 paddr = gva_to_hpa(vcpu, addr);
177
178                 if (is_error_hpa(paddr))
179                         break;
180
181                 guest_buf = (hva_t)kmap_atomic(
182                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
183                 offset = addr & ~PAGE_MASK;
184                 guest_buf |= offset;
185                 now = min(size, PAGE_SIZE - offset);
186                 memcpy((void*)guest_buf, host_buf, now);
187                 host_buf += now;
188                 addr += now;
189                 size -= now;
190                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
191         }
192         return req_size - size;
193 }
194 EXPORT_SYMBOL_GPL(kvm_write_guest);
195
196 static int vcpu_slot(struct kvm_vcpu *vcpu)
197 {
198         return vcpu - vcpu->kvm->vcpus;
199 }
200
201 /*
202  * Switches to specified vcpu, until a matching vcpu_put()
203  */
204 static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
205 {
206         struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
207
208         mutex_lock(&vcpu->mutex);
209         if (unlikely(!vcpu->vmcs)) {
210                 mutex_unlock(&vcpu->mutex);
211                 return 0;
212         }
213         return kvm_arch_ops->vcpu_load(vcpu);
214 }
215
216 static void vcpu_put(struct kvm_vcpu *vcpu)
217 {
218         kvm_arch_ops->vcpu_put(vcpu);
219         put_cpu();
220         mutex_unlock(&vcpu->mutex);
221 }
222
223 static int kvm_dev_open(struct inode *inode, struct file *filp)
224 {
225         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
226         int i;
227
228         if (!kvm)
229                 return -ENOMEM;
230
231         spin_lock_init(&kvm->lock);
232         INIT_LIST_HEAD(&kvm->active_mmu_pages);
233         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
234                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
235
236                 mutex_init(&vcpu->mutex);
237                 vcpu->mmu.root_hpa = INVALID_PAGE;
238                 INIT_LIST_HEAD(&vcpu->free_pages);
239         }
240         filp->private_data = kvm;
241         return 0;
242 }
243
244 /*
245  * Free any memory in @free but not in @dont.
246  */
247 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
248                                   struct kvm_memory_slot *dont)
249 {
250         int i;
251
252         if (!dont || free->phys_mem != dont->phys_mem)
253                 if (free->phys_mem) {
254                         for (i = 0; i < free->npages; ++i)
255                                 __free_page(free->phys_mem[i]);
256                         vfree(free->phys_mem);
257                 }
258
259         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
260                 vfree(free->dirty_bitmap);
261
262         free->phys_mem = 0;
263         free->npages = 0;
264         free->dirty_bitmap = 0;
265 }
266
267 static void kvm_free_physmem(struct kvm *kvm)
268 {
269         int i;
270
271         for (i = 0; i < kvm->nmemslots; ++i)
272                 kvm_free_physmem_slot(&kvm->memslots[i], 0);
273 }
274
275 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
276 {
277         kvm_arch_ops->vcpu_free(vcpu);
278         kvm_mmu_destroy(vcpu);
279 }
280
281 static void kvm_free_vcpus(struct kvm *kvm)
282 {
283         unsigned int i;
284
285         for (i = 0; i < KVM_MAX_VCPUS; ++i)
286                 kvm_free_vcpu(&kvm->vcpus[i]);
287 }
288
289 static int kvm_dev_release(struct inode *inode, struct file *filp)
290 {
291         struct kvm *kvm = filp->private_data;
292
293         kvm_free_vcpus(kvm);
294         kvm_free_physmem(kvm);
295         kfree(kvm);
296         return 0;
297 }
298
299 static void inject_gp(struct kvm_vcpu *vcpu)
300 {
301         kvm_arch_ops->inject_gp(vcpu, 0);
302 }
303
304 static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
305                                          unsigned long cr3)
306 {
307         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
308         unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
309         int i;
310         u64 pdpte;
311         u64 *pdpt;
312         struct kvm_memory_slot *memslot;
313
314         spin_lock(&vcpu->kvm->lock);
315         memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
316         /* FIXME: !memslot - emulate? 0xff? */
317         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
318
319         for (i = 0; i < 4; ++i) {
320                 pdpte = pdpt[offset + i];
321                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
322                         break;
323         }
324
325         kunmap_atomic(pdpt, KM_USER0);
326         spin_unlock(&vcpu->kvm->lock);
327
328         return i != 4;
329 }
330
331 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
332 {
333         if (cr0 & CR0_RESEVED_BITS) {
334                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
335                        cr0, vcpu->cr0);
336                 inject_gp(vcpu);
337                 return;
338         }
339
340         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
341                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
342                 inject_gp(vcpu);
343                 return;
344         }
345
346         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
347                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
348                        "and a clear PE flag\n");
349                 inject_gp(vcpu);
350                 return;
351         }
352
353         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
354 #ifdef __x86_64__
355                 if ((vcpu->shadow_efer & EFER_LME)) {
356                         int cs_db, cs_l;
357
358                         if (!is_pae(vcpu)) {
359                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
360                                        "in long mode while PAE is disabled\n");
361                                 inject_gp(vcpu);
362                                 return;
363                         }
364                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
365                         if (cs_l) {
366                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
367                                        "in long mode while CS.L == 1\n");
368                                 inject_gp(vcpu);
369                                 return;
370
371                         }
372                 } else
373 #endif
374                 if (is_pae(vcpu) &&
375                             pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
376                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
377                                "reserved bits\n");
378                         inject_gp(vcpu);
379                         return;
380                 }
381
382         }
383
384         kvm_arch_ops->set_cr0(vcpu, cr0);
385         vcpu->cr0 = cr0;
386
387         spin_lock(&vcpu->kvm->lock);
388         kvm_mmu_reset_context(vcpu);
389         spin_unlock(&vcpu->kvm->lock);
390         return;
391 }
392 EXPORT_SYMBOL_GPL(set_cr0);
393
394 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
395 {
396         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
397 }
398 EXPORT_SYMBOL_GPL(lmsw);
399
400 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
401 {
402         if (cr4 & CR4_RESEVED_BITS) {
403                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
404                 inject_gp(vcpu);
405                 return;
406         }
407
408         if (kvm_arch_ops->is_long_mode(vcpu)) {
409                 if (!(cr4 & CR4_PAE_MASK)) {
410                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
411                                "in long mode\n");
412                         inject_gp(vcpu);
413                         return;
414                 }
415         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
416                    && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
417                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
418                 inject_gp(vcpu);
419         }
420
421         if (cr4 & CR4_VMXE_MASK) {
422                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
423                 inject_gp(vcpu);
424                 return;
425         }
426         kvm_arch_ops->set_cr4(vcpu, cr4);
427         spin_lock(&vcpu->kvm->lock);
428         kvm_mmu_reset_context(vcpu);
429         spin_unlock(&vcpu->kvm->lock);
430 }
431 EXPORT_SYMBOL_GPL(set_cr4);
432
433 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
434 {
435         if (kvm_arch_ops->is_long_mode(vcpu)) {
436                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
437                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
438                         inject_gp(vcpu);
439                         return;
440                 }
441         } else {
442                 if (cr3 & CR3_RESEVED_BITS) {
443                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
444                         inject_gp(vcpu);
445                         return;
446                 }
447                 if (is_paging(vcpu) && is_pae(vcpu) &&
448                     pdptrs_have_reserved_bits_set(vcpu, cr3)) {
449                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
450                                "reserved bits\n");
451                         inject_gp(vcpu);
452                         return;
453                 }
454         }
455
456         vcpu->cr3 = cr3;
457         spin_lock(&vcpu->kvm->lock);
458         vcpu->mmu.new_cr3(vcpu);
459         spin_unlock(&vcpu->kvm->lock);
460 }
461 EXPORT_SYMBOL_GPL(set_cr3);
462
463 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
464 {
465         if ( cr8 & CR8_RESEVED_BITS) {
466                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
467                 inject_gp(vcpu);
468                 return;
469         }
470         vcpu->cr8 = cr8;
471 }
472 EXPORT_SYMBOL_GPL(set_cr8);
473
474 void fx_init(struct kvm_vcpu *vcpu)
475 {
476         struct __attribute__ ((__packed__)) fx_image_s {
477                 u16 control; //fcw
478                 u16 status; //fsw
479                 u16 tag; // ftw
480                 u16 opcode; //fop
481                 u64 ip; // fpu ip
482                 u64 operand;// fpu dp
483                 u32 mxcsr;
484                 u32 mxcsr_mask;
485
486         } *fx_image;
487
488         fx_save(vcpu->host_fx_image);
489         fpu_init();
490         fx_save(vcpu->guest_fx_image);
491         fx_restore(vcpu->host_fx_image);
492
493         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
494         fx_image->mxcsr = 0x1f80;
495         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
496                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
497 }
498 EXPORT_SYMBOL_GPL(fx_init);
499
500 /*
501  * Creates some virtual cpus.  Good luck creating more than one.
502  */
503 static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
504 {
505         int r;
506         struct kvm_vcpu *vcpu;
507
508         r = -EINVAL;
509         if (n < 0 || n >= KVM_MAX_VCPUS)
510                 goto out;
511
512         vcpu = &kvm->vcpus[n];
513
514         mutex_lock(&vcpu->mutex);
515
516         if (vcpu->vmcs) {
517                 mutex_unlock(&vcpu->mutex);
518                 return -EEXIST;
519         }
520
521         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
522                                            FX_IMAGE_ALIGN);
523         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
524
525         vcpu->cpu = -1;  /* First load will set up TR */
526         vcpu->kvm = kvm;
527         r = kvm_arch_ops->vcpu_create(vcpu);
528         if (r < 0)
529                 goto out_free_vcpus;
530
531         kvm_arch_ops->vcpu_load(vcpu);
532
533         r = kvm_arch_ops->vcpu_setup(vcpu);
534         if (r >= 0)
535                 r = kvm_mmu_init(vcpu);
536
537         vcpu_put(vcpu);
538
539         if (r < 0)
540                 goto out_free_vcpus;
541
542         return 0;
543
544 out_free_vcpus:
545         kvm_free_vcpu(vcpu);
546         mutex_unlock(&vcpu->mutex);
547 out:
548         return r;
549 }
550
551 /*
552  * Allocate some memory and give it an address in the guest physical address
553  * space.
554  *
555  * Discontiguous memory is allowed, mostly for framebuffers.
556  */
557 static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
558                                            struct kvm_memory_region *mem)
559 {
560         int r;
561         gfn_t base_gfn;
562         unsigned long npages;
563         unsigned long i;
564         struct kvm_memory_slot *memslot;
565         struct kvm_memory_slot old, new;
566         int memory_config_version;
567
568         r = -EINVAL;
569         /* General sanity checks */
570         if (mem->memory_size & (PAGE_SIZE - 1))
571                 goto out;
572         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
573                 goto out;
574         if (mem->slot >= KVM_MEMORY_SLOTS)
575                 goto out;
576         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
577                 goto out;
578
579         memslot = &kvm->memslots[mem->slot];
580         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
581         npages = mem->memory_size >> PAGE_SHIFT;
582
583         if (!npages)
584                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
585
586 raced:
587         spin_lock(&kvm->lock);
588
589         memory_config_version = kvm->memory_config_version;
590         new = old = *memslot;
591
592         new.base_gfn = base_gfn;
593         new.npages = npages;
594         new.flags = mem->flags;
595
596         /* Disallow changing a memory slot's size. */
597         r = -EINVAL;
598         if (npages && old.npages && npages != old.npages)
599                 goto out_unlock;
600
601         /* Check for overlaps */
602         r = -EEXIST;
603         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
604                 struct kvm_memory_slot *s = &kvm->memslots[i];
605
606                 if (s == memslot)
607                         continue;
608                 if (!((base_gfn + npages <= s->base_gfn) ||
609                       (base_gfn >= s->base_gfn + s->npages)))
610                         goto out_unlock;
611         }
612         /*
613          * Do memory allocations outside lock.  memory_config_version will
614          * detect any races.
615          */
616         spin_unlock(&kvm->lock);
617
618         /* Deallocate if slot is being removed */
619         if (!npages)
620                 new.phys_mem = 0;
621
622         /* Free page dirty bitmap if unneeded */
623         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
624                 new.dirty_bitmap = 0;
625
626         r = -ENOMEM;
627
628         /* Allocate if a slot is being created */
629         if (npages && !new.phys_mem) {
630                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
631
632                 if (!new.phys_mem)
633                         goto out_free;
634
635                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
636                 for (i = 0; i < npages; ++i) {
637                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
638                                                      | __GFP_ZERO);
639                         if (!new.phys_mem[i])
640                                 goto out_free;
641                 }
642         }
643
644         /* Allocate page dirty bitmap if needed */
645         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
646                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
647
648                 new.dirty_bitmap = vmalloc(dirty_bytes);
649                 if (!new.dirty_bitmap)
650                         goto out_free;
651                 memset(new.dirty_bitmap, 0, dirty_bytes);
652         }
653
654         spin_lock(&kvm->lock);
655
656         if (memory_config_version != kvm->memory_config_version) {
657                 spin_unlock(&kvm->lock);
658                 kvm_free_physmem_slot(&new, &old);
659                 goto raced;
660         }
661
662         r = -EAGAIN;
663         if (kvm->busy)
664                 goto out_unlock;
665
666         if (mem->slot >= kvm->nmemslots)
667                 kvm->nmemslots = mem->slot + 1;
668
669         *memslot = new;
670         ++kvm->memory_config_version;
671
672         spin_unlock(&kvm->lock);
673
674         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
675                 struct kvm_vcpu *vcpu;
676
677                 vcpu = vcpu_load(kvm, i);
678                 if (!vcpu)
679                         continue;
680                 kvm_mmu_reset_context(vcpu);
681                 vcpu_put(vcpu);
682         }
683
684         kvm_free_physmem_slot(&old, &new);
685         return 0;
686
687 out_unlock:
688         spin_unlock(&kvm->lock);
689 out_free:
690         kvm_free_physmem_slot(&new, &old);
691 out:
692         return r;
693 }
694
695 /*
696  * Get (and clear) the dirty memory log for a memory slot.
697  */
698 static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
699                                        struct kvm_dirty_log *log)
700 {
701         struct kvm_memory_slot *memslot;
702         int r, i;
703         int n;
704         unsigned long any = 0;
705
706         spin_lock(&kvm->lock);
707
708         /*
709          * Prevent changes to guest memory configuration even while the lock
710          * is not taken.
711          */
712         ++kvm->busy;
713         spin_unlock(&kvm->lock);
714         r = -EINVAL;
715         if (log->slot >= KVM_MEMORY_SLOTS)
716                 goto out;
717
718         memslot = &kvm->memslots[log->slot];
719         r = -ENOENT;
720         if (!memslot->dirty_bitmap)
721                 goto out;
722
723         n = ALIGN(memslot->npages, 8) / 8;
724
725         for (i = 0; !any && i < n; ++i)
726                 any = memslot->dirty_bitmap[i];
727
728         r = -EFAULT;
729         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
730                 goto out;
731
732
733         if (any) {
734                 spin_lock(&kvm->lock);
735                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
736                 spin_unlock(&kvm->lock);
737                 memset(memslot->dirty_bitmap, 0, n);
738                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
739                         struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
740
741                         if (!vcpu)
742                                 continue;
743                         kvm_arch_ops->tlb_flush(vcpu);
744                         vcpu_put(vcpu);
745                 }
746         }
747
748         r = 0;
749
750 out:
751         spin_lock(&kvm->lock);
752         --kvm->busy;
753         spin_unlock(&kvm->lock);
754         return r;
755 }
756
757 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
758 {
759         int i;
760
761         for (i = 0; i < kvm->nmemslots; ++i) {
762                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
763
764                 if (gfn >= memslot->base_gfn
765                     && gfn < memslot->base_gfn + memslot->npages)
766                         return memslot;
767         }
768         return 0;
769 }
770 EXPORT_SYMBOL_GPL(gfn_to_memslot);
771
772 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
773 {
774         int i;
775         struct kvm_memory_slot *memslot = 0;
776         unsigned long rel_gfn;
777
778         for (i = 0; i < kvm->nmemslots; ++i) {
779                 memslot = &kvm->memslots[i];
780
781                 if (gfn >= memslot->base_gfn
782                     && gfn < memslot->base_gfn + memslot->npages) {
783
784                         if (!memslot || !memslot->dirty_bitmap)
785                                 return;
786
787                         rel_gfn = gfn - memslot->base_gfn;
788
789                         /* avoid RMW */
790                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
791                                 set_bit(rel_gfn, memslot->dirty_bitmap);
792                         return;
793                 }
794         }
795 }
796
797 static int emulator_read_std(unsigned long addr,
798                              unsigned long *val,
799                              unsigned int bytes,
800                              struct x86_emulate_ctxt *ctxt)
801 {
802         struct kvm_vcpu *vcpu = ctxt->vcpu;
803         void *data = val;
804
805         while (bytes) {
806                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
807                 unsigned offset = addr & (PAGE_SIZE-1);
808                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
809                 unsigned long pfn;
810                 struct kvm_memory_slot *memslot;
811                 void *page;
812
813                 if (gpa == UNMAPPED_GVA)
814                         return X86EMUL_PROPAGATE_FAULT;
815                 pfn = gpa >> PAGE_SHIFT;
816                 memslot = gfn_to_memslot(vcpu->kvm, pfn);
817                 if (!memslot)
818                         return X86EMUL_UNHANDLEABLE;
819                 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
820
821                 memcpy(data, page + offset, tocopy);
822
823                 kunmap_atomic(page, KM_USER0);
824
825                 bytes -= tocopy;
826                 data += tocopy;
827                 addr += tocopy;
828         }
829
830         return X86EMUL_CONTINUE;
831 }
832
833 static int emulator_write_std(unsigned long addr,
834                               unsigned long val,
835                               unsigned int bytes,
836                               struct x86_emulate_ctxt *ctxt)
837 {
838         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
839                addr, bytes);
840         return X86EMUL_UNHANDLEABLE;
841 }
842
843 static int emulator_read_emulated(unsigned long addr,
844                                   unsigned long *val,
845                                   unsigned int bytes,
846                                   struct x86_emulate_ctxt *ctxt)
847 {
848         struct kvm_vcpu *vcpu = ctxt->vcpu;
849
850         if (vcpu->mmio_read_completed) {
851                 memcpy(val, vcpu->mmio_data, bytes);
852                 vcpu->mmio_read_completed = 0;
853                 return X86EMUL_CONTINUE;
854         } else if (emulator_read_std(addr, val, bytes, ctxt)
855                    == X86EMUL_CONTINUE)
856                 return X86EMUL_CONTINUE;
857         else {
858                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
859                 if (gpa == UNMAPPED_GVA)
860                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
861                 vcpu->mmio_needed = 1;
862                 vcpu->mmio_phys_addr = gpa;
863                 vcpu->mmio_size = bytes;
864                 vcpu->mmio_is_write = 0;
865
866                 return X86EMUL_UNHANDLEABLE;
867         }
868 }
869
870 static int emulator_write_emulated(unsigned long addr,
871                                    unsigned long val,
872                                    unsigned int bytes,
873                                    struct x86_emulate_ctxt *ctxt)
874 {
875         struct kvm_vcpu *vcpu = ctxt->vcpu;
876         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
877
878         if (gpa == UNMAPPED_GVA)
879                 return X86EMUL_PROPAGATE_FAULT;
880
881         vcpu->mmio_needed = 1;
882         vcpu->mmio_phys_addr = gpa;
883         vcpu->mmio_size = bytes;
884         vcpu->mmio_is_write = 1;
885         memcpy(vcpu->mmio_data, &val, bytes);
886
887         return X86EMUL_CONTINUE;
888 }
889
890 static int emulator_cmpxchg_emulated(unsigned long addr,
891                                      unsigned long old,
892                                      unsigned long new,
893                                      unsigned int bytes,
894                                      struct x86_emulate_ctxt *ctxt)
895 {
896         static int reported;
897
898         if (!reported) {
899                 reported = 1;
900                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
901         }
902         return emulator_write_emulated(addr, new, bytes, ctxt);
903 }
904
905 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
906 {
907         return kvm_arch_ops->get_segment_base(vcpu, seg);
908 }
909
910 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
911 {
912         spin_lock(&vcpu->kvm->lock);
913         vcpu->mmu.inval_page(vcpu, address);
914         spin_unlock(&vcpu->kvm->lock);
915         kvm_arch_ops->invlpg(vcpu, address);
916         return X86EMUL_CONTINUE;
917 }
918
919 int emulate_clts(struct kvm_vcpu *vcpu)
920 {
921         unsigned long cr0 = vcpu->cr0;
922
923         cr0 &= ~CR0_TS_MASK;
924         kvm_arch_ops->set_cr0(vcpu, cr0);
925         return X86EMUL_CONTINUE;
926 }
927
928 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
929 {
930         struct kvm_vcpu *vcpu = ctxt->vcpu;
931
932         switch (dr) {
933         case 0 ... 3:
934                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
935                 return X86EMUL_CONTINUE;
936         default:
937                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
938                        __FUNCTION__, dr);
939                 return X86EMUL_UNHANDLEABLE;
940         }
941 }
942
943 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
944 {
945         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
946         int exception;
947
948         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
949         if (exception) {
950                 /* FIXME: better handling */
951                 return X86EMUL_UNHANDLEABLE;
952         }
953         return X86EMUL_CONTINUE;
954 }
955
956 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
957 {
958         static int reported;
959         u8 opcodes[4];
960         unsigned long rip = ctxt->vcpu->rip;
961         unsigned long rip_linear;
962
963         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
964
965         if (reported)
966                 return;
967
968         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
969
970         printk(KERN_ERR "emulation failed but !mmio_needed?"
971                " rip %lx %02x %02x %02x %02x\n",
972                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
973         reported = 1;
974 }
975
976 struct x86_emulate_ops emulate_ops = {
977         .read_std            = emulator_read_std,
978         .write_std           = emulator_write_std,
979         .read_emulated       = emulator_read_emulated,
980         .write_emulated      = emulator_write_emulated,
981         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
982 };
983
984 int emulate_instruction(struct kvm_vcpu *vcpu,
985                         struct kvm_run *run,
986                         unsigned long cr2,
987                         u16 error_code)
988 {
989         struct x86_emulate_ctxt emulate_ctxt;
990         int r;
991         int cs_db, cs_l;
992
993         kvm_arch_ops->cache_regs(vcpu);
994
995         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
996
997         emulate_ctxt.vcpu = vcpu;
998         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
999         emulate_ctxt.cr2 = cr2;
1000         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1001                 ? X86EMUL_MODE_REAL : cs_l
1002                 ? X86EMUL_MODE_PROT64 : cs_db
1003                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1004
1005         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1006                 emulate_ctxt.cs_base = 0;
1007                 emulate_ctxt.ds_base = 0;
1008                 emulate_ctxt.es_base = 0;
1009                 emulate_ctxt.ss_base = 0;
1010         } else {
1011                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1012                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1013                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1014                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1015         }
1016
1017         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1018         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1019
1020         vcpu->mmio_is_write = 0;
1021         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1022
1023         if ((r || vcpu->mmio_is_write) && run) {
1024                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1025                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1026                 run->mmio.len = vcpu->mmio_size;
1027                 run->mmio.is_write = vcpu->mmio_is_write;
1028         }
1029
1030         if (r) {
1031                 if (!vcpu->mmio_needed) {
1032                         report_emulation_failure(&emulate_ctxt);
1033                         return EMULATE_FAIL;
1034                 }
1035                 return EMULATE_DO_MMIO;
1036         }
1037
1038         kvm_arch_ops->decache_regs(vcpu);
1039         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1040
1041         if (vcpu->mmio_is_write)
1042                 return EMULATE_DO_MMIO;
1043
1044         return EMULATE_DONE;
1045 }
1046 EXPORT_SYMBOL_GPL(emulate_instruction);
1047
1048 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1049 {
1050         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1051 }
1052
1053 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1054 {
1055         struct descriptor_table dt = { limit, base };
1056
1057         kvm_arch_ops->set_gdt(vcpu, &dt);
1058 }
1059
1060 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1061 {
1062         struct descriptor_table dt = { limit, base };
1063
1064         kvm_arch_ops->set_idt(vcpu, &dt);
1065 }
1066
1067 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1068                    unsigned long *rflags)
1069 {
1070         lmsw(vcpu, msw);
1071         *rflags = kvm_arch_ops->get_rflags(vcpu);
1072 }
1073
1074 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1075 {
1076         switch (cr) {
1077         case 0:
1078                 return vcpu->cr0;
1079         case 2:
1080                 return vcpu->cr2;
1081         case 3:
1082                 return vcpu->cr3;
1083         case 4:
1084                 return vcpu->cr4;
1085         default:
1086                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1087                 return 0;
1088         }
1089 }
1090
1091 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1092                      unsigned long *rflags)
1093 {
1094         switch (cr) {
1095         case 0:
1096                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1097                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1098                 break;
1099         case 2:
1100                 vcpu->cr2 = val;
1101                 break;
1102         case 3:
1103                 set_cr3(vcpu, val);
1104                 break;
1105         case 4:
1106                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1107                 break;
1108         default:
1109                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1110         }
1111 }
1112
1113 /*
1114  * Reads an msr value (of 'msr_index') into 'pdata'.
1115  * Returns 0 on success, non-0 otherwise.
1116  * Assumes vcpu_load() was already called.
1117  */
1118 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1119 {
1120         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1121 }
1122
1123 #ifdef __x86_64__
1124
1125 void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1126 {
1127         struct vmx_msr_entry *msr;
1128
1129         if (efer & EFER_RESERVED_BITS) {
1130                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1131                        efer);
1132                 inject_gp(vcpu);
1133                 return;
1134         }
1135
1136         if (is_paging(vcpu)
1137             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1138                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1139                 inject_gp(vcpu);
1140                 return;
1141         }
1142
1143         efer &= ~EFER_LMA;
1144         efer |= vcpu->shadow_efer & EFER_LMA;
1145
1146         vcpu->shadow_efer = efer;
1147
1148         msr = find_msr_entry(vcpu, MSR_EFER);
1149
1150         if (!(efer & EFER_LMA))
1151             efer &= ~EFER_LME;
1152         msr->data = efer;
1153 }
1154 EXPORT_SYMBOL_GPL(set_efer);
1155
1156 #endif
1157
1158 /*
1159  * Writes msr value into into the appropriate "register".
1160  * Returns 0 on success, non-0 otherwise.
1161  * Assumes vcpu_load() was already called.
1162  */
1163 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1164 {
1165         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1166 }
1167
1168 void kvm_resched(struct kvm_vcpu *vcpu)
1169 {
1170         vcpu_put(vcpu);
1171         cond_resched();
1172         /* Cannot fail -  no vcpu unplug yet. */
1173         vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1174 }
1175 EXPORT_SYMBOL_GPL(kvm_resched);
1176
1177 void load_msrs(struct vmx_msr_entry *e, int n)
1178 {
1179         int i;
1180
1181         for (i = 0; i < n; ++i)
1182                 wrmsrl(e[i].index, e[i].data);
1183 }
1184 EXPORT_SYMBOL_GPL(load_msrs);
1185
1186 void save_msrs(struct vmx_msr_entry *e, int n)
1187 {
1188         int i;
1189
1190         for (i = 0; i < n; ++i)
1191                 rdmsrl(e[i].index, e[i].data);
1192 }
1193 EXPORT_SYMBOL_GPL(save_msrs);
1194
1195 static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1196 {
1197         struct kvm_vcpu *vcpu;
1198         int r;
1199
1200         if (kvm_run->vcpu < 0 || kvm_run->vcpu >= KVM_MAX_VCPUS)
1201                 return -EINVAL;
1202
1203         vcpu = vcpu_load(kvm, kvm_run->vcpu);
1204         if (!vcpu)
1205                 return -ENOENT;
1206
1207         if (kvm_run->emulated) {
1208                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1209                 kvm_run->emulated = 0;
1210         }
1211
1212         if (kvm_run->mmio_completed) {
1213                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1214                 vcpu->mmio_read_completed = 1;
1215         }
1216
1217         vcpu->mmio_needed = 0;
1218
1219         r = kvm_arch_ops->run(vcpu, kvm_run);
1220
1221         vcpu_put(vcpu);
1222         return r;
1223 }
1224
1225 static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1226 {
1227         struct kvm_vcpu *vcpu;
1228
1229         if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS)
1230                 return -EINVAL;
1231
1232         vcpu = vcpu_load(kvm, regs->vcpu);
1233         if (!vcpu)
1234                 return -ENOENT;
1235
1236         kvm_arch_ops->cache_regs(vcpu);
1237
1238         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1239         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1240         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1241         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1242         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1243         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1244         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1245         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1246 #ifdef __x86_64__
1247         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1248         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1249         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1250         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1251         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1252         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1253         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1254         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1255 #endif
1256
1257         regs->rip = vcpu->rip;
1258         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1259
1260         /*
1261          * Don't leak debug flags in case they were set for guest debugging
1262          */
1263         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1264                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1265
1266         vcpu_put(vcpu);
1267
1268         return 0;
1269 }
1270
1271 static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1272 {
1273         struct kvm_vcpu *vcpu;
1274
1275         if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS)
1276                 return -EINVAL;
1277
1278         vcpu = vcpu_load(kvm, regs->vcpu);
1279         if (!vcpu)
1280                 return -ENOENT;
1281
1282         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1283         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1284         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1285         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1286         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1287         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1288         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1289         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1290 #ifdef __x86_64__
1291         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1292         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1293         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1294         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1295         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1296         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1297         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1298         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1299 #endif
1300
1301         vcpu->rip = regs->rip;
1302         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1303
1304         kvm_arch_ops->decache_regs(vcpu);
1305
1306         vcpu_put(vcpu);
1307
1308         return 0;
1309 }
1310
1311 static void get_segment(struct kvm_vcpu *vcpu,
1312                         struct kvm_segment *var, int seg)
1313 {
1314         return kvm_arch_ops->get_segment(vcpu, var, seg);
1315 }
1316
1317 static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1318 {
1319         struct kvm_vcpu *vcpu;
1320         struct descriptor_table dt;
1321
1322         if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS)
1323                 return -EINVAL;
1324         vcpu = vcpu_load(kvm, sregs->vcpu);
1325         if (!vcpu)
1326                 return -ENOENT;
1327
1328         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1329         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1330         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1331         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1332         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1333         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1334
1335         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1336         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1337
1338         kvm_arch_ops->get_idt(vcpu, &dt);
1339         sregs->idt.limit = dt.limit;
1340         sregs->idt.base = dt.base;
1341         kvm_arch_ops->get_gdt(vcpu, &dt);
1342         sregs->gdt.limit = dt.limit;
1343         sregs->gdt.base = dt.base;
1344
1345         sregs->cr0 = vcpu->cr0;
1346         sregs->cr2 = vcpu->cr2;
1347         sregs->cr3 = vcpu->cr3;
1348         sregs->cr4 = vcpu->cr4;
1349         sregs->cr8 = vcpu->cr8;
1350         sregs->efer = vcpu->shadow_efer;
1351         sregs->apic_base = vcpu->apic_base;
1352
1353         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1354                sizeof sregs->interrupt_bitmap);
1355
1356         vcpu_put(vcpu);
1357
1358         return 0;
1359 }
1360
1361 static void set_segment(struct kvm_vcpu *vcpu,
1362                         struct kvm_segment *var, int seg)
1363 {
1364         return kvm_arch_ops->set_segment(vcpu, var, seg);
1365 }
1366
1367 static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1368 {
1369         struct kvm_vcpu *vcpu;
1370         int mmu_reset_needed = 0;
1371         int i;
1372         struct descriptor_table dt;
1373
1374         if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS)
1375                 return -EINVAL;
1376         vcpu = vcpu_load(kvm, sregs->vcpu);
1377         if (!vcpu)
1378                 return -ENOENT;
1379
1380         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1381         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1382         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1383         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1384         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1385         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1386
1387         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1388         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1389
1390         dt.limit = sregs->idt.limit;
1391         dt.base = sregs->idt.base;
1392         kvm_arch_ops->set_idt(vcpu, &dt);
1393         dt.limit = sregs->gdt.limit;
1394         dt.base = sregs->gdt.base;
1395         kvm_arch_ops->set_gdt(vcpu, &dt);
1396
1397         vcpu->cr2 = sregs->cr2;
1398         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1399         vcpu->cr3 = sregs->cr3;
1400
1401         vcpu->cr8 = sregs->cr8;
1402
1403         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1404 #ifdef __x86_64__
1405         kvm_arch_ops->set_efer(vcpu, sregs->efer);
1406 #endif
1407         vcpu->apic_base = sregs->apic_base;
1408
1409         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1410         kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1411
1412         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1413         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1414
1415         if (mmu_reset_needed)
1416                 kvm_mmu_reset_context(vcpu);
1417
1418         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1419                sizeof vcpu->irq_pending);
1420         vcpu->irq_summary = 0;
1421         for (i = 0; i < NR_IRQ_WORDS; ++i)
1422                 if (vcpu->irq_pending[i])
1423                         __set_bit(i, &vcpu->irq_summary);
1424
1425         vcpu_put(vcpu);
1426
1427         return 0;
1428 }
1429
1430 /*
1431  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1432  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1433  */
1434 static u32 msrs_to_save[] = {
1435         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1436         MSR_K6_STAR,
1437 #ifdef __x86_64__
1438         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1439 #endif
1440         MSR_IA32_TIME_STAMP_COUNTER,
1441 };
1442
1443
1444 /*
1445  * Adapt set_msr() to msr_io()'s calling convention
1446  */
1447 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1448 {
1449         return set_msr(vcpu, index, *data);
1450 }
1451
1452 /*
1453  * Read or write a bunch of msrs. All parameters are kernel addresses.
1454  *
1455  * @return number of msrs set successfully.
1456  */
1457 static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1458                     struct kvm_msr_entry *entries,
1459                     int (*do_msr)(struct kvm_vcpu *vcpu,
1460                                   unsigned index, u64 *data))
1461 {
1462         struct kvm_vcpu *vcpu;
1463         int i;
1464
1465         if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
1466                 return -EINVAL;
1467
1468         vcpu = vcpu_load(kvm, msrs->vcpu);
1469         if (!vcpu)
1470                 return -ENOENT;
1471
1472         for (i = 0; i < msrs->nmsrs; ++i)
1473                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1474                         break;
1475
1476         vcpu_put(vcpu);
1477
1478         return i;
1479 }
1480
1481 /*
1482  * Read or write a bunch of msrs. Parameters are user addresses.
1483  *
1484  * @return number of msrs set successfully.
1485  */
1486 static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1487                   int (*do_msr)(struct kvm_vcpu *vcpu,
1488                                 unsigned index, u64 *data),
1489                   int writeback)
1490 {
1491         struct kvm_msrs msrs;
1492         struct kvm_msr_entry *entries;
1493         int r, n;
1494         unsigned size;
1495
1496         r = -EFAULT;
1497         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1498                 goto out;
1499
1500         r = -E2BIG;
1501         if (msrs.nmsrs >= MAX_IO_MSRS)
1502                 goto out;
1503
1504         r = -ENOMEM;
1505         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1506         entries = vmalloc(size);
1507         if (!entries)
1508                 goto out;
1509
1510         r = -EFAULT;
1511         if (copy_from_user(entries, user_msrs->entries, size))
1512                 goto out_free;
1513
1514         r = n = __msr_io(kvm, &msrs, entries, do_msr);
1515         if (r < 0)
1516                 goto out_free;
1517
1518         r = -EFAULT;
1519         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1520                 goto out_free;
1521
1522         r = n;
1523
1524 out_free:
1525         vfree(entries);
1526 out:
1527         return r;
1528 }
1529
1530 /*
1531  * Translate a guest virtual address to a guest physical address.
1532  */
1533 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1534 {
1535         unsigned long vaddr = tr->linear_address;
1536         struct kvm_vcpu *vcpu;
1537         gpa_t gpa;
1538
1539         vcpu = vcpu_load(kvm, tr->vcpu);
1540         if (!vcpu)
1541                 return -ENOENT;
1542         spin_lock(&kvm->lock);
1543         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1544         tr->physical_address = gpa;
1545         tr->valid = gpa != UNMAPPED_GVA;
1546         tr->writeable = 1;
1547         tr->usermode = 0;
1548         spin_unlock(&kvm->lock);
1549         vcpu_put(vcpu);
1550
1551         return 0;
1552 }
1553
1554 static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1555 {
1556         struct kvm_vcpu *vcpu;
1557
1558         if (irq->vcpu < 0 || irq->vcpu >= KVM_MAX_VCPUS)
1559                 return -EINVAL;
1560         if (irq->irq < 0 || irq->irq >= 256)
1561                 return -EINVAL;
1562         vcpu = vcpu_load(kvm, irq->vcpu);
1563         if (!vcpu)
1564                 return -ENOENT;
1565
1566         set_bit(irq->irq, vcpu->irq_pending);
1567         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1568
1569         vcpu_put(vcpu);
1570
1571         return 0;
1572 }
1573
1574 static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1575                                      struct kvm_debug_guest *dbg)
1576 {
1577         struct kvm_vcpu *vcpu;
1578         int r;
1579
1580         if (dbg->vcpu < 0 || dbg->vcpu >= KVM_MAX_VCPUS)
1581                 return -EINVAL;
1582         vcpu = vcpu_load(kvm, dbg->vcpu);
1583         if (!vcpu)
1584                 return -ENOENT;
1585
1586         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1587
1588         vcpu_put(vcpu);
1589
1590         return r;
1591 }
1592
1593 static long kvm_dev_ioctl(struct file *filp,
1594                           unsigned int ioctl, unsigned long arg)
1595 {
1596         struct kvm *kvm = filp->private_data;
1597         int r = -EINVAL;
1598
1599         switch (ioctl) {
1600         case KVM_CREATE_VCPU: {
1601                 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1602                 if (r)
1603                         goto out;
1604                 break;
1605         }
1606         case KVM_RUN: {
1607                 struct kvm_run kvm_run;
1608
1609                 r = -EFAULT;
1610                 if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run))
1611                         goto out;
1612                 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1613                 if (r < 0)
1614                         goto out;
1615                 r = -EFAULT;
1616                 if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run))
1617                         goto out;
1618                 r = 0;
1619                 break;
1620         }
1621         case KVM_GET_REGS: {
1622                 struct kvm_regs kvm_regs;
1623
1624                 r = -EFAULT;
1625                 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1626                         goto out;
1627                 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1628                 if (r)
1629                         goto out;
1630                 r = -EFAULT;
1631                 if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs))
1632                         goto out;
1633                 r = 0;
1634                 break;
1635         }
1636         case KVM_SET_REGS: {
1637                 struct kvm_regs kvm_regs;
1638
1639                 r = -EFAULT;
1640                 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1641                         goto out;
1642                 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1643                 if (r)
1644                         goto out;
1645                 r = 0;
1646                 break;
1647         }
1648         case KVM_GET_SREGS: {
1649                 struct kvm_sregs kvm_sregs;
1650
1651                 r = -EFAULT;
1652                 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1653                         goto out;
1654                 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1655                 if (r)
1656                         goto out;
1657                 r = -EFAULT;
1658                 if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs))
1659                         goto out;
1660                 r = 0;
1661                 break;
1662         }
1663         case KVM_SET_SREGS: {
1664                 struct kvm_sregs kvm_sregs;
1665
1666                 r = -EFAULT;
1667                 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1668                         goto out;
1669                 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1670                 if (r)
1671                         goto out;
1672                 r = 0;
1673                 break;
1674         }
1675         case KVM_TRANSLATE: {
1676                 struct kvm_translation tr;
1677
1678                 r = -EFAULT;
1679                 if (copy_from_user(&tr, (void *)arg, sizeof tr))
1680                         goto out;
1681                 r = kvm_dev_ioctl_translate(kvm, &tr);
1682                 if (r)
1683                         goto out;
1684                 r = -EFAULT;
1685                 if (copy_to_user((void *)arg, &tr, sizeof tr))
1686                         goto out;
1687                 r = 0;
1688                 break;
1689         }
1690         case KVM_INTERRUPT: {
1691                 struct kvm_interrupt irq;
1692
1693                 r = -EFAULT;
1694                 if (copy_from_user(&irq, (void *)arg, sizeof irq))
1695                         goto out;
1696                 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1697                 if (r)
1698                         goto out;
1699                 r = 0;
1700                 break;
1701         }
1702         case KVM_DEBUG_GUEST: {
1703                 struct kvm_debug_guest dbg;
1704
1705                 r = -EFAULT;
1706                 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
1707                         goto out;
1708                 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1709                 if (r)
1710                         goto out;
1711                 r = 0;
1712                 break;
1713         }
1714         case KVM_SET_MEMORY_REGION: {
1715                 struct kvm_memory_region kvm_mem;
1716
1717                 r = -EFAULT;
1718                 if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem))
1719                         goto out;
1720                 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1721                 if (r)
1722                         goto out;
1723                 break;
1724         }
1725         case KVM_GET_DIRTY_LOG: {
1726                 struct kvm_dirty_log log;
1727
1728                 r = -EFAULT;
1729                 if (copy_from_user(&log, (void *)arg, sizeof log))
1730                         goto out;
1731                 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1732                 if (r)
1733                         goto out;
1734                 break;
1735         }
1736         case KVM_GET_MSRS:
1737                 r = msr_io(kvm, (void __user *)arg, get_msr, 1);
1738                 break;
1739         case KVM_SET_MSRS:
1740                 r = msr_io(kvm, (void __user *)arg, do_set_msr, 0);
1741                 break;
1742         case KVM_GET_MSR_INDEX_LIST: {
1743                 struct kvm_msr_list __user *user_msr_list = (void __user *)arg;
1744                 struct kvm_msr_list msr_list;
1745                 unsigned n;
1746
1747                 r = -EFAULT;
1748                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1749                         goto out;
1750                 n = msr_list.nmsrs;
1751                 msr_list.nmsrs = ARRAY_SIZE(msrs_to_save);
1752                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1753                         goto out;
1754                 r = -E2BIG;
1755                 if (n < ARRAY_SIZE(msrs_to_save))
1756                         goto out;
1757                 r = -EFAULT;
1758                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1759                                  sizeof msrs_to_save))
1760                         goto out;
1761                 r = 0;
1762         }
1763         default:
1764                 ;
1765         }
1766 out:
1767         return r;
1768 }
1769
1770 static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1771                                    unsigned long address,
1772                                    int *type)
1773 {
1774         struct kvm *kvm = vma->vm_file->private_data;
1775         unsigned long pgoff;
1776         struct kvm_memory_slot *slot;
1777         struct page *page;
1778
1779         *type = VM_FAULT_MINOR;
1780         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1781         slot = gfn_to_memslot(kvm, pgoff);
1782         if (!slot)
1783                 return NOPAGE_SIGBUS;
1784         page = gfn_to_page(slot, pgoff);
1785         if (!page)
1786                 return NOPAGE_SIGBUS;
1787         get_page(page);
1788         return page;
1789 }
1790
1791 static struct vm_operations_struct kvm_dev_vm_ops = {
1792         .nopage = kvm_dev_nopage,
1793 };
1794
1795 static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
1796 {
1797         vma->vm_ops = &kvm_dev_vm_ops;
1798         return 0;
1799 }
1800
1801 static struct file_operations kvm_chardev_ops = {
1802         .open           = kvm_dev_open,
1803         .release        = kvm_dev_release,
1804         .unlocked_ioctl = kvm_dev_ioctl,
1805         .compat_ioctl   = kvm_dev_ioctl,
1806         .mmap           = kvm_dev_mmap,
1807 };
1808
1809 static struct miscdevice kvm_dev = {
1810         MISC_DYNAMIC_MINOR,
1811         "kvm",
1812         &kvm_chardev_ops,
1813 };
1814
1815 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1816                        void *v)
1817 {
1818         if (val == SYS_RESTART) {
1819                 /*
1820                  * Some (well, at least mine) BIOSes hang on reboot if
1821                  * in vmx root mode.
1822                  */
1823                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1824                 on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1825         }
1826         return NOTIFY_OK;
1827 }
1828
1829 static struct notifier_block kvm_reboot_notifier = {
1830         .notifier_call = kvm_reboot,
1831         .priority = 0,
1832 };
1833
1834 static __init void kvm_init_debug(void)
1835 {
1836         struct kvm_stats_debugfs_item *p;
1837
1838         debugfs_dir = debugfs_create_dir("kvm", 0);
1839         for (p = debugfs_entries; p->name; ++p)
1840                 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
1841                                                p->data);
1842 }
1843
1844 static void kvm_exit_debug(void)
1845 {
1846         struct kvm_stats_debugfs_item *p;
1847
1848         for (p = debugfs_entries; p->name; ++p)
1849                 debugfs_remove(p->dentry);
1850         debugfs_remove(debugfs_dir);
1851 }
1852
1853 hpa_t bad_page_address;
1854
1855 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
1856 {
1857         int r;
1858
1859         kvm_arch_ops = ops;
1860
1861         if (!kvm_arch_ops->cpu_has_kvm_support()) {
1862                 printk(KERN_ERR "kvm: no hardware support\n");
1863                 return -EOPNOTSUPP;
1864         }
1865         if (kvm_arch_ops->disabled_by_bios()) {
1866                 printk(KERN_ERR "kvm: disabled by bios\n");
1867                 return -EOPNOTSUPP;
1868         }
1869
1870         r = kvm_arch_ops->hardware_setup();
1871         if (r < 0)
1872             return r;
1873
1874         on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
1875         register_reboot_notifier(&kvm_reboot_notifier);
1876
1877         kvm_chardev_ops.owner = module;
1878
1879         r = misc_register(&kvm_dev);
1880         if (r) {
1881                 printk (KERN_ERR "kvm: misc device register failed\n");
1882                 goto out_free;
1883         }
1884
1885         return r;
1886
1887 out_free:
1888         unregister_reboot_notifier(&kvm_reboot_notifier);
1889         on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1890         kvm_arch_ops->hardware_unsetup();
1891         return r;
1892 }
1893
1894 void kvm_exit_arch(void)
1895 {
1896         misc_deregister(&kvm_dev);
1897
1898         unregister_reboot_notifier(&kvm_reboot_notifier);
1899         on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1900         kvm_arch_ops->hardware_unsetup();
1901 }
1902
1903 static __init int kvm_init(void)
1904 {
1905         static struct page *bad_page;
1906         int r = 0;
1907
1908         kvm_init_debug();
1909
1910         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
1911                 r = -ENOMEM;
1912                 goto out;
1913         }
1914
1915         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
1916         memset(__va(bad_page_address), 0, PAGE_SIZE);
1917
1918         return r;
1919
1920 out:
1921         kvm_exit_debug();
1922         return r;
1923 }
1924
1925 static __exit void kvm_exit(void)
1926 {
1927         kvm_exit_debug();
1928         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
1929 }
1930
1931 module_init(kvm_init)
1932 module_exit(kvm_exit)
1933
1934 EXPORT_SYMBOL_GPL(kvm_init_arch);
1935 EXPORT_SYMBOL_GPL(kvm_exit_arch);