Merge master.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-rc-fixes-2.6
[pandora-kernel.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <linux/magic.h>
24 #include <asm/processor.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <asm/msr.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <asm/uaccess.h>
32 #include <linux/reboot.h>
33 #include <asm/io.h>
34 #include <linux/debugfs.h>
35 #include <linux/highmem.h>
36 #include <linux/file.h>
37 #include <asm/desc.h>
38 #include <linux/sysdev.h>
39 #include <linux/cpu.h>
40 #include <linux/file.h>
41 #include <linux/fs.h>
42 #include <linux/mount.h>
43 #include <linux/sched.h>
44
45 #include "x86_emulate.h"
46 #include "segment_descriptor.h"
47
48 MODULE_AUTHOR("Qumranet");
49 MODULE_LICENSE("GPL");
50
51 static DEFINE_SPINLOCK(kvm_lock);
52 static LIST_HEAD(vm_list);
53
54 struct kvm_arch_ops *kvm_arch_ops;
55
56 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
57
58 static struct kvm_stats_debugfs_item {
59         const char *name;
60         int offset;
61         struct dentry *dentry;
62 } debugfs_entries[] = {
63         { "pf_fixed", STAT_OFFSET(pf_fixed) },
64         { "pf_guest", STAT_OFFSET(pf_guest) },
65         { "tlb_flush", STAT_OFFSET(tlb_flush) },
66         { "invlpg", STAT_OFFSET(invlpg) },
67         { "exits", STAT_OFFSET(exits) },
68         { "io_exits", STAT_OFFSET(io_exits) },
69         { "mmio_exits", STAT_OFFSET(mmio_exits) },
70         { "signal_exits", STAT_OFFSET(signal_exits) },
71         { "irq_window", STAT_OFFSET(irq_window_exits) },
72         { "halt_exits", STAT_OFFSET(halt_exits) },
73         { "request_irq", STAT_OFFSET(request_irq_exits) },
74         { "irq_exits", STAT_OFFSET(irq_exits) },
75         { NULL }
76 };
77
78 static struct dentry *debugfs_dir;
79
80 struct vfsmount *kvmfs_mnt;
81
82 #define MAX_IO_MSRS 256
83
84 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
85 #define LMSW_GUEST_MASK 0x0eULL
86 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
87 #define CR8_RESEVED_BITS (~0x0fULL)
88 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
89
90 #ifdef CONFIG_X86_64
91 // LDT or TSS descriptor in the GDT. 16 bytes.
92 struct segment_descriptor_64 {
93         struct segment_descriptor s;
94         u32 base_higher;
95         u32 pad_zero;
96 };
97
98 #endif
99
100 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
101                            unsigned long arg);
102
103 static struct inode *kvmfs_inode(struct file_operations *fops)
104 {
105         int error = -ENOMEM;
106         struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
107
108         if (!inode)
109                 goto eexit_1;
110
111         inode->i_fop = fops;
112
113         /*
114          * Mark the inode dirty from the very beginning,
115          * that way it will never be moved to the dirty
116          * list because mark_inode_dirty() will think
117          * that it already _is_ on the dirty list.
118          */
119         inode->i_state = I_DIRTY;
120         inode->i_mode = S_IRUSR | S_IWUSR;
121         inode->i_uid = current->fsuid;
122         inode->i_gid = current->fsgid;
123         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
124         return inode;
125
126 eexit_1:
127         return ERR_PTR(error);
128 }
129
130 static struct file *kvmfs_file(struct inode *inode, void *private_data)
131 {
132         struct file *file = get_empty_filp();
133
134         if (!file)
135                 return ERR_PTR(-ENFILE);
136
137         file->f_path.mnt = mntget(kvmfs_mnt);
138         file->f_path.dentry = d_alloc_anon(inode);
139         if (!file->f_path.dentry)
140                 return ERR_PTR(-ENOMEM);
141         file->f_mapping = inode->i_mapping;
142
143         file->f_pos = 0;
144         file->f_flags = O_RDWR;
145         file->f_op = inode->i_fop;
146         file->f_mode = FMODE_READ | FMODE_WRITE;
147         file->f_version = 0;
148         file->private_data = private_data;
149         return file;
150 }
151
152 unsigned long segment_base(u16 selector)
153 {
154         struct descriptor_table gdt;
155         struct segment_descriptor *d;
156         unsigned long table_base;
157         typedef unsigned long ul;
158         unsigned long v;
159
160         if (selector == 0)
161                 return 0;
162
163         asm ("sgdt %0" : "=m"(gdt));
164         table_base = gdt.base;
165
166         if (selector & 4) {           /* from ldt */
167                 u16 ldt_selector;
168
169                 asm ("sldt %0" : "=g"(ldt_selector));
170                 table_base = segment_base(ldt_selector);
171         }
172         d = (struct segment_descriptor *)(table_base + (selector & ~7));
173         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
174 #ifdef CONFIG_X86_64
175         if (d->system == 0
176             && (d->type == 2 || d->type == 9 || d->type == 11))
177                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
178 #endif
179         return v;
180 }
181 EXPORT_SYMBOL_GPL(segment_base);
182
183 static inline int valid_vcpu(int n)
184 {
185         return likely(n >= 0 && n < KVM_MAX_VCPUS);
186 }
187
188 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
189                    void *dest)
190 {
191         unsigned char *host_buf = dest;
192         unsigned long req_size = size;
193
194         while (size) {
195                 hpa_t paddr;
196                 unsigned now;
197                 unsigned offset;
198                 hva_t guest_buf;
199
200                 paddr = gva_to_hpa(vcpu, addr);
201
202                 if (is_error_hpa(paddr))
203                         break;
204
205                 guest_buf = (hva_t)kmap_atomic(
206                                         pfn_to_page(paddr >> PAGE_SHIFT),
207                                         KM_USER0);
208                 offset = addr & ~PAGE_MASK;
209                 guest_buf |= offset;
210                 now = min(size, PAGE_SIZE - offset);
211                 memcpy(host_buf, (void*)guest_buf, now);
212                 host_buf += now;
213                 addr += now;
214                 size -= now;
215                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
216         }
217         return req_size - size;
218 }
219 EXPORT_SYMBOL_GPL(kvm_read_guest);
220
221 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
222                     void *data)
223 {
224         unsigned char *host_buf = data;
225         unsigned long req_size = size;
226
227         while (size) {
228                 hpa_t paddr;
229                 unsigned now;
230                 unsigned offset;
231                 hva_t guest_buf;
232                 gfn_t gfn;
233
234                 paddr = gva_to_hpa(vcpu, addr);
235
236                 if (is_error_hpa(paddr))
237                         break;
238
239                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
240                 mark_page_dirty(vcpu->kvm, gfn);
241                 guest_buf = (hva_t)kmap_atomic(
242                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
243                 offset = addr & ~PAGE_MASK;
244                 guest_buf |= offset;
245                 now = min(size, PAGE_SIZE - offset);
246                 memcpy((void*)guest_buf, host_buf, now);
247                 host_buf += now;
248                 addr += now;
249                 size -= now;
250                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
251         }
252         return req_size - size;
253 }
254 EXPORT_SYMBOL_GPL(kvm_write_guest);
255
256 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
257 {
258         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
259                 return;
260
261         vcpu->guest_fpu_loaded = 1;
262         fx_save(vcpu->host_fx_image);
263         fx_restore(vcpu->guest_fx_image);
264 }
265 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
266
267 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
268 {
269         if (!vcpu->guest_fpu_loaded)
270                 return;
271
272         vcpu->guest_fpu_loaded = 0;
273         fx_save(vcpu->guest_fx_image);
274         fx_restore(vcpu->host_fx_image);
275 }
276 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
277
278 /*
279  * Switches to specified vcpu, until a matching vcpu_put()
280  */
281 static void vcpu_load(struct kvm_vcpu *vcpu)
282 {
283         mutex_lock(&vcpu->mutex);
284         kvm_arch_ops->vcpu_load(vcpu);
285 }
286
287 /*
288  * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
289  * if the slot is not populated.
290  */
291 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
292 {
293         struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
294
295         mutex_lock(&vcpu->mutex);
296         if (!vcpu->vmcs) {
297                 mutex_unlock(&vcpu->mutex);
298                 return NULL;
299         }
300         kvm_arch_ops->vcpu_load(vcpu);
301         return vcpu;
302 }
303
304 static void vcpu_put(struct kvm_vcpu *vcpu)
305 {
306         kvm_arch_ops->vcpu_put(vcpu);
307         mutex_unlock(&vcpu->mutex);
308 }
309
310 static struct kvm *kvm_create_vm(void)
311 {
312         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
313         int i;
314
315         if (!kvm)
316                 return ERR_PTR(-ENOMEM);
317
318         spin_lock_init(&kvm->lock);
319         INIT_LIST_HEAD(&kvm->active_mmu_pages);
320         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
321                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
322
323                 mutex_init(&vcpu->mutex);
324                 vcpu->cpu = -1;
325                 vcpu->kvm = kvm;
326                 vcpu->mmu.root_hpa = INVALID_PAGE;
327                 INIT_LIST_HEAD(&vcpu->free_pages);
328                 spin_lock(&kvm_lock);
329                 list_add(&kvm->vm_list, &vm_list);
330                 spin_unlock(&kvm_lock);
331         }
332         return kvm;
333 }
334
335 static int kvm_dev_open(struct inode *inode, struct file *filp)
336 {
337         return 0;
338 }
339
340 /*
341  * Free any memory in @free but not in @dont.
342  */
343 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
344                                   struct kvm_memory_slot *dont)
345 {
346         int i;
347
348         if (!dont || free->phys_mem != dont->phys_mem)
349                 if (free->phys_mem) {
350                         for (i = 0; i < free->npages; ++i)
351                                 if (free->phys_mem[i])
352                                         __free_page(free->phys_mem[i]);
353                         vfree(free->phys_mem);
354                 }
355
356         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
357                 vfree(free->dirty_bitmap);
358
359         free->phys_mem = NULL;
360         free->npages = 0;
361         free->dirty_bitmap = NULL;
362 }
363
364 static void kvm_free_physmem(struct kvm *kvm)
365 {
366         int i;
367
368         for (i = 0; i < kvm->nmemslots; ++i)
369                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
370 }
371
372 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
373 {
374         int i;
375
376         for (i = 0; i < 2; ++i)
377                 if (vcpu->pio.guest_pages[i]) {
378                         __free_page(vcpu->pio.guest_pages[i]);
379                         vcpu->pio.guest_pages[i] = NULL;
380                 }
381 }
382
383 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
384 {
385         if (!vcpu->vmcs)
386                 return;
387
388         vcpu_load(vcpu);
389         kvm_mmu_destroy(vcpu);
390         vcpu_put(vcpu);
391         kvm_arch_ops->vcpu_free(vcpu);
392         free_page((unsigned long)vcpu->run);
393         vcpu->run = NULL;
394         free_page((unsigned long)vcpu->pio_data);
395         vcpu->pio_data = NULL;
396         free_pio_guest_pages(vcpu);
397 }
398
399 static void kvm_free_vcpus(struct kvm *kvm)
400 {
401         unsigned int i;
402
403         for (i = 0; i < KVM_MAX_VCPUS; ++i)
404                 kvm_free_vcpu(&kvm->vcpus[i]);
405 }
406
407 static int kvm_dev_release(struct inode *inode, struct file *filp)
408 {
409         return 0;
410 }
411
412 static void kvm_destroy_vm(struct kvm *kvm)
413 {
414         spin_lock(&kvm_lock);
415         list_del(&kvm->vm_list);
416         spin_unlock(&kvm_lock);
417         kvm_free_vcpus(kvm);
418         kvm_free_physmem(kvm);
419         kfree(kvm);
420 }
421
422 static int kvm_vm_release(struct inode *inode, struct file *filp)
423 {
424         struct kvm *kvm = filp->private_data;
425
426         kvm_destroy_vm(kvm);
427         return 0;
428 }
429
430 static void inject_gp(struct kvm_vcpu *vcpu)
431 {
432         kvm_arch_ops->inject_gp(vcpu, 0);
433 }
434
435 /*
436  * Load the pae pdptrs.  Return true is they are all valid.
437  */
438 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
439 {
440         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
441         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
442         int i;
443         u64 pdpte;
444         u64 *pdpt;
445         int ret;
446         struct page *page;
447
448         spin_lock(&vcpu->kvm->lock);
449         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
450         /* FIXME: !page - emulate? 0xff? */
451         pdpt = kmap_atomic(page, KM_USER0);
452
453         ret = 1;
454         for (i = 0; i < 4; ++i) {
455                 pdpte = pdpt[offset + i];
456                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
457                         ret = 0;
458                         goto out;
459                 }
460         }
461
462         for (i = 0; i < 4; ++i)
463                 vcpu->pdptrs[i] = pdpt[offset + i];
464
465 out:
466         kunmap_atomic(pdpt, KM_USER0);
467         spin_unlock(&vcpu->kvm->lock);
468
469         return ret;
470 }
471
472 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
473 {
474         if (cr0 & CR0_RESEVED_BITS) {
475                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
476                        cr0, vcpu->cr0);
477                 inject_gp(vcpu);
478                 return;
479         }
480
481         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
482                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
483                 inject_gp(vcpu);
484                 return;
485         }
486
487         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
488                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
489                        "and a clear PE flag\n");
490                 inject_gp(vcpu);
491                 return;
492         }
493
494         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
495 #ifdef CONFIG_X86_64
496                 if ((vcpu->shadow_efer & EFER_LME)) {
497                         int cs_db, cs_l;
498
499                         if (!is_pae(vcpu)) {
500                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
501                                        "in long mode while PAE is disabled\n");
502                                 inject_gp(vcpu);
503                                 return;
504                         }
505                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
506                         if (cs_l) {
507                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
508                                        "in long mode while CS.L == 1\n");
509                                 inject_gp(vcpu);
510                                 return;
511
512                         }
513                 } else
514 #endif
515                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
516                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
517                                "reserved bits\n");
518                         inject_gp(vcpu);
519                         return;
520                 }
521
522         }
523
524         kvm_arch_ops->set_cr0(vcpu, cr0);
525         vcpu->cr0 = cr0;
526
527         spin_lock(&vcpu->kvm->lock);
528         kvm_mmu_reset_context(vcpu);
529         spin_unlock(&vcpu->kvm->lock);
530         return;
531 }
532 EXPORT_SYMBOL_GPL(set_cr0);
533
534 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
535 {
536         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
537 }
538 EXPORT_SYMBOL_GPL(lmsw);
539
540 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
541 {
542         if (cr4 & CR4_RESEVED_BITS) {
543                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
544                 inject_gp(vcpu);
545                 return;
546         }
547
548         if (is_long_mode(vcpu)) {
549                 if (!(cr4 & CR4_PAE_MASK)) {
550                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
551                                "in long mode\n");
552                         inject_gp(vcpu);
553                         return;
554                 }
555         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
556                    && !load_pdptrs(vcpu, vcpu->cr3)) {
557                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
558                 inject_gp(vcpu);
559         }
560
561         if (cr4 & CR4_VMXE_MASK) {
562                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
563                 inject_gp(vcpu);
564                 return;
565         }
566         kvm_arch_ops->set_cr4(vcpu, cr4);
567         spin_lock(&vcpu->kvm->lock);
568         kvm_mmu_reset_context(vcpu);
569         spin_unlock(&vcpu->kvm->lock);
570 }
571 EXPORT_SYMBOL_GPL(set_cr4);
572
573 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
574 {
575         if (is_long_mode(vcpu)) {
576                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
577                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
578                         inject_gp(vcpu);
579                         return;
580                 }
581         } else {
582                 if (cr3 & CR3_RESEVED_BITS) {
583                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
584                         inject_gp(vcpu);
585                         return;
586                 }
587                 if (is_paging(vcpu) && is_pae(vcpu) &&
588                     !load_pdptrs(vcpu, cr3)) {
589                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
590                                "reserved bits\n");
591                         inject_gp(vcpu);
592                         return;
593                 }
594         }
595
596         vcpu->cr3 = cr3;
597         spin_lock(&vcpu->kvm->lock);
598         /*
599          * Does the new cr3 value map to physical memory? (Note, we
600          * catch an invalid cr3 even in real-mode, because it would
601          * cause trouble later on when we turn on paging anyway.)
602          *
603          * A real CPU would silently accept an invalid cr3 and would
604          * attempt to use it - with largely undefined (and often hard
605          * to debug) behavior on the guest side.
606          */
607         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
608                 inject_gp(vcpu);
609         else
610                 vcpu->mmu.new_cr3(vcpu);
611         spin_unlock(&vcpu->kvm->lock);
612 }
613 EXPORT_SYMBOL_GPL(set_cr3);
614
615 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
616 {
617         if ( cr8 & CR8_RESEVED_BITS) {
618                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
619                 inject_gp(vcpu);
620                 return;
621         }
622         vcpu->cr8 = cr8;
623 }
624 EXPORT_SYMBOL_GPL(set_cr8);
625
626 void fx_init(struct kvm_vcpu *vcpu)
627 {
628         struct __attribute__ ((__packed__)) fx_image_s {
629                 u16 control; //fcw
630                 u16 status; //fsw
631                 u16 tag; // ftw
632                 u16 opcode; //fop
633                 u64 ip; // fpu ip
634                 u64 operand;// fpu dp
635                 u32 mxcsr;
636                 u32 mxcsr_mask;
637
638         } *fx_image;
639
640         fx_save(vcpu->host_fx_image);
641         fpu_init();
642         fx_save(vcpu->guest_fx_image);
643         fx_restore(vcpu->host_fx_image);
644
645         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
646         fx_image->mxcsr = 0x1f80;
647         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
648                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
649 }
650 EXPORT_SYMBOL_GPL(fx_init);
651
652 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
653 {
654         spin_lock(&vcpu->kvm->lock);
655         kvm_mmu_slot_remove_write_access(vcpu, slot);
656         spin_unlock(&vcpu->kvm->lock);
657 }
658
659 /*
660  * Allocate some memory and give it an address in the guest physical address
661  * space.
662  *
663  * Discontiguous memory is allowed, mostly for framebuffers.
664  */
665 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
666                                           struct kvm_memory_region *mem)
667 {
668         int r;
669         gfn_t base_gfn;
670         unsigned long npages;
671         unsigned long i;
672         struct kvm_memory_slot *memslot;
673         struct kvm_memory_slot old, new;
674         int memory_config_version;
675
676         r = -EINVAL;
677         /* General sanity checks */
678         if (mem->memory_size & (PAGE_SIZE - 1))
679                 goto out;
680         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
681                 goto out;
682         if (mem->slot >= KVM_MEMORY_SLOTS)
683                 goto out;
684         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
685                 goto out;
686
687         memslot = &kvm->memslots[mem->slot];
688         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
689         npages = mem->memory_size >> PAGE_SHIFT;
690
691         if (!npages)
692                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
693
694 raced:
695         spin_lock(&kvm->lock);
696
697         memory_config_version = kvm->memory_config_version;
698         new = old = *memslot;
699
700         new.base_gfn = base_gfn;
701         new.npages = npages;
702         new.flags = mem->flags;
703
704         /* Disallow changing a memory slot's size. */
705         r = -EINVAL;
706         if (npages && old.npages && npages != old.npages)
707                 goto out_unlock;
708
709         /* Check for overlaps */
710         r = -EEXIST;
711         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
712                 struct kvm_memory_slot *s = &kvm->memslots[i];
713
714                 if (s == memslot)
715                         continue;
716                 if (!((base_gfn + npages <= s->base_gfn) ||
717                       (base_gfn >= s->base_gfn + s->npages)))
718                         goto out_unlock;
719         }
720         /*
721          * Do memory allocations outside lock.  memory_config_version will
722          * detect any races.
723          */
724         spin_unlock(&kvm->lock);
725
726         /* Deallocate if slot is being removed */
727         if (!npages)
728                 new.phys_mem = NULL;
729
730         /* Free page dirty bitmap if unneeded */
731         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
732                 new.dirty_bitmap = NULL;
733
734         r = -ENOMEM;
735
736         /* Allocate if a slot is being created */
737         if (npages && !new.phys_mem) {
738                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
739
740                 if (!new.phys_mem)
741                         goto out_free;
742
743                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
744                 for (i = 0; i < npages; ++i) {
745                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
746                                                      | __GFP_ZERO);
747                         if (!new.phys_mem[i])
748                                 goto out_free;
749                         set_page_private(new.phys_mem[i],0);
750                 }
751         }
752
753         /* Allocate page dirty bitmap if needed */
754         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
755                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
756
757                 new.dirty_bitmap = vmalloc(dirty_bytes);
758                 if (!new.dirty_bitmap)
759                         goto out_free;
760                 memset(new.dirty_bitmap, 0, dirty_bytes);
761         }
762
763         spin_lock(&kvm->lock);
764
765         if (memory_config_version != kvm->memory_config_version) {
766                 spin_unlock(&kvm->lock);
767                 kvm_free_physmem_slot(&new, &old);
768                 goto raced;
769         }
770
771         r = -EAGAIN;
772         if (kvm->busy)
773                 goto out_unlock;
774
775         if (mem->slot >= kvm->nmemslots)
776                 kvm->nmemslots = mem->slot + 1;
777
778         *memslot = new;
779         ++kvm->memory_config_version;
780
781         spin_unlock(&kvm->lock);
782
783         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
784                 struct kvm_vcpu *vcpu;
785
786                 vcpu = vcpu_load_slot(kvm, i);
787                 if (!vcpu)
788                         continue;
789                 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
790                         do_remove_write_access(vcpu, mem->slot);
791                 kvm_mmu_reset_context(vcpu);
792                 vcpu_put(vcpu);
793         }
794
795         kvm_free_physmem_slot(&old, &new);
796         return 0;
797
798 out_unlock:
799         spin_unlock(&kvm->lock);
800 out_free:
801         kvm_free_physmem_slot(&new, &old);
802 out:
803         return r;
804 }
805
806 /*
807  * Get (and clear) the dirty memory log for a memory slot.
808  */
809 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
810                                       struct kvm_dirty_log *log)
811 {
812         struct kvm_memory_slot *memslot;
813         int r, i;
814         int n;
815         int cleared;
816         unsigned long any = 0;
817
818         spin_lock(&kvm->lock);
819
820         /*
821          * Prevent changes to guest memory configuration even while the lock
822          * is not taken.
823          */
824         ++kvm->busy;
825         spin_unlock(&kvm->lock);
826         r = -EINVAL;
827         if (log->slot >= KVM_MEMORY_SLOTS)
828                 goto out;
829
830         memslot = &kvm->memslots[log->slot];
831         r = -ENOENT;
832         if (!memslot->dirty_bitmap)
833                 goto out;
834
835         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
836
837         for (i = 0; !any && i < n/sizeof(long); ++i)
838                 any = memslot->dirty_bitmap[i];
839
840         r = -EFAULT;
841         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
842                 goto out;
843
844         if (any) {
845                 cleared = 0;
846                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
847                         struct kvm_vcpu *vcpu;
848
849                         vcpu = vcpu_load_slot(kvm, i);
850                         if (!vcpu)
851                                 continue;
852                         if (!cleared) {
853                                 do_remove_write_access(vcpu, log->slot);
854                                 memset(memslot->dirty_bitmap, 0, n);
855                                 cleared = 1;
856                         }
857                         kvm_arch_ops->tlb_flush(vcpu);
858                         vcpu_put(vcpu);
859                 }
860         }
861
862         r = 0;
863
864 out:
865         spin_lock(&kvm->lock);
866         --kvm->busy;
867         spin_unlock(&kvm->lock);
868         return r;
869 }
870
871 /*
872  * Set a new alias region.  Aliases map a portion of physical memory into
873  * another portion.  This is useful for memory windows, for example the PC
874  * VGA region.
875  */
876 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
877                                          struct kvm_memory_alias *alias)
878 {
879         int r, n;
880         struct kvm_mem_alias *p;
881
882         r = -EINVAL;
883         /* General sanity checks */
884         if (alias->memory_size & (PAGE_SIZE - 1))
885                 goto out;
886         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
887                 goto out;
888         if (alias->slot >= KVM_ALIAS_SLOTS)
889                 goto out;
890         if (alias->guest_phys_addr + alias->memory_size
891             < alias->guest_phys_addr)
892                 goto out;
893         if (alias->target_phys_addr + alias->memory_size
894             < alias->target_phys_addr)
895                 goto out;
896
897         spin_lock(&kvm->lock);
898
899         p = &kvm->aliases[alias->slot];
900         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
901         p->npages = alias->memory_size >> PAGE_SHIFT;
902         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
903
904         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
905                 if (kvm->aliases[n - 1].npages)
906                         break;
907         kvm->naliases = n;
908
909         spin_unlock(&kvm->lock);
910
911         vcpu_load(&kvm->vcpus[0]);
912         spin_lock(&kvm->lock);
913         kvm_mmu_zap_all(&kvm->vcpus[0]);
914         spin_unlock(&kvm->lock);
915         vcpu_put(&kvm->vcpus[0]);
916
917         return 0;
918
919 out:
920         return r;
921 }
922
923 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
924 {
925         int i;
926         struct kvm_mem_alias *alias;
927
928         for (i = 0; i < kvm->naliases; ++i) {
929                 alias = &kvm->aliases[i];
930                 if (gfn >= alias->base_gfn
931                     && gfn < alias->base_gfn + alias->npages)
932                         return alias->target_gfn + gfn - alias->base_gfn;
933         }
934         return gfn;
935 }
936
937 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
938 {
939         int i;
940
941         for (i = 0; i < kvm->nmemslots; ++i) {
942                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
943
944                 if (gfn >= memslot->base_gfn
945                     && gfn < memslot->base_gfn + memslot->npages)
946                         return memslot;
947         }
948         return NULL;
949 }
950
951 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
952 {
953         gfn = unalias_gfn(kvm, gfn);
954         return __gfn_to_memslot(kvm, gfn);
955 }
956
957 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
958 {
959         struct kvm_memory_slot *slot;
960
961         gfn = unalias_gfn(kvm, gfn);
962         slot = __gfn_to_memslot(kvm, gfn);
963         if (!slot)
964                 return NULL;
965         return slot->phys_mem[gfn - slot->base_gfn];
966 }
967 EXPORT_SYMBOL_GPL(gfn_to_page);
968
969 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
970 {
971         int i;
972         struct kvm_memory_slot *memslot = NULL;
973         unsigned long rel_gfn;
974
975         for (i = 0; i < kvm->nmemslots; ++i) {
976                 memslot = &kvm->memslots[i];
977
978                 if (gfn >= memslot->base_gfn
979                     && gfn < memslot->base_gfn + memslot->npages) {
980
981                         if (!memslot || !memslot->dirty_bitmap)
982                                 return;
983
984                         rel_gfn = gfn - memslot->base_gfn;
985
986                         /* avoid RMW */
987                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
988                                 set_bit(rel_gfn, memslot->dirty_bitmap);
989                         return;
990                 }
991         }
992 }
993
994 static int emulator_read_std(unsigned long addr,
995                              void *val,
996                              unsigned int bytes,
997                              struct x86_emulate_ctxt *ctxt)
998 {
999         struct kvm_vcpu *vcpu = ctxt->vcpu;
1000         void *data = val;
1001
1002         while (bytes) {
1003                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1004                 unsigned offset = addr & (PAGE_SIZE-1);
1005                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1006                 unsigned long pfn;
1007                 struct page *page;
1008                 void *page_virt;
1009
1010                 if (gpa == UNMAPPED_GVA)
1011                         return X86EMUL_PROPAGATE_FAULT;
1012                 pfn = gpa >> PAGE_SHIFT;
1013                 page = gfn_to_page(vcpu->kvm, pfn);
1014                 if (!page)
1015                         return X86EMUL_UNHANDLEABLE;
1016                 page_virt = kmap_atomic(page, KM_USER0);
1017
1018                 memcpy(data, page_virt + offset, tocopy);
1019
1020                 kunmap_atomic(page_virt, KM_USER0);
1021
1022                 bytes -= tocopy;
1023                 data += tocopy;
1024                 addr += tocopy;
1025         }
1026
1027         return X86EMUL_CONTINUE;
1028 }
1029
1030 static int emulator_write_std(unsigned long addr,
1031                               const void *val,
1032                               unsigned int bytes,
1033                               struct x86_emulate_ctxt *ctxt)
1034 {
1035         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1036                addr, bytes);
1037         return X86EMUL_UNHANDLEABLE;
1038 }
1039
1040 static int emulator_read_emulated(unsigned long addr,
1041                                   void *val,
1042                                   unsigned int bytes,
1043                                   struct x86_emulate_ctxt *ctxt)
1044 {
1045         struct kvm_vcpu *vcpu = ctxt->vcpu;
1046
1047         if (vcpu->mmio_read_completed) {
1048                 memcpy(val, vcpu->mmio_data, bytes);
1049                 vcpu->mmio_read_completed = 0;
1050                 return X86EMUL_CONTINUE;
1051         } else if (emulator_read_std(addr, val, bytes, ctxt)
1052                    == X86EMUL_CONTINUE)
1053                 return X86EMUL_CONTINUE;
1054         else {
1055                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1056
1057                 if (gpa == UNMAPPED_GVA)
1058                         return X86EMUL_PROPAGATE_FAULT;
1059                 vcpu->mmio_needed = 1;
1060                 vcpu->mmio_phys_addr = gpa;
1061                 vcpu->mmio_size = bytes;
1062                 vcpu->mmio_is_write = 0;
1063
1064                 return X86EMUL_UNHANDLEABLE;
1065         }
1066 }
1067
1068 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1069                                const void *val, int bytes)
1070 {
1071         struct page *page;
1072         void *virt;
1073
1074         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1075                 return 0;
1076         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1077         if (!page)
1078                 return 0;
1079         kvm_mmu_pre_write(vcpu, gpa, bytes);
1080         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1081         virt = kmap_atomic(page, KM_USER0);
1082         memcpy(virt + offset_in_page(gpa), val, bytes);
1083         kunmap_atomic(virt, KM_USER0);
1084         kvm_mmu_post_write(vcpu, gpa, bytes);
1085         return 1;
1086 }
1087
1088 static int emulator_write_emulated(unsigned long addr,
1089                                    const void *val,
1090                                    unsigned int bytes,
1091                                    struct x86_emulate_ctxt *ctxt)
1092 {
1093         struct kvm_vcpu *vcpu = ctxt->vcpu;
1094         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1095
1096         if (gpa == UNMAPPED_GVA) {
1097                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1098                 return X86EMUL_PROPAGATE_FAULT;
1099         }
1100
1101         if (emulator_write_phys(vcpu, gpa, val, bytes))
1102                 return X86EMUL_CONTINUE;
1103
1104         vcpu->mmio_needed = 1;
1105         vcpu->mmio_phys_addr = gpa;
1106         vcpu->mmio_size = bytes;
1107         vcpu->mmio_is_write = 1;
1108         memcpy(vcpu->mmio_data, val, bytes);
1109
1110         return X86EMUL_CONTINUE;
1111 }
1112
1113 static int emulator_cmpxchg_emulated(unsigned long addr,
1114                                      const void *old,
1115                                      const void *new,
1116                                      unsigned int bytes,
1117                                      struct x86_emulate_ctxt *ctxt)
1118 {
1119         static int reported;
1120
1121         if (!reported) {
1122                 reported = 1;
1123                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1124         }
1125         return emulator_write_emulated(addr, new, bytes, ctxt);
1126 }
1127
1128 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1129 {
1130         return kvm_arch_ops->get_segment_base(vcpu, seg);
1131 }
1132
1133 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1134 {
1135         return X86EMUL_CONTINUE;
1136 }
1137
1138 int emulate_clts(struct kvm_vcpu *vcpu)
1139 {
1140         unsigned long cr0;
1141
1142         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1143         kvm_arch_ops->set_cr0(vcpu, cr0);
1144         return X86EMUL_CONTINUE;
1145 }
1146
1147 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1148 {
1149         struct kvm_vcpu *vcpu = ctxt->vcpu;
1150
1151         switch (dr) {
1152         case 0 ... 3:
1153                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1154                 return X86EMUL_CONTINUE;
1155         default:
1156                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1157                        __FUNCTION__, dr);
1158                 return X86EMUL_UNHANDLEABLE;
1159         }
1160 }
1161
1162 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1163 {
1164         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1165         int exception;
1166
1167         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1168         if (exception) {
1169                 /* FIXME: better handling */
1170                 return X86EMUL_UNHANDLEABLE;
1171         }
1172         return X86EMUL_CONTINUE;
1173 }
1174
1175 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1176 {
1177         static int reported;
1178         u8 opcodes[4];
1179         unsigned long rip = ctxt->vcpu->rip;
1180         unsigned long rip_linear;
1181
1182         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1183
1184         if (reported)
1185                 return;
1186
1187         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1188
1189         printk(KERN_ERR "emulation failed but !mmio_needed?"
1190                " rip %lx %02x %02x %02x %02x\n",
1191                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1192         reported = 1;
1193 }
1194
1195 struct x86_emulate_ops emulate_ops = {
1196         .read_std            = emulator_read_std,
1197         .write_std           = emulator_write_std,
1198         .read_emulated       = emulator_read_emulated,
1199         .write_emulated      = emulator_write_emulated,
1200         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1201 };
1202
1203 int emulate_instruction(struct kvm_vcpu *vcpu,
1204                         struct kvm_run *run,
1205                         unsigned long cr2,
1206                         u16 error_code)
1207 {
1208         struct x86_emulate_ctxt emulate_ctxt;
1209         int r;
1210         int cs_db, cs_l;
1211
1212         vcpu->mmio_fault_cr2 = cr2;
1213         kvm_arch_ops->cache_regs(vcpu);
1214
1215         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1216
1217         emulate_ctxt.vcpu = vcpu;
1218         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1219         emulate_ctxt.cr2 = cr2;
1220         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1221                 ? X86EMUL_MODE_REAL : cs_l
1222                 ? X86EMUL_MODE_PROT64 : cs_db
1223                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1224
1225         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1226                 emulate_ctxt.cs_base = 0;
1227                 emulate_ctxt.ds_base = 0;
1228                 emulate_ctxt.es_base = 0;
1229                 emulate_ctxt.ss_base = 0;
1230         } else {
1231                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1232                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1233                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1234                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1235         }
1236
1237         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1238         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1239
1240         vcpu->mmio_is_write = 0;
1241         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1242
1243         if ((r || vcpu->mmio_is_write) && run) {
1244                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1245                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1246                 run->mmio.len = vcpu->mmio_size;
1247                 run->mmio.is_write = vcpu->mmio_is_write;
1248         }
1249
1250         if (r) {
1251                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1252                         return EMULATE_DONE;
1253                 if (!vcpu->mmio_needed) {
1254                         report_emulation_failure(&emulate_ctxt);
1255                         return EMULATE_FAIL;
1256                 }
1257                 return EMULATE_DO_MMIO;
1258         }
1259
1260         kvm_arch_ops->decache_regs(vcpu);
1261         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1262
1263         if (vcpu->mmio_is_write) {
1264                 vcpu->mmio_needed = 0;
1265                 return EMULATE_DO_MMIO;
1266         }
1267
1268         return EMULATE_DONE;
1269 }
1270 EXPORT_SYMBOL_GPL(emulate_instruction);
1271
1272 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1273 {
1274         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1275
1276         kvm_arch_ops->cache_regs(vcpu);
1277         ret = -KVM_EINVAL;
1278 #ifdef CONFIG_X86_64
1279         if (is_long_mode(vcpu)) {
1280                 nr = vcpu->regs[VCPU_REGS_RAX];
1281                 a0 = vcpu->regs[VCPU_REGS_RDI];
1282                 a1 = vcpu->regs[VCPU_REGS_RSI];
1283                 a2 = vcpu->regs[VCPU_REGS_RDX];
1284                 a3 = vcpu->regs[VCPU_REGS_RCX];
1285                 a4 = vcpu->regs[VCPU_REGS_R8];
1286                 a5 = vcpu->regs[VCPU_REGS_R9];
1287         } else
1288 #endif
1289         {
1290                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1291                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1292                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1293                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1294                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1295                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1296                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1297         }
1298         switch (nr) {
1299         default:
1300                 run->hypercall.args[0] = a0;
1301                 run->hypercall.args[1] = a1;
1302                 run->hypercall.args[2] = a2;
1303                 run->hypercall.args[3] = a3;
1304                 run->hypercall.args[4] = a4;
1305                 run->hypercall.args[5] = a5;
1306                 run->hypercall.ret = ret;
1307                 run->hypercall.longmode = is_long_mode(vcpu);
1308                 kvm_arch_ops->decache_regs(vcpu);
1309                 return 0;
1310         }
1311         vcpu->regs[VCPU_REGS_RAX] = ret;
1312         kvm_arch_ops->decache_regs(vcpu);
1313         return 1;
1314 }
1315 EXPORT_SYMBOL_GPL(kvm_hypercall);
1316
1317 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1318 {
1319         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1320 }
1321
1322 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1323 {
1324         struct descriptor_table dt = { limit, base };
1325
1326         kvm_arch_ops->set_gdt(vcpu, &dt);
1327 }
1328
1329 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1330 {
1331         struct descriptor_table dt = { limit, base };
1332
1333         kvm_arch_ops->set_idt(vcpu, &dt);
1334 }
1335
1336 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1337                    unsigned long *rflags)
1338 {
1339         lmsw(vcpu, msw);
1340         *rflags = kvm_arch_ops->get_rflags(vcpu);
1341 }
1342
1343 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1344 {
1345         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1346         switch (cr) {
1347         case 0:
1348                 return vcpu->cr0;
1349         case 2:
1350                 return vcpu->cr2;
1351         case 3:
1352                 return vcpu->cr3;
1353         case 4:
1354                 return vcpu->cr4;
1355         default:
1356                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1357                 return 0;
1358         }
1359 }
1360
1361 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1362                      unsigned long *rflags)
1363 {
1364         switch (cr) {
1365         case 0:
1366                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1367                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1368                 break;
1369         case 2:
1370                 vcpu->cr2 = val;
1371                 break;
1372         case 3:
1373                 set_cr3(vcpu, val);
1374                 break;
1375         case 4:
1376                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1377                 break;
1378         default:
1379                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1380         }
1381 }
1382
1383 /*
1384  * Register the para guest with the host:
1385  */
1386 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1387 {
1388         struct kvm_vcpu_para_state *para_state;
1389         hpa_t para_state_hpa, hypercall_hpa;
1390         struct page *para_state_page;
1391         unsigned char *hypercall;
1392         gpa_t hypercall_gpa;
1393
1394         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1395         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1396
1397         /*
1398          * Needs to be page aligned:
1399          */
1400         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1401                 goto err_gp;
1402
1403         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1404         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1405         if (is_error_hpa(para_state_hpa))
1406                 goto err_gp;
1407
1408         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1409         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1410         para_state = kmap_atomic(para_state_page, KM_USER0);
1411
1412         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1413         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1414
1415         para_state->host_version = KVM_PARA_API_VERSION;
1416         /*
1417          * We cannot support guests that try to register themselves
1418          * with a newer API version than the host supports:
1419          */
1420         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1421                 para_state->ret = -KVM_EINVAL;
1422                 goto err_kunmap_skip;
1423         }
1424
1425         hypercall_gpa = para_state->hypercall_gpa;
1426         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1427         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1428         if (is_error_hpa(hypercall_hpa)) {
1429                 para_state->ret = -KVM_EINVAL;
1430                 goto err_kunmap_skip;
1431         }
1432
1433         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1434         vcpu->para_state_page = para_state_page;
1435         vcpu->para_state_gpa = para_state_gpa;
1436         vcpu->hypercall_gpa = hypercall_gpa;
1437
1438         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1439         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1440                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1441         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1442         kunmap_atomic(hypercall, KM_USER1);
1443
1444         para_state->ret = 0;
1445 err_kunmap_skip:
1446         kunmap_atomic(para_state, KM_USER0);
1447         return 0;
1448 err_gp:
1449         return 1;
1450 }
1451
1452 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1453 {
1454         u64 data;
1455
1456         switch (msr) {
1457         case 0xc0010010: /* SYSCFG */
1458         case 0xc0010015: /* HWCR */
1459         case MSR_IA32_PLATFORM_ID:
1460         case MSR_IA32_P5_MC_ADDR:
1461         case MSR_IA32_P5_MC_TYPE:
1462         case MSR_IA32_MC0_CTL:
1463         case MSR_IA32_MCG_STATUS:
1464         case MSR_IA32_MCG_CAP:
1465         case MSR_IA32_MC0_MISC:
1466         case MSR_IA32_MC0_MISC+4:
1467         case MSR_IA32_MC0_MISC+8:
1468         case MSR_IA32_MC0_MISC+12:
1469         case MSR_IA32_MC0_MISC+16:
1470         case MSR_IA32_UCODE_REV:
1471         case MSR_IA32_PERF_STATUS:
1472                 /* MTRR registers */
1473         case 0xfe:
1474         case 0x200 ... 0x2ff:
1475                 data = 0;
1476                 break;
1477         case 0xcd: /* fsb frequency */
1478                 data = 3;
1479                 break;
1480         case MSR_IA32_APICBASE:
1481                 data = vcpu->apic_base;
1482                 break;
1483         case MSR_IA32_MISC_ENABLE:
1484                 data = vcpu->ia32_misc_enable_msr;
1485                 break;
1486 #ifdef CONFIG_X86_64
1487         case MSR_EFER:
1488                 data = vcpu->shadow_efer;
1489                 break;
1490 #endif
1491         default:
1492                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1493                 return 1;
1494         }
1495         *pdata = data;
1496         return 0;
1497 }
1498 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1499
1500 /*
1501  * Reads an msr value (of 'msr_index') into 'pdata'.
1502  * Returns 0 on success, non-0 otherwise.
1503  * Assumes vcpu_load() was already called.
1504  */
1505 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1506 {
1507         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1508 }
1509
1510 #ifdef CONFIG_X86_64
1511
1512 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1513 {
1514         if (efer & EFER_RESERVED_BITS) {
1515                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1516                        efer);
1517                 inject_gp(vcpu);
1518                 return;
1519         }
1520
1521         if (is_paging(vcpu)
1522             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1523                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1524                 inject_gp(vcpu);
1525                 return;
1526         }
1527
1528         kvm_arch_ops->set_efer(vcpu, efer);
1529
1530         efer &= ~EFER_LMA;
1531         efer |= vcpu->shadow_efer & EFER_LMA;
1532
1533         vcpu->shadow_efer = efer;
1534 }
1535
1536 #endif
1537
1538 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1539 {
1540         switch (msr) {
1541 #ifdef CONFIG_X86_64
1542         case MSR_EFER:
1543                 set_efer(vcpu, data);
1544                 break;
1545 #endif
1546         case MSR_IA32_MC0_STATUS:
1547                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1548                        __FUNCTION__, data);
1549                 break;
1550         case MSR_IA32_MCG_STATUS:
1551                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1552                         __FUNCTION__, data);
1553                 break;
1554         case MSR_IA32_UCODE_REV:
1555         case MSR_IA32_UCODE_WRITE:
1556         case 0x200 ... 0x2ff: /* MTRRs */
1557                 break;
1558         case MSR_IA32_APICBASE:
1559                 vcpu->apic_base = data;
1560                 break;
1561         case MSR_IA32_MISC_ENABLE:
1562                 vcpu->ia32_misc_enable_msr = data;
1563                 break;
1564         /*
1565          * This is the 'probe whether the host is KVM' logic:
1566          */
1567         case MSR_KVM_API_MAGIC:
1568                 return vcpu_register_para(vcpu, data);
1569
1570         default:
1571                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1572                 return 1;
1573         }
1574         return 0;
1575 }
1576 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1577
1578 /*
1579  * Writes msr value into into the appropriate "register".
1580  * Returns 0 on success, non-0 otherwise.
1581  * Assumes vcpu_load() was already called.
1582  */
1583 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1584 {
1585         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1586 }
1587
1588 void kvm_resched(struct kvm_vcpu *vcpu)
1589 {
1590         if (!need_resched())
1591                 return;
1592         vcpu_put(vcpu);
1593         cond_resched();
1594         vcpu_load(vcpu);
1595 }
1596 EXPORT_SYMBOL_GPL(kvm_resched);
1597
1598 void load_msrs(struct vmx_msr_entry *e, int n)
1599 {
1600         int i;
1601
1602         for (i = 0; i < n; ++i)
1603                 wrmsrl(e[i].index, e[i].data);
1604 }
1605 EXPORT_SYMBOL_GPL(load_msrs);
1606
1607 void save_msrs(struct vmx_msr_entry *e, int n)
1608 {
1609         int i;
1610
1611         for (i = 0; i < n; ++i)
1612                 rdmsrl(e[i].index, e[i].data);
1613 }
1614 EXPORT_SYMBOL_GPL(save_msrs);
1615
1616 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1617 {
1618         int i;
1619         u32 function;
1620         struct kvm_cpuid_entry *e, *best;
1621
1622         kvm_arch_ops->cache_regs(vcpu);
1623         function = vcpu->regs[VCPU_REGS_RAX];
1624         vcpu->regs[VCPU_REGS_RAX] = 0;
1625         vcpu->regs[VCPU_REGS_RBX] = 0;
1626         vcpu->regs[VCPU_REGS_RCX] = 0;
1627         vcpu->regs[VCPU_REGS_RDX] = 0;
1628         best = NULL;
1629         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1630                 e = &vcpu->cpuid_entries[i];
1631                 if (e->function == function) {
1632                         best = e;
1633                         break;
1634                 }
1635                 /*
1636                  * Both basic or both extended?
1637                  */
1638                 if (((e->function ^ function) & 0x80000000) == 0)
1639                         if (!best || e->function > best->function)
1640                                 best = e;
1641         }
1642         if (best) {
1643                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1644                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1645                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1646                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1647         }
1648         kvm_arch_ops->decache_regs(vcpu);
1649         kvm_arch_ops->skip_emulated_instruction(vcpu);
1650 }
1651 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1652
1653 static int pio_copy_data(struct kvm_vcpu *vcpu)
1654 {
1655         void *p = vcpu->pio_data;
1656         void *q;
1657         unsigned bytes;
1658         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1659
1660         kvm_arch_ops->vcpu_put(vcpu);
1661         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1662                  PAGE_KERNEL);
1663         if (!q) {
1664                 kvm_arch_ops->vcpu_load(vcpu);
1665                 free_pio_guest_pages(vcpu);
1666                 return -ENOMEM;
1667         }
1668         q += vcpu->pio.guest_page_offset;
1669         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1670         if (vcpu->pio.in)
1671                 memcpy(q, p, bytes);
1672         else
1673                 memcpy(p, q, bytes);
1674         q -= vcpu->pio.guest_page_offset;
1675         vunmap(q);
1676         kvm_arch_ops->vcpu_load(vcpu);
1677         free_pio_guest_pages(vcpu);
1678         return 0;
1679 }
1680
1681 static int complete_pio(struct kvm_vcpu *vcpu)
1682 {
1683         struct kvm_pio_request *io = &vcpu->pio;
1684         long delta;
1685         int r;
1686
1687         kvm_arch_ops->cache_regs(vcpu);
1688
1689         if (!io->string) {
1690                 if (io->in)
1691                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1692                                io->size);
1693         } else {
1694                 if (io->in) {
1695                         r = pio_copy_data(vcpu);
1696                         if (r) {
1697                                 kvm_arch_ops->cache_regs(vcpu);
1698                                 return r;
1699                         }
1700                 }
1701
1702                 delta = 1;
1703                 if (io->rep) {
1704                         delta *= io->cur_count;
1705                         /*
1706                          * The size of the register should really depend on
1707                          * current address size.
1708                          */
1709                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1710                 }
1711                 if (io->down)
1712                         delta = -delta;
1713                 delta *= io->size;
1714                 if (io->in)
1715                         vcpu->regs[VCPU_REGS_RDI] += delta;
1716                 else
1717                         vcpu->regs[VCPU_REGS_RSI] += delta;
1718         }
1719
1720         kvm_arch_ops->decache_regs(vcpu);
1721
1722         io->count -= io->cur_count;
1723         io->cur_count = 0;
1724
1725         if (!io->count)
1726                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1727         return 0;
1728 }
1729
1730 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1731                   int size, unsigned long count, int string, int down,
1732                   gva_t address, int rep, unsigned port)
1733 {
1734         unsigned now, in_page;
1735         int i;
1736         int nr_pages = 1;
1737         struct page *page;
1738
1739         vcpu->run->exit_reason = KVM_EXIT_IO;
1740         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1741         vcpu->run->io.size = size;
1742         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1743         vcpu->run->io.count = count;
1744         vcpu->run->io.port = port;
1745         vcpu->pio.count = count;
1746         vcpu->pio.cur_count = count;
1747         vcpu->pio.size = size;
1748         vcpu->pio.in = in;
1749         vcpu->pio.string = string;
1750         vcpu->pio.down = down;
1751         vcpu->pio.guest_page_offset = offset_in_page(address);
1752         vcpu->pio.rep = rep;
1753
1754         if (!string) {
1755                 kvm_arch_ops->cache_regs(vcpu);
1756                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1757                 kvm_arch_ops->decache_regs(vcpu);
1758                 return 0;
1759         }
1760
1761         if (!count) {
1762                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1763                 return 1;
1764         }
1765
1766         now = min(count, PAGE_SIZE / size);
1767
1768         if (!down)
1769                 in_page = PAGE_SIZE - offset_in_page(address);
1770         else
1771                 in_page = offset_in_page(address) + size;
1772         now = min(count, (unsigned long)in_page / size);
1773         if (!now) {
1774                 /*
1775                  * String I/O straddles page boundary.  Pin two guest pages
1776                  * so that we satisfy atomicity constraints.  Do just one
1777                  * transaction to avoid complexity.
1778                  */
1779                 nr_pages = 2;
1780                 now = 1;
1781         }
1782         if (down) {
1783                 /*
1784                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1785                  */
1786                 printk(KERN_ERR "kvm: guest string pio down\n");
1787                 inject_gp(vcpu);
1788                 return 1;
1789         }
1790         vcpu->run->io.count = now;
1791         vcpu->pio.cur_count = now;
1792
1793         for (i = 0; i < nr_pages; ++i) {
1794                 spin_lock(&vcpu->kvm->lock);
1795                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1796                 if (page)
1797                         get_page(page);
1798                 vcpu->pio.guest_pages[i] = page;
1799                 spin_unlock(&vcpu->kvm->lock);
1800                 if (!page) {
1801                         inject_gp(vcpu);
1802                         free_pio_guest_pages(vcpu);
1803                         return 1;
1804                 }
1805         }
1806
1807         if (!vcpu->pio.in)
1808                 return pio_copy_data(vcpu);
1809         return 0;
1810 }
1811 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1812
1813 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814 {
1815         int r;
1816         sigset_t sigsaved;
1817
1818         vcpu_load(vcpu);
1819
1820         if (vcpu->sigset_active)
1821                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1822
1823         /* re-sync apic's tpr */
1824         vcpu->cr8 = kvm_run->cr8;
1825
1826         if (vcpu->pio.cur_count) {
1827                 r = complete_pio(vcpu);
1828                 if (r)
1829                         goto out;
1830         }
1831
1832         if (vcpu->mmio_needed) {
1833                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1834                 vcpu->mmio_read_completed = 1;
1835                 vcpu->mmio_needed = 0;
1836                 r = emulate_instruction(vcpu, kvm_run,
1837                                         vcpu->mmio_fault_cr2, 0);
1838                 if (r == EMULATE_DO_MMIO) {
1839                         /*
1840                          * Read-modify-write.  Back to userspace.
1841                          */
1842                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1843                         r = 0;
1844                         goto out;
1845                 }
1846         }
1847
1848         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1849                 kvm_arch_ops->cache_regs(vcpu);
1850                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1851                 kvm_arch_ops->decache_regs(vcpu);
1852         }
1853
1854         r = kvm_arch_ops->run(vcpu, kvm_run);
1855
1856 out:
1857         if (vcpu->sigset_active)
1858                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1859
1860         vcpu_put(vcpu);
1861         return r;
1862 }
1863
1864 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1865                                    struct kvm_regs *regs)
1866 {
1867         vcpu_load(vcpu);
1868
1869         kvm_arch_ops->cache_regs(vcpu);
1870
1871         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1872         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1873         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1874         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1875         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1876         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1877         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1878         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1879 #ifdef CONFIG_X86_64
1880         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1881         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1882         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1883         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1884         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1885         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1886         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1887         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1888 #endif
1889
1890         regs->rip = vcpu->rip;
1891         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1892
1893         /*
1894          * Don't leak debug flags in case they were set for guest debugging
1895          */
1896         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1897                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1898
1899         vcpu_put(vcpu);
1900
1901         return 0;
1902 }
1903
1904 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1905                                    struct kvm_regs *regs)
1906 {
1907         vcpu_load(vcpu);
1908
1909         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1910         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1911         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1912         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1913         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1914         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1915         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1916         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1917 #ifdef CONFIG_X86_64
1918         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1919         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1920         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1921         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1922         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1923         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1924         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1925         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1926 #endif
1927
1928         vcpu->rip = regs->rip;
1929         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1930
1931         kvm_arch_ops->decache_regs(vcpu);
1932
1933         vcpu_put(vcpu);
1934
1935         return 0;
1936 }
1937
1938 static void get_segment(struct kvm_vcpu *vcpu,
1939                         struct kvm_segment *var, int seg)
1940 {
1941         return kvm_arch_ops->get_segment(vcpu, var, seg);
1942 }
1943
1944 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1945                                     struct kvm_sregs *sregs)
1946 {
1947         struct descriptor_table dt;
1948
1949         vcpu_load(vcpu);
1950
1951         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1952         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1953         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1954         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1955         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1956         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1957
1958         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1959         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1960
1961         kvm_arch_ops->get_idt(vcpu, &dt);
1962         sregs->idt.limit = dt.limit;
1963         sregs->idt.base = dt.base;
1964         kvm_arch_ops->get_gdt(vcpu, &dt);
1965         sregs->gdt.limit = dt.limit;
1966         sregs->gdt.base = dt.base;
1967
1968         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1969         sregs->cr0 = vcpu->cr0;
1970         sregs->cr2 = vcpu->cr2;
1971         sregs->cr3 = vcpu->cr3;
1972         sregs->cr4 = vcpu->cr4;
1973         sregs->cr8 = vcpu->cr8;
1974         sregs->efer = vcpu->shadow_efer;
1975         sregs->apic_base = vcpu->apic_base;
1976
1977         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1978                sizeof sregs->interrupt_bitmap);
1979
1980         vcpu_put(vcpu);
1981
1982         return 0;
1983 }
1984
1985 static void set_segment(struct kvm_vcpu *vcpu,
1986                         struct kvm_segment *var, int seg)
1987 {
1988         return kvm_arch_ops->set_segment(vcpu, var, seg);
1989 }
1990
1991 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1992                                     struct kvm_sregs *sregs)
1993 {
1994         int mmu_reset_needed = 0;
1995         int i;
1996         struct descriptor_table dt;
1997
1998         vcpu_load(vcpu);
1999
2000         dt.limit = sregs->idt.limit;
2001         dt.base = sregs->idt.base;
2002         kvm_arch_ops->set_idt(vcpu, &dt);
2003         dt.limit = sregs->gdt.limit;
2004         dt.base = sregs->gdt.base;
2005         kvm_arch_ops->set_gdt(vcpu, &dt);
2006
2007         vcpu->cr2 = sregs->cr2;
2008         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2009         vcpu->cr3 = sregs->cr3;
2010
2011         vcpu->cr8 = sregs->cr8;
2012
2013         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2014 #ifdef CONFIG_X86_64
2015         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2016 #endif
2017         vcpu->apic_base = sregs->apic_base;
2018
2019         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2020
2021         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2022         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2023
2024         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2025         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2026         if (!is_long_mode(vcpu) && is_pae(vcpu))
2027                 load_pdptrs(vcpu, vcpu->cr3);
2028
2029         if (mmu_reset_needed)
2030                 kvm_mmu_reset_context(vcpu);
2031
2032         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2033                sizeof vcpu->irq_pending);
2034         vcpu->irq_summary = 0;
2035         for (i = 0; i < NR_IRQ_WORDS; ++i)
2036                 if (vcpu->irq_pending[i])
2037                         __set_bit(i, &vcpu->irq_summary);
2038
2039         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2040         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2041         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2042         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2043         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2044         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2045
2046         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2047         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2048
2049         vcpu_put(vcpu);
2050
2051         return 0;
2052 }
2053
2054 /*
2055  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2056  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2057  *
2058  * This list is modified at module load time to reflect the
2059  * capabilities of the host cpu.
2060  */
2061 static u32 msrs_to_save[] = {
2062         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2063         MSR_K6_STAR,
2064 #ifdef CONFIG_X86_64
2065         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2066 #endif
2067         MSR_IA32_TIME_STAMP_COUNTER,
2068 };
2069
2070 static unsigned num_msrs_to_save;
2071
2072 static u32 emulated_msrs[] = {
2073         MSR_IA32_MISC_ENABLE,
2074 };
2075
2076 static __init void kvm_init_msr_list(void)
2077 {
2078         u32 dummy[2];
2079         unsigned i, j;
2080
2081         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2082                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2083                         continue;
2084                 if (j < i)
2085                         msrs_to_save[j] = msrs_to_save[i];
2086                 j++;
2087         }
2088         num_msrs_to_save = j;
2089 }
2090
2091 /*
2092  * Adapt set_msr() to msr_io()'s calling convention
2093  */
2094 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2095 {
2096         return set_msr(vcpu, index, *data);
2097 }
2098
2099 /*
2100  * Read or write a bunch of msrs. All parameters are kernel addresses.
2101  *
2102  * @return number of msrs set successfully.
2103  */
2104 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2105                     struct kvm_msr_entry *entries,
2106                     int (*do_msr)(struct kvm_vcpu *vcpu,
2107                                   unsigned index, u64 *data))
2108 {
2109         int i;
2110
2111         vcpu_load(vcpu);
2112
2113         for (i = 0; i < msrs->nmsrs; ++i)
2114                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2115                         break;
2116
2117         vcpu_put(vcpu);
2118
2119         return i;
2120 }
2121
2122 /*
2123  * Read or write a bunch of msrs. Parameters are user addresses.
2124  *
2125  * @return number of msrs set successfully.
2126  */
2127 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2128                   int (*do_msr)(struct kvm_vcpu *vcpu,
2129                                 unsigned index, u64 *data),
2130                   int writeback)
2131 {
2132         struct kvm_msrs msrs;
2133         struct kvm_msr_entry *entries;
2134         int r, n;
2135         unsigned size;
2136
2137         r = -EFAULT;
2138         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2139                 goto out;
2140
2141         r = -E2BIG;
2142         if (msrs.nmsrs >= MAX_IO_MSRS)
2143                 goto out;
2144
2145         r = -ENOMEM;
2146         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2147         entries = vmalloc(size);
2148         if (!entries)
2149                 goto out;
2150
2151         r = -EFAULT;
2152         if (copy_from_user(entries, user_msrs->entries, size))
2153                 goto out_free;
2154
2155         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2156         if (r < 0)
2157                 goto out_free;
2158
2159         r = -EFAULT;
2160         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2161                 goto out_free;
2162
2163         r = n;
2164
2165 out_free:
2166         vfree(entries);
2167 out:
2168         return r;
2169 }
2170
2171 /*
2172  * Translate a guest virtual address to a guest physical address.
2173  */
2174 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2175                                     struct kvm_translation *tr)
2176 {
2177         unsigned long vaddr = tr->linear_address;
2178         gpa_t gpa;
2179
2180         vcpu_load(vcpu);
2181         spin_lock(&vcpu->kvm->lock);
2182         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2183         tr->physical_address = gpa;
2184         tr->valid = gpa != UNMAPPED_GVA;
2185         tr->writeable = 1;
2186         tr->usermode = 0;
2187         spin_unlock(&vcpu->kvm->lock);
2188         vcpu_put(vcpu);
2189
2190         return 0;
2191 }
2192
2193 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2194                                     struct kvm_interrupt *irq)
2195 {
2196         if (irq->irq < 0 || irq->irq >= 256)
2197                 return -EINVAL;
2198         vcpu_load(vcpu);
2199
2200         set_bit(irq->irq, vcpu->irq_pending);
2201         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2202
2203         vcpu_put(vcpu);
2204
2205         return 0;
2206 }
2207
2208 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2209                                       struct kvm_debug_guest *dbg)
2210 {
2211         int r;
2212
2213         vcpu_load(vcpu);
2214
2215         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2216
2217         vcpu_put(vcpu);
2218
2219         return r;
2220 }
2221
2222 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2223                                     unsigned long address,
2224                                     int *type)
2225 {
2226         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2227         unsigned long pgoff;
2228         struct page *page;
2229
2230         *type = VM_FAULT_MINOR;
2231         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2232         if (pgoff == 0)
2233                 page = virt_to_page(vcpu->run);
2234         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2235                 page = virt_to_page(vcpu->pio_data);
2236         else
2237                 return NOPAGE_SIGBUS;
2238         get_page(page);
2239         return page;
2240 }
2241
2242 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2243         .nopage = kvm_vcpu_nopage,
2244 };
2245
2246 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2247 {
2248         vma->vm_ops = &kvm_vcpu_vm_ops;
2249         return 0;
2250 }
2251
2252 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2253 {
2254         struct kvm_vcpu *vcpu = filp->private_data;
2255
2256         fput(vcpu->kvm->filp);
2257         return 0;
2258 }
2259
2260 static struct file_operations kvm_vcpu_fops = {
2261         .release        = kvm_vcpu_release,
2262         .unlocked_ioctl = kvm_vcpu_ioctl,
2263         .compat_ioctl   = kvm_vcpu_ioctl,
2264         .mmap           = kvm_vcpu_mmap,
2265 };
2266
2267 /*
2268  * Allocates an inode for the vcpu.
2269  */
2270 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2271 {
2272         int fd, r;
2273         struct inode *inode;
2274         struct file *file;
2275
2276         atomic_inc(&vcpu->kvm->filp->f_count);
2277         inode = kvmfs_inode(&kvm_vcpu_fops);
2278         if (IS_ERR(inode)) {
2279                 r = PTR_ERR(inode);
2280                 goto out1;
2281         }
2282
2283         file = kvmfs_file(inode, vcpu);
2284         if (IS_ERR(file)) {
2285                 r = PTR_ERR(file);
2286                 goto out2;
2287         }
2288
2289         r = get_unused_fd();
2290         if (r < 0)
2291                 goto out3;
2292         fd = r;
2293         fd_install(fd, file);
2294
2295         return fd;
2296
2297 out3:
2298         fput(file);
2299 out2:
2300         iput(inode);
2301 out1:
2302         fput(vcpu->kvm->filp);
2303         return r;
2304 }
2305
2306 /*
2307  * Creates some virtual cpus.  Good luck creating more than one.
2308  */
2309 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2310 {
2311         int r;
2312         struct kvm_vcpu *vcpu;
2313         struct page *page;
2314
2315         r = -EINVAL;
2316         if (!valid_vcpu(n))
2317                 goto out;
2318
2319         vcpu = &kvm->vcpus[n];
2320
2321         mutex_lock(&vcpu->mutex);
2322
2323         if (vcpu->vmcs) {
2324                 mutex_unlock(&vcpu->mutex);
2325                 return -EEXIST;
2326         }
2327
2328         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2329         r = -ENOMEM;
2330         if (!page)
2331                 goto out_unlock;
2332         vcpu->run = page_address(page);
2333
2334         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2335         r = -ENOMEM;
2336         if (!page)
2337                 goto out_free_run;
2338         vcpu->pio_data = page_address(page);
2339
2340         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2341                                            FX_IMAGE_ALIGN);
2342         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2343         vcpu->cr0 = 0x10;
2344
2345         r = kvm_arch_ops->vcpu_create(vcpu);
2346         if (r < 0)
2347                 goto out_free_vcpus;
2348
2349         r = kvm_mmu_create(vcpu);
2350         if (r < 0)
2351                 goto out_free_vcpus;
2352
2353         kvm_arch_ops->vcpu_load(vcpu);
2354         r = kvm_mmu_setup(vcpu);
2355         if (r >= 0)
2356                 r = kvm_arch_ops->vcpu_setup(vcpu);
2357         vcpu_put(vcpu);
2358
2359         if (r < 0)
2360                 goto out_free_vcpus;
2361
2362         r = create_vcpu_fd(vcpu);
2363         if (r < 0)
2364                 goto out_free_vcpus;
2365
2366         return r;
2367
2368 out_free_vcpus:
2369         kvm_free_vcpu(vcpu);
2370 out_free_run:
2371         free_page((unsigned long)vcpu->run);
2372         vcpu->run = NULL;
2373 out_unlock:
2374         mutex_unlock(&vcpu->mutex);
2375 out:
2376         return r;
2377 }
2378
2379 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2380                                     struct kvm_cpuid *cpuid,
2381                                     struct kvm_cpuid_entry __user *entries)
2382 {
2383         int r;
2384
2385         r = -E2BIG;
2386         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2387                 goto out;
2388         r = -EFAULT;
2389         if (copy_from_user(&vcpu->cpuid_entries, entries,
2390                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2391                 goto out;
2392         vcpu->cpuid_nent = cpuid->nent;
2393         return 0;
2394
2395 out:
2396         return r;
2397 }
2398
2399 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2400 {
2401         if (sigset) {
2402                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2403                 vcpu->sigset_active = 1;
2404                 vcpu->sigset = *sigset;
2405         } else
2406                 vcpu->sigset_active = 0;
2407         return 0;
2408 }
2409
2410 /*
2411  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2412  * we have asm/x86/processor.h
2413  */
2414 struct fxsave {
2415         u16     cwd;
2416         u16     swd;
2417         u16     twd;
2418         u16     fop;
2419         u64     rip;
2420         u64     rdp;
2421         u32     mxcsr;
2422         u32     mxcsr_mask;
2423         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2424 #ifdef CONFIG_X86_64
2425         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2426 #else
2427         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2428 #endif
2429 };
2430
2431 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2432 {
2433         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2434
2435         vcpu_load(vcpu);
2436
2437         memcpy(fpu->fpr, fxsave->st_space, 128);
2438         fpu->fcw = fxsave->cwd;
2439         fpu->fsw = fxsave->swd;
2440         fpu->ftwx = fxsave->twd;
2441         fpu->last_opcode = fxsave->fop;
2442         fpu->last_ip = fxsave->rip;
2443         fpu->last_dp = fxsave->rdp;
2444         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2445
2446         vcpu_put(vcpu);
2447
2448         return 0;
2449 }
2450
2451 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2452 {
2453         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2454
2455         vcpu_load(vcpu);
2456
2457         memcpy(fxsave->st_space, fpu->fpr, 128);
2458         fxsave->cwd = fpu->fcw;
2459         fxsave->swd = fpu->fsw;
2460         fxsave->twd = fpu->ftwx;
2461         fxsave->fop = fpu->last_opcode;
2462         fxsave->rip = fpu->last_ip;
2463         fxsave->rdp = fpu->last_dp;
2464         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2465
2466         vcpu_put(vcpu);
2467
2468         return 0;
2469 }
2470
2471 static long kvm_vcpu_ioctl(struct file *filp,
2472                            unsigned int ioctl, unsigned long arg)
2473 {
2474         struct kvm_vcpu *vcpu = filp->private_data;
2475         void __user *argp = (void __user *)arg;
2476         int r = -EINVAL;
2477
2478         switch (ioctl) {
2479         case KVM_RUN:
2480                 r = -EINVAL;
2481                 if (arg)
2482                         goto out;
2483                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2484                 break;
2485         case KVM_GET_REGS: {
2486                 struct kvm_regs kvm_regs;
2487
2488                 memset(&kvm_regs, 0, sizeof kvm_regs);
2489                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2490                 if (r)
2491                         goto out;
2492                 r = -EFAULT;
2493                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2494                         goto out;
2495                 r = 0;
2496                 break;
2497         }
2498         case KVM_SET_REGS: {
2499                 struct kvm_regs kvm_regs;
2500
2501                 r = -EFAULT;
2502                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2503                         goto out;
2504                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2505                 if (r)
2506                         goto out;
2507                 r = 0;
2508                 break;
2509         }
2510         case KVM_GET_SREGS: {
2511                 struct kvm_sregs kvm_sregs;
2512
2513                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2514                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2515                 if (r)
2516                         goto out;
2517                 r = -EFAULT;
2518                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2519                         goto out;
2520                 r = 0;
2521                 break;
2522         }
2523         case KVM_SET_SREGS: {
2524                 struct kvm_sregs kvm_sregs;
2525
2526                 r = -EFAULT;
2527                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2528                         goto out;
2529                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2530                 if (r)
2531                         goto out;
2532                 r = 0;
2533                 break;
2534         }
2535         case KVM_TRANSLATE: {
2536                 struct kvm_translation tr;
2537
2538                 r = -EFAULT;
2539                 if (copy_from_user(&tr, argp, sizeof tr))
2540                         goto out;
2541                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2542                 if (r)
2543                         goto out;
2544                 r = -EFAULT;
2545                 if (copy_to_user(argp, &tr, sizeof tr))
2546                         goto out;
2547                 r = 0;
2548                 break;
2549         }
2550         case KVM_INTERRUPT: {
2551                 struct kvm_interrupt irq;
2552
2553                 r = -EFAULT;
2554                 if (copy_from_user(&irq, argp, sizeof irq))
2555                         goto out;
2556                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2557                 if (r)
2558                         goto out;
2559                 r = 0;
2560                 break;
2561         }
2562         case KVM_DEBUG_GUEST: {
2563                 struct kvm_debug_guest dbg;
2564
2565                 r = -EFAULT;
2566                 if (copy_from_user(&dbg, argp, sizeof dbg))
2567                         goto out;
2568                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2569                 if (r)
2570                         goto out;
2571                 r = 0;
2572                 break;
2573         }
2574         case KVM_GET_MSRS:
2575                 r = msr_io(vcpu, argp, get_msr, 1);
2576                 break;
2577         case KVM_SET_MSRS:
2578                 r = msr_io(vcpu, argp, do_set_msr, 0);
2579                 break;
2580         case KVM_SET_CPUID: {
2581                 struct kvm_cpuid __user *cpuid_arg = argp;
2582                 struct kvm_cpuid cpuid;
2583
2584                 r = -EFAULT;
2585                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2586                         goto out;
2587                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2588                 if (r)
2589                         goto out;
2590                 break;
2591         }
2592         case KVM_SET_SIGNAL_MASK: {
2593                 struct kvm_signal_mask __user *sigmask_arg = argp;
2594                 struct kvm_signal_mask kvm_sigmask;
2595                 sigset_t sigset, *p;
2596
2597                 p = NULL;
2598                 if (argp) {
2599                         r = -EFAULT;
2600                         if (copy_from_user(&kvm_sigmask, argp,
2601                                            sizeof kvm_sigmask))
2602                                 goto out;
2603                         r = -EINVAL;
2604                         if (kvm_sigmask.len != sizeof sigset)
2605                                 goto out;
2606                         r = -EFAULT;
2607                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2608                                            sizeof sigset))
2609                                 goto out;
2610                         p = &sigset;
2611                 }
2612                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2613                 break;
2614         }
2615         case KVM_GET_FPU: {
2616                 struct kvm_fpu fpu;
2617
2618                 memset(&fpu, 0, sizeof fpu);
2619                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2620                 if (r)
2621                         goto out;
2622                 r = -EFAULT;
2623                 if (copy_to_user(argp, &fpu, sizeof fpu))
2624                         goto out;
2625                 r = 0;
2626                 break;
2627         }
2628         case KVM_SET_FPU: {
2629                 struct kvm_fpu fpu;
2630
2631                 r = -EFAULT;
2632                 if (copy_from_user(&fpu, argp, sizeof fpu))
2633                         goto out;
2634                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2635                 if (r)
2636                         goto out;
2637                 r = 0;
2638                 break;
2639         }
2640         default:
2641                 ;
2642         }
2643 out:
2644         return r;
2645 }
2646
2647 static long kvm_vm_ioctl(struct file *filp,
2648                            unsigned int ioctl, unsigned long arg)
2649 {
2650         struct kvm *kvm = filp->private_data;
2651         void __user *argp = (void __user *)arg;
2652         int r = -EINVAL;
2653
2654         switch (ioctl) {
2655         case KVM_CREATE_VCPU:
2656                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2657                 if (r < 0)
2658                         goto out;
2659                 break;
2660         case KVM_SET_MEMORY_REGION: {
2661                 struct kvm_memory_region kvm_mem;
2662
2663                 r = -EFAULT;
2664                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2665                         goto out;
2666                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2667                 if (r)
2668                         goto out;
2669                 break;
2670         }
2671         case KVM_GET_DIRTY_LOG: {
2672                 struct kvm_dirty_log log;
2673
2674                 r = -EFAULT;
2675                 if (copy_from_user(&log, argp, sizeof log))
2676                         goto out;
2677                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2678                 if (r)
2679                         goto out;
2680                 break;
2681         }
2682         case KVM_SET_MEMORY_ALIAS: {
2683                 struct kvm_memory_alias alias;
2684
2685                 r = -EFAULT;
2686                 if (copy_from_user(&alias, argp, sizeof alias))
2687                         goto out;
2688                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2689                 if (r)
2690                         goto out;
2691                 break;
2692         }
2693         default:
2694                 ;
2695         }
2696 out:
2697         return r;
2698 }
2699
2700 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2701                                   unsigned long address,
2702                                   int *type)
2703 {
2704         struct kvm *kvm = vma->vm_file->private_data;
2705         unsigned long pgoff;
2706         struct page *page;
2707
2708         *type = VM_FAULT_MINOR;
2709         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2710         page = gfn_to_page(kvm, pgoff);
2711         if (!page)
2712                 return NOPAGE_SIGBUS;
2713         get_page(page);
2714         return page;
2715 }
2716
2717 static struct vm_operations_struct kvm_vm_vm_ops = {
2718         .nopage = kvm_vm_nopage,
2719 };
2720
2721 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2722 {
2723         vma->vm_ops = &kvm_vm_vm_ops;
2724         return 0;
2725 }
2726
2727 static struct file_operations kvm_vm_fops = {
2728         .release        = kvm_vm_release,
2729         .unlocked_ioctl = kvm_vm_ioctl,
2730         .compat_ioctl   = kvm_vm_ioctl,
2731         .mmap           = kvm_vm_mmap,
2732 };
2733
2734 static int kvm_dev_ioctl_create_vm(void)
2735 {
2736         int fd, r;
2737         struct inode *inode;
2738         struct file *file;
2739         struct kvm *kvm;
2740
2741         inode = kvmfs_inode(&kvm_vm_fops);
2742         if (IS_ERR(inode)) {
2743                 r = PTR_ERR(inode);
2744                 goto out1;
2745         }
2746
2747         kvm = kvm_create_vm();
2748         if (IS_ERR(kvm)) {
2749                 r = PTR_ERR(kvm);
2750                 goto out2;
2751         }
2752
2753         file = kvmfs_file(inode, kvm);
2754         if (IS_ERR(file)) {
2755                 r = PTR_ERR(file);
2756                 goto out3;
2757         }
2758         kvm->filp = file;
2759
2760         r = get_unused_fd();
2761         if (r < 0)
2762                 goto out4;
2763         fd = r;
2764         fd_install(fd, file);
2765
2766         return fd;
2767
2768 out4:
2769         fput(file);
2770 out3:
2771         kvm_destroy_vm(kvm);
2772 out2:
2773         iput(inode);
2774 out1:
2775         return r;
2776 }
2777
2778 static long kvm_dev_ioctl(struct file *filp,
2779                           unsigned int ioctl, unsigned long arg)
2780 {
2781         void __user *argp = (void __user *)arg;
2782         long r = -EINVAL;
2783
2784         switch (ioctl) {
2785         case KVM_GET_API_VERSION:
2786                 r = -EINVAL;
2787                 if (arg)
2788                         goto out;
2789                 r = KVM_API_VERSION;
2790                 break;
2791         case KVM_CREATE_VM:
2792                 r = -EINVAL;
2793                 if (arg)
2794                         goto out;
2795                 r = kvm_dev_ioctl_create_vm();
2796                 break;
2797         case KVM_GET_MSR_INDEX_LIST: {
2798                 struct kvm_msr_list __user *user_msr_list = argp;
2799                 struct kvm_msr_list msr_list;
2800                 unsigned n;
2801
2802                 r = -EFAULT;
2803                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2804                         goto out;
2805                 n = msr_list.nmsrs;
2806                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2807                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2808                         goto out;
2809                 r = -E2BIG;
2810                 if (n < num_msrs_to_save)
2811                         goto out;
2812                 r = -EFAULT;
2813                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2814                                  num_msrs_to_save * sizeof(u32)))
2815                         goto out;
2816                 if (copy_to_user(user_msr_list->indices
2817                                  + num_msrs_to_save * sizeof(u32),
2818                                  &emulated_msrs,
2819                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2820                         goto out;
2821                 r = 0;
2822                 break;
2823         }
2824         case KVM_CHECK_EXTENSION:
2825                 /*
2826                  * No extensions defined at present.
2827                  */
2828                 r = 0;
2829                 break;
2830         case KVM_GET_VCPU_MMAP_SIZE:
2831                 r = -EINVAL;
2832                 if (arg)
2833                         goto out;
2834                 r = 2 * PAGE_SIZE;
2835                 break;
2836         default:
2837                 ;
2838         }
2839 out:
2840         return r;
2841 }
2842
2843 static struct file_operations kvm_chardev_ops = {
2844         .open           = kvm_dev_open,
2845         .release        = kvm_dev_release,
2846         .unlocked_ioctl = kvm_dev_ioctl,
2847         .compat_ioctl   = kvm_dev_ioctl,
2848 };
2849
2850 static struct miscdevice kvm_dev = {
2851         KVM_MINOR,
2852         "kvm",
2853         &kvm_chardev_ops,
2854 };
2855
2856 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2857                        void *v)
2858 {
2859         if (val == SYS_RESTART) {
2860                 /*
2861                  * Some (well, at least mine) BIOSes hang on reboot if
2862                  * in vmx root mode.
2863                  */
2864                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2865                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2866         }
2867         return NOTIFY_OK;
2868 }
2869
2870 static struct notifier_block kvm_reboot_notifier = {
2871         .notifier_call = kvm_reboot,
2872         .priority = 0,
2873 };
2874
2875 /*
2876  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2877  * cached on it.
2878  */
2879 static void decache_vcpus_on_cpu(int cpu)
2880 {
2881         struct kvm *vm;
2882         struct kvm_vcpu *vcpu;
2883         int i;
2884
2885         spin_lock(&kvm_lock);
2886         list_for_each_entry(vm, &vm_list, vm_list)
2887                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2888                         vcpu = &vm->vcpus[i];
2889                         /*
2890                          * If the vcpu is locked, then it is running on some
2891                          * other cpu and therefore it is not cached on the
2892                          * cpu in question.
2893                          *
2894                          * If it's not locked, check the last cpu it executed
2895                          * on.
2896                          */
2897                         if (mutex_trylock(&vcpu->mutex)) {
2898                                 if (vcpu->cpu == cpu) {
2899                                         kvm_arch_ops->vcpu_decache(vcpu);
2900                                         vcpu->cpu = -1;
2901                                 }
2902                                 mutex_unlock(&vcpu->mutex);
2903                         }
2904                 }
2905         spin_unlock(&kvm_lock);
2906 }
2907
2908 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2909                            void *v)
2910 {
2911         int cpu = (long)v;
2912
2913         switch (val) {
2914         case CPU_DOWN_PREPARE:
2915         case CPU_DOWN_PREPARE_FROZEN:
2916         case CPU_UP_CANCELED:
2917         case CPU_UP_CANCELED_FROZEN:
2918                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2919                        cpu);
2920                 decache_vcpus_on_cpu(cpu);
2921                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
2922                                          NULL, 0, 1);
2923                 break;
2924         case CPU_ONLINE:
2925         case CPU_ONLINE_FROZEN:
2926                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2927                        cpu);
2928                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
2929                                          NULL, 0, 1);
2930                 break;
2931         }
2932         return NOTIFY_OK;
2933 }
2934
2935 static struct notifier_block kvm_cpu_notifier = {
2936         .notifier_call = kvm_cpu_hotplug,
2937         .priority = 20, /* must be > scheduler priority */
2938 };
2939
2940 static u64 stat_get(void *_offset)
2941 {
2942         unsigned offset = (long)_offset;
2943         u64 total = 0;
2944         struct kvm *kvm;
2945         struct kvm_vcpu *vcpu;
2946         int i;
2947
2948         spin_lock(&kvm_lock);
2949         list_for_each_entry(kvm, &vm_list, vm_list)
2950                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2951                         vcpu = &kvm->vcpus[i];
2952                         total += *(u32 *)((void *)vcpu + offset);
2953                 }
2954         spin_unlock(&kvm_lock);
2955         return total;
2956 }
2957
2958 static void stat_set(void *offset, u64 val)
2959 {
2960 }
2961
2962 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
2963
2964 static __init void kvm_init_debug(void)
2965 {
2966         struct kvm_stats_debugfs_item *p;
2967
2968         debugfs_dir = debugfs_create_dir("kvm", NULL);
2969         for (p = debugfs_entries; p->name; ++p)
2970                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
2971                                                 (void *)(long)p->offset,
2972                                                 &stat_fops);
2973 }
2974
2975 static void kvm_exit_debug(void)
2976 {
2977         struct kvm_stats_debugfs_item *p;
2978
2979         for (p = debugfs_entries; p->name; ++p)
2980                 debugfs_remove(p->dentry);
2981         debugfs_remove(debugfs_dir);
2982 }
2983
2984 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2985 {
2986         decache_vcpus_on_cpu(raw_smp_processor_id());
2987         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2988         return 0;
2989 }
2990
2991 static int kvm_resume(struct sys_device *dev)
2992 {
2993         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
2994         return 0;
2995 }
2996
2997 static struct sysdev_class kvm_sysdev_class = {
2998         set_kset_name("kvm"),
2999         .suspend = kvm_suspend,
3000         .resume = kvm_resume,
3001 };
3002
3003 static struct sys_device kvm_sysdev = {
3004         .id = 0,
3005         .cls = &kvm_sysdev_class,
3006 };
3007
3008 hpa_t bad_page_address;
3009
3010 static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3011                         const char *dev_name, void *data, struct vfsmount *mnt)
3012 {
3013         return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3014 }
3015
3016 static struct file_system_type kvm_fs_type = {
3017         .name           = "kvmfs",
3018         .get_sb         = kvmfs_get_sb,
3019         .kill_sb        = kill_anon_super,
3020 };
3021
3022 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3023 {
3024         int r;
3025
3026         if (kvm_arch_ops) {
3027                 printk(KERN_ERR "kvm: already loaded the other module\n");
3028                 return -EEXIST;
3029         }
3030
3031         if (!ops->cpu_has_kvm_support()) {
3032                 printk(KERN_ERR "kvm: no hardware support\n");
3033                 return -EOPNOTSUPP;
3034         }
3035         if (ops->disabled_by_bios()) {
3036                 printk(KERN_ERR "kvm: disabled by bios\n");
3037                 return -EOPNOTSUPP;
3038         }
3039
3040         kvm_arch_ops = ops;
3041
3042         r = kvm_arch_ops->hardware_setup();
3043         if (r < 0)
3044                 goto out;
3045
3046         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3047         r = register_cpu_notifier(&kvm_cpu_notifier);
3048         if (r)
3049                 goto out_free_1;
3050         register_reboot_notifier(&kvm_reboot_notifier);
3051
3052         r = sysdev_class_register(&kvm_sysdev_class);
3053         if (r)
3054                 goto out_free_2;
3055
3056         r = sysdev_register(&kvm_sysdev);
3057         if (r)
3058                 goto out_free_3;
3059
3060         kvm_chardev_ops.owner = module;
3061
3062         r = misc_register(&kvm_dev);
3063         if (r) {
3064                 printk (KERN_ERR "kvm: misc device register failed\n");
3065                 goto out_free;
3066         }
3067
3068         return r;
3069
3070 out_free:
3071         sysdev_unregister(&kvm_sysdev);
3072 out_free_3:
3073         sysdev_class_unregister(&kvm_sysdev_class);
3074 out_free_2:
3075         unregister_reboot_notifier(&kvm_reboot_notifier);
3076         unregister_cpu_notifier(&kvm_cpu_notifier);
3077 out_free_1:
3078         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3079         kvm_arch_ops->hardware_unsetup();
3080 out:
3081         kvm_arch_ops = NULL;
3082         return r;
3083 }
3084
3085 void kvm_exit_arch(void)
3086 {
3087         misc_deregister(&kvm_dev);
3088         sysdev_unregister(&kvm_sysdev);
3089         sysdev_class_unregister(&kvm_sysdev_class);
3090         unregister_reboot_notifier(&kvm_reboot_notifier);
3091         unregister_cpu_notifier(&kvm_cpu_notifier);
3092         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3093         kvm_arch_ops->hardware_unsetup();
3094         kvm_arch_ops = NULL;
3095 }
3096
3097 static __init int kvm_init(void)
3098 {
3099         static struct page *bad_page;
3100         int r;
3101
3102         r = kvm_mmu_module_init();
3103         if (r)
3104                 goto out4;
3105
3106         r = register_filesystem(&kvm_fs_type);
3107         if (r)
3108                 goto out3;
3109
3110         kvmfs_mnt = kern_mount(&kvm_fs_type);
3111         r = PTR_ERR(kvmfs_mnt);
3112         if (IS_ERR(kvmfs_mnt))
3113                 goto out2;
3114         kvm_init_debug();
3115
3116         kvm_init_msr_list();
3117
3118         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3119                 r = -ENOMEM;
3120                 goto out;
3121         }
3122
3123         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3124         memset(__va(bad_page_address), 0, PAGE_SIZE);
3125
3126         return 0;
3127
3128 out:
3129         kvm_exit_debug();
3130         mntput(kvmfs_mnt);
3131 out2:
3132         unregister_filesystem(&kvm_fs_type);
3133 out3:
3134         kvm_mmu_module_exit();
3135 out4:
3136         return r;
3137 }
3138
3139 static __exit void kvm_exit(void)
3140 {
3141         kvm_exit_debug();
3142         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3143         mntput(kvmfs_mnt);
3144         unregister_filesystem(&kvm_fs_type);
3145         kvm_mmu_module_exit();
3146 }
3147
3148 module_init(kvm_init)
3149 module_exit(kvm_exit)
3150
3151 EXPORT_SYMBOL_GPL(kvm_init_arch);
3152 EXPORT_SYMBOL_GPL(kvm_exit_arch);