KVM: nVMX: Introduce vmcs02: VMCS used to run L2
[pandora-kernel.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21
22 #include <linux/kvm_host.h>
23 #include <linux/module.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/highmem.h>
27 #include <linux/sched.h>
28 #include <linux/moduleparam.h>
29 #include <linux/ftrace_event.h>
30 #include <linux/slab.h>
31 #include <linux/tboot.h>
32 #include "kvm_cache_regs.h"
33 #include "x86.h"
34
35 #include <asm/io.h>
36 #include <asm/desc.h>
37 #include <asm/vmx.h>
38 #include <asm/virtext.h>
39 #include <asm/mce.h>
40 #include <asm/i387.h>
41 #include <asm/xcr.h>
42
43 #include "trace.h"
44
45 #define __ex(x) __kvm_handle_fault_on_reboot(x)
46 #define __ex_clear(x, reg) \
47         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
48
49 MODULE_AUTHOR("Qumranet");
50 MODULE_LICENSE("GPL");
51
52 static int __read_mostly bypass_guest_pf = 1;
53 module_param(bypass_guest_pf, bool, S_IRUGO);
54
55 static int __read_mostly enable_vpid = 1;
56 module_param_named(vpid, enable_vpid, bool, 0444);
57
58 static int __read_mostly flexpriority_enabled = 1;
59 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
60
61 static int __read_mostly enable_ept = 1;
62 module_param_named(ept, enable_ept, bool, S_IRUGO);
63
64 static int __read_mostly enable_unrestricted_guest = 1;
65 module_param_named(unrestricted_guest,
66                         enable_unrestricted_guest, bool, S_IRUGO);
67
68 static int __read_mostly emulate_invalid_guest_state = 0;
69 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
70
71 static int __read_mostly vmm_exclusive = 1;
72 module_param(vmm_exclusive, bool, S_IRUGO);
73
74 static int __read_mostly yield_on_hlt = 1;
75 module_param(yield_on_hlt, bool, S_IRUGO);
76
77 /*
78  * If nested=1, nested virtualization is supported, i.e., guests may use
79  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
80  * use VMX instructions.
81  */
82 static int __read_mostly nested = 0;
83 module_param(nested, bool, S_IRUGO);
84
85 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                           \
86         (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
87 #define KVM_GUEST_CR0_MASK                                              \
88         (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
89 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST                         \
90         (X86_CR0_WP | X86_CR0_NE)
91 #define KVM_VM_CR0_ALWAYS_ON                                            \
92         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
93 #define KVM_CR4_GUEST_OWNED_BITS                                      \
94         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
95          | X86_CR4_OSXMMEXCPT)
96
97 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
98 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
99
100 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
101
102 /*
103  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
104  * ple_gap:    upper bound on the amount of time between two successive
105  *             executions of PAUSE in a loop. Also indicate if ple enabled.
106  *             According to test, this time is usually smaller than 128 cycles.
107  * ple_window: upper bound on the amount of time a guest is allowed to execute
108  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
109  *             less than 2^12 cycles
110  * Time is measured based on a counter that runs at the same rate as the TSC,
111  * refer SDM volume 3b section 21.6.13 & 22.1.3.
112  */
113 #define KVM_VMX_DEFAULT_PLE_GAP    128
114 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
115 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
116 module_param(ple_gap, int, S_IRUGO);
117
118 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
119 module_param(ple_window, int, S_IRUGO);
120
121 #define NR_AUTOLOAD_MSRS 1
122 #define VMCS02_POOL_SIZE 1
123
124 struct vmcs {
125         u32 revision_id;
126         u32 abort;
127         char data[0];
128 };
129
130 /*
131  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
132  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
133  * loaded on this CPU (so we can clear them if the CPU goes down).
134  */
135 struct loaded_vmcs {
136         struct vmcs *vmcs;
137         int cpu;
138         int launched;
139         struct list_head loaded_vmcss_on_cpu_link;
140 };
141
142 struct shared_msr_entry {
143         unsigned index;
144         u64 data;
145         u64 mask;
146 };
147
148 /*
149  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
150  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
151  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
152  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
153  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
154  * More than one of these structures may exist, if L1 runs multiple L2 guests.
155  * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
156  * underlying hardware which will be used to run L2.
157  * This structure is packed to ensure that its layout is identical across
158  * machines (necessary for live migration).
159  * If there are changes in this struct, VMCS12_REVISION must be changed.
160  */
161 struct __packed vmcs12 {
162         /* According to the Intel spec, a VMCS region must start with the
163          * following two fields. Then follow implementation-specific data.
164          */
165         u32 revision_id;
166         u32 abort;
167 };
168
169 /*
170  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
171  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
172  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
173  */
174 #define VMCS12_REVISION 0x11e57ed0
175
176 /*
177  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
178  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
179  * current implementation, 4K are reserved to avoid future complications.
180  */
181 #define VMCS12_SIZE 0x1000
182
183 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
184 struct vmcs02_list {
185         struct list_head list;
186         gpa_t vmptr;
187         struct loaded_vmcs vmcs02;
188 };
189
190 /*
191  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
192  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
193  */
194 struct nested_vmx {
195         /* Has the level1 guest done vmxon? */
196         bool vmxon;
197
198         /* The guest-physical address of the current VMCS L1 keeps for L2 */
199         gpa_t current_vmptr;
200         /* The host-usable pointer to the above */
201         struct page *current_vmcs12_page;
202         struct vmcs12 *current_vmcs12;
203
204         /* vmcs02_list cache of VMCSs recently used to run L2 guests */
205         struct list_head vmcs02_pool;
206         int vmcs02_num;
207 };
208
209 struct vcpu_vmx {
210         struct kvm_vcpu       vcpu;
211         unsigned long         host_rsp;
212         u8                    fail;
213         u8                    cpl;
214         bool                  nmi_known_unmasked;
215         u32                   exit_intr_info;
216         u32                   idt_vectoring_info;
217         ulong                 rflags;
218         struct shared_msr_entry *guest_msrs;
219         int                   nmsrs;
220         int                   save_nmsrs;
221 #ifdef CONFIG_X86_64
222         u64                   msr_host_kernel_gs_base;
223         u64                   msr_guest_kernel_gs_base;
224 #endif
225         /*
226          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
227          * non-nested (L1) guest, it always points to vmcs01. For a nested
228          * guest (L2), it points to a different VMCS.
229          */
230         struct loaded_vmcs    vmcs01;
231         struct loaded_vmcs   *loaded_vmcs;
232         bool                  __launched; /* temporary, used in vmx_vcpu_run */
233         struct msr_autoload {
234                 unsigned nr;
235                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
236                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
237         } msr_autoload;
238         struct {
239                 int           loaded;
240                 u16           fs_sel, gs_sel, ldt_sel;
241                 int           gs_ldt_reload_needed;
242                 int           fs_reload_needed;
243         } host_state;
244         struct {
245                 int vm86_active;
246                 ulong save_rflags;
247                 struct kvm_save_segment {
248                         u16 selector;
249                         unsigned long base;
250                         u32 limit;
251                         u32 ar;
252                 } tr, es, ds, fs, gs;
253         } rmode;
254         struct {
255                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
256                 struct kvm_save_segment seg[8];
257         } segment_cache;
258         int vpid;
259         bool emulation_required;
260
261         /* Support for vnmi-less CPUs */
262         int soft_vnmi_blocked;
263         ktime_t entry_time;
264         s64 vnmi_blocked_time;
265         u32 exit_reason;
266
267         bool rdtscp_enabled;
268
269         /* Support for a guest hypervisor (nested VMX) */
270         struct nested_vmx nested;
271 };
272
273 enum segment_cache_field {
274         SEG_FIELD_SEL = 0,
275         SEG_FIELD_BASE = 1,
276         SEG_FIELD_LIMIT = 2,
277         SEG_FIELD_AR = 3,
278
279         SEG_FIELD_NR = 4
280 };
281
282 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
283 {
284         return container_of(vcpu, struct vcpu_vmx, vcpu);
285 }
286
287 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
288 {
289         return to_vmx(vcpu)->nested.current_vmcs12;
290 }
291
292 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
293 {
294         struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
295         if (is_error_page(page)) {
296                 kvm_release_page_clean(page);
297                 return NULL;
298         }
299         return page;
300 }
301
302 static void nested_release_page(struct page *page)
303 {
304         kvm_release_page_dirty(page);
305 }
306
307 static void nested_release_page_clean(struct page *page)
308 {
309         kvm_release_page_clean(page);
310 }
311
312 static u64 construct_eptp(unsigned long root_hpa);
313 static void kvm_cpu_vmxon(u64 addr);
314 static void kvm_cpu_vmxoff(void);
315 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
316 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
317
318 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
319 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
320 /*
321  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
322  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
323  */
324 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
325 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
326
327 static unsigned long *vmx_io_bitmap_a;
328 static unsigned long *vmx_io_bitmap_b;
329 static unsigned long *vmx_msr_bitmap_legacy;
330 static unsigned long *vmx_msr_bitmap_longmode;
331
332 static bool cpu_has_load_ia32_efer;
333
334 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
335 static DEFINE_SPINLOCK(vmx_vpid_lock);
336
337 static struct vmcs_config {
338         int size;
339         int order;
340         u32 revision_id;
341         u32 pin_based_exec_ctrl;
342         u32 cpu_based_exec_ctrl;
343         u32 cpu_based_2nd_exec_ctrl;
344         u32 vmexit_ctrl;
345         u32 vmentry_ctrl;
346 } vmcs_config;
347
348 static struct vmx_capability {
349         u32 ept;
350         u32 vpid;
351 } vmx_capability;
352
353 #define VMX_SEGMENT_FIELD(seg)                                  \
354         [VCPU_SREG_##seg] = {                                   \
355                 .selector = GUEST_##seg##_SELECTOR,             \
356                 .base = GUEST_##seg##_BASE,                     \
357                 .limit = GUEST_##seg##_LIMIT,                   \
358                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
359         }
360
361 static struct kvm_vmx_segment_field {
362         unsigned selector;
363         unsigned base;
364         unsigned limit;
365         unsigned ar_bytes;
366 } kvm_vmx_segment_fields[] = {
367         VMX_SEGMENT_FIELD(CS),
368         VMX_SEGMENT_FIELD(DS),
369         VMX_SEGMENT_FIELD(ES),
370         VMX_SEGMENT_FIELD(FS),
371         VMX_SEGMENT_FIELD(GS),
372         VMX_SEGMENT_FIELD(SS),
373         VMX_SEGMENT_FIELD(TR),
374         VMX_SEGMENT_FIELD(LDTR),
375 };
376
377 static u64 host_efer;
378
379 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
380
381 /*
382  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
383  * away by decrementing the array size.
384  */
385 static const u32 vmx_msr_index[] = {
386 #ifdef CONFIG_X86_64
387         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
388 #endif
389         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
390 };
391 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
392
393 static inline bool is_page_fault(u32 intr_info)
394 {
395         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
396                              INTR_INFO_VALID_MASK)) ==
397                 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
398 }
399
400 static inline bool is_no_device(u32 intr_info)
401 {
402         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
403                              INTR_INFO_VALID_MASK)) ==
404                 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
405 }
406
407 static inline bool is_invalid_opcode(u32 intr_info)
408 {
409         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
410                              INTR_INFO_VALID_MASK)) ==
411                 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
412 }
413
414 static inline bool is_external_interrupt(u32 intr_info)
415 {
416         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
417                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
418 }
419
420 static inline bool is_machine_check(u32 intr_info)
421 {
422         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
423                              INTR_INFO_VALID_MASK)) ==
424                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
425 }
426
427 static inline bool cpu_has_vmx_msr_bitmap(void)
428 {
429         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
430 }
431
432 static inline bool cpu_has_vmx_tpr_shadow(void)
433 {
434         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
435 }
436
437 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
438 {
439         return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
440 }
441
442 static inline bool cpu_has_secondary_exec_ctrls(void)
443 {
444         return vmcs_config.cpu_based_exec_ctrl &
445                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
446 }
447
448 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
449 {
450         return vmcs_config.cpu_based_2nd_exec_ctrl &
451                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
452 }
453
454 static inline bool cpu_has_vmx_flexpriority(void)
455 {
456         return cpu_has_vmx_tpr_shadow() &&
457                 cpu_has_vmx_virtualize_apic_accesses();
458 }
459
460 static inline bool cpu_has_vmx_ept_execute_only(void)
461 {
462         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
463 }
464
465 static inline bool cpu_has_vmx_eptp_uncacheable(void)
466 {
467         return vmx_capability.ept & VMX_EPTP_UC_BIT;
468 }
469
470 static inline bool cpu_has_vmx_eptp_writeback(void)
471 {
472         return vmx_capability.ept & VMX_EPTP_WB_BIT;
473 }
474
475 static inline bool cpu_has_vmx_ept_2m_page(void)
476 {
477         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
478 }
479
480 static inline bool cpu_has_vmx_ept_1g_page(void)
481 {
482         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
483 }
484
485 static inline bool cpu_has_vmx_ept_4levels(void)
486 {
487         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
488 }
489
490 static inline bool cpu_has_vmx_invept_individual_addr(void)
491 {
492         return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
493 }
494
495 static inline bool cpu_has_vmx_invept_context(void)
496 {
497         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
498 }
499
500 static inline bool cpu_has_vmx_invept_global(void)
501 {
502         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
503 }
504
505 static inline bool cpu_has_vmx_invvpid_single(void)
506 {
507         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
508 }
509
510 static inline bool cpu_has_vmx_invvpid_global(void)
511 {
512         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
513 }
514
515 static inline bool cpu_has_vmx_ept(void)
516 {
517         return vmcs_config.cpu_based_2nd_exec_ctrl &
518                 SECONDARY_EXEC_ENABLE_EPT;
519 }
520
521 static inline bool cpu_has_vmx_unrestricted_guest(void)
522 {
523         return vmcs_config.cpu_based_2nd_exec_ctrl &
524                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
525 }
526
527 static inline bool cpu_has_vmx_ple(void)
528 {
529         return vmcs_config.cpu_based_2nd_exec_ctrl &
530                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
531 }
532
533 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
534 {
535         return flexpriority_enabled && irqchip_in_kernel(kvm);
536 }
537
538 static inline bool cpu_has_vmx_vpid(void)
539 {
540         return vmcs_config.cpu_based_2nd_exec_ctrl &
541                 SECONDARY_EXEC_ENABLE_VPID;
542 }
543
544 static inline bool cpu_has_vmx_rdtscp(void)
545 {
546         return vmcs_config.cpu_based_2nd_exec_ctrl &
547                 SECONDARY_EXEC_RDTSCP;
548 }
549
550 static inline bool cpu_has_virtual_nmis(void)
551 {
552         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
553 }
554
555 static inline bool cpu_has_vmx_wbinvd_exit(void)
556 {
557         return vmcs_config.cpu_based_2nd_exec_ctrl &
558                 SECONDARY_EXEC_WBINVD_EXITING;
559 }
560
561 static inline bool report_flexpriority(void)
562 {
563         return flexpriority_enabled;
564 }
565
566 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
567 {
568         int i;
569
570         for (i = 0; i < vmx->nmsrs; ++i)
571                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
572                         return i;
573         return -1;
574 }
575
576 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
577 {
578     struct {
579         u64 vpid : 16;
580         u64 rsvd : 48;
581         u64 gva;
582     } operand = { vpid, 0, gva };
583
584     asm volatile (__ex(ASM_VMX_INVVPID)
585                   /* CF==1 or ZF==1 --> rc = -1 */
586                   "; ja 1f ; ud2 ; 1:"
587                   : : "a"(&operand), "c"(ext) : "cc", "memory");
588 }
589
590 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
591 {
592         struct {
593                 u64 eptp, gpa;
594         } operand = {eptp, gpa};
595
596         asm volatile (__ex(ASM_VMX_INVEPT)
597                         /* CF==1 or ZF==1 --> rc = -1 */
598                         "; ja 1f ; ud2 ; 1:\n"
599                         : : "a" (&operand), "c" (ext) : "cc", "memory");
600 }
601
602 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
603 {
604         int i;
605
606         i = __find_msr_index(vmx, msr);
607         if (i >= 0)
608                 return &vmx->guest_msrs[i];
609         return NULL;
610 }
611
612 static void vmcs_clear(struct vmcs *vmcs)
613 {
614         u64 phys_addr = __pa(vmcs);
615         u8 error;
616
617         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
618                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
619                       : "cc", "memory");
620         if (error)
621                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
622                        vmcs, phys_addr);
623 }
624
625 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
626 {
627         vmcs_clear(loaded_vmcs->vmcs);
628         loaded_vmcs->cpu = -1;
629         loaded_vmcs->launched = 0;
630 }
631
632 static void vmcs_load(struct vmcs *vmcs)
633 {
634         u64 phys_addr = __pa(vmcs);
635         u8 error;
636
637         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
638                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
639                         : "cc", "memory");
640         if (error)
641                 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
642                        vmcs, phys_addr);
643 }
644
645 static void __loaded_vmcs_clear(void *arg)
646 {
647         struct loaded_vmcs *loaded_vmcs = arg;
648         int cpu = raw_smp_processor_id();
649
650         if (loaded_vmcs->cpu != cpu)
651                 return; /* vcpu migration can race with cpu offline */
652         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
653                 per_cpu(current_vmcs, cpu) = NULL;
654         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
655         loaded_vmcs_init(loaded_vmcs);
656 }
657
658 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
659 {
660         if (loaded_vmcs->cpu != -1)
661                 smp_call_function_single(
662                         loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
663 }
664
665 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
666 {
667         if (vmx->vpid == 0)
668                 return;
669
670         if (cpu_has_vmx_invvpid_single())
671                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
672 }
673
674 static inline void vpid_sync_vcpu_global(void)
675 {
676         if (cpu_has_vmx_invvpid_global())
677                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
678 }
679
680 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
681 {
682         if (cpu_has_vmx_invvpid_single())
683                 vpid_sync_vcpu_single(vmx);
684         else
685                 vpid_sync_vcpu_global();
686 }
687
688 static inline void ept_sync_global(void)
689 {
690         if (cpu_has_vmx_invept_global())
691                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
692 }
693
694 static inline void ept_sync_context(u64 eptp)
695 {
696         if (enable_ept) {
697                 if (cpu_has_vmx_invept_context())
698                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
699                 else
700                         ept_sync_global();
701         }
702 }
703
704 static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
705 {
706         if (enable_ept) {
707                 if (cpu_has_vmx_invept_individual_addr())
708                         __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
709                                         eptp, gpa);
710                 else
711                         ept_sync_context(eptp);
712         }
713 }
714
715 static __always_inline unsigned long vmcs_readl(unsigned long field)
716 {
717         unsigned long value;
718
719         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
720                       : "=a"(value) : "d"(field) : "cc");
721         return value;
722 }
723
724 static __always_inline u16 vmcs_read16(unsigned long field)
725 {
726         return vmcs_readl(field);
727 }
728
729 static __always_inline u32 vmcs_read32(unsigned long field)
730 {
731         return vmcs_readl(field);
732 }
733
734 static __always_inline u64 vmcs_read64(unsigned long field)
735 {
736 #ifdef CONFIG_X86_64
737         return vmcs_readl(field);
738 #else
739         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
740 #endif
741 }
742
743 static noinline void vmwrite_error(unsigned long field, unsigned long value)
744 {
745         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
746                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
747         dump_stack();
748 }
749
750 static void vmcs_writel(unsigned long field, unsigned long value)
751 {
752         u8 error;
753
754         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
755                        : "=q"(error) : "a"(value), "d"(field) : "cc");
756         if (unlikely(error))
757                 vmwrite_error(field, value);
758 }
759
760 static void vmcs_write16(unsigned long field, u16 value)
761 {
762         vmcs_writel(field, value);
763 }
764
765 static void vmcs_write32(unsigned long field, u32 value)
766 {
767         vmcs_writel(field, value);
768 }
769
770 static void vmcs_write64(unsigned long field, u64 value)
771 {
772         vmcs_writel(field, value);
773 #ifndef CONFIG_X86_64
774         asm volatile ("");
775         vmcs_writel(field+1, value >> 32);
776 #endif
777 }
778
779 static void vmcs_clear_bits(unsigned long field, u32 mask)
780 {
781         vmcs_writel(field, vmcs_readl(field) & ~mask);
782 }
783
784 static void vmcs_set_bits(unsigned long field, u32 mask)
785 {
786         vmcs_writel(field, vmcs_readl(field) | mask);
787 }
788
789 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
790 {
791         vmx->segment_cache.bitmask = 0;
792 }
793
794 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
795                                        unsigned field)
796 {
797         bool ret;
798         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
799
800         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
801                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
802                 vmx->segment_cache.bitmask = 0;
803         }
804         ret = vmx->segment_cache.bitmask & mask;
805         vmx->segment_cache.bitmask |= mask;
806         return ret;
807 }
808
809 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
810 {
811         u16 *p = &vmx->segment_cache.seg[seg].selector;
812
813         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
814                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
815         return *p;
816 }
817
818 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
819 {
820         ulong *p = &vmx->segment_cache.seg[seg].base;
821
822         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
823                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
824         return *p;
825 }
826
827 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
828 {
829         u32 *p = &vmx->segment_cache.seg[seg].limit;
830
831         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
832                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
833         return *p;
834 }
835
836 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
837 {
838         u32 *p = &vmx->segment_cache.seg[seg].ar;
839
840         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
841                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
842         return *p;
843 }
844
845 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
846 {
847         u32 eb;
848
849         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
850              (1u << NM_VECTOR) | (1u << DB_VECTOR);
851         if ((vcpu->guest_debug &
852              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
853             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
854                 eb |= 1u << BP_VECTOR;
855         if (to_vmx(vcpu)->rmode.vm86_active)
856                 eb = ~0;
857         if (enable_ept)
858                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
859         if (vcpu->fpu_active)
860                 eb &= ~(1u << NM_VECTOR);
861         vmcs_write32(EXCEPTION_BITMAP, eb);
862 }
863
864 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
865 {
866         unsigned i;
867         struct msr_autoload *m = &vmx->msr_autoload;
868
869         if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
870                 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
871                 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
872                 return;
873         }
874
875         for (i = 0; i < m->nr; ++i)
876                 if (m->guest[i].index == msr)
877                         break;
878
879         if (i == m->nr)
880                 return;
881         --m->nr;
882         m->guest[i] = m->guest[m->nr];
883         m->host[i] = m->host[m->nr];
884         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
885         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
886 }
887
888 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
889                                   u64 guest_val, u64 host_val)
890 {
891         unsigned i;
892         struct msr_autoload *m = &vmx->msr_autoload;
893
894         if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
895                 vmcs_write64(GUEST_IA32_EFER, guest_val);
896                 vmcs_write64(HOST_IA32_EFER, host_val);
897                 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
898                 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
899                 return;
900         }
901
902         for (i = 0; i < m->nr; ++i)
903                 if (m->guest[i].index == msr)
904                         break;
905
906         if (i == m->nr) {
907                 ++m->nr;
908                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
909                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
910         }
911
912         m->guest[i].index = msr;
913         m->guest[i].value = guest_val;
914         m->host[i].index = msr;
915         m->host[i].value = host_val;
916 }
917
918 static void reload_tss(void)
919 {
920         /*
921          * VT restores TR but not its size.  Useless.
922          */
923         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
924         struct desc_struct *descs;
925
926         descs = (void *)gdt->address;
927         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
928         load_TR_desc();
929 }
930
931 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
932 {
933         u64 guest_efer;
934         u64 ignore_bits;
935
936         guest_efer = vmx->vcpu.arch.efer;
937
938         /*
939          * NX is emulated; LMA and LME handled by hardware; SCE meaninless
940          * outside long mode
941          */
942         ignore_bits = EFER_NX | EFER_SCE;
943 #ifdef CONFIG_X86_64
944         ignore_bits |= EFER_LMA | EFER_LME;
945         /* SCE is meaningful only in long mode on Intel */
946         if (guest_efer & EFER_LMA)
947                 ignore_bits &= ~(u64)EFER_SCE;
948 #endif
949         guest_efer &= ~ignore_bits;
950         guest_efer |= host_efer & ignore_bits;
951         vmx->guest_msrs[efer_offset].data = guest_efer;
952         vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
953
954         clear_atomic_switch_msr(vmx, MSR_EFER);
955         /* On ept, can't emulate nx, and must switch nx atomically */
956         if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
957                 guest_efer = vmx->vcpu.arch.efer;
958                 if (!(guest_efer & EFER_LMA))
959                         guest_efer &= ~EFER_LME;
960                 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
961                 return false;
962         }
963
964         return true;
965 }
966
967 static unsigned long segment_base(u16 selector)
968 {
969         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
970         struct desc_struct *d;
971         unsigned long table_base;
972         unsigned long v;
973
974         if (!(selector & ~3))
975                 return 0;
976
977         table_base = gdt->address;
978
979         if (selector & 4) {           /* from ldt */
980                 u16 ldt_selector = kvm_read_ldt();
981
982                 if (!(ldt_selector & ~3))
983                         return 0;
984
985                 table_base = segment_base(ldt_selector);
986         }
987         d = (struct desc_struct *)(table_base + (selector & ~7));
988         v = get_desc_base(d);
989 #ifdef CONFIG_X86_64
990        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
991                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
992 #endif
993         return v;
994 }
995
996 static inline unsigned long kvm_read_tr_base(void)
997 {
998         u16 tr;
999         asm("str %0" : "=g"(tr));
1000         return segment_base(tr);
1001 }
1002
1003 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1004 {
1005         struct vcpu_vmx *vmx = to_vmx(vcpu);
1006         int i;
1007
1008         if (vmx->host_state.loaded)
1009                 return;
1010
1011         vmx->host_state.loaded = 1;
1012         /*
1013          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1014          * allow segment selectors with cpl > 0 or ti == 1.
1015          */
1016         vmx->host_state.ldt_sel = kvm_read_ldt();
1017         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1018         savesegment(fs, vmx->host_state.fs_sel);
1019         if (!(vmx->host_state.fs_sel & 7)) {
1020                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1021                 vmx->host_state.fs_reload_needed = 0;
1022         } else {
1023                 vmcs_write16(HOST_FS_SELECTOR, 0);
1024                 vmx->host_state.fs_reload_needed = 1;
1025         }
1026         savesegment(gs, vmx->host_state.gs_sel);
1027         if (!(vmx->host_state.gs_sel & 7))
1028                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1029         else {
1030                 vmcs_write16(HOST_GS_SELECTOR, 0);
1031                 vmx->host_state.gs_ldt_reload_needed = 1;
1032         }
1033
1034 #ifdef CONFIG_X86_64
1035         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1036         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1037 #else
1038         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1039         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1040 #endif
1041
1042 #ifdef CONFIG_X86_64
1043         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1044         if (is_long_mode(&vmx->vcpu))
1045                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1046 #endif
1047         for (i = 0; i < vmx->save_nmsrs; ++i)
1048                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1049                                    vmx->guest_msrs[i].data,
1050                                    vmx->guest_msrs[i].mask);
1051 }
1052
1053 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1054 {
1055         if (!vmx->host_state.loaded)
1056                 return;
1057
1058         ++vmx->vcpu.stat.host_state_reload;
1059         vmx->host_state.loaded = 0;
1060 #ifdef CONFIG_X86_64
1061         if (is_long_mode(&vmx->vcpu))
1062                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1063 #endif
1064         if (vmx->host_state.gs_ldt_reload_needed) {
1065                 kvm_load_ldt(vmx->host_state.ldt_sel);
1066 #ifdef CONFIG_X86_64
1067                 load_gs_index(vmx->host_state.gs_sel);
1068 #else
1069                 loadsegment(gs, vmx->host_state.gs_sel);
1070 #endif
1071         }
1072         if (vmx->host_state.fs_reload_needed)
1073                 loadsegment(fs, vmx->host_state.fs_sel);
1074         reload_tss();
1075 #ifdef CONFIG_X86_64
1076         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1077 #endif
1078         if (current_thread_info()->status & TS_USEDFPU)
1079                 clts();
1080         load_gdt(&__get_cpu_var(host_gdt));
1081 }
1082
1083 static void vmx_load_host_state(struct vcpu_vmx *vmx)
1084 {
1085         preempt_disable();
1086         __vmx_load_host_state(vmx);
1087         preempt_enable();
1088 }
1089
1090 /*
1091  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1092  * vcpu mutex is already taken.
1093  */
1094 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1095 {
1096         struct vcpu_vmx *vmx = to_vmx(vcpu);
1097         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1098
1099         if (!vmm_exclusive)
1100                 kvm_cpu_vmxon(phys_addr);
1101         else if (vmx->loaded_vmcs->cpu != cpu)
1102                 loaded_vmcs_clear(vmx->loaded_vmcs);
1103
1104         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1105                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1106                 vmcs_load(vmx->loaded_vmcs->vmcs);
1107         }
1108
1109         if (vmx->loaded_vmcs->cpu != cpu) {
1110                 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1111                 unsigned long sysenter_esp;
1112
1113                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1114                 local_irq_disable();
1115                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1116                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1117                 local_irq_enable();
1118
1119                 /*
1120                  * Linux uses per-cpu TSS and GDT, so set these when switching
1121                  * processors.
1122                  */
1123                 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1124                 vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1125
1126                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1127                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1128                 vmx->loaded_vmcs->cpu = cpu;
1129         }
1130 }
1131
1132 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1133 {
1134         __vmx_load_host_state(to_vmx(vcpu));
1135         if (!vmm_exclusive) {
1136                 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1137                 vcpu->cpu = -1;
1138                 kvm_cpu_vmxoff();
1139         }
1140 }
1141
1142 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1143 {
1144         ulong cr0;
1145
1146         if (vcpu->fpu_active)
1147                 return;
1148         vcpu->fpu_active = 1;
1149         cr0 = vmcs_readl(GUEST_CR0);
1150         cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1151         cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1152         vmcs_writel(GUEST_CR0, cr0);
1153         update_exception_bitmap(vcpu);
1154         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1155         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1156 }
1157
1158 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1159
1160 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1161 {
1162         vmx_decache_cr0_guest_bits(vcpu);
1163         vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1164         update_exception_bitmap(vcpu);
1165         vcpu->arch.cr0_guest_owned_bits = 0;
1166         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1167         vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1168 }
1169
1170 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1171 {
1172         unsigned long rflags, save_rflags;
1173
1174         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1175                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1176                 rflags = vmcs_readl(GUEST_RFLAGS);
1177                 if (to_vmx(vcpu)->rmode.vm86_active) {
1178                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1179                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1180                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1181                 }
1182                 to_vmx(vcpu)->rflags = rflags;
1183         }
1184         return to_vmx(vcpu)->rflags;
1185 }
1186
1187 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1188 {
1189         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1190         __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1191         to_vmx(vcpu)->rflags = rflags;
1192         if (to_vmx(vcpu)->rmode.vm86_active) {
1193                 to_vmx(vcpu)->rmode.save_rflags = rflags;
1194                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1195         }
1196         vmcs_writel(GUEST_RFLAGS, rflags);
1197 }
1198
1199 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1200 {
1201         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1202         int ret = 0;
1203
1204         if (interruptibility & GUEST_INTR_STATE_STI)
1205                 ret |= KVM_X86_SHADOW_INT_STI;
1206         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1207                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1208
1209         return ret & mask;
1210 }
1211
1212 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1213 {
1214         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1215         u32 interruptibility = interruptibility_old;
1216
1217         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1218
1219         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1220                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1221         else if (mask & KVM_X86_SHADOW_INT_STI)
1222                 interruptibility |= GUEST_INTR_STATE_STI;
1223
1224         if ((interruptibility != interruptibility_old))
1225                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1226 }
1227
1228 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1229 {
1230         unsigned long rip;
1231
1232         rip = kvm_rip_read(vcpu);
1233         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1234         kvm_rip_write(vcpu, rip);
1235
1236         /* skipping an emulated instruction also counts */
1237         vmx_set_interrupt_shadow(vcpu, 0);
1238 }
1239
1240 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1241 {
1242         /* Ensure that we clear the HLT state in the VMCS.  We don't need to
1243          * explicitly skip the instruction because if the HLT state is set, then
1244          * the instruction is already executing and RIP has already been
1245          * advanced. */
1246         if (!yield_on_hlt &&
1247             vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1248                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1249 }
1250
1251 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1252                                 bool has_error_code, u32 error_code,
1253                                 bool reinject)
1254 {
1255         struct vcpu_vmx *vmx = to_vmx(vcpu);
1256         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1257
1258         if (has_error_code) {
1259                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1260                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1261         }
1262
1263         if (vmx->rmode.vm86_active) {
1264                 int inc_eip = 0;
1265                 if (kvm_exception_is_soft(nr))
1266                         inc_eip = vcpu->arch.event_exit_inst_len;
1267                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1268                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1269                 return;
1270         }
1271
1272         if (kvm_exception_is_soft(nr)) {
1273                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1274                              vmx->vcpu.arch.event_exit_inst_len);
1275                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1276         } else
1277                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1278
1279         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1280         vmx_clear_hlt(vcpu);
1281 }
1282
1283 static bool vmx_rdtscp_supported(void)
1284 {
1285         return cpu_has_vmx_rdtscp();
1286 }
1287
1288 /*
1289  * Swap MSR entry in host/guest MSR entry array.
1290  */
1291 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1292 {
1293         struct shared_msr_entry tmp;
1294
1295         tmp = vmx->guest_msrs[to];
1296         vmx->guest_msrs[to] = vmx->guest_msrs[from];
1297         vmx->guest_msrs[from] = tmp;
1298 }
1299
1300 /*
1301  * Set up the vmcs to automatically save and restore system
1302  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1303  * mode, as fiddling with msrs is very expensive.
1304  */
1305 static void setup_msrs(struct vcpu_vmx *vmx)
1306 {
1307         int save_nmsrs, index;
1308         unsigned long *msr_bitmap;
1309
1310         vmx_load_host_state(vmx);
1311         save_nmsrs = 0;
1312 #ifdef CONFIG_X86_64
1313         if (is_long_mode(&vmx->vcpu)) {
1314                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1315                 if (index >= 0)
1316                         move_msr_up(vmx, index, save_nmsrs++);
1317                 index = __find_msr_index(vmx, MSR_LSTAR);
1318                 if (index >= 0)
1319                         move_msr_up(vmx, index, save_nmsrs++);
1320                 index = __find_msr_index(vmx, MSR_CSTAR);
1321                 if (index >= 0)
1322                         move_msr_up(vmx, index, save_nmsrs++);
1323                 index = __find_msr_index(vmx, MSR_TSC_AUX);
1324                 if (index >= 0 && vmx->rdtscp_enabled)
1325                         move_msr_up(vmx, index, save_nmsrs++);
1326                 /*
1327                  * MSR_STAR is only needed on long mode guests, and only
1328                  * if efer.sce is enabled.
1329                  */
1330                 index = __find_msr_index(vmx, MSR_STAR);
1331                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1332                         move_msr_up(vmx, index, save_nmsrs++);
1333         }
1334 #endif
1335         index = __find_msr_index(vmx, MSR_EFER);
1336         if (index >= 0 && update_transition_efer(vmx, index))
1337                 move_msr_up(vmx, index, save_nmsrs++);
1338
1339         vmx->save_nmsrs = save_nmsrs;
1340
1341         if (cpu_has_vmx_msr_bitmap()) {
1342                 if (is_long_mode(&vmx->vcpu))
1343                         msr_bitmap = vmx_msr_bitmap_longmode;
1344                 else
1345                         msr_bitmap = vmx_msr_bitmap_legacy;
1346
1347                 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1348         }
1349 }
1350
1351 /*
1352  * reads and returns guest's timestamp counter "register"
1353  * guest_tsc = host_tsc + tsc_offset    -- 21.3
1354  */
1355 static u64 guest_read_tsc(void)
1356 {
1357         u64 host_tsc, tsc_offset;
1358
1359         rdtscll(host_tsc);
1360         tsc_offset = vmcs_read64(TSC_OFFSET);
1361         return host_tsc + tsc_offset;
1362 }
1363
1364 /*
1365  * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1366  * ioctl. In this case the call-back should update internal vmx state to make
1367  * the changes effective.
1368  */
1369 static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1370 {
1371         /* Nothing to do here */
1372 }
1373
1374 /*
1375  * writes 'offset' into guest's timestamp counter offset register
1376  */
1377 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1378 {
1379         vmcs_write64(TSC_OFFSET, offset);
1380 }
1381
1382 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1383 {
1384         u64 offset = vmcs_read64(TSC_OFFSET);
1385         vmcs_write64(TSC_OFFSET, offset + adjustment);
1386 }
1387
1388 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1389 {
1390         return target_tsc - native_read_tsc();
1391 }
1392
1393 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1394 {
1395         struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1396         return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1397 }
1398
1399 /*
1400  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1401  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1402  * all guests if the "nested" module option is off, and can also be disabled
1403  * for a single guest by disabling its VMX cpuid bit.
1404  */
1405 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1406 {
1407         return nested && guest_cpuid_has_vmx(vcpu);
1408 }
1409
1410 /*
1411  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1412  * returned for the various VMX controls MSRs when nested VMX is enabled.
1413  * The same values should also be used to verify that vmcs12 control fields are
1414  * valid during nested entry from L1 to L2.
1415  * Each of these control msrs has a low and high 32-bit half: A low bit is on
1416  * if the corresponding bit in the (32-bit) control field *must* be on, and a
1417  * bit in the high half is on if the corresponding bit in the control field
1418  * may be on. See also vmx_control_verify().
1419  * TODO: allow these variables to be modified (downgraded) by module options
1420  * or other means.
1421  */
1422 static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1423 static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1424 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1425 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1426 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1427 static __init void nested_vmx_setup_ctls_msrs(void)
1428 {
1429         /*
1430          * Note that as a general rule, the high half of the MSRs (bits in
1431          * the control fields which may be 1) should be initialized by the
1432          * intersection of the underlying hardware's MSR (i.e., features which
1433          * can be supported) and the list of features we want to expose -
1434          * because they are known to be properly supported in our code.
1435          * Also, usually, the low half of the MSRs (bits which must be 1) can
1436          * be set to 0, meaning that L1 may turn off any of these bits. The
1437          * reason is that if one of these bits is necessary, it will appear
1438          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
1439          * fields of vmcs01 and vmcs02, will turn these bits off - and
1440          * nested_vmx_exit_handled() will not pass related exits to L1.
1441          * These rules have exceptions below.
1442          */
1443
1444         /* pin-based controls */
1445         /*
1446          * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
1447          * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
1448          */
1449         nested_vmx_pinbased_ctls_low = 0x16 ;
1450         nested_vmx_pinbased_ctls_high = 0x16 |
1451                 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
1452                 PIN_BASED_VIRTUAL_NMIS;
1453
1454         /* exit controls */
1455         nested_vmx_exit_ctls_low = 0;
1456 #ifdef CONFIG_X86_64
1457         nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
1458 #else
1459         nested_vmx_exit_ctls_high = 0;
1460 #endif
1461
1462         /* entry controls */
1463         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
1464                 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
1465         nested_vmx_entry_ctls_low = 0;
1466         nested_vmx_entry_ctls_high &=
1467                 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
1468
1469         /* cpu-based controls */
1470         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
1471                 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
1472         nested_vmx_procbased_ctls_low = 0;
1473         nested_vmx_procbased_ctls_high &=
1474                 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
1475                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
1476                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
1477                 CPU_BASED_CR3_STORE_EXITING |
1478 #ifdef CONFIG_X86_64
1479                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
1480 #endif
1481                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1482                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1483                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1484         /*
1485          * We can allow some features even when not supported by the
1486          * hardware. For example, L1 can specify an MSR bitmap - and we
1487          * can use it to avoid exits to L1 - even when L0 runs L2
1488          * without MSR bitmaps.
1489          */
1490         nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
1491
1492         /* secondary cpu-based controls */
1493         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
1494                 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
1495         nested_vmx_secondary_ctls_low = 0;
1496         nested_vmx_secondary_ctls_high &=
1497                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1498 }
1499
1500 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1501 {
1502         /*
1503          * Bits 0 in high must be 0, and bits 1 in low must be 1.
1504          */
1505         return ((control & high) | low) == control;
1506 }
1507
1508 static inline u64 vmx_control_msr(u32 low, u32 high)
1509 {
1510         return low | ((u64)high << 32);
1511 }
1512
1513 /*
1514  * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
1515  * also let it use VMX-specific MSRs.
1516  * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
1517  * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
1518  * like all other MSRs).
1519  */
1520 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1521 {
1522         if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
1523                      msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
1524                 /*
1525                  * According to the spec, processors which do not support VMX
1526                  * should throw a #GP(0) when VMX capability MSRs are read.
1527                  */
1528                 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
1529                 return 1;
1530         }
1531
1532         switch (msr_index) {
1533         case MSR_IA32_FEATURE_CONTROL:
1534                 *pdata = 0;
1535                 break;
1536         case MSR_IA32_VMX_BASIC:
1537                 /*
1538                  * This MSR reports some information about VMX support. We
1539                  * should return information about the VMX we emulate for the
1540                  * guest, and the VMCS structure we give it - not about the
1541                  * VMX support of the underlying hardware.
1542                  */
1543                 *pdata = VMCS12_REVISION |
1544                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
1545                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
1546                 break;
1547         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1548         case MSR_IA32_VMX_PINBASED_CTLS:
1549                 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
1550                                         nested_vmx_pinbased_ctls_high);
1551                 break;
1552         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1553         case MSR_IA32_VMX_PROCBASED_CTLS:
1554                 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
1555                                         nested_vmx_procbased_ctls_high);
1556                 break;
1557         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1558         case MSR_IA32_VMX_EXIT_CTLS:
1559                 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
1560                                         nested_vmx_exit_ctls_high);
1561                 break;
1562         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1563         case MSR_IA32_VMX_ENTRY_CTLS:
1564                 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
1565                                         nested_vmx_entry_ctls_high);
1566                 break;
1567         case MSR_IA32_VMX_MISC:
1568                 *pdata = 0;
1569                 break;
1570         /*
1571          * These MSRs specify bits which the guest must keep fixed (on or off)
1572          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
1573          * We picked the standard core2 setting.
1574          */
1575 #define VMXON_CR0_ALWAYSON      (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
1576 #define VMXON_CR4_ALWAYSON      X86_CR4_VMXE
1577         case MSR_IA32_VMX_CR0_FIXED0:
1578                 *pdata = VMXON_CR0_ALWAYSON;
1579                 break;
1580         case MSR_IA32_VMX_CR0_FIXED1:
1581                 *pdata = -1ULL;
1582                 break;
1583         case MSR_IA32_VMX_CR4_FIXED0:
1584                 *pdata = VMXON_CR4_ALWAYSON;
1585                 break;
1586         case MSR_IA32_VMX_CR4_FIXED1:
1587                 *pdata = -1ULL;
1588                 break;
1589         case MSR_IA32_VMX_VMCS_ENUM:
1590                 *pdata = 0x1f;
1591                 break;
1592         case MSR_IA32_VMX_PROCBASED_CTLS2:
1593                 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
1594                                         nested_vmx_secondary_ctls_high);
1595                 break;
1596         case MSR_IA32_VMX_EPT_VPID_CAP:
1597                 /* Currently, no nested ept or nested vpid */
1598                 *pdata = 0;
1599                 break;
1600         default:
1601                 return 0;
1602         }
1603
1604         return 1;
1605 }
1606
1607 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1608 {
1609         if (!nested_vmx_allowed(vcpu))
1610                 return 0;
1611
1612         if (msr_index == MSR_IA32_FEATURE_CONTROL)
1613                 /* TODO: the right thing. */
1614                 return 1;
1615         /*
1616          * No need to treat VMX capability MSRs specially: If we don't handle
1617          * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
1618          */
1619         return 0;
1620 }
1621
1622 /*
1623  * Reads an msr value (of 'msr_index') into 'pdata'.
1624  * Returns 0 on success, non-0 otherwise.
1625  * Assumes vcpu_load() was already called.
1626  */
1627 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1628 {
1629         u64 data;
1630         struct shared_msr_entry *msr;
1631
1632         if (!pdata) {
1633                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
1634                 return -EINVAL;
1635         }
1636
1637         switch (msr_index) {
1638 #ifdef CONFIG_X86_64
1639         case MSR_FS_BASE:
1640                 data = vmcs_readl(GUEST_FS_BASE);
1641                 break;
1642         case MSR_GS_BASE:
1643                 data = vmcs_readl(GUEST_GS_BASE);
1644                 break;
1645         case MSR_KERNEL_GS_BASE:
1646                 vmx_load_host_state(to_vmx(vcpu));
1647                 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
1648                 break;
1649 #endif
1650         case MSR_EFER:
1651                 return kvm_get_msr_common(vcpu, msr_index, pdata);
1652         case MSR_IA32_TSC:
1653                 data = guest_read_tsc();
1654                 break;
1655         case MSR_IA32_SYSENTER_CS:
1656                 data = vmcs_read32(GUEST_SYSENTER_CS);
1657                 break;
1658         case MSR_IA32_SYSENTER_EIP:
1659                 data = vmcs_readl(GUEST_SYSENTER_EIP);
1660                 break;
1661         case MSR_IA32_SYSENTER_ESP:
1662                 data = vmcs_readl(GUEST_SYSENTER_ESP);
1663                 break;
1664         case MSR_TSC_AUX:
1665                 if (!to_vmx(vcpu)->rdtscp_enabled)
1666                         return 1;
1667                 /* Otherwise falls through */
1668         default:
1669                 vmx_load_host_state(to_vmx(vcpu));
1670                 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
1671                         return 0;
1672                 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1673                 if (msr) {
1674                         vmx_load_host_state(to_vmx(vcpu));
1675                         data = msr->data;
1676                         break;
1677                 }
1678                 return kvm_get_msr_common(vcpu, msr_index, pdata);
1679         }
1680
1681         *pdata = data;
1682         return 0;
1683 }
1684
1685 /*
1686  * Writes msr value into into the appropriate "register".
1687  * Returns 0 on success, non-0 otherwise.
1688  * Assumes vcpu_load() was already called.
1689  */
1690 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1691 {
1692         struct vcpu_vmx *vmx = to_vmx(vcpu);
1693         struct shared_msr_entry *msr;
1694         int ret = 0;
1695
1696         switch (msr_index) {
1697         case MSR_EFER:
1698                 vmx_load_host_state(vmx);
1699                 ret = kvm_set_msr_common(vcpu, msr_index, data);
1700                 break;
1701 #ifdef CONFIG_X86_64
1702         case MSR_FS_BASE:
1703                 vmx_segment_cache_clear(vmx);
1704                 vmcs_writel(GUEST_FS_BASE, data);
1705                 break;
1706         case MSR_GS_BASE:
1707                 vmx_segment_cache_clear(vmx);
1708                 vmcs_writel(GUEST_GS_BASE, data);
1709                 break;
1710         case MSR_KERNEL_GS_BASE:
1711                 vmx_load_host_state(vmx);
1712                 vmx->msr_guest_kernel_gs_base = data;
1713                 break;
1714 #endif
1715         case MSR_IA32_SYSENTER_CS:
1716                 vmcs_write32(GUEST_SYSENTER_CS, data);
1717                 break;
1718         case MSR_IA32_SYSENTER_EIP:
1719                 vmcs_writel(GUEST_SYSENTER_EIP, data);
1720                 break;
1721         case MSR_IA32_SYSENTER_ESP:
1722                 vmcs_writel(GUEST_SYSENTER_ESP, data);
1723                 break;
1724         case MSR_IA32_TSC:
1725                 kvm_write_tsc(vcpu, data);
1726                 break;
1727         case MSR_IA32_CR_PAT:
1728                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1729                         vmcs_write64(GUEST_IA32_PAT, data);
1730                         vcpu->arch.pat = data;
1731                         break;
1732                 }
1733                 ret = kvm_set_msr_common(vcpu, msr_index, data);
1734                 break;
1735         case MSR_TSC_AUX:
1736                 if (!vmx->rdtscp_enabled)
1737                         return 1;
1738                 /* Check reserved bit, higher 32 bits should be zero */
1739                 if ((data >> 32) != 0)
1740                         return 1;
1741                 /* Otherwise falls through */
1742         default:
1743                 if (vmx_set_vmx_msr(vcpu, msr_index, data))
1744                         break;
1745                 msr = find_msr_entry(vmx, msr_index);
1746                 if (msr) {
1747                         vmx_load_host_state(vmx);
1748                         msr->data = data;
1749                         break;
1750                 }
1751                 ret = kvm_set_msr_common(vcpu, msr_index, data);
1752         }
1753
1754         return ret;
1755 }
1756
1757 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1758 {
1759         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
1760         switch (reg) {
1761         case VCPU_REGS_RSP:
1762                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1763                 break;
1764         case VCPU_REGS_RIP:
1765                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1766                 break;
1767         case VCPU_EXREG_PDPTR:
1768                 if (enable_ept)
1769                         ept_save_pdptrs(vcpu);
1770                 break;
1771         default:
1772                 break;
1773         }
1774 }
1775
1776 static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1777 {
1778         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1779                 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1780         else
1781                 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1782
1783         update_exception_bitmap(vcpu);
1784 }
1785
1786 static __init int cpu_has_kvm_support(void)
1787 {
1788         return cpu_has_vmx();
1789 }
1790
1791 static __init int vmx_disabled_by_bios(void)
1792 {
1793         u64 msr;
1794
1795         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1796         if (msr & FEATURE_CONTROL_LOCKED) {
1797                 /* launched w/ TXT and VMX disabled */
1798                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1799                         && tboot_enabled())
1800                         return 1;
1801                 /* launched w/o TXT and VMX only enabled w/ TXT */
1802                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1803                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1804                         && !tboot_enabled()) {
1805                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1806                                 "activate TXT before enabling KVM\n");
1807                         return 1;
1808                 }
1809                 /* launched w/o TXT and VMX disabled */
1810                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1811                         && !tboot_enabled())
1812                         return 1;
1813         }
1814
1815         return 0;
1816 }
1817
1818 static void kvm_cpu_vmxon(u64 addr)
1819 {
1820         asm volatile (ASM_VMX_VMXON_RAX
1821                         : : "a"(&addr), "m"(addr)
1822                         : "memory", "cc");
1823 }
1824
1825 static int hardware_enable(void *garbage)
1826 {
1827         int cpu = raw_smp_processor_id();
1828         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1829         u64 old, test_bits;
1830
1831         if (read_cr4() & X86_CR4_VMXE)
1832                 return -EBUSY;
1833
1834         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
1835         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1836
1837         test_bits = FEATURE_CONTROL_LOCKED;
1838         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1839         if (tboot_enabled())
1840                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1841
1842         if ((old & test_bits) != test_bits) {
1843                 /* enable and lock */
1844                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1845         }
1846         write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1847
1848         if (vmm_exclusive) {
1849                 kvm_cpu_vmxon(phys_addr);
1850                 ept_sync_global();
1851         }
1852
1853         store_gdt(&__get_cpu_var(host_gdt));
1854
1855         return 0;
1856 }
1857
1858 static void vmclear_local_loaded_vmcss(void)
1859 {
1860         int cpu = raw_smp_processor_id();
1861         struct loaded_vmcs *v, *n;
1862
1863         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
1864                                  loaded_vmcss_on_cpu_link)
1865                 __loaded_vmcs_clear(v);
1866 }
1867
1868
1869 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1870  * tricks.
1871  */
1872 static void kvm_cpu_vmxoff(void)
1873 {
1874         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1875 }
1876
1877 static void hardware_disable(void *garbage)
1878 {
1879         if (vmm_exclusive) {
1880                 vmclear_local_loaded_vmcss();
1881                 kvm_cpu_vmxoff();
1882         }
1883         write_cr4(read_cr4() & ~X86_CR4_VMXE);
1884 }
1885
1886 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1887                                       u32 msr, u32 *result)
1888 {
1889         u32 vmx_msr_low, vmx_msr_high;
1890         u32 ctl = ctl_min | ctl_opt;
1891
1892         rdmsr(msr, vmx_msr_low, vmx_msr_high);
1893
1894         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1895         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
1896
1897         /* Ensure minimum (required) set of control bits are supported. */
1898         if (ctl_min & ~ctl)
1899                 return -EIO;
1900
1901         *result = ctl;
1902         return 0;
1903 }
1904
1905 static __init bool allow_1_setting(u32 msr, u32 ctl)
1906 {
1907         u32 vmx_msr_low, vmx_msr_high;
1908
1909         rdmsr(msr, vmx_msr_low, vmx_msr_high);
1910         return vmx_msr_high & ctl;
1911 }
1912
1913 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1914 {
1915         u32 vmx_msr_low, vmx_msr_high;
1916         u32 min, opt, min2, opt2;
1917         u32 _pin_based_exec_control = 0;
1918         u32 _cpu_based_exec_control = 0;
1919         u32 _cpu_based_2nd_exec_control = 0;
1920         u32 _vmexit_control = 0;
1921         u32 _vmentry_control = 0;
1922
1923         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1924         opt = PIN_BASED_VIRTUAL_NMIS;
1925         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1926                                 &_pin_based_exec_control) < 0)
1927                 return -EIO;
1928
1929         min =
1930 #ifdef CONFIG_X86_64
1931               CPU_BASED_CR8_LOAD_EXITING |
1932               CPU_BASED_CR8_STORE_EXITING |
1933 #endif
1934               CPU_BASED_CR3_LOAD_EXITING |
1935               CPU_BASED_CR3_STORE_EXITING |
1936               CPU_BASED_USE_IO_BITMAPS |
1937               CPU_BASED_MOV_DR_EXITING |
1938               CPU_BASED_USE_TSC_OFFSETING |
1939               CPU_BASED_MWAIT_EXITING |
1940               CPU_BASED_MONITOR_EXITING |
1941               CPU_BASED_INVLPG_EXITING;
1942
1943         if (yield_on_hlt)
1944                 min |= CPU_BASED_HLT_EXITING;
1945
1946         opt = CPU_BASED_TPR_SHADOW |
1947               CPU_BASED_USE_MSR_BITMAPS |
1948               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1949         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1950                                 &_cpu_based_exec_control) < 0)
1951                 return -EIO;
1952 #ifdef CONFIG_X86_64
1953         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1954                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1955                                            ~CPU_BASED_CR8_STORE_EXITING;
1956 #endif
1957         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
1958                 min2 = 0;
1959                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1960                         SECONDARY_EXEC_WBINVD_EXITING |
1961                         SECONDARY_EXEC_ENABLE_VPID |
1962                         SECONDARY_EXEC_ENABLE_EPT |
1963                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
1964                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1965                         SECONDARY_EXEC_RDTSCP;
1966                 if (adjust_vmx_controls(min2, opt2,
1967                                         MSR_IA32_VMX_PROCBASED_CTLS2,
1968                                         &_cpu_based_2nd_exec_control) < 0)
1969                         return -EIO;
1970         }
1971 #ifndef CONFIG_X86_64
1972         if (!(_cpu_based_2nd_exec_control &
1973                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1974                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1975 #endif
1976         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1977                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1978                    enabled */
1979                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
1980                                              CPU_BASED_CR3_STORE_EXITING |
1981                                              CPU_BASED_INVLPG_EXITING);
1982                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1983                       vmx_capability.ept, vmx_capability.vpid);
1984         }
1985
1986         min = 0;
1987 #ifdef CONFIG_X86_64
1988         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1989 #endif
1990         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1991         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1992                                 &_vmexit_control) < 0)
1993                 return -EIO;
1994
1995         min = 0;
1996         opt = VM_ENTRY_LOAD_IA32_PAT;
1997         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1998                                 &_vmentry_control) < 0)
1999                 return -EIO;
2000
2001         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2002
2003         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2004         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2005                 return -EIO;
2006
2007 #ifdef CONFIG_X86_64
2008         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2009         if (vmx_msr_high & (1u<<16))
2010                 return -EIO;
2011 #endif
2012
2013         /* Require Write-Back (WB) memory type for VMCS accesses. */
2014         if (((vmx_msr_high >> 18) & 15) != 6)
2015                 return -EIO;
2016
2017         vmcs_conf->size = vmx_msr_high & 0x1fff;
2018         vmcs_conf->order = get_order(vmcs_config.size);
2019         vmcs_conf->revision_id = vmx_msr_low;
2020
2021         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2022         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2023         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2024         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2025         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2026
2027         cpu_has_load_ia32_efer =
2028                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2029                                 VM_ENTRY_LOAD_IA32_EFER)
2030                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2031                                    VM_EXIT_LOAD_IA32_EFER);
2032
2033         return 0;
2034 }
2035
2036 static struct vmcs *alloc_vmcs_cpu(int cpu)
2037 {
2038         int node = cpu_to_node(cpu);
2039         struct page *pages;
2040         struct vmcs *vmcs;
2041
2042         pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
2043         if (!pages)
2044                 return NULL;
2045         vmcs = page_address(pages);
2046         memset(vmcs, 0, vmcs_config.size);
2047         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
2048         return vmcs;
2049 }
2050
2051 static struct vmcs *alloc_vmcs(void)
2052 {
2053         return alloc_vmcs_cpu(raw_smp_processor_id());
2054 }
2055
2056 static void free_vmcs(struct vmcs *vmcs)
2057 {
2058         free_pages((unsigned long)vmcs, vmcs_config.order);
2059 }
2060
2061 /*
2062  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2063  */
2064 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2065 {
2066         if (!loaded_vmcs->vmcs)
2067                 return;
2068         loaded_vmcs_clear(loaded_vmcs);
2069         free_vmcs(loaded_vmcs->vmcs);
2070         loaded_vmcs->vmcs = NULL;
2071 }
2072
2073 static void free_kvm_area(void)
2074 {
2075         int cpu;
2076
2077         for_each_possible_cpu(cpu) {
2078                 free_vmcs(per_cpu(vmxarea, cpu));
2079                 per_cpu(vmxarea, cpu) = NULL;
2080         }
2081 }
2082
2083 static __init int alloc_kvm_area(void)
2084 {
2085         int cpu;
2086
2087         for_each_possible_cpu(cpu) {
2088                 struct vmcs *vmcs;
2089
2090                 vmcs = alloc_vmcs_cpu(cpu);
2091                 if (!vmcs) {
2092                         free_kvm_area();
2093                         return -ENOMEM;
2094                 }
2095
2096                 per_cpu(vmxarea, cpu) = vmcs;
2097         }
2098         return 0;
2099 }
2100
2101 static __init int hardware_setup(void)
2102 {
2103         if (setup_vmcs_config(&vmcs_config) < 0)
2104                 return -EIO;
2105
2106         if (boot_cpu_has(X86_FEATURE_NX))
2107                 kvm_enable_efer_bits(EFER_NX);
2108
2109         if (!cpu_has_vmx_vpid())
2110                 enable_vpid = 0;
2111
2112         if (!cpu_has_vmx_ept() ||
2113             !cpu_has_vmx_ept_4levels()) {
2114                 enable_ept = 0;
2115                 enable_unrestricted_guest = 0;
2116         }
2117
2118         if (!cpu_has_vmx_unrestricted_guest())
2119                 enable_unrestricted_guest = 0;
2120
2121         if (!cpu_has_vmx_flexpriority())
2122                 flexpriority_enabled = 0;
2123
2124         if (!cpu_has_vmx_tpr_shadow())
2125                 kvm_x86_ops->update_cr8_intercept = NULL;
2126
2127         if (enable_ept && !cpu_has_vmx_ept_2m_page())
2128                 kvm_disable_largepages();
2129
2130         if (!cpu_has_vmx_ple())
2131                 ple_gap = 0;
2132
2133         if (nested)
2134                 nested_vmx_setup_ctls_msrs();
2135
2136         return alloc_kvm_area();
2137 }
2138
2139 static __exit void hardware_unsetup(void)
2140 {
2141         free_kvm_area();
2142 }
2143
2144 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
2145 {
2146         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2147
2148         if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
2149                 vmcs_write16(sf->selector, save->selector);
2150                 vmcs_writel(sf->base, save->base);
2151                 vmcs_write32(sf->limit, save->limit);
2152                 vmcs_write32(sf->ar_bytes, save->ar);
2153         } else {
2154                 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
2155                         << AR_DPL_SHIFT;
2156                 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
2157         }
2158 }
2159
2160 static void enter_pmode(struct kvm_vcpu *vcpu)
2161 {
2162         unsigned long flags;
2163         struct vcpu_vmx *vmx = to_vmx(vcpu);
2164
2165         vmx->emulation_required = 1;
2166         vmx->rmode.vm86_active = 0;
2167
2168         vmx_segment_cache_clear(vmx);
2169
2170         vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
2171         vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
2172         vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
2173         vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
2174
2175         flags = vmcs_readl(GUEST_RFLAGS);
2176         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2177         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2178         vmcs_writel(GUEST_RFLAGS, flags);
2179
2180         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2181                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2182
2183         update_exception_bitmap(vcpu);
2184
2185         if (emulate_invalid_guest_state)
2186                 return;
2187
2188         fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
2189         fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
2190         fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
2191         fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
2192
2193         vmx_segment_cache_clear(vmx);
2194
2195         vmcs_write16(GUEST_SS_SELECTOR, 0);
2196         vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
2197
2198         vmcs_write16(GUEST_CS_SELECTOR,
2199                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
2200         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2201 }
2202
2203 static gva_t rmode_tss_base(struct kvm *kvm)
2204 {
2205         if (!kvm->arch.tss_addr) {
2206                 struct kvm_memslots *slots;
2207                 gfn_t base_gfn;
2208
2209                 slots = kvm_memslots(kvm);
2210                 base_gfn = slots->memslots[0].base_gfn +
2211                                  kvm->memslots->memslots[0].npages - 3;
2212                 return base_gfn << PAGE_SHIFT;
2213         }
2214         return kvm->arch.tss_addr;
2215 }
2216
2217 static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
2218 {
2219         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2220
2221         save->selector = vmcs_read16(sf->selector);
2222         save->base = vmcs_readl(sf->base);
2223         save->limit = vmcs_read32(sf->limit);
2224         save->ar = vmcs_read32(sf->ar_bytes);
2225         vmcs_write16(sf->selector, save->base >> 4);
2226         vmcs_write32(sf->base, save->base & 0xffff0);
2227         vmcs_write32(sf->limit, 0xffff);
2228         vmcs_write32(sf->ar_bytes, 0xf3);
2229         if (save->base & 0xf)
2230                 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
2231                             " aligned when entering protected mode (seg=%d)",
2232                             seg);
2233 }
2234
2235 static void enter_rmode(struct kvm_vcpu *vcpu)
2236 {
2237         unsigned long flags;
2238         struct vcpu_vmx *vmx = to_vmx(vcpu);
2239
2240         if (enable_unrestricted_guest)
2241                 return;
2242
2243         vmx->emulation_required = 1;
2244         vmx->rmode.vm86_active = 1;
2245
2246         /*
2247          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2248          * vcpu. Call it here with phys address pointing 16M below 4G.
2249          */
2250         if (!vcpu->kvm->arch.tss_addr) {
2251                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2252                              "called before entering vcpu\n");
2253                 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
2254                 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
2255                 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2256         }
2257
2258         vmx_segment_cache_clear(vmx);
2259
2260         vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
2261         vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
2262         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
2263
2264         vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
2265         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2266
2267         vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
2268         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2269
2270         flags = vmcs_readl(GUEST_RFLAGS);
2271         vmx->rmode.save_rflags = flags;
2272
2273         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2274
2275         vmcs_writel(GUEST_RFLAGS, flags);
2276         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2277         update_exception_bitmap(vcpu);
2278
2279         if (emulate_invalid_guest_state)
2280                 goto continue_rmode;
2281
2282         vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
2283         vmcs_write32(GUEST_SS_LIMIT, 0xffff);
2284         vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
2285
2286         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
2287         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2288         if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
2289                 vmcs_writel(GUEST_CS_BASE, 0xf0000);
2290         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
2291
2292         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
2293         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
2294         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
2295         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
2296
2297 continue_rmode:
2298         kvm_mmu_reset_context(vcpu);
2299 }
2300
2301 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2302 {
2303         struct vcpu_vmx *vmx = to_vmx(vcpu);
2304         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2305
2306         if (!msr)
2307                 return;
2308
2309         /*
2310          * Force kernel_gs_base reloading before EFER changes, as control
2311          * of this msr depends on is_long_mode().
2312          */
2313         vmx_load_host_state(to_vmx(vcpu));
2314         vcpu->arch.efer = efer;
2315         if (efer & EFER_LMA) {
2316                 vmcs_write32(VM_ENTRY_CONTROLS,
2317                              vmcs_read32(VM_ENTRY_CONTROLS) |
2318                              VM_ENTRY_IA32E_MODE);
2319                 msr->data = efer;
2320         } else {
2321                 vmcs_write32(VM_ENTRY_CONTROLS,
2322                              vmcs_read32(VM_ENTRY_CONTROLS) &
2323                              ~VM_ENTRY_IA32E_MODE);
2324
2325                 msr->data = efer & ~EFER_LME;
2326         }
2327         setup_msrs(vmx);
2328 }
2329
2330 #ifdef CONFIG_X86_64
2331
2332 static void enter_lmode(struct kvm_vcpu *vcpu)
2333 {
2334         u32 guest_tr_ar;
2335
2336         vmx_segment_cache_clear(to_vmx(vcpu));
2337
2338         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2339         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
2340                 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
2341                        __func__);
2342                 vmcs_write32(GUEST_TR_AR_BYTES,
2343                              (guest_tr_ar & ~AR_TYPE_MASK)
2344                              | AR_TYPE_BUSY_64_TSS);
2345         }
2346         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2347 }
2348
2349 static void exit_lmode(struct kvm_vcpu *vcpu)
2350 {
2351         vmcs_write32(VM_ENTRY_CONTROLS,
2352                      vmcs_read32(VM_ENTRY_CONTROLS)
2353                      & ~VM_ENTRY_IA32E_MODE);
2354         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2355 }
2356
2357 #endif
2358
2359 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2360 {
2361         vpid_sync_context(to_vmx(vcpu));
2362         if (enable_ept) {
2363                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2364                         return;
2365                 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
2366         }
2367 }
2368
2369 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2370 {
2371         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2372
2373         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
2374         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
2375 }
2376
2377 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
2378 {
2379         if (enable_ept && is_paging(vcpu))
2380                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2381         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
2382 }
2383
2384 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2385 {
2386         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2387
2388         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
2389         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
2390 }
2391
2392 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
2393 {
2394         if (!test_bit(VCPU_EXREG_PDPTR,
2395                       (unsigned long *)&vcpu->arch.regs_dirty))
2396                 return;
2397
2398         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2399                 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
2400                 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
2401                 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
2402                 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
2403         }
2404 }
2405
2406 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2407 {
2408         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2409                 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2410                 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2411                 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
2412                 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
2413         }
2414
2415         __set_bit(VCPU_EXREG_PDPTR,
2416                   (unsigned long *)&vcpu->arch.regs_avail);
2417         __set_bit(VCPU_EXREG_PDPTR,
2418                   (unsigned long *)&vcpu->arch.regs_dirty);
2419 }
2420
2421 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2422
2423 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2424                                         unsigned long cr0,
2425                                         struct kvm_vcpu *vcpu)
2426 {
2427         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2428                 vmx_decache_cr3(vcpu);
2429         if (!(cr0 & X86_CR0_PG)) {
2430                 /* From paging/starting to nonpaging */
2431                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2432                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
2433                              (CPU_BASED_CR3_LOAD_EXITING |
2434                               CPU_BASED_CR3_STORE_EXITING));
2435                 vcpu->arch.cr0 = cr0;
2436                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2437         } else if (!is_paging(vcpu)) {
2438                 /* From nonpaging to paging */
2439                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2440                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
2441                              ~(CPU_BASED_CR3_LOAD_EXITING |
2442                                CPU_BASED_CR3_STORE_EXITING));
2443                 vcpu->arch.cr0 = cr0;
2444                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2445         }
2446
2447         if (!(cr0 & X86_CR0_WP))
2448                 *hw_cr0 &= ~X86_CR0_WP;
2449 }
2450
2451 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2452 {
2453         struct vcpu_vmx *vmx = to_vmx(vcpu);
2454         unsigned long hw_cr0;
2455
2456         if (enable_unrestricted_guest)
2457                 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
2458                         | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
2459         else
2460                 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
2461
2462         if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
2463                 enter_pmode(vcpu);
2464
2465         if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
2466                 enter_rmode(vcpu);
2467
2468 #ifdef CONFIG_X86_64
2469         if (vcpu->arch.efer & EFER_LME) {
2470                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
2471                         enter_lmode(vcpu);
2472                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
2473                         exit_lmode(vcpu);
2474         }
2475 #endif
2476
2477         if (enable_ept)
2478                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
2479
2480         if (!vcpu->fpu_active)
2481                 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
2482
2483         vmcs_writel(CR0_READ_SHADOW, cr0);
2484         vmcs_writel(GUEST_CR0, hw_cr0);
2485         vcpu->arch.cr0 = cr0;
2486         __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2487 }
2488
2489 static u64 construct_eptp(unsigned long root_hpa)
2490 {
2491         u64 eptp;
2492
2493         /* TODO write the value reading from MSR */
2494         eptp = VMX_EPT_DEFAULT_MT |
2495                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
2496         eptp |= (root_hpa & PAGE_MASK);
2497
2498         return eptp;
2499 }
2500
2501 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2502 {
2503         unsigned long guest_cr3;
2504         u64 eptp;
2505
2506         guest_cr3 = cr3;
2507         if (enable_ept) {
2508                 eptp = construct_eptp(cr3);
2509                 vmcs_write64(EPT_POINTER, eptp);
2510                 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
2511                         vcpu->kvm->arch.ept_identity_map_addr;
2512                 ept_load_pdptrs(vcpu);
2513         }
2514
2515         vmx_flush_tlb(vcpu);
2516         vmcs_writel(GUEST_CR3, guest_cr3);
2517 }
2518
2519 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2520 {
2521         unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2522                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2523
2524         if (cr4 & X86_CR4_VMXE) {
2525                 /*
2526                  * To use VMXON (and later other VMX instructions), a guest
2527                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
2528                  * So basically the check on whether to allow nested VMX
2529                  * is here.
2530                  */
2531                 if (!nested_vmx_allowed(vcpu))
2532                         return 1;
2533         } else if (to_vmx(vcpu)->nested.vmxon)
2534                 return 1;
2535
2536         vcpu->arch.cr4 = cr4;
2537         if (enable_ept) {
2538                 if (!is_paging(vcpu)) {
2539                         hw_cr4 &= ~X86_CR4_PAE;
2540                         hw_cr4 |= X86_CR4_PSE;
2541                 } else if (!(cr4 & X86_CR4_PAE)) {
2542                         hw_cr4 &= ~X86_CR4_PAE;
2543                 }
2544         }
2545
2546         vmcs_writel(CR4_READ_SHADOW, cr4);
2547         vmcs_writel(GUEST_CR4, hw_cr4);
2548         return 0;
2549 }
2550
2551 static void vmx_get_segment(struct kvm_vcpu *vcpu,
2552                             struct kvm_segment *var, int seg)
2553 {
2554         struct vcpu_vmx *vmx = to_vmx(vcpu);
2555         struct kvm_save_segment *save;
2556         u32 ar;
2557
2558         if (vmx->rmode.vm86_active
2559             && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2560                 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2561                 || seg == VCPU_SREG_GS)
2562             && !emulate_invalid_guest_state) {
2563                 switch (seg) {
2564                 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2565                 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2566                 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2567                 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2568                 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2569                 default: BUG();
2570                 }
2571                 var->selector = save->selector;
2572                 var->base = save->base;
2573                 var->limit = save->limit;
2574                 ar = save->ar;
2575                 if (seg == VCPU_SREG_TR
2576                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2577                         goto use_saved_rmode_seg;
2578         }
2579         var->base = vmx_read_guest_seg_base(vmx, seg);
2580         var->limit = vmx_read_guest_seg_limit(vmx, seg);
2581         var->selector = vmx_read_guest_seg_selector(vmx, seg);
2582         ar = vmx_read_guest_seg_ar(vmx, seg);
2583 use_saved_rmode_seg:
2584         if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2585                 ar = 0;
2586         var->type = ar & 15;
2587         var->s = (ar >> 4) & 1;
2588         var->dpl = (ar >> 5) & 3;
2589         var->present = (ar >> 7) & 1;
2590         var->avl = (ar >> 12) & 1;
2591         var->l = (ar >> 13) & 1;
2592         var->db = (ar >> 14) & 1;
2593         var->g = (ar >> 15) & 1;
2594         var->unusable = (ar >> 16) & 1;
2595 }
2596
2597 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2598 {
2599         struct kvm_segment s;
2600
2601         if (to_vmx(vcpu)->rmode.vm86_active) {
2602                 vmx_get_segment(vcpu, &s, seg);
2603                 return s.base;
2604         }
2605         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
2606 }
2607
2608 static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
2609 {
2610         if (!is_protmode(vcpu))
2611                 return 0;
2612
2613         if (!is_long_mode(vcpu)
2614             && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
2615                 return 3;
2616
2617         return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
2618 }
2619
2620 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2621 {
2622         if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
2623                 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2624                 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
2625         }
2626         return to_vmx(vcpu)->cpl;
2627 }
2628
2629
2630 static u32 vmx_segment_access_rights(struct kvm_segment *var)
2631 {
2632         u32 ar;
2633
2634         if (var->unusable)
2635                 ar = 1 << 16;
2636         else {
2637                 ar = var->type & 15;
2638                 ar |= (var->s & 1) << 4;
2639                 ar |= (var->dpl & 3) << 5;
2640                 ar |= (var->present & 1) << 7;
2641                 ar |= (var->avl & 1) << 12;
2642                 ar |= (var->l & 1) << 13;
2643                 ar |= (var->db & 1) << 14;
2644                 ar |= (var->g & 1) << 15;
2645         }
2646         if (ar == 0) /* a 0 value means unusable */
2647                 ar = AR_UNUSABLE_MASK;
2648
2649         return ar;
2650 }
2651
2652 static void vmx_set_segment(struct kvm_vcpu *vcpu,
2653                             struct kvm_segment *var, int seg)
2654 {
2655         struct vcpu_vmx *vmx = to_vmx(vcpu);
2656         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2657         u32 ar;
2658
2659         vmx_segment_cache_clear(vmx);
2660
2661         if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2662                 vmcs_write16(sf->selector, var->selector);
2663                 vmx->rmode.tr.selector = var->selector;
2664                 vmx->rmode.tr.base = var->base;
2665                 vmx->rmode.tr.limit = var->limit;
2666                 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
2667                 return;
2668         }
2669         vmcs_writel(sf->base, var->base);
2670         vmcs_write32(sf->limit, var->limit);
2671         vmcs_write16(sf->selector, var->selector);
2672         if (vmx->rmode.vm86_active && var->s) {
2673                 /*
2674                  * Hack real-mode segments into vm86 compatibility.
2675                  */
2676                 if (var->base == 0xffff0000 && var->selector == 0xf000)
2677                         vmcs_writel(sf->base, 0xf0000);
2678                 ar = 0xf3;
2679         } else
2680                 ar = vmx_segment_access_rights(var);
2681
2682         /*
2683          *   Fix the "Accessed" bit in AR field of segment registers for older
2684          * qemu binaries.
2685          *   IA32 arch specifies that at the time of processor reset the
2686          * "Accessed" bit in the AR field of segment registers is 1. And qemu
2687          * is setting it to 0 in the usedland code. This causes invalid guest
2688          * state vmexit when "unrestricted guest" mode is turned on.
2689          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
2690          * tree. Newer qemu binaries with that qemu fix would not need this
2691          * kvm hack.
2692          */
2693         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
2694                 ar |= 0x1; /* Accessed */
2695
2696         vmcs_write32(sf->ar_bytes, ar);
2697         __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2698 }
2699
2700 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2701 {
2702         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
2703
2704         *db = (ar >> 14) & 1;
2705         *l = (ar >> 13) & 1;
2706 }
2707
2708 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2709 {
2710         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
2711         dt->address = vmcs_readl(GUEST_IDTR_BASE);
2712 }
2713
2714 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2715 {
2716         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
2717         vmcs_writel(GUEST_IDTR_BASE, dt->address);
2718 }
2719
2720 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2721 {
2722         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
2723         dt->address = vmcs_readl(GUEST_GDTR_BASE);
2724 }
2725
2726 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2727 {
2728         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
2729         vmcs_writel(GUEST_GDTR_BASE, dt->address);
2730 }
2731
2732 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
2733 {
2734         struct kvm_segment var;
2735         u32 ar;
2736
2737         vmx_get_segment(vcpu, &var, seg);
2738         ar = vmx_segment_access_rights(&var);
2739
2740         if (var.base != (var.selector << 4))
2741                 return false;
2742         if (var.limit != 0xffff)
2743                 return false;
2744         if (ar != 0xf3)
2745                 return false;
2746
2747         return true;
2748 }
2749
2750 static bool code_segment_valid(struct kvm_vcpu *vcpu)
2751 {
2752         struct kvm_segment cs;
2753         unsigned int cs_rpl;
2754
2755         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2756         cs_rpl = cs.selector & SELECTOR_RPL_MASK;
2757
2758         if (cs.unusable)
2759                 return false;
2760         if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
2761                 return false;
2762         if (!cs.s)
2763                 return false;
2764         if (cs.type & AR_TYPE_WRITEABLE_MASK) {
2765                 if (cs.dpl > cs_rpl)
2766                         return false;
2767         } else {
2768                 if (cs.dpl != cs_rpl)
2769                         return false;
2770         }
2771         if (!cs.present)
2772                 return false;
2773
2774         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
2775         return true;
2776 }
2777
2778 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
2779 {
2780         struct kvm_segment ss;
2781         unsigned int ss_rpl;
2782
2783         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2784         ss_rpl = ss.selector & SELECTOR_RPL_MASK;
2785
2786         if (ss.unusable)
2787                 return true;
2788         if (ss.type != 3 && ss.type != 7)
2789                 return false;
2790         if (!ss.s)
2791                 return false;
2792         if (ss.dpl != ss_rpl) /* DPL != RPL */
2793                 return false;
2794         if (!ss.present)
2795                 return false;
2796
2797         return true;
2798 }
2799
2800 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
2801 {
2802         struct kvm_segment var;
2803         unsigned int rpl;
2804
2805         vmx_get_segment(vcpu, &var, seg);
2806         rpl = var.selector & SELECTOR_RPL_MASK;
2807
2808         if (var.unusable)
2809                 return true;
2810         if (!var.s)
2811                 return false;
2812         if (!var.present)
2813                 return false;
2814         if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
2815                 if (var.dpl < rpl) /* DPL < RPL */
2816                         return false;
2817         }
2818
2819         /* TODO: Add other members to kvm_segment_field to allow checking for other access
2820          * rights flags
2821          */
2822         return true;
2823 }
2824
2825 static bool tr_valid(struct kvm_vcpu *vcpu)
2826 {
2827         struct kvm_segment tr;
2828
2829         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
2830
2831         if (tr.unusable)
2832                 return false;
2833         if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
2834                 return false;
2835         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
2836                 return false;
2837         if (!tr.present)
2838                 return false;
2839
2840         return true;
2841 }
2842
2843 static bool ldtr_valid(struct kvm_vcpu *vcpu)
2844 {
2845         struct kvm_segment ldtr;
2846
2847         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
2848
2849         if (ldtr.unusable)
2850                 return true;
2851         if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
2852                 return false;
2853         if (ldtr.type != 2)
2854                 return false;
2855         if (!ldtr.present)
2856                 return false;
2857
2858         return true;
2859 }
2860
2861 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2862 {
2863         struct kvm_segment cs, ss;
2864
2865         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2866         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2867
2868         return ((cs.selector & SELECTOR_RPL_MASK) ==
2869                  (ss.selector & SELECTOR_RPL_MASK));
2870 }
2871
2872 /*
2873  * Check if guest state is valid. Returns true if valid, false if
2874  * not.
2875  * We assume that registers are always usable
2876  */
2877 static bool guest_state_valid(struct kvm_vcpu *vcpu)
2878 {
2879         /* real mode guest state checks */
2880         if (!is_protmode(vcpu)) {
2881                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2882                         return false;
2883                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
2884                         return false;
2885                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
2886                         return false;
2887                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
2888                         return false;
2889                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
2890                         return false;
2891                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
2892                         return false;
2893         } else {
2894         /* protected mode guest state checks */
2895                 if (!cs_ss_rpl_check(vcpu))
2896                         return false;
2897                 if (!code_segment_valid(vcpu))
2898                         return false;
2899                 if (!stack_segment_valid(vcpu))
2900                         return false;
2901                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
2902                         return false;
2903                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
2904                         return false;
2905                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
2906                         return false;
2907                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
2908                         return false;
2909                 if (!tr_valid(vcpu))
2910                         return false;
2911                 if (!ldtr_valid(vcpu))
2912                         return false;
2913         }
2914         /* TODO:
2915          * - Add checks on RIP
2916          * - Add checks on RFLAGS
2917          */
2918
2919         return true;
2920 }
2921
2922 static int init_rmode_tss(struct kvm *kvm)
2923 {
2924         gfn_t fn;
2925         u16 data = 0;
2926         int r, idx, ret = 0;
2927
2928         idx = srcu_read_lock(&kvm->srcu);
2929         fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2930         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2931         if (r < 0)
2932                 goto out;
2933         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
2934         r = kvm_write_guest_page(kvm, fn++, &data,
2935                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
2936         if (r < 0)
2937                 goto out;
2938         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
2939         if (r < 0)
2940                 goto out;
2941         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2942         if (r < 0)
2943                 goto out;
2944         data = ~0;
2945         r = kvm_write_guest_page(kvm, fn, &data,
2946                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
2947                                  sizeof(u8));
2948         if (r < 0)
2949                 goto out;
2950
2951         ret = 1;
2952 out:
2953         srcu_read_unlock(&kvm->srcu, idx);
2954         return ret;
2955 }
2956
2957 static int init_rmode_identity_map(struct kvm *kvm)
2958 {
2959         int i, idx, r, ret;
2960         pfn_t identity_map_pfn;
2961         u32 tmp;
2962
2963         if (!enable_ept)
2964                 return 1;
2965         if (unlikely(!kvm->arch.ept_identity_pagetable)) {
2966                 printk(KERN_ERR "EPT: identity-mapping pagetable "
2967                         "haven't been allocated!\n");
2968                 return 0;
2969         }
2970         if (likely(kvm->arch.ept_identity_pagetable_done))
2971                 return 1;
2972         ret = 0;
2973         identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2974         idx = srcu_read_lock(&kvm->srcu);
2975         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2976         if (r < 0)
2977                 goto out;
2978         /* Set up identity-mapping pagetable for EPT in real mode */
2979         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
2980                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
2981                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2982                 r = kvm_write_guest_page(kvm, identity_map_pfn,
2983                                 &tmp, i * sizeof(tmp), sizeof(tmp));
2984                 if (r < 0)
2985                         goto out;
2986         }
2987         kvm->arch.ept_identity_pagetable_done = true;
2988         ret = 1;
2989 out:
2990         srcu_read_unlock(&kvm->srcu, idx);
2991         return ret;
2992 }
2993
2994 static void seg_setup(int seg)
2995 {
2996         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2997         unsigned int ar;
2998
2999         vmcs_write16(sf->selector, 0);
3000         vmcs_writel(sf->base, 0);
3001         vmcs_write32(sf->limit, 0xffff);
3002         if (enable_unrestricted_guest) {
3003                 ar = 0x93;
3004                 if (seg == VCPU_SREG_CS)
3005                         ar |= 0x08; /* code segment */
3006         } else
3007                 ar = 0xf3;
3008
3009         vmcs_write32(sf->ar_bytes, ar);
3010 }
3011
3012 static int alloc_apic_access_page(struct kvm *kvm)
3013 {
3014         struct kvm_userspace_memory_region kvm_userspace_mem;
3015         int r = 0;
3016
3017         mutex_lock(&kvm->slots_lock);
3018         if (kvm->arch.apic_access_page)
3019                 goto out;
3020         kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
3021         kvm_userspace_mem.flags = 0;
3022         kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3023         kvm_userspace_mem.memory_size = PAGE_SIZE;
3024         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
3025         if (r)
3026                 goto out;
3027
3028         kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
3029 out:
3030         mutex_unlock(&kvm->slots_lock);
3031         return r;
3032 }
3033
3034 static int alloc_identity_pagetable(struct kvm *kvm)
3035 {
3036         struct kvm_userspace_memory_region kvm_userspace_mem;
3037         int r = 0;
3038
3039         mutex_lock(&kvm->slots_lock);
3040         if (kvm->arch.ept_identity_pagetable)
3041                 goto out;
3042         kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
3043         kvm_userspace_mem.flags = 0;
3044         kvm_userspace_mem.guest_phys_addr =
3045                 kvm->arch.ept_identity_map_addr;
3046         kvm_userspace_mem.memory_size = PAGE_SIZE;
3047         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
3048         if (r)
3049                 goto out;
3050
3051         kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
3052                         kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
3053 out:
3054         mutex_unlock(&kvm->slots_lock);
3055         return r;
3056 }
3057
3058 static void allocate_vpid(struct vcpu_vmx *vmx)
3059 {
3060         int vpid;
3061
3062         vmx->vpid = 0;
3063         if (!enable_vpid)
3064                 return;
3065         spin_lock(&vmx_vpid_lock);
3066         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3067         if (vpid < VMX_NR_VPIDS) {
3068                 vmx->vpid = vpid;
3069                 __set_bit(vpid, vmx_vpid_bitmap);
3070         }
3071         spin_unlock(&vmx_vpid_lock);
3072 }
3073
3074 static void free_vpid(struct vcpu_vmx *vmx)
3075 {
3076         if (!enable_vpid)
3077                 return;
3078         spin_lock(&vmx_vpid_lock);
3079         if (vmx->vpid != 0)
3080                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3081         spin_unlock(&vmx_vpid_lock);
3082 }
3083
3084 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3085 {
3086         int f = sizeof(unsigned long);
3087
3088         if (!cpu_has_vmx_msr_bitmap())
3089                 return;
3090
3091         /*
3092          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3093          * have the write-low and read-high bitmap offsets the wrong way round.
3094          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3095          */
3096         if (msr <= 0x1fff) {
3097                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
3098                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
3099         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3100                 msr &= 0x1fff;
3101                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
3102                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
3103         }
3104 }
3105
3106 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3107 {
3108         if (!longmode_only)
3109                 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
3110         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
3111 }
3112
3113 /*
3114  * Sets up the vmcs for emulated real mode.
3115  */
3116 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3117 {
3118         u32 host_sysenter_cs, msr_low, msr_high;
3119         u32 junk;
3120         u64 host_pat;
3121         unsigned long a;
3122         struct desc_ptr dt;
3123         int i;
3124         unsigned long kvm_vmx_return;
3125         u32 exec_control;
3126
3127         /* I/O */
3128         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
3129         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
3130
3131         if (cpu_has_vmx_msr_bitmap())
3132                 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
3133
3134         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
3135
3136         /* Control */
3137         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
3138                 vmcs_config.pin_based_exec_ctrl);
3139
3140         exec_control = vmcs_config.cpu_based_exec_ctrl;
3141         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3142                 exec_control &= ~CPU_BASED_TPR_SHADOW;
3143 #ifdef CONFIG_X86_64
3144                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3145                                 CPU_BASED_CR8_LOAD_EXITING;
3146 #endif
3147         }
3148         if (!enable_ept)
3149                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3150                                 CPU_BASED_CR3_LOAD_EXITING  |
3151                                 CPU_BASED_INVLPG_EXITING;
3152         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
3153
3154         if (cpu_has_secondary_exec_ctrls()) {
3155                 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3156                 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3157                         exec_control &=
3158                                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3159                 if (vmx->vpid == 0)
3160                         exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3161                 if (!enable_ept) {
3162                         exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3163                         enable_unrestricted_guest = 0;
3164                 }
3165                 if (!enable_unrestricted_guest)
3166                         exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3167                 if (!ple_gap)
3168                         exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3169                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
3170         }
3171
3172         if (ple_gap) {
3173                 vmcs_write32(PLE_GAP, ple_gap);
3174                 vmcs_write32(PLE_WINDOW, ple_window);
3175         }
3176
3177         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
3178         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
3179         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
3180
3181         vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
3182         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
3183         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
3184
3185         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
3186         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3187         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3188         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
3189         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
3190         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3191 #ifdef CONFIG_X86_64
3192         rdmsrl(MSR_FS_BASE, a);
3193         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
3194         rdmsrl(MSR_GS_BASE, a);
3195         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
3196 #else
3197         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
3198         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
3199 #endif
3200
3201         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
3202
3203         native_store_idt(&dt);
3204         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
3205
3206         asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
3207         vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
3208         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
3209         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3210         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
3211         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3212         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
3213
3214         rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
3215         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
3216         rdmsrl(MSR_IA32_SYSENTER_ESP, a);
3217         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
3218         rdmsrl(MSR_IA32_SYSENTER_EIP, a);
3219         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
3220
3221         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3222                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
3223                 host_pat = msr_low | ((u64) msr_high << 32);
3224                 vmcs_write64(HOST_IA32_PAT, host_pat);
3225         }
3226         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3227                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
3228                 host_pat = msr_low | ((u64) msr_high << 32);
3229                 /* Write the default value follow host pat */
3230                 vmcs_write64(GUEST_IA32_PAT, host_pat);
3231                 /* Keep arch.pat sync with GUEST_IA32_PAT */
3232                 vmx->vcpu.arch.pat = host_pat;
3233         }
3234
3235         for (i = 0; i < NR_VMX_MSR; ++i) {
3236                 u32 index = vmx_msr_index[i];
3237                 u32 data_low, data_high;
3238                 int j = vmx->nmsrs;
3239
3240                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
3241                         continue;
3242                 if (wrmsr_safe(index, data_low, data_high) < 0)
3243                         continue;
3244                 vmx->guest_msrs[j].index = i;
3245                 vmx->guest_msrs[j].data = 0;
3246                 vmx->guest_msrs[j].mask = -1ull;
3247                 ++vmx->nmsrs;
3248         }
3249
3250         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
3251
3252         /* 22.2.1, 20.8.1 */
3253         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
3254
3255         vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3256         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3257         if (enable_ept)
3258                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3259         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3260
3261         kvm_write_tsc(&vmx->vcpu, 0);
3262
3263         return 0;
3264 }
3265
3266 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3267 {
3268         struct vcpu_vmx *vmx = to_vmx(vcpu);
3269         u64 msr;
3270         int ret;
3271
3272         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3273
3274         vmx->rmode.vm86_active = 0;
3275
3276         vmx->soft_vnmi_blocked = 0;
3277
3278         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
3279         kvm_set_cr8(&vmx->vcpu, 0);
3280         msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
3281         if (kvm_vcpu_is_bsp(&vmx->vcpu))
3282                 msr |= MSR_IA32_APICBASE_BSP;
3283         kvm_set_apic_base(&vmx->vcpu, msr);
3284
3285         ret = fx_init(&vmx->vcpu);
3286         if (ret != 0)
3287                 goto out;
3288
3289         vmx_segment_cache_clear(vmx);
3290
3291         seg_setup(VCPU_SREG_CS);
3292         /*
3293          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
3294          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
3295          */
3296         if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
3297                 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
3298                 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
3299         } else {
3300                 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
3301                 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
3302         }
3303
3304         seg_setup(VCPU_SREG_DS);
3305         seg_setup(VCPU_SREG_ES);
3306         seg_setup(VCPU_SREG_FS);
3307         seg_setup(VCPU_SREG_GS);
3308         seg_setup(VCPU_SREG_SS);
3309
3310         vmcs_write16(GUEST_TR_SELECTOR, 0);
3311         vmcs_writel(GUEST_TR_BASE, 0);
3312         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
3313         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3314
3315         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
3316         vmcs_writel(GUEST_LDTR_BASE, 0);
3317         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
3318         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
3319
3320         vmcs_write32(GUEST_SYSENTER_CS, 0);
3321         vmcs_writel(GUEST_SYSENTER_ESP, 0);
3322         vmcs_writel(GUEST_SYSENTER_EIP, 0);
3323
3324         vmcs_writel(GUEST_RFLAGS, 0x02);
3325         if (kvm_vcpu_is_bsp(&vmx->vcpu))
3326                 kvm_rip_write(vcpu, 0xfff0);
3327         else
3328                 kvm_rip_write(vcpu, 0);
3329         kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
3330
3331         vmcs_writel(GUEST_DR7, 0x400);
3332
3333         vmcs_writel(GUEST_GDTR_BASE, 0);
3334         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
3335
3336         vmcs_writel(GUEST_IDTR_BASE, 0);
3337         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
3338
3339         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3340         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
3341         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
3342
3343         /* Special registers */
3344         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3345
3346         setup_msrs(vmx);
3347
3348         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
3349
3350         if (cpu_has_vmx_tpr_shadow()) {
3351                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
3352                 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
3353                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
3354                                      __pa(vmx->vcpu.arch.apic->regs));
3355                 vmcs_write32(TPR_THRESHOLD, 0);
3356         }
3357
3358         if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3359                 vmcs_write64(APIC_ACCESS_ADDR,
3360                              page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
3361
3362         if (vmx->vpid != 0)
3363                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
3364
3365         vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
3366         vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
3367         vmx_set_cr4(&vmx->vcpu, 0);
3368         vmx_set_efer(&vmx->vcpu, 0);
3369         vmx_fpu_activate(&vmx->vcpu);
3370         update_exception_bitmap(&vmx->vcpu);
3371
3372         vpid_sync_context(vmx);
3373
3374         ret = 0;
3375
3376         /* HACK: Don't enable emulation on guest boot/reset */
3377         vmx->emulation_required = 0;
3378
3379 out:
3380         return ret;
3381 }
3382
3383 static void enable_irq_window(struct kvm_vcpu *vcpu)
3384 {
3385         u32 cpu_based_vm_exec_control;
3386
3387         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3388         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
3389         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3390 }
3391
3392 static void enable_nmi_window(struct kvm_vcpu *vcpu)
3393 {
3394         u32 cpu_based_vm_exec_control;
3395
3396         if (!cpu_has_virtual_nmis()) {
3397                 enable_irq_window(vcpu);
3398                 return;
3399         }
3400
3401         if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
3402                 enable_irq_window(vcpu);
3403                 return;
3404         }
3405         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3406         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
3407         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3408 }
3409
3410 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
3411 {
3412         struct vcpu_vmx *vmx = to_vmx(vcpu);
3413         uint32_t intr;
3414         int irq = vcpu->arch.interrupt.nr;
3415
3416         trace_kvm_inj_virq(irq);
3417
3418         ++vcpu->stat.irq_injections;
3419         if (vmx->rmode.vm86_active) {
3420                 int inc_eip = 0;
3421                 if (vcpu->arch.interrupt.soft)
3422                         inc_eip = vcpu->arch.event_exit_inst_len;
3423                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
3424                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3425                 return;
3426         }
3427         intr = irq | INTR_INFO_VALID_MASK;
3428         if (vcpu->arch.interrupt.soft) {
3429                 intr |= INTR_TYPE_SOFT_INTR;
3430                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3431                              vmx->vcpu.arch.event_exit_inst_len);
3432         } else
3433                 intr |= INTR_TYPE_EXT_INTR;
3434         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
3435         vmx_clear_hlt(vcpu);
3436 }
3437
3438 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
3439 {
3440         struct vcpu_vmx *vmx = to_vmx(vcpu);
3441
3442         if (!cpu_has_virtual_nmis()) {
3443                 /*
3444                  * Tracking the NMI-blocked state in software is built upon
3445                  * finding the next open IRQ window. This, in turn, depends on
3446                  * well-behaving guests: They have to keep IRQs disabled at
3447                  * least as long as the NMI handler runs. Otherwise we may
3448                  * cause NMI nesting, maybe breaking the guest. But as this is
3449                  * highly unlikely, we can live with the residual risk.
3450                  */
3451                 vmx->soft_vnmi_blocked = 1;
3452                 vmx->vnmi_blocked_time = 0;
3453         }
3454
3455         ++vcpu->stat.nmi_injections;
3456         vmx->nmi_known_unmasked = false;
3457         if (vmx->rmode.vm86_active) {
3458                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
3459                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3460                 return;
3461         }
3462         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
3463                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
3464         vmx_clear_hlt(vcpu);
3465 }
3466
3467 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
3468 {
3469         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
3470                 return 0;
3471
3472         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3473                   (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
3474                    | GUEST_INTR_STATE_NMI));
3475 }
3476
3477 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
3478 {
3479         if (!cpu_has_virtual_nmis())
3480                 return to_vmx(vcpu)->soft_vnmi_blocked;
3481         if (to_vmx(vcpu)->nmi_known_unmasked)
3482                 return false;
3483         return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
3484 }
3485
3486 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3487 {
3488         struct vcpu_vmx *vmx = to_vmx(vcpu);
3489
3490         if (!cpu_has_virtual_nmis()) {
3491                 if (vmx->soft_vnmi_blocked != masked) {
3492                         vmx->soft_vnmi_blocked = masked;
3493                         vmx->vnmi_blocked_time = 0;
3494                 }
3495         } else {
3496                 vmx->nmi_known_unmasked = !masked;
3497                 if (masked)
3498                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3499                                       GUEST_INTR_STATE_NMI);
3500                 else
3501                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3502                                         GUEST_INTR_STATE_NMI);
3503         }
3504 }
3505
3506 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
3507 {
3508         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
3509                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3510                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
3511 }
3512
3513 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
3514 {
3515         int ret;
3516         struct kvm_userspace_memory_region tss_mem = {
3517                 .slot = TSS_PRIVATE_MEMSLOT,
3518                 .guest_phys_addr = addr,
3519                 .memory_size = PAGE_SIZE * 3,
3520                 .flags = 0,
3521         };
3522
3523         ret = kvm_set_memory_region(kvm, &tss_mem, 0);
3524         if (ret)
3525                 return ret;
3526         kvm->arch.tss_addr = addr;
3527         if (!init_rmode_tss(kvm))
3528                 return  -ENOMEM;
3529
3530         return 0;
3531 }
3532
3533 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
3534                                   int vec, u32 err_code)
3535 {
3536         /*
3537          * Instruction with address size override prefix opcode 0x67
3538          * Cause the #SS fault with 0 error code in VM86 mode.
3539          */
3540         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
3541                 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
3542                         return 1;
3543         /*
3544          * Forward all other exceptions that are valid in real mode.
3545          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
3546          *        the required debugging infrastructure rework.
3547          */
3548         switch (vec) {
3549         case DB_VECTOR:
3550                 if (vcpu->guest_debug &
3551                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
3552                         return 0;
3553                 kvm_queue_exception(vcpu, vec);
3554                 return 1;
3555         case BP_VECTOR:
3556                 /*
3557                  * Update instruction length as we may reinject the exception
3558                  * from user space while in guest debugging mode.
3559                  */
3560                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
3561                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3562                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
3563                         return 0;
3564                 /* fall through */
3565         case DE_VECTOR:
3566         case OF_VECTOR:
3567         case BR_VECTOR:
3568         case UD_VECTOR:
3569         case DF_VECTOR:
3570         case SS_VECTOR:
3571         case GP_VECTOR:
3572         case MF_VECTOR:
3573                 kvm_queue_exception(vcpu, vec);
3574                 return 1;
3575         }
3576         return 0;
3577 }
3578
3579 /*
3580  * Trigger machine check on the host. We assume all the MSRs are already set up
3581  * by the CPU and that we still run on the same CPU as the MCE occurred on.
3582  * We pass a fake environment to the machine check handler because we want
3583  * the guest to be always treated like user space, no matter what context
3584  * it used internally.
3585  */
3586 static void kvm_machine_check(void)
3587 {
3588 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
3589         struct pt_regs regs = {
3590                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
3591                 .flags = X86_EFLAGS_IF,
3592         };
3593
3594         do_machine_check(&regs, 0);
3595 #endif
3596 }
3597
3598 static int handle_machine_check(struct kvm_vcpu *vcpu)
3599 {
3600         /* already handled by vcpu_run */
3601         return 1;
3602 }
3603
3604 static int handle_exception(struct kvm_vcpu *vcpu)
3605 {
3606         struct vcpu_vmx *vmx = to_vmx(vcpu);
3607         struct kvm_run *kvm_run = vcpu->run;
3608         u32 intr_info, ex_no, error_code;
3609         unsigned long cr2, rip, dr6;
3610         u32 vect_info;
3611         enum emulation_result er;
3612
3613         vect_info = vmx->idt_vectoring_info;
3614         intr_info = vmx->exit_intr_info;
3615
3616         if (is_machine_check(intr_info))
3617                 return handle_machine_check(vcpu);
3618
3619         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
3620             !is_page_fault(intr_info)) {
3621                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3622                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
3623                 vcpu->run->internal.ndata = 2;
3624                 vcpu->run->internal.data[0] = vect_info;
3625                 vcpu->run->internal.data[1] = intr_info;
3626                 return 0;
3627         }
3628
3629         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
3630                 return 1;  /* already handled by vmx_vcpu_run() */
3631
3632         if (is_no_device(intr_info)) {
3633                 vmx_fpu_activate(vcpu);
3634                 return 1;
3635         }
3636
3637         if (is_invalid_opcode(intr_info)) {
3638                 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3639                 if (er != EMULATE_DONE)
3640                         kvm_queue_exception(vcpu, UD_VECTOR);
3641                 return 1;
3642         }
3643
3644         error_code = 0;
3645         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3646                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3647         if (is_page_fault(intr_info)) {
3648                 /* EPT won't cause page fault directly */
3649                 if (enable_ept)
3650                         BUG();
3651                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
3652                 trace_kvm_page_fault(cr2, error_code);
3653
3654                 if (kvm_event_needs_reinjection(vcpu))
3655                         kvm_mmu_unprotect_page_virt(vcpu, cr2);
3656                 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3657         }
3658
3659         if (vmx->rmode.vm86_active &&
3660             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
3661                                                                 error_code)) {
3662                 if (vcpu->arch.halt_request) {
3663                         vcpu->arch.halt_request = 0;
3664                         return kvm_emulate_halt(vcpu);
3665                 }
3666                 return 1;
3667         }
3668
3669         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
3670         switch (ex_no) {
3671         case DB_VECTOR:
3672                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
3673                 if (!(vcpu->guest_debug &
3674                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
3675                         vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
3676                         kvm_queue_exception(vcpu, DB_VECTOR);
3677                         return 1;
3678                 }
3679                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
3680                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
3681                 /* fall through */
3682         case BP_VECTOR:
3683                 /*
3684                  * Update instruction length as we may reinject #BP from
3685                  * user space while in guest debugging mode. Reading it for
3686                  * #DB as well causes no harm, it is not used in that case.
3687                  */
3688                 vmx->vcpu.arch.event_exit_inst_len =
3689                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3690                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
3691                 rip = kvm_rip_read(vcpu);
3692                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3693                 kvm_run->debug.arch.exception = ex_no;
3694                 break;
3695         default:
3696                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
3697                 kvm_run->ex.exception = ex_no;
3698                 kvm_run->ex.error_code = error_code;
3699                 break;
3700         }
3701         return 0;
3702 }
3703
3704 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
3705 {
3706         ++vcpu->stat.irq_exits;
3707         return 1;
3708 }
3709
3710 static int handle_triple_fault(struct kvm_vcpu *vcpu)
3711 {
3712         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3713         return 0;
3714 }
3715
3716 static int handle_io(struct kvm_vcpu *vcpu)
3717 {
3718         unsigned long exit_qualification;
3719         int size, in, string;
3720         unsigned port;
3721
3722         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3723         string = (exit_qualification & 16) != 0;
3724         in = (exit_qualification & 8) != 0;
3725
3726         ++vcpu->stat.io_exits;
3727
3728         if (string || in)
3729                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3730
3731         port = exit_qualification >> 16;
3732         size = (exit_qualification & 7) + 1;
3733         skip_emulated_instruction(vcpu);
3734
3735         return kvm_fast_pio_out(vcpu, size, port);
3736 }
3737
3738 static void
3739 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3740 {
3741         /*
3742          * Patch in the VMCALL instruction:
3743          */
3744         hypercall[0] = 0x0f;
3745         hypercall[1] = 0x01;
3746         hypercall[2] = 0xc1;
3747 }
3748
3749 static int handle_cr(struct kvm_vcpu *vcpu)
3750 {
3751         unsigned long exit_qualification, val;
3752         int cr;
3753         int reg;
3754         int err;
3755
3756         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3757         cr = exit_qualification & 15;
3758         reg = (exit_qualification >> 8) & 15;
3759         switch ((exit_qualification >> 4) & 3) {
3760         case 0: /* mov to cr */
3761                 val = kvm_register_read(vcpu, reg);
3762                 trace_kvm_cr_write(cr, val);
3763                 switch (cr) {
3764                 case 0:
3765                         err = kvm_set_cr0(vcpu, val);
3766                         kvm_complete_insn_gp(vcpu, err);
3767                         return 1;
3768                 case 3:
3769                         err = kvm_set_cr3(vcpu, val);
3770                         kvm_complete_insn_gp(vcpu, err);
3771                         return 1;
3772                 case 4:
3773                         err = kvm_set_cr4(vcpu, val);
3774                         kvm_complete_insn_gp(vcpu, err);
3775                         return 1;
3776                 case 8: {
3777                                 u8 cr8_prev = kvm_get_cr8(vcpu);
3778                                 u8 cr8 = kvm_register_read(vcpu, reg);
3779                                 err = kvm_set_cr8(vcpu, cr8);
3780                                 kvm_complete_insn_gp(vcpu, err);
3781                                 if (irqchip_in_kernel(vcpu->kvm))
3782                                         return 1;
3783                                 if (cr8_prev <= cr8)
3784                                         return 1;
3785                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
3786                                 return 0;
3787                         }
3788                 };
3789                 break;
3790         case 2: /* clts */
3791                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3792                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3793                 skip_emulated_instruction(vcpu);
3794                 vmx_fpu_activate(vcpu);
3795                 return 1;
3796         case 1: /*mov from cr*/
3797                 switch (cr) {
3798                 case 3:
3799                         val = kvm_read_cr3(vcpu);
3800                         kvm_register_write(vcpu, reg, val);
3801                         trace_kvm_cr_read(cr, val);
3802                         skip_emulated_instruction(vcpu);
3803                         return 1;
3804                 case 8:
3805                         val = kvm_get_cr8(vcpu);
3806                         kvm_register_write(vcpu, reg, val);
3807                         trace_kvm_cr_read(cr, val);
3808                         skip_emulated_instruction(vcpu);
3809                         return 1;
3810                 }
3811                 break;
3812         case 3: /* lmsw */
3813                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3814                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3815                 kvm_lmsw(vcpu, val);
3816
3817                 skip_emulated_instruction(vcpu);
3818                 return 1;
3819         default:
3820                 break;
3821         }
3822         vcpu->run->exit_reason = 0;
3823         pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
3824                (int)(exit_qualification >> 4) & 3, cr);
3825         return 0;
3826 }
3827
3828 static int handle_dr(struct kvm_vcpu *vcpu)
3829 {
3830         unsigned long exit_qualification;
3831         int dr, reg;
3832
3833         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
3834         if (!kvm_require_cpl(vcpu, 0))
3835                 return 1;
3836         dr = vmcs_readl(GUEST_DR7);
3837         if (dr & DR7_GD) {
3838                 /*
3839                  * As the vm-exit takes precedence over the debug trap, we
3840                  * need to emulate the latter, either for the host or the
3841                  * guest debugging itself.
3842                  */
3843                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
3844                         vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
3845                         vcpu->run->debug.arch.dr7 = dr;
3846                         vcpu->run->debug.arch.pc =
3847                                 vmcs_readl(GUEST_CS_BASE) +
3848                                 vmcs_readl(GUEST_RIP);
3849                         vcpu->run->debug.arch.exception = DB_VECTOR;
3850                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
3851                         return 0;
3852                 } else {
3853                         vcpu->arch.dr7 &= ~DR7_GD;
3854                         vcpu->arch.dr6 |= DR6_BD;
3855                         vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3856                         kvm_queue_exception(vcpu, DB_VECTOR);
3857                         return 1;
3858                 }
3859         }
3860
3861         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3862         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3863         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3864         if (exit_qualification & TYPE_MOV_FROM_DR) {
3865                 unsigned long val;
3866                 if (!kvm_get_dr(vcpu, dr, &val))
3867                         kvm_register_write(vcpu, reg, val);
3868         } else
3869                 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3870         skip_emulated_instruction(vcpu);
3871         return 1;
3872 }
3873
3874 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3875 {
3876         vmcs_writel(GUEST_DR7, val);
3877 }
3878
3879 static int handle_cpuid(struct kvm_vcpu *vcpu)
3880 {
3881         kvm_emulate_cpuid(vcpu);
3882         return 1;
3883 }
3884
3885 static int handle_rdmsr(struct kvm_vcpu *vcpu)
3886 {
3887         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3888         u64 data;
3889
3890         if (vmx_get_msr(vcpu, ecx, &data)) {
3891                 trace_kvm_msr_read_ex(ecx);
3892                 kvm_inject_gp(vcpu, 0);
3893                 return 1;
3894         }
3895
3896         trace_kvm_msr_read(ecx, data);
3897
3898         /* FIXME: handling of bits 32:63 of rax, rdx */
3899         vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
3900         vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
3901         skip_emulated_instruction(vcpu);
3902         return 1;
3903 }
3904
3905 static int handle_wrmsr(struct kvm_vcpu *vcpu)
3906 {
3907         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3908         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3909                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3910
3911         if (vmx_set_msr(vcpu, ecx, data) != 0) {
3912                 trace_kvm_msr_write_ex(ecx, data);
3913                 kvm_inject_gp(vcpu, 0);
3914                 return 1;
3915         }
3916
3917         trace_kvm_msr_write(ecx, data);
3918         skip_emulated_instruction(vcpu);
3919         return 1;
3920 }
3921
3922 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3923 {
3924         kvm_make_request(KVM_REQ_EVENT, vcpu);
3925         return 1;
3926 }
3927
3928 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3929 {
3930         u32 cpu_based_vm_exec_control;
3931
3932         /* clear pending irq */
3933         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3934         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3935         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3936
3937         kvm_make_request(KVM_REQ_EVENT, vcpu);
3938
3939         ++vcpu->stat.irq_window_exits;
3940
3941         /*
3942          * If the user space waits to inject interrupts, exit as soon as
3943          * possible
3944          */
3945         if (!irqchip_in_kernel(vcpu->kvm) &&
3946             vcpu->run->request_interrupt_window &&
3947             !kvm_cpu_has_interrupt(vcpu)) {
3948                 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3949                 return 0;
3950         }
3951         return 1;
3952 }
3953
3954 static int handle_halt(struct kvm_vcpu *vcpu)
3955 {
3956         skip_emulated_instruction(vcpu);
3957         return kvm_emulate_halt(vcpu);
3958 }
3959
3960 static int handle_vmcall(struct kvm_vcpu *vcpu)
3961 {
3962         skip_emulated_instruction(vcpu);
3963         kvm_emulate_hypercall(vcpu);
3964         return 1;
3965 }
3966
3967 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3968 {
3969         kvm_queue_exception(vcpu, UD_VECTOR);
3970         return 1;
3971 }
3972
3973 static int handle_invd(struct kvm_vcpu *vcpu)
3974 {
3975         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3976 }
3977
3978 static int handle_invlpg(struct kvm_vcpu *vcpu)
3979 {
3980         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3981
3982         kvm_mmu_invlpg(vcpu, exit_qualification);
3983         skip_emulated_instruction(vcpu);
3984         return 1;
3985 }
3986
3987 static int handle_wbinvd(struct kvm_vcpu *vcpu)
3988 {
3989         skip_emulated_instruction(vcpu);
3990         kvm_emulate_wbinvd(vcpu);
3991         return 1;
3992 }
3993
3994 static int handle_xsetbv(struct kvm_vcpu *vcpu)
3995 {
3996         u64 new_bv = kvm_read_edx_eax(vcpu);
3997         u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3998
3999         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
4000                 skip_emulated_instruction(vcpu);
4001         return 1;
4002 }
4003
4004 static int handle_apic_access(struct kvm_vcpu *vcpu)
4005 {
4006         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4007 }
4008
4009 static int handle_task_switch(struct kvm_vcpu *vcpu)
4010 {
4011         struct vcpu_vmx *vmx = to_vmx(vcpu);
4012         unsigned long exit_qualification;
4013         bool has_error_code = false;
4014         u32 error_code = 0;
4015         u16 tss_selector;
4016         int reason, type, idt_v;
4017
4018         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4019         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
4020
4021         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4022
4023         reason = (u32)exit_qualification >> 30;
4024         if (reason == TASK_SWITCH_GATE && idt_v) {
4025                 switch (type) {
4026                 case INTR_TYPE_NMI_INTR:
4027                         vcpu->arch.nmi_injected = false;
4028                         vmx_set_nmi_mask(vcpu, true);
4029                         break;
4030                 case INTR_TYPE_EXT_INTR:
4031                 case INTR_TYPE_SOFT_INTR:
4032                         kvm_clear_interrupt_queue(vcpu);
4033                         break;
4034                 case INTR_TYPE_HARD_EXCEPTION:
4035                         if (vmx->idt_vectoring_info &
4036                             VECTORING_INFO_DELIVER_CODE_MASK) {
4037                                 has_error_code = true;
4038                                 error_code =
4039                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
4040                         }
4041                         /* fall through */
4042                 case INTR_TYPE_SOFT_EXCEPTION:
4043                         kvm_clear_exception_queue(vcpu);
4044                         break;
4045                 default:
4046                         break;
4047                 }
4048         }
4049         tss_selector = exit_qualification;
4050
4051         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
4052                        type != INTR_TYPE_EXT_INTR &&
4053                        type != INTR_TYPE_NMI_INTR))
4054                 skip_emulated_instruction(vcpu);
4055
4056         if (kvm_task_switch(vcpu, tss_selector, reason,
4057                                 has_error_code, error_code) == EMULATE_FAIL) {
4058                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4059                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4060                 vcpu->run->internal.ndata = 0;
4061                 return 0;
4062         }
4063
4064         /* clear all local breakpoint enable flags */
4065         vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
4066
4067         /*
4068          * TODO: What about debug traps on tss switch?
4069          *       Are we supposed to inject them and update dr6?
4070          */
4071
4072         return 1;
4073 }
4074
4075 static int handle_ept_violation(struct kvm_vcpu *vcpu)
4076 {
4077         unsigned long exit_qualification;
4078         gpa_t gpa;
4079         int gla_validity;
4080
4081         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4082
4083         if (exit_qualification & (1 << 6)) {
4084                 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
4085                 return -EINVAL;
4086         }
4087
4088         gla_validity = (exit_qualification >> 7) & 0x3;
4089         if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4090                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
4091                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
4092                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
4093                         vmcs_readl(GUEST_LINEAR_ADDRESS));
4094                 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
4095                         (long unsigned int)exit_qualification);
4096                 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
4097                 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
4098                 return 0;
4099         }
4100
4101         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4102         trace_kvm_page_fault(gpa, exit_qualification);
4103         return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
4104 }
4105
4106 static u64 ept_rsvd_mask(u64 spte, int level)
4107 {
4108         int i;
4109         u64 mask = 0;
4110
4111         for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
4112                 mask |= (1ULL << i);
4113
4114         if (level > 2)
4115                 /* bits 7:3 reserved */
4116                 mask |= 0xf8;
4117         else if (level == 2) {
4118                 if (spte & (1ULL << 7))
4119                         /* 2MB ref, bits 20:12 reserved */
4120                         mask |= 0x1ff000;
4121                 else
4122                         /* bits 6:3 reserved */
4123                         mask |= 0x78;
4124         }
4125
4126         return mask;
4127 }
4128
4129 static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
4130                                        int level)
4131 {
4132         printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
4133
4134         /* 010b (write-only) */
4135         WARN_ON((spte & 0x7) == 0x2);
4136
4137         /* 110b (write/execute) */
4138         WARN_ON((spte & 0x7) == 0x6);
4139
4140         /* 100b (execute-only) and value not supported by logical processor */
4141         if (!cpu_has_vmx_ept_execute_only())
4142                 WARN_ON((spte & 0x7) == 0x4);
4143
4144         /* not 000b */
4145         if ((spte & 0x7)) {
4146                 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
4147
4148                 if (rsvd_bits != 0) {
4149                         printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
4150                                          __func__, rsvd_bits);
4151                         WARN_ON(1);
4152                 }
4153
4154                 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
4155                         u64 ept_mem_type = (spte & 0x38) >> 3;
4156
4157                         if (ept_mem_type == 2 || ept_mem_type == 3 ||
4158                             ept_mem_type == 7) {
4159                                 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
4160                                                 __func__, ept_mem_type);
4161                                 WARN_ON(1);
4162                         }
4163                 }
4164         }
4165 }
4166
4167 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
4168 {
4169         u64 sptes[4];
4170         int nr_sptes, i;
4171         gpa_t gpa;
4172
4173         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4174
4175         printk(KERN_ERR "EPT: Misconfiguration.\n");
4176         printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
4177
4178         nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
4179
4180         for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
4181                 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
4182
4183         vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
4184         vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
4185
4186         return 0;
4187 }
4188
4189 static int handle_nmi_window(struct kvm_vcpu *vcpu)
4190 {
4191         u32 cpu_based_vm_exec_control;
4192
4193         /* clear pending NMI */
4194         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4195         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
4196         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4197         ++vcpu->stat.nmi_window_exits;
4198         kvm_make_request(KVM_REQ_EVENT, vcpu);
4199
4200         return 1;
4201 }
4202
4203 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4204 {
4205         struct vcpu_vmx *vmx = to_vmx(vcpu);
4206         enum emulation_result err = EMULATE_DONE;
4207         int ret = 1;
4208         u32 cpu_exec_ctrl;
4209         bool intr_window_requested;
4210
4211         cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4212         intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
4213
4214         while (!guest_state_valid(vcpu)) {
4215                 if (intr_window_requested
4216                     && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
4217                         return handle_interrupt_window(&vmx->vcpu);
4218
4219                 err = emulate_instruction(vcpu, 0);
4220
4221                 if (err == EMULATE_DO_MMIO) {
4222                         ret = 0;
4223                         goto out;
4224                 }
4225
4226                 if (err != EMULATE_DONE)
4227                         return 0;
4228
4229                 if (signal_pending(current))
4230                         goto out;
4231                 if (need_resched())
4232                         schedule();
4233         }
4234
4235         vmx->emulation_required = 0;
4236 out:
4237         return ret;
4238 }
4239
4240 /*
4241  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
4242  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
4243  */
4244 static int handle_pause(struct kvm_vcpu *vcpu)
4245 {
4246         skip_emulated_instruction(vcpu);
4247         kvm_vcpu_on_spin(vcpu);
4248
4249         return 1;
4250 }
4251
4252 static int handle_invalid_op(struct kvm_vcpu *vcpu)
4253 {
4254         kvm_queue_exception(vcpu, UD_VECTOR);
4255         return 1;
4256 }
4257
4258 /*
4259  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
4260  * We could reuse a single VMCS for all the L2 guests, but we also want the
4261  * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
4262  * allows keeping them loaded on the processor, and in the future will allow
4263  * optimizations where prepare_vmcs02 doesn't need to set all the fields on
4264  * every entry if they never change.
4265  * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
4266  * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
4267  *
4268  * The following functions allocate and free a vmcs02 in this pool.
4269  */
4270
4271 /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
4272 static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
4273 {
4274         struct vmcs02_list *item;
4275         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4276                 if (item->vmptr == vmx->nested.current_vmptr) {
4277                         list_move(&item->list, &vmx->nested.vmcs02_pool);
4278                         return &item->vmcs02;
4279                 }
4280
4281         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
4282                 /* Recycle the least recently used VMCS. */
4283                 item = list_entry(vmx->nested.vmcs02_pool.prev,
4284                         struct vmcs02_list, list);
4285                 item->vmptr = vmx->nested.current_vmptr;
4286                 list_move(&item->list, &vmx->nested.vmcs02_pool);
4287                 return &item->vmcs02;
4288         }
4289
4290         /* Create a new VMCS */
4291         item = (struct vmcs02_list *)
4292                 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
4293         if (!item)
4294                 return NULL;
4295         item->vmcs02.vmcs = alloc_vmcs();
4296         if (!item->vmcs02.vmcs) {
4297                 kfree(item);
4298                 return NULL;
4299         }
4300         loaded_vmcs_init(&item->vmcs02);
4301         item->vmptr = vmx->nested.current_vmptr;
4302         list_add(&(item->list), &(vmx->nested.vmcs02_pool));
4303         vmx->nested.vmcs02_num++;
4304         return &item->vmcs02;
4305 }
4306
4307 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
4308 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
4309 {
4310         struct vmcs02_list *item;
4311         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4312                 if (item->vmptr == vmptr) {
4313                         free_loaded_vmcs(&item->vmcs02);
4314                         list_del(&item->list);
4315                         kfree(item);
4316                         vmx->nested.vmcs02_num--;
4317                         return;
4318                 }
4319 }
4320
4321 /*
4322  * Free all VMCSs saved for this vcpu, except the one pointed by
4323  * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
4324  * currently used, if running L2), and vmcs01 when running L2.
4325  */
4326 static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
4327 {
4328         struct vmcs02_list *item, *n;
4329         list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
4330                 if (vmx->loaded_vmcs != &item->vmcs02)
4331                         free_loaded_vmcs(&item->vmcs02);
4332                 list_del(&item->list);
4333                 kfree(item);
4334         }
4335         vmx->nested.vmcs02_num = 0;
4336
4337         if (vmx->loaded_vmcs != &vmx->vmcs01)
4338                 free_loaded_vmcs(&vmx->vmcs01);
4339 }
4340
4341 /*
4342  * Emulate the VMXON instruction.
4343  * Currently, we just remember that VMX is active, and do not save or even
4344  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4345  * do not currently need to store anything in that guest-allocated memory
4346  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4347  * argument is different from the VMXON pointer (which the spec says they do).
4348  */
4349 static int handle_vmon(struct kvm_vcpu *vcpu)
4350 {
4351         struct kvm_segment cs;
4352         struct vcpu_vmx *vmx = to_vmx(vcpu);
4353
4354         /* The Intel VMX Instruction Reference lists a bunch of bits that
4355          * are prerequisite to running VMXON, most notably cr4.VMXE must be
4356          * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
4357          * Otherwise, we should fail with #UD. We test these now:
4358          */
4359         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
4360             !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
4361             (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4362                 kvm_queue_exception(vcpu, UD_VECTOR);
4363                 return 1;
4364         }
4365
4366         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4367         if (is_long_mode(vcpu) && !cs.l) {
4368                 kvm_queue_exception(vcpu, UD_VECTOR);
4369                 return 1;
4370         }
4371
4372         if (vmx_get_cpl(vcpu)) {
4373                 kvm_inject_gp(vcpu, 0);
4374                 return 1;
4375         }
4376
4377         INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
4378         vmx->nested.vmcs02_num = 0;
4379
4380         vmx->nested.vmxon = true;
4381
4382         skip_emulated_instruction(vcpu);
4383         return 1;
4384 }
4385
4386 /*
4387  * Intel's VMX Instruction Reference specifies a common set of prerequisites
4388  * for running VMX instructions (except VMXON, whose prerequisites are
4389  * slightly different). It also specifies what exception to inject otherwise.
4390  */
4391 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
4392 {
4393         struct kvm_segment cs;
4394         struct vcpu_vmx *vmx = to_vmx(vcpu);
4395
4396         if (!vmx->nested.vmxon) {
4397                 kvm_queue_exception(vcpu, UD_VECTOR);
4398                 return 0;
4399         }
4400
4401         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4402         if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
4403             (is_long_mode(vcpu) && !cs.l)) {
4404                 kvm_queue_exception(vcpu, UD_VECTOR);
4405                 return 0;
4406         }
4407
4408         if (vmx_get_cpl(vcpu)) {
4409                 kvm_inject_gp(vcpu, 0);
4410                 return 0;
4411         }
4412
4413         return 1;
4414 }
4415
4416 /*
4417  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
4418  * just stops using VMX.
4419  */
4420 static void free_nested(struct vcpu_vmx *vmx)
4421 {
4422         if (!vmx->nested.vmxon)
4423                 return;
4424         vmx->nested.vmxon = false;
4425         if (vmx->nested.current_vmptr != -1ull) {
4426                 kunmap(vmx->nested.current_vmcs12_page);
4427                 nested_release_page(vmx->nested.current_vmcs12_page);
4428                 vmx->nested.current_vmptr = -1ull;
4429                 vmx->nested.current_vmcs12 = NULL;
4430         }
4431
4432         nested_free_all_saved_vmcss(vmx);
4433 }
4434
4435 /* Emulate the VMXOFF instruction */
4436 static int handle_vmoff(struct kvm_vcpu *vcpu)
4437 {
4438         if (!nested_vmx_check_permission(vcpu))
4439                 return 1;
4440         free_nested(to_vmx(vcpu));
4441         skip_emulated_instruction(vcpu);
4442         return 1;
4443 }
4444
4445 /*
4446  * Decode the memory-address operand of a vmx instruction, as recorded on an
4447  * exit caused by such an instruction (run by a guest hypervisor).
4448  * On success, returns 0. When the operand is invalid, returns 1 and throws
4449  * #UD or #GP.
4450  */
4451 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
4452                                  unsigned long exit_qualification,
4453                                  u32 vmx_instruction_info, gva_t *ret)
4454 {
4455         /*
4456          * According to Vol. 3B, "Information for VM Exits Due to Instruction
4457          * Execution", on an exit, vmx_instruction_info holds most of the
4458          * addressing components of the operand. Only the displacement part
4459          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4460          * For how an actual address is calculated from all these components,
4461          * refer to Vol. 1, "Operand Addressing".
4462          */
4463         int  scaling = vmx_instruction_info & 3;
4464         int  addr_size = (vmx_instruction_info >> 7) & 7;
4465         bool is_reg = vmx_instruction_info & (1u << 10);
4466         int  seg_reg = (vmx_instruction_info >> 15) & 7;
4467         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4468         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4469         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4470         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4471
4472         if (is_reg) {
4473                 kvm_queue_exception(vcpu, UD_VECTOR);
4474                 return 1;
4475         }
4476
4477         /* Addr = segment_base + offset */
4478         /* offset = base + [index * scale] + displacement */
4479         *ret = vmx_get_segment_base(vcpu, seg_reg);
4480         if (base_is_valid)
4481                 *ret += kvm_register_read(vcpu, base_reg);
4482         if (index_is_valid)
4483                 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
4484         *ret += exit_qualification; /* holds the displacement */
4485
4486         if (addr_size == 1) /* 32 bit */
4487                 *ret &= 0xffffffff;
4488
4489         /*
4490          * TODO: throw #GP (and return 1) in various cases that the VM*
4491          * instructions require it - e.g., offset beyond segment limit,
4492          * unusable or unreadable/unwritable segment, non-canonical 64-bit
4493          * address, and so on. Currently these are not checked.
4494          */
4495         return 0;
4496 }
4497
4498 /*
4499  * The exit handlers return 1 if the exit was handled fully and guest execution
4500  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
4501  * to be done to userspace and return 0.
4502  */
4503 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
4504         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
4505         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
4506         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
4507         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
4508         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
4509         [EXIT_REASON_CR_ACCESS]               = handle_cr,
4510         [EXIT_REASON_DR_ACCESS]               = handle_dr,
4511         [EXIT_REASON_CPUID]                   = handle_cpuid,
4512         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
4513         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
4514         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
4515         [EXIT_REASON_HLT]                     = handle_halt,
4516         [EXIT_REASON_INVD]                    = handle_invd,
4517         [EXIT_REASON_INVLPG]                  = handle_invlpg,
4518         [EXIT_REASON_VMCALL]                  = handle_vmcall,
4519         [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
4520         [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
4521         [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
4522         [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
4523         [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
4524         [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
4525         [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
4526         [EXIT_REASON_VMOFF]                   = handle_vmoff,
4527         [EXIT_REASON_VMON]                    = handle_vmon,
4528         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
4529         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
4530         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
4531         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
4532         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
4533         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
4534         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
4535         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
4536         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
4537         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
4538         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
4539 };
4540
4541 static const int kvm_vmx_max_exit_handlers =
4542         ARRAY_SIZE(kvm_vmx_exit_handlers);
4543
4544 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4545 {
4546         *info1 = vmcs_readl(EXIT_QUALIFICATION);
4547         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
4548 }
4549
4550 /*
4551  * The guest has exited.  See if we can fix it or if we need userspace
4552  * assistance.
4553  */
4554 static int vmx_handle_exit(struct kvm_vcpu *vcpu)
4555 {
4556         struct vcpu_vmx *vmx = to_vmx(vcpu);
4557         u32 exit_reason = vmx->exit_reason;
4558         u32 vectoring_info = vmx->idt_vectoring_info;
4559
4560         trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
4561
4562         /* If guest state is invalid, start emulating */
4563         if (vmx->emulation_required && emulate_invalid_guest_state)
4564                 return handle_invalid_guest_state(vcpu);
4565
4566         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
4567                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4568                 vcpu->run->fail_entry.hardware_entry_failure_reason
4569                         = exit_reason;
4570                 return 0;
4571         }
4572
4573         if (unlikely(vmx->fail)) {
4574                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4575                 vcpu->run->fail_entry.hardware_entry_failure_reason
4576                         = vmcs_read32(VM_INSTRUCTION_ERROR);
4577                 return 0;
4578         }
4579
4580         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
4581                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
4582                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
4583                         exit_reason != EXIT_REASON_TASK_SWITCH))
4584                 printk(KERN_WARNING "%s: unexpected, valid vectoring info "
4585                        "(0x%x) and exit reason is 0x%x\n",
4586                        __func__, vectoring_info, exit_reason);
4587
4588         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
4589                 if (vmx_interrupt_allowed(vcpu)) {
4590                         vmx->soft_vnmi_blocked = 0;
4591                 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
4592                            vcpu->arch.nmi_pending) {
4593                         /*
4594                          * This CPU don't support us in finding the end of an
4595                          * NMI-blocked window if the guest runs with IRQs
4596                          * disabled. So we pull the trigger after 1 s of
4597                          * futile waiting, but inform the user about this.
4598                          */
4599                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
4600                                "state on VCPU %d after 1 s timeout\n",
4601                                __func__, vcpu->vcpu_id);
4602                         vmx->soft_vnmi_blocked = 0;
4603                 }
4604         }
4605
4606         if (exit_reason < kvm_vmx_max_exit_handlers
4607             && kvm_vmx_exit_handlers[exit_reason])
4608                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
4609         else {
4610                 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
4611                 vcpu->run->hw.hardware_exit_reason = exit_reason;
4612         }
4613         return 0;
4614 }
4615
4616 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4617 {
4618         if (irr == -1 || tpr < irr) {
4619                 vmcs_write32(TPR_THRESHOLD, 0);
4620                 return;
4621         }
4622
4623         vmcs_write32(TPR_THRESHOLD, irr);
4624 }
4625
4626 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
4627 {
4628         u32 exit_intr_info;
4629
4630         if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
4631               || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
4632                 return;
4633
4634         vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4635         exit_intr_info = vmx->exit_intr_info;
4636
4637         /* Handle machine checks before interrupts are enabled */
4638         if (is_machine_check(exit_intr_info))
4639                 kvm_machine_check();
4640
4641         /* We need to handle NMIs before interrupts are enabled */
4642         if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
4643             (exit_intr_info & INTR_INFO_VALID_MASK)) {
4644                 kvm_before_handle_nmi(&vmx->vcpu);
4645                 asm("int $2");
4646                 kvm_after_handle_nmi(&vmx->vcpu);
4647         }
4648 }
4649
4650 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
4651 {
4652         u32 exit_intr_info;
4653         bool unblock_nmi;
4654         u8 vector;
4655         bool idtv_info_valid;
4656
4657         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
4658
4659         if (cpu_has_virtual_nmis()) {
4660                 if (vmx->nmi_known_unmasked)
4661                         return;
4662                 /*
4663                  * Can't use vmx->exit_intr_info since we're not sure what
4664                  * the exit reason is.
4665                  */
4666                 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4667                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
4668                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
4669                 /*
4670                  * SDM 3: 27.7.1.2 (September 2008)
4671                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
4672                  * a guest IRET fault.
4673                  * SDM 3: 23.2.2 (September 2008)
4674                  * Bit 12 is undefined in any of the following cases:
4675                  *  If the VM exit sets the valid bit in the IDT-vectoring
4676                  *   information field.
4677                  *  If the VM exit is due to a double fault.
4678                  */
4679                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
4680                     vector != DF_VECTOR && !idtv_info_valid)
4681                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4682                                       GUEST_INTR_STATE_NMI);
4683                 else
4684                         vmx->nmi_known_unmasked =
4685                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
4686                                   & GUEST_INTR_STATE_NMI);
4687         } else if (unlikely(vmx->soft_vnmi_blocked))
4688                 vmx->vnmi_blocked_time +=
4689                         ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
4690 }
4691
4692 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4693                                       u32 idt_vectoring_info,
4694                                       int instr_len_field,
4695                                       int error_code_field)
4696 {
4697         u8 vector;
4698         int type;
4699         bool idtv_info_valid;
4700
4701         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
4702
4703         vmx->vcpu.arch.nmi_injected = false;
4704         kvm_clear_exception_queue(&vmx->vcpu);
4705         kvm_clear_interrupt_queue(&vmx->vcpu);
4706
4707         if (!idtv_info_valid)
4708                 return;
4709
4710         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
4711
4712         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
4713         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
4714
4715         switch (type) {
4716         case INTR_TYPE_NMI_INTR:
4717                 vmx->vcpu.arch.nmi_injected = true;
4718                 /*
4719                  * SDM 3: 27.7.1.2 (September 2008)
4720                  * Clear bit "block by NMI" before VM entry if a NMI
4721                  * delivery faulted.
4722                  */
4723                 vmx_set_nmi_mask(&vmx->vcpu, false);
4724                 break;
4725         case INTR_TYPE_SOFT_EXCEPTION:
4726                 vmx->vcpu.arch.event_exit_inst_len =
4727                         vmcs_read32(instr_len_field);
4728                 /* fall through */
4729         case INTR_TYPE_HARD_EXCEPTION:
4730                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
4731                         u32 err = vmcs_read32(error_code_field);
4732                         kvm_queue_exception_e(&vmx->vcpu, vector, err);
4733                 } else
4734                         kvm_queue_exception(&vmx->vcpu, vector);
4735                 break;
4736         case INTR_TYPE_SOFT_INTR:
4737                 vmx->vcpu.arch.event_exit_inst_len =
4738                         vmcs_read32(instr_len_field);
4739                 /* fall through */
4740         case INTR_TYPE_EXT_INTR:
4741                 kvm_queue_interrupt(&vmx->vcpu, vector,
4742                         type == INTR_TYPE_SOFT_INTR);
4743                 break;
4744         default:
4745                 break;
4746         }
4747 }
4748
4749 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4750 {
4751         __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
4752                                   VM_EXIT_INSTRUCTION_LEN,
4753                                   IDT_VECTORING_ERROR_CODE);
4754 }
4755
4756 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
4757 {
4758         __vmx_complete_interrupts(to_vmx(vcpu),
4759                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
4760                                   VM_ENTRY_INSTRUCTION_LEN,
4761                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
4762
4763         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
4764 }
4765
4766 #ifdef CONFIG_X86_64
4767 #define R "r"
4768 #define Q "q"
4769 #else
4770 #define R "e"
4771 #define Q "l"
4772 #endif
4773
4774 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4775 {
4776         struct vcpu_vmx *vmx = to_vmx(vcpu);
4777
4778         /* Record the guest's net vcpu time for enforced NMI injections. */
4779         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
4780                 vmx->entry_time = ktime_get();
4781
4782         /* Don't enter VMX if guest state is invalid, let the exit handler
4783            start emulation until we arrive back to a valid state */
4784         if (vmx->emulation_required && emulate_invalid_guest_state)
4785                 return;
4786
4787         if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
4788                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
4789         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
4790                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
4791
4792         /* When single-stepping over STI and MOV SS, we must clear the
4793          * corresponding interruptibility bits in the guest state. Otherwise
4794          * vmentry fails as it then expects bit 14 (BS) in pending debug
4795          * exceptions being set, but that's not correct for the guest debugging
4796          * case. */
4797         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
4798                 vmx_set_interrupt_shadow(vcpu, 0);
4799
4800         vmx->__launched = vmx->loaded_vmcs->launched;
4801         asm(
4802                 /* Store host registers */
4803                 "push %%"R"dx; push %%"R"bp;"
4804                 "push %%"R"cx \n\t" /* placeholder for guest rcx */
4805                 "push %%"R"cx \n\t"
4806                 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
4807                 "je 1f \n\t"
4808                 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
4809                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
4810                 "1: \n\t"
4811                 /* Reload cr2 if changed */
4812                 "mov %c[cr2](%0), %%"R"ax \n\t"
4813                 "mov %%cr2, %%"R"dx \n\t"
4814                 "cmp %%"R"ax, %%"R"dx \n\t"
4815                 "je 2f \n\t"
4816                 "mov %%"R"ax, %%cr2 \n\t"
4817                 "2: \n\t"
4818                 /* Check if vmlaunch of vmresume is needed */
4819                 "cmpl $0, %c[launched](%0) \n\t"
4820                 /* Load guest registers.  Don't clobber flags. */
4821                 "mov %c[rax](%0), %%"R"ax \n\t"
4822                 "mov %c[rbx](%0), %%"R"bx \n\t"
4823                 "mov %c[rdx](%0), %%"R"dx \n\t"
4824                 "mov %c[rsi](%0), %%"R"si \n\t"
4825                 "mov %c[rdi](%0), %%"R"di \n\t"
4826                 "mov %c[rbp](%0), %%"R"bp \n\t"
4827 #ifdef CONFIG_X86_64
4828                 "mov %c[r8](%0),  %%r8  \n\t"
4829                 "mov %c[r9](%0),  %%r9  \n\t"
4830                 "mov %c[r10](%0), %%r10 \n\t"
4831                 "mov %c[r11](%0), %%r11 \n\t"
4832                 "mov %c[r12](%0), %%r12 \n\t"
4833                 "mov %c[r13](%0), %%r13 \n\t"
4834                 "mov %c[r14](%0), %%r14 \n\t"
4835                 "mov %c[r15](%0), %%r15 \n\t"
4836 #endif
4837                 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
4838
4839                 /* Enter guest mode */
4840                 "jne .Llaunched \n\t"
4841                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
4842                 "jmp .Lkvm_vmx_return \n\t"
4843                 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4844                 ".Lkvm_vmx_return: "
4845                 /* Save guest registers, load host registers, keep flags */
4846                 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4847                 "pop %0 \n\t"
4848                 "mov %%"R"ax, %c[rax](%0) \n\t"
4849                 "mov %%"R"bx, %c[rbx](%0) \n\t"
4850                 "pop"Q" %c[rcx](%0) \n\t"
4851                 "mov %%"R"dx, %c[rdx](%0) \n\t"
4852                 "mov %%"R"si, %c[rsi](%0) \n\t"
4853                 "mov %%"R"di, %c[rdi](%0) \n\t"
4854                 "mov %%"R"bp, %c[rbp](%0) \n\t"
4855 #ifdef CONFIG_X86_64
4856                 "mov %%r8,  %c[r8](%0) \n\t"
4857                 "mov %%r9,  %c[r9](%0) \n\t"
4858                 "mov %%r10, %c[r10](%0) \n\t"
4859                 "mov %%r11, %c[r11](%0) \n\t"
4860                 "mov %%r12, %c[r12](%0) \n\t"
4861                 "mov %%r13, %c[r13](%0) \n\t"
4862                 "mov %%r14, %c[r14](%0) \n\t"
4863                 "mov %%r15, %c[r15](%0) \n\t"
4864 #endif
4865                 "mov %%cr2, %%"R"ax   \n\t"
4866                 "mov %%"R"ax, %c[cr2](%0) \n\t"
4867
4868                 "pop  %%"R"bp; pop  %%"R"dx \n\t"
4869                 "setbe %c[fail](%0) \n\t"
4870               : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4871                 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
4872                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4873                 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4874                 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
4875                 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
4876                 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
4877                 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
4878                 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
4879                 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
4880                 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
4881 #ifdef CONFIG_X86_64
4882                 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
4883                 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
4884                 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
4885                 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
4886                 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
4887                 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
4888                 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4889                 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4890 #endif
4891                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4892                 [wordsize]"i"(sizeof(ulong))
4893               : "cc", "memory"
4894                 , R"ax", R"bx", R"di", R"si"
4895 #ifdef CONFIG_X86_64
4896                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4897 #endif
4898               );
4899
4900         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4901                                   | (1 << VCPU_EXREG_RFLAGS)
4902                                   | (1 << VCPU_EXREG_CPL)
4903                                   | (1 << VCPU_EXREG_PDPTR)
4904                                   | (1 << VCPU_EXREG_SEGMENTS)
4905                                   | (1 << VCPU_EXREG_CR3));
4906         vcpu->arch.regs_dirty = 0;
4907
4908         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4909
4910         asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4911         vmx->loaded_vmcs->launched = 1;
4912
4913         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4914
4915         vmx_complete_atomic_exit(vmx);
4916         vmx_recover_nmi_blocking(vmx);
4917         vmx_complete_interrupts(vmx);
4918 }
4919
4920 #undef R
4921 #undef Q
4922
4923 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4924 {
4925         struct vcpu_vmx *vmx = to_vmx(vcpu);
4926
4927         free_vpid(vmx);
4928         free_nested(vmx);
4929         free_loaded_vmcs(vmx->loaded_vmcs);
4930         kfree(vmx->guest_msrs);
4931         kvm_vcpu_uninit(vcpu);
4932         kmem_cache_free(kvm_vcpu_cache, vmx);
4933 }
4934
4935 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4936 {
4937         int err;
4938         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
4939         int cpu;
4940
4941         if (!vmx)
4942                 return ERR_PTR(-ENOMEM);
4943
4944         allocate_vpid(vmx);
4945
4946         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
4947         if (err)
4948                 goto free_vcpu;
4949
4950         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4951         err = -ENOMEM;
4952         if (!vmx->guest_msrs) {
4953                 goto uninit_vcpu;
4954         }
4955
4956         vmx->loaded_vmcs = &vmx->vmcs01;
4957         vmx->loaded_vmcs->vmcs = alloc_vmcs();
4958         if (!vmx->loaded_vmcs->vmcs)
4959                 goto free_msrs;
4960         if (!vmm_exclusive)
4961                 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
4962         loaded_vmcs_init(vmx->loaded_vmcs);
4963         if (!vmm_exclusive)
4964                 kvm_cpu_vmxoff();
4965
4966         cpu = get_cpu();
4967         vmx_vcpu_load(&vmx->vcpu, cpu);
4968         vmx->vcpu.cpu = cpu;
4969         err = vmx_vcpu_setup(vmx);
4970         vmx_vcpu_put(&vmx->vcpu);
4971         put_cpu();
4972         if (err)
4973                 goto free_vmcs;
4974         if (vm_need_virtualize_apic_accesses(kvm))
4975                 err = alloc_apic_access_page(kvm);
4976                 if (err)
4977                         goto free_vmcs;
4978
4979         if (enable_ept) {
4980                 if (!kvm->arch.ept_identity_map_addr)
4981                         kvm->arch.ept_identity_map_addr =
4982                                 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4983                 err = -ENOMEM;
4984                 if (alloc_identity_pagetable(kvm) != 0)
4985                         goto free_vmcs;
4986                 if (!init_rmode_identity_map(kvm))
4987                         goto free_vmcs;
4988         }
4989
4990         vmx->nested.current_vmptr = -1ull;
4991         vmx->nested.current_vmcs12 = NULL;
4992
4993         return &vmx->vcpu;
4994
4995 free_vmcs:
4996         free_vmcs(vmx->loaded_vmcs->vmcs);
4997 free_msrs:
4998         kfree(vmx->guest_msrs);
4999 uninit_vcpu:
5000         kvm_vcpu_uninit(&vmx->vcpu);
5001 free_vcpu:
5002         free_vpid(vmx);
5003         kmem_cache_free(kvm_vcpu_cache, vmx);
5004         return ERR_PTR(err);
5005 }
5006
5007 static void __init vmx_check_processor_compat(void *rtn)
5008 {
5009         struct vmcs_config vmcs_conf;
5010
5011         *(int *)rtn = 0;
5012         if (setup_vmcs_config(&vmcs_conf) < 0)
5013                 *(int *)rtn = -EIO;
5014         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
5015                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
5016                                 smp_processor_id());
5017                 *(int *)rtn = -EIO;
5018         }
5019 }
5020
5021 static int get_ept_level(void)
5022 {
5023         return VMX_EPT_DEFAULT_GAW + 1;
5024 }
5025
5026 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
5027 {
5028         u64 ret;
5029
5030         /* For VT-d and EPT combination
5031          * 1. MMIO: always map as UC
5032          * 2. EPT with VT-d:
5033          *   a. VT-d without snooping control feature: can't guarantee the
5034          *      result, try to trust guest.
5035          *   b. VT-d with snooping control feature: snooping control feature of
5036          *      VT-d engine can guarantee the cache correctness. Just set it
5037          *      to WB to keep consistent with host. So the same as item 3.
5038          * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
5039          *    consistent with host MTRR
5040          */
5041         if (is_mmio)
5042                 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
5043         else if (vcpu->kvm->arch.iommu_domain &&
5044                 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
5045                 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
5046                       VMX_EPT_MT_EPTE_SHIFT;
5047         else
5048                 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
5049                         | VMX_EPT_IPAT_BIT;
5050
5051         return ret;
5052 }
5053
5054 #define _ER(x) { EXIT_REASON_##x, #x }
5055
5056 static const struct trace_print_flags vmx_exit_reasons_str[] = {
5057         _ER(EXCEPTION_NMI),
5058         _ER(EXTERNAL_INTERRUPT),
5059         _ER(TRIPLE_FAULT),
5060         _ER(PENDING_INTERRUPT),
5061         _ER(NMI_WINDOW),
5062         _ER(TASK_SWITCH),
5063         _ER(CPUID),
5064         _ER(HLT),
5065         _ER(INVLPG),
5066         _ER(RDPMC),
5067         _ER(RDTSC),
5068         _ER(VMCALL),
5069         _ER(VMCLEAR),
5070         _ER(VMLAUNCH),
5071         _ER(VMPTRLD),
5072         _ER(VMPTRST),
5073         _ER(VMREAD),
5074         _ER(VMRESUME),
5075         _ER(VMWRITE),
5076         _ER(VMOFF),
5077         _ER(VMON),
5078         _ER(CR_ACCESS),
5079         _ER(DR_ACCESS),
5080         _ER(IO_INSTRUCTION),
5081         _ER(MSR_READ),
5082         _ER(MSR_WRITE),
5083         _ER(MWAIT_INSTRUCTION),
5084         _ER(MONITOR_INSTRUCTION),
5085         _ER(PAUSE_INSTRUCTION),
5086         _ER(MCE_DURING_VMENTRY),
5087         _ER(TPR_BELOW_THRESHOLD),
5088         _ER(APIC_ACCESS),
5089         _ER(EPT_VIOLATION),
5090         _ER(EPT_MISCONFIG),
5091         _ER(WBINVD),
5092         { -1, NULL }
5093 };
5094
5095 #undef _ER
5096
5097 static int vmx_get_lpage_level(void)
5098 {
5099         if (enable_ept && !cpu_has_vmx_ept_1g_page())
5100                 return PT_DIRECTORY_LEVEL;
5101         else
5102                 /* For shadow and EPT supported 1GB page */
5103                 return PT_PDPE_LEVEL;
5104 }
5105
5106 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
5107 {
5108         struct kvm_cpuid_entry2 *best;
5109         struct vcpu_vmx *vmx = to_vmx(vcpu);
5110         u32 exec_control;
5111
5112         vmx->rdtscp_enabled = false;
5113         if (vmx_rdtscp_supported()) {
5114                 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5115                 if (exec_control & SECONDARY_EXEC_RDTSCP) {
5116                         best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
5117                         if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
5118                                 vmx->rdtscp_enabled = true;
5119                         else {
5120                                 exec_control &= ~SECONDARY_EXEC_RDTSCP;
5121                                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
5122                                                 exec_control);
5123                         }
5124                 }
5125         }
5126 }
5127
5128 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
5129 {
5130 }
5131
5132 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
5133                                struct x86_instruction_info *info,
5134                                enum x86_intercept_stage stage)
5135 {
5136         return X86EMUL_CONTINUE;
5137 }
5138
5139 static struct kvm_x86_ops vmx_x86_ops = {
5140         .cpu_has_kvm_support = cpu_has_kvm_support,
5141         .disabled_by_bios = vmx_disabled_by_bios,
5142         .hardware_setup = hardware_setup,
5143         .hardware_unsetup = hardware_unsetup,
5144         .check_processor_compatibility = vmx_check_processor_compat,
5145         .hardware_enable = hardware_enable,
5146         .hardware_disable = hardware_disable,
5147         .cpu_has_accelerated_tpr = report_flexpriority,
5148
5149         .vcpu_create = vmx_create_vcpu,
5150         .vcpu_free = vmx_free_vcpu,
5151         .vcpu_reset = vmx_vcpu_reset,
5152
5153         .prepare_guest_switch = vmx_save_host_state,
5154         .vcpu_load = vmx_vcpu_load,
5155         .vcpu_put = vmx_vcpu_put,
5156
5157         .set_guest_debug = set_guest_debug,
5158         .get_msr = vmx_get_msr,
5159         .set_msr = vmx_set_msr,
5160         .get_segment_base = vmx_get_segment_base,
5161         .get_segment = vmx_get_segment,
5162         .set_segment = vmx_set_segment,
5163         .get_cpl = vmx_get_cpl,
5164         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
5165         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
5166         .decache_cr3 = vmx_decache_cr3,
5167         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
5168         .set_cr0 = vmx_set_cr0,
5169         .set_cr3 = vmx_set_cr3,
5170         .set_cr4 = vmx_set_cr4,
5171         .set_efer = vmx_set_efer,
5172         .get_idt = vmx_get_idt,
5173         .set_idt = vmx_set_idt,
5174         .get_gdt = vmx_get_gdt,
5175         .set_gdt = vmx_set_gdt,
5176         .set_dr7 = vmx_set_dr7,
5177         .cache_reg = vmx_cache_reg,
5178         .get_rflags = vmx_get_rflags,
5179         .set_rflags = vmx_set_rflags,
5180         .fpu_activate = vmx_fpu_activate,
5181         .fpu_deactivate = vmx_fpu_deactivate,
5182
5183         .tlb_flush = vmx_flush_tlb,
5184
5185         .run = vmx_vcpu_run,
5186         .handle_exit = vmx_handle_exit,
5187         .skip_emulated_instruction = skip_emulated_instruction,
5188         .set_interrupt_shadow = vmx_set_interrupt_shadow,
5189         .get_interrupt_shadow = vmx_get_interrupt_shadow,
5190         .patch_hypercall = vmx_patch_hypercall,
5191         .set_irq = vmx_inject_irq,
5192         .set_nmi = vmx_inject_nmi,
5193         .queue_exception = vmx_queue_exception,
5194         .cancel_injection = vmx_cancel_injection,
5195         .interrupt_allowed = vmx_interrupt_allowed,
5196         .nmi_allowed = vmx_nmi_allowed,
5197         .get_nmi_mask = vmx_get_nmi_mask,
5198         .set_nmi_mask = vmx_set_nmi_mask,
5199         .enable_nmi_window = enable_nmi_window,
5200         .enable_irq_window = enable_irq_window,
5201         .update_cr8_intercept = update_cr8_intercept,
5202
5203         .set_tss_addr = vmx_set_tss_addr,
5204         .get_tdp_level = get_ept_level,
5205         .get_mt_mask = vmx_get_mt_mask,
5206
5207         .get_exit_info = vmx_get_exit_info,
5208         .exit_reasons_str = vmx_exit_reasons_str,
5209
5210         .get_lpage_level = vmx_get_lpage_level,
5211
5212         .cpuid_update = vmx_cpuid_update,
5213
5214         .rdtscp_supported = vmx_rdtscp_supported,
5215
5216         .set_supported_cpuid = vmx_set_supported_cpuid,
5217
5218         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
5219
5220         .set_tsc_khz = vmx_set_tsc_khz,
5221         .write_tsc_offset = vmx_write_tsc_offset,
5222         .adjust_tsc_offset = vmx_adjust_tsc_offset,
5223         .compute_tsc_offset = vmx_compute_tsc_offset,
5224
5225         .set_tdp_cr3 = vmx_set_cr3,
5226
5227         .check_intercept = vmx_check_intercept,
5228 };
5229
5230 static int __init vmx_init(void)
5231 {
5232         int r, i;
5233
5234         rdmsrl_safe(MSR_EFER, &host_efer);
5235
5236         for (i = 0; i < NR_VMX_MSR; ++i)
5237                 kvm_define_shared_msr(i, vmx_msr_index[i]);
5238
5239         vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
5240         if (!vmx_io_bitmap_a)
5241                 return -ENOMEM;
5242
5243         vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
5244         if (!vmx_io_bitmap_b) {
5245                 r = -ENOMEM;
5246                 goto out;
5247         }
5248
5249         vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
5250         if (!vmx_msr_bitmap_legacy) {
5251                 r = -ENOMEM;
5252                 goto out1;
5253         }
5254
5255         vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
5256         if (!vmx_msr_bitmap_longmode) {
5257                 r = -ENOMEM;
5258                 goto out2;
5259         }
5260
5261         /*
5262          * Allow direct access to the PC debug port (it is often used for I/O
5263          * delays, but the vmexits simply slow things down).
5264          */
5265         memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
5266         clear_bit(0x80, vmx_io_bitmap_a);
5267
5268         memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
5269
5270         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
5271         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
5272
5273         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
5274
5275         r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
5276                      __alignof__(struct vcpu_vmx), THIS_MODULE);
5277         if (r)
5278                 goto out3;
5279
5280         vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
5281         vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
5282         vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
5283         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
5284         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
5285         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
5286
5287         if (enable_ept) {
5288                 bypass_guest_pf = 0;
5289                 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
5290                                 VMX_EPT_EXECUTABLE_MASK);
5291                 kvm_enable_tdp();
5292         } else
5293                 kvm_disable_tdp();
5294
5295         if (bypass_guest_pf)
5296                 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
5297
5298         return 0;
5299
5300 out3:
5301         free_page((unsigned long)vmx_msr_bitmap_longmode);
5302 out2:
5303         free_page((unsigned long)vmx_msr_bitmap_legacy);
5304 out1:
5305         free_page((unsigned long)vmx_io_bitmap_b);
5306 out:
5307         free_page((unsigned long)vmx_io_bitmap_a);
5308         return r;
5309 }
5310
5311 static void __exit vmx_exit(void)
5312 {
5313         free_page((unsigned long)vmx_msr_bitmap_legacy);
5314         free_page((unsigned long)vmx_msr_bitmap_longmode);
5315         free_page((unsigned long)vmx_io_bitmap_b);
5316         free_page((unsigned long)vmx_io_bitmap_a);
5317
5318         kvm_exit();
5319 }
5320
5321 module_init(vmx_init)
5322 module_exit(vmx_exit)