9d2ec88eeed2efcbf4bae0189462406f6efa0864
[pandora-kernel.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22
23 #include <linux/kvm_host.h>
24 #include <linux/module.h>
25 #include <linux/kernel.h>
26 #include <linux/mm.h>
27 #include <linux/highmem.h>
28 #include <linux/sched.h>
29 #include <linux/moduleparam.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/ftrace_event.h>
32 #include <linux/slab.h>
33 #include <linux/tboot.h>
34 #include "kvm_cache_regs.h"
35 #include "x86.h"
36
37 #include <asm/io.h>
38 #include <asm/desc.h>
39 #include <asm/vmx.h>
40 #include <asm/virtext.h>
41 #include <asm/mce.h>
42 #include <asm/i387.h>
43 #include <asm/xcr.h>
44 #include <asm/perf_event.h>
45 #include <asm/kexec.h>
46
47 #include "trace.h"
48
49 #define __ex(x) __kvm_handle_fault_on_reboot(x)
50 #define __ex_clear(x, reg) \
51         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
52
53 MODULE_AUTHOR("Qumranet");
54 MODULE_LICENSE("GPL");
55
56 static const struct x86_cpu_id vmx_cpu_id[] = {
57         X86_FEATURE_MATCH(X86_FEATURE_VMX),
58         {}
59 };
60 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
61
62 static bool __read_mostly enable_vpid = 1;
63 module_param_named(vpid, enable_vpid, bool, 0444);
64
65 static bool __read_mostly flexpriority_enabled = 1;
66 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
67
68 static bool __read_mostly enable_ept = 1;
69 module_param_named(ept, enable_ept, bool, S_IRUGO);
70
71 static bool __read_mostly enable_unrestricted_guest = 1;
72 module_param_named(unrestricted_guest,
73                         enable_unrestricted_guest, bool, S_IRUGO);
74
75 static bool __read_mostly enable_ept_ad_bits = 1;
76 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
77
78 static bool __read_mostly emulate_invalid_guest_state = true;
79 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
80
81 static bool __read_mostly vmm_exclusive = 1;
82 module_param(vmm_exclusive, bool, S_IRUGO);
83
84 static bool __read_mostly fasteoi = 1;
85 module_param(fasteoi, bool, S_IRUGO);
86
87 /*
88  * If nested=1, nested virtualization is supported, i.e., guests may use
89  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
90  * use VMX instructions.
91  */
92 static bool __read_mostly nested = 0;
93 module_param(nested, bool, S_IRUGO);
94
95 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                           \
96         (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
97 #define KVM_GUEST_CR0_MASK                                              \
98         (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
99 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST                         \
100         (X86_CR0_WP | X86_CR0_NE)
101 #define KVM_VM_CR0_ALWAYS_ON                                            \
102         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
103 #define KVM_CR4_GUEST_OWNED_BITS                                      \
104         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
105          | X86_CR4_OSXMMEXCPT)
106
107 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
108 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
109
110 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
111
112 /*
113  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
114  * ple_gap:    upper bound on the amount of time between two successive
115  *             executions of PAUSE in a loop. Also indicate if ple enabled.
116  *             According to test, this time is usually smaller than 128 cycles.
117  * ple_window: upper bound on the amount of time a guest is allowed to execute
118  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
119  *             less than 2^12 cycles
120  * Time is measured based on a counter that runs at the same rate as the TSC,
121  * refer SDM volume 3b section 21.6.13 & 22.1.3.
122  */
123 #define KVM_VMX_DEFAULT_PLE_GAP    128
124 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
125 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
126 module_param(ple_gap, int, S_IRUGO);
127
128 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
129 module_param(ple_window, int, S_IRUGO);
130
131 extern const ulong vmx_return;
132
133 #define NR_AUTOLOAD_MSRS 8
134 #define VMCS02_POOL_SIZE 1
135
136 struct vmcs {
137         u32 revision_id;
138         u32 abort;
139         char data[0];
140 };
141
142 /*
143  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
144  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
145  * loaded on this CPU (so we can clear them if the CPU goes down).
146  */
147 struct loaded_vmcs {
148         struct vmcs *vmcs;
149         int cpu;
150         int launched;
151         struct list_head loaded_vmcss_on_cpu_link;
152 };
153
154 struct shared_msr_entry {
155         unsigned index;
156         u64 data;
157         u64 mask;
158 };
159
160 /*
161  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
162  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
163  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
164  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
165  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
166  * More than one of these structures may exist, if L1 runs multiple L2 guests.
167  * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
168  * underlying hardware which will be used to run L2.
169  * This structure is packed to ensure that its layout is identical across
170  * machines (necessary for live migration).
171  * If there are changes in this struct, VMCS12_REVISION must be changed.
172  */
173 typedef u64 natural_width;
174 struct __packed vmcs12 {
175         /* According to the Intel spec, a VMCS region must start with the
176          * following two fields. Then follow implementation-specific data.
177          */
178         u32 revision_id;
179         u32 abort;
180
181         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
182         u32 padding[7]; /* room for future expansion */
183
184         u64 io_bitmap_a;
185         u64 io_bitmap_b;
186         u64 msr_bitmap;
187         u64 vm_exit_msr_store_addr;
188         u64 vm_exit_msr_load_addr;
189         u64 vm_entry_msr_load_addr;
190         u64 tsc_offset;
191         u64 virtual_apic_page_addr;
192         u64 apic_access_addr;
193         u64 ept_pointer;
194         u64 guest_physical_address;
195         u64 vmcs_link_pointer;
196         u64 guest_ia32_debugctl;
197         u64 guest_ia32_pat;
198         u64 guest_ia32_efer;
199         u64 guest_ia32_perf_global_ctrl;
200         u64 guest_pdptr0;
201         u64 guest_pdptr1;
202         u64 guest_pdptr2;
203         u64 guest_pdptr3;
204         u64 host_ia32_pat;
205         u64 host_ia32_efer;
206         u64 host_ia32_perf_global_ctrl;
207         u64 padding64[8]; /* room for future expansion */
208         /*
209          * To allow migration of L1 (complete with its L2 guests) between
210          * machines of different natural widths (32 or 64 bit), we cannot have
211          * unsigned long fields with no explict size. We use u64 (aliased
212          * natural_width) instead. Luckily, x86 is little-endian.
213          */
214         natural_width cr0_guest_host_mask;
215         natural_width cr4_guest_host_mask;
216         natural_width cr0_read_shadow;
217         natural_width cr4_read_shadow;
218         natural_width cr3_target_value0;
219         natural_width cr3_target_value1;
220         natural_width cr3_target_value2;
221         natural_width cr3_target_value3;
222         natural_width exit_qualification;
223         natural_width guest_linear_address;
224         natural_width guest_cr0;
225         natural_width guest_cr3;
226         natural_width guest_cr4;
227         natural_width guest_es_base;
228         natural_width guest_cs_base;
229         natural_width guest_ss_base;
230         natural_width guest_ds_base;
231         natural_width guest_fs_base;
232         natural_width guest_gs_base;
233         natural_width guest_ldtr_base;
234         natural_width guest_tr_base;
235         natural_width guest_gdtr_base;
236         natural_width guest_idtr_base;
237         natural_width guest_dr7;
238         natural_width guest_rsp;
239         natural_width guest_rip;
240         natural_width guest_rflags;
241         natural_width guest_pending_dbg_exceptions;
242         natural_width guest_sysenter_esp;
243         natural_width guest_sysenter_eip;
244         natural_width host_cr0;
245         natural_width host_cr3;
246         natural_width host_cr4;
247         natural_width host_fs_base;
248         natural_width host_gs_base;
249         natural_width host_tr_base;
250         natural_width host_gdtr_base;
251         natural_width host_idtr_base;
252         natural_width host_ia32_sysenter_esp;
253         natural_width host_ia32_sysenter_eip;
254         natural_width host_rsp;
255         natural_width host_rip;
256         natural_width paddingl[8]; /* room for future expansion */
257         u32 pin_based_vm_exec_control;
258         u32 cpu_based_vm_exec_control;
259         u32 exception_bitmap;
260         u32 page_fault_error_code_mask;
261         u32 page_fault_error_code_match;
262         u32 cr3_target_count;
263         u32 vm_exit_controls;
264         u32 vm_exit_msr_store_count;
265         u32 vm_exit_msr_load_count;
266         u32 vm_entry_controls;
267         u32 vm_entry_msr_load_count;
268         u32 vm_entry_intr_info_field;
269         u32 vm_entry_exception_error_code;
270         u32 vm_entry_instruction_len;
271         u32 tpr_threshold;
272         u32 secondary_vm_exec_control;
273         u32 vm_instruction_error;
274         u32 vm_exit_reason;
275         u32 vm_exit_intr_info;
276         u32 vm_exit_intr_error_code;
277         u32 idt_vectoring_info_field;
278         u32 idt_vectoring_error_code;
279         u32 vm_exit_instruction_len;
280         u32 vmx_instruction_info;
281         u32 guest_es_limit;
282         u32 guest_cs_limit;
283         u32 guest_ss_limit;
284         u32 guest_ds_limit;
285         u32 guest_fs_limit;
286         u32 guest_gs_limit;
287         u32 guest_ldtr_limit;
288         u32 guest_tr_limit;
289         u32 guest_gdtr_limit;
290         u32 guest_idtr_limit;
291         u32 guest_es_ar_bytes;
292         u32 guest_cs_ar_bytes;
293         u32 guest_ss_ar_bytes;
294         u32 guest_ds_ar_bytes;
295         u32 guest_fs_ar_bytes;
296         u32 guest_gs_ar_bytes;
297         u32 guest_ldtr_ar_bytes;
298         u32 guest_tr_ar_bytes;
299         u32 guest_interruptibility_info;
300         u32 guest_activity_state;
301         u32 guest_sysenter_cs;
302         u32 host_ia32_sysenter_cs;
303         u32 padding32[8]; /* room for future expansion */
304         u16 virtual_processor_id;
305         u16 guest_es_selector;
306         u16 guest_cs_selector;
307         u16 guest_ss_selector;
308         u16 guest_ds_selector;
309         u16 guest_fs_selector;
310         u16 guest_gs_selector;
311         u16 guest_ldtr_selector;
312         u16 guest_tr_selector;
313         u16 host_es_selector;
314         u16 host_cs_selector;
315         u16 host_ss_selector;
316         u16 host_ds_selector;
317         u16 host_fs_selector;
318         u16 host_gs_selector;
319         u16 host_tr_selector;
320 };
321
322 /*
323  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
324  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
325  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
326  */
327 #define VMCS12_REVISION 0x11e57ed0
328
329 /*
330  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
331  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
332  * current implementation, 4K are reserved to avoid future complications.
333  */
334 #define VMCS12_SIZE 0x1000
335
336 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
337 struct vmcs02_list {
338         struct list_head list;
339         gpa_t vmptr;
340         struct loaded_vmcs vmcs02;
341 };
342
343 /*
344  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
345  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
346  */
347 struct nested_vmx {
348         /* Has the level1 guest done vmxon? */
349         bool vmxon;
350
351         /* The guest-physical address of the current VMCS L1 keeps for L2 */
352         gpa_t current_vmptr;
353         /* The host-usable pointer to the above */
354         struct page *current_vmcs12_page;
355         struct vmcs12 *current_vmcs12;
356
357         /* vmcs02_list cache of VMCSs recently used to run L2 guests */
358         struct list_head vmcs02_pool;
359         int vmcs02_num;
360         u64 vmcs01_tsc_offset;
361         /* L2 must run next, and mustn't decide to exit to L1. */
362         bool nested_run_pending;
363         /*
364          * Guest pages referred to in vmcs02 with host-physical pointers, so
365          * we must keep them pinned while L2 runs.
366          */
367         struct page *apic_access_page;
368 };
369
370 struct vcpu_vmx {
371         struct kvm_vcpu       vcpu;
372         unsigned long         host_rsp;
373         u8                    fail;
374         u8                    cpl;
375         bool                  nmi_known_unmasked;
376         u32                   exit_intr_info;
377         u32                   idt_vectoring_info;
378         ulong                 rflags;
379         struct shared_msr_entry *guest_msrs;
380         int                   nmsrs;
381         int                   save_nmsrs;
382 #ifdef CONFIG_X86_64
383         u64                   msr_host_kernel_gs_base;
384         u64                   msr_guest_kernel_gs_base;
385 #endif
386         /*
387          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
388          * non-nested (L1) guest, it always points to vmcs01. For a nested
389          * guest (L2), it points to a different VMCS.
390          */
391         struct loaded_vmcs    vmcs01;
392         struct loaded_vmcs   *loaded_vmcs;
393         bool                  __launched; /* temporary, used in vmx_vcpu_run */
394         struct msr_autoload {
395                 unsigned nr;
396                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
397                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
398         } msr_autoload;
399         struct {
400                 int           loaded;
401                 u16           fs_sel, gs_sel, ldt_sel;
402 #ifdef CONFIG_X86_64
403                 u16           ds_sel, es_sel;
404 #endif
405                 int           gs_ldt_reload_needed;
406                 int           fs_reload_needed;
407         } host_state;
408         struct {
409                 int vm86_active;
410                 ulong save_rflags;
411                 struct kvm_segment segs[8];
412         } rmode;
413         struct {
414                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
415                 struct kvm_save_segment {
416                         u16 selector;
417                         unsigned long base;
418                         u32 limit;
419                         u32 ar;
420                 } seg[8];
421         } segment_cache;
422         int vpid;
423         bool emulation_required;
424
425         /* Support for vnmi-less CPUs */
426         int soft_vnmi_blocked;
427         ktime_t entry_time;
428         s64 vnmi_blocked_time;
429         u32 exit_reason;
430
431         bool rdtscp_enabled;
432
433         /* Support for a guest hypervisor (nested VMX) */
434         struct nested_vmx nested;
435 };
436
437 enum segment_cache_field {
438         SEG_FIELD_SEL = 0,
439         SEG_FIELD_BASE = 1,
440         SEG_FIELD_LIMIT = 2,
441         SEG_FIELD_AR = 3,
442
443         SEG_FIELD_NR = 4
444 };
445
446 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
447 {
448         return container_of(vcpu, struct vcpu_vmx, vcpu);
449 }
450
451 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
452 #define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
453 #define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
454                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
455
456 static const unsigned short vmcs_field_to_offset_table[] = {
457         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
458         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
459         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
460         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
461         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
462         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
463         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
464         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
465         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
466         FIELD(HOST_ES_SELECTOR, host_es_selector),
467         FIELD(HOST_CS_SELECTOR, host_cs_selector),
468         FIELD(HOST_SS_SELECTOR, host_ss_selector),
469         FIELD(HOST_DS_SELECTOR, host_ds_selector),
470         FIELD(HOST_FS_SELECTOR, host_fs_selector),
471         FIELD(HOST_GS_SELECTOR, host_gs_selector),
472         FIELD(HOST_TR_SELECTOR, host_tr_selector),
473         FIELD64(IO_BITMAP_A, io_bitmap_a),
474         FIELD64(IO_BITMAP_B, io_bitmap_b),
475         FIELD64(MSR_BITMAP, msr_bitmap),
476         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
477         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
478         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
479         FIELD64(TSC_OFFSET, tsc_offset),
480         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
481         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
482         FIELD64(EPT_POINTER, ept_pointer),
483         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
484         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
485         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
486         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
487         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
488         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
489         FIELD64(GUEST_PDPTR0, guest_pdptr0),
490         FIELD64(GUEST_PDPTR1, guest_pdptr1),
491         FIELD64(GUEST_PDPTR2, guest_pdptr2),
492         FIELD64(GUEST_PDPTR3, guest_pdptr3),
493         FIELD64(HOST_IA32_PAT, host_ia32_pat),
494         FIELD64(HOST_IA32_EFER, host_ia32_efer),
495         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
496         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
497         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
498         FIELD(EXCEPTION_BITMAP, exception_bitmap),
499         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
500         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
501         FIELD(CR3_TARGET_COUNT, cr3_target_count),
502         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
503         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
504         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
505         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
506         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
507         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
508         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
509         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
510         FIELD(TPR_THRESHOLD, tpr_threshold),
511         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
512         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
513         FIELD(VM_EXIT_REASON, vm_exit_reason),
514         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
515         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
516         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
517         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
518         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
519         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
520         FIELD(GUEST_ES_LIMIT, guest_es_limit),
521         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
522         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
523         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
524         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
525         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
526         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
527         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
528         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
529         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
530         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
531         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
532         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
533         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
534         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
535         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
536         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
537         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
538         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
539         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
540         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
541         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
542         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
543         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
544         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
545         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
546         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
547         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
548         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
549         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
550         FIELD(EXIT_QUALIFICATION, exit_qualification),
551         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
552         FIELD(GUEST_CR0, guest_cr0),
553         FIELD(GUEST_CR3, guest_cr3),
554         FIELD(GUEST_CR4, guest_cr4),
555         FIELD(GUEST_ES_BASE, guest_es_base),
556         FIELD(GUEST_CS_BASE, guest_cs_base),
557         FIELD(GUEST_SS_BASE, guest_ss_base),
558         FIELD(GUEST_DS_BASE, guest_ds_base),
559         FIELD(GUEST_FS_BASE, guest_fs_base),
560         FIELD(GUEST_GS_BASE, guest_gs_base),
561         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
562         FIELD(GUEST_TR_BASE, guest_tr_base),
563         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
564         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
565         FIELD(GUEST_DR7, guest_dr7),
566         FIELD(GUEST_RSP, guest_rsp),
567         FIELD(GUEST_RIP, guest_rip),
568         FIELD(GUEST_RFLAGS, guest_rflags),
569         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
570         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
571         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
572         FIELD(HOST_CR0, host_cr0),
573         FIELD(HOST_CR3, host_cr3),
574         FIELD(HOST_CR4, host_cr4),
575         FIELD(HOST_FS_BASE, host_fs_base),
576         FIELD(HOST_GS_BASE, host_gs_base),
577         FIELD(HOST_TR_BASE, host_tr_base),
578         FIELD(HOST_GDTR_BASE, host_gdtr_base),
579         FIELD(HOST_IDTR_BASE, host_idtr_base),
580         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
581         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
582         FIELD(HOST_RSP, host_rsp),
583         FIELD(HOST_RIP, host_rip),
584 };
585 static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
586
587 static inline short vmcs_field_to_offset(unsigned long field)
588 {
589         if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
590                 return -1;
591         return vmcs_field_to_offset_table[field];
592 }
593
594 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
595 {
596         return to_vmx(vcpu)->nested.current_vmcs12;
597 }
598
599 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
600 {
601         struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
602         if (is_error_page(page))
603                 return NULL;
604
605         return page;
606 }
607
608 static void nested_release_page(struct page *page)
609 {
610         kvm_release_page_dirty(page);
611 }
612
613 static void nested_release_page_clean(struct page *page)
614 {
615         kvm_release_page_clean(page);
616 }
617
618 static u64 construct_eptp(unsigned long root_hpa);
619 static void kvm_cpu_vmxon(u64 addr);
620 static void kvm_cpu_vmxoff(void);
621 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
622 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
623 static void vmx_set_segment(struct kvm_vcpu *vcpu,
624                             struct kvm_segment *var, int seg);
625 static void vmx_get_segment(struct kvm_vcpu *vcpu,
626                             struct kvm_segment *var, int seg);
627 static bool guest_state_valid(struct kvm_vcpu *vcpu);
628 static u32 vmx_segment_access_rights(struct kvm_segment *var);
629
630 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
631 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
632 /*
633  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
634  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
635  */
636 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
637 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
638
639 static unsigned long *vmx_io_bitmap_a;
640 static unsigned long *vmx_io_bitmap_b;
641 static unsigned long *vmx_msr_bitmap_legacy;
642 static unsigned long *vmx_msr_bitmap_longmode;
643
644 static bool cpu_has_load_ia32_efer;
645 static bool cpu_has_load_perf_global_ctrl;
646
647 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
648 static DEFINE_SPINLOCK(vmx_vpid_lock);
649
650 static struct vmcs_config {
651         int size;
652         int order;
653         u32 revision_id;
654         u32 pin_based_exec_ctrl;
655         u32 cpu_based_exec_ctrl;
656         u32 cpu_based_2nd_exec_ctrl;
657         u32 vmexit_ctrl;
658         u32 vmentry_ctrl;
659 } vmcs_config;
660
661 static struct vmx_capability {
662         u32 ept;
663         u32 vpid;
664 } vmx_capability;
665
666 #define VMX_SEGMENT_FIELD(seg)                                  \
667         [VCPU_SREG_##seg] = {                                   \
668                 .selector = GUEST_##seg##_SELECTOR,             \
669                 .base = GUEST_##seg##_BASE,                     \
670                 .limit = GUEST_##seg##_LIMIT,                   \
671                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
672         }
673
674 static const struct kvm_vmx_segment_field {
675         unsigned selector;
676         unsigned base;
677         unsigned limit;
678         unsigned ar_bytes;
679 } kvm_vmx_segment_fields[] = {
680         VMX_SEGMENT_FIELD(CS),
681         VMX_SEGMENT_FIELD(DS),
682         VMX_SEGMENT_FIELD(ES),
683         VMX_SEGMENT_FIELD(FS),
684         VMX_SEGMENT_FIELD(GS),
685         VMX_SEGMENT_FIELD(SS),
686         VMX_SEGMENT_FIELD(TR),
687         VMX_SEGMENT_FIELD(LDTR),
688 };
689
690 static u64 host_efer;
691
692 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
693
694 /*
695  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
696  * away by decrementing the array size.
697  */
698 static const u32 vmx_msr_index[] = {
699 #ifdef CONFIG_X86_64
700         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
701 #endif
702         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
703 };
704 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
705
706 static inline bool is_page_fault(u32 intr_info)
707 {
708         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
709                              INTR_INFO_VALID_MASK)) ==
710                 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
711 }
712
713 static inline bool is_no_device(u32 intr_info)
714 {
715         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
716                              INTR_INFO_VALID_MASK)) ==
717                 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
718 }
719
720 static inline bool is_invalid_opcode(u32 intr_info)
721 {
722         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
723                              INTR_INFO_VALID_MASK)) ==
724                 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
725 }
726
727 static inline bool is_external_interrupt(u32 intr_info)
728 {
729         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
730                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
731 }
732
733 static inline bool is_machine_check(u32 intr_info)
734 {
735         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
736                              INTR_INFO_VALID_MASK)) ==
737                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
738 }
739
740 static inline bool cpu_has_vmx_msr_bitmap(void)
741 {
742         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
743 }
744
745 static inline bool cpu_has_vmx_tpr_shadow(void)
746 {
747         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
748 }
749
750 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
751 {
752         return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
753 }
754
755 static inline bool cpu_has_secondary_exec_ctrls(void)
756 {
757         return vmcs_config.cpu_based_exec_ctrl &
758                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
759 }
760
761 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
762 {
763         return vmcs_config.cpu_based_2nd_exec_ctrl &
764                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
765 }
766
767 static inline bool cpu_has_vmx_flexpriority(void)
768 {
769         return cpu_has_vmx_tpr_shadow() &&
770                 cpu_has_vmx_virtualize_apic_accesses();
771 }
772
773 static inline bool cpu_has_vmx_ept_execute_only(void)
774 {
775         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
776 }
777
778 static inline bool cpu_has_vmx_eptp_uncacheable(void)
779 {
780         return vmx_capability.ept & VMX_EPTP_UC_BIT;
781 }
782
783 static inline bool cpu_has_vmx_eptp_writeback(void)
784 {
785         return vmx_capability.ept & VMX_EPTP_WB_BIT;
786 }
787
788 static inline bool cpu_has_vmx_ept_2m_page(void)
789 {
790         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
791 }
792
793 static inline bool cpu_has_vmx_ept_1g_page(void)
794 {
795         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
796 }
797
798 static inline bool cpu_has_vmx_ept_4levels(void)
799 {
800         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
801 }
802
803 static inline bool cpu_has_vmx_ept_ad_bits(void)
804 {
805         return vmx_capability.ept & VMX_EPT_AD_BIT;
806 }
807
808 static inline bool cpu_has_vmx_invept_context(void)
809 {
810         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
811 }
812
813 static inline bool cpu_has_vmx_invept_global(void)
814 {
815         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
816 }
817
818 static inline bool cpu_has_vmx_invvpid_single(void)
819 {
820         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
821 }
822
823 static inline bool cpu_has_vmx_invvpid_global(void)
824 {
825         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
826 }
827
828 static inline bool cpu_has_vmx_ept(void)
829 {
830         return vmcs_config.cpu_based_2nd_exec_ctrl &
831                 SECONDARY_EXEC_ENABLE_EPT;
832 }
833
834 static inline bool cpu_has_vmx_unrestricted_guest(void)
835 {
836         return vmcs_config.cpu_based_2nd_exec_ctrl &
837                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
838 }
839
840 static inline bool cpu_has_vmx_ple(void)
841 {
842         return vmcs_config.cpu_based_2nd_exec_ctrl &
843                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
844 }
845
846 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
847 {
848         return flexpriority_enabled && irqchip_in_kernel(kvm);
849 }
850
851 static inline bool cpu_has_vmx_vpid(void)
852 {
853         return vmcs_config.cpu_based_2nd_exec_ctrl &
854                 SECONDARY_EXEC_ENABLE_VPID;
855 }
856
857 static inline bool cpu_has_vmx_rdtscp(void)
858 {
859         return vmcs_config.cpu_based_2nd_exec_ctrl &
860                 SECONDARY_EXEC_RDTSCP;
861 }
862
863 static inline bool cpu_has_vmx_invpcid(void)
864 {
865         return vmcs_config.cpu_based_2nd_exec_ctrl &
866                 SECONDARY_EXEC_ENABLE_INVPCID;
867 }
868
869 static inline bool cpu_has_virtual_nmis(void)
870 {
871         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
872 }
873
874 static inline bool cpu_has_vmx_wbinvd_exit(void)
875 {
876         return vmcs_config.cpu_based_2nd_exec_ctrl &
877                 SECONDARY_EXEC_WBINVD_EXITING;
878 }
879
880 static inline bool report_flexpriority(void)
881 {
882         return flexpriority_enabled;
883 }
884
885 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
886 {
887         return vmcs12->cpu_based_vm_exec_control & bit;
888 }
889
890 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
891 {
892         return (vmcs12->cpu_based_vm_exec_control &
893                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
894                 (vmcs12->secondary_vm_exec_control & bit);
895 }
896
897 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
898         struct kvm_vcpu *vcpu)
899 {
900         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
901 }
902
903 static inline bool is_exception(u32 intr_info)
904 {
905         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
906                 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
907 }
908
909 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
910 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
911                         struct vmcs12 *vmcs12,
912                         u32 reason, unsigned long qualification);
913
914 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
915 {
916         int i;
917
918         for (i = 0; i < vmx->nmsrs; ++i)
919                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
920                         return i;
921         return -1;
922 }
923
924 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
925 {
926     struct {
927         u64 vpid : 16;
928         u64 rsvd : 48;
929         u64 gva;
930     } operand = { vpid, 0, gva };
931
932     asm volatile (__ex(ASM_VMX_INVVPID)
933                   /* CF==1 or ZF==1 --> rc = -1 */
934                   "; ja 1f ; ud2 ; 1:"
935                   : : "a"(&operand), "c"(ext) : "cc", "memory");
936 }
937
938 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
939 {
940         struct {
941                 u64 eptp, gpa;
942         } operand = {eptp, gpa};
943
944         asm volatile (__ex(ASM_VMX_INVEPT)
945                         /* CF==1 or ZF==1 --> rc = -1 */
946                         "; ja 1f ; ud2 ; 1:\n"
947                         : : "a" (&operand), "c" (ext) : "cc", "memory");
948 }
949
950 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
951 {
952         int i;
953
954         i = __find_msr_index(vmx, msr);
955         if (i >= 0)
956                 return &vmx->guest_msrs[i];
957         return NULL;
958 }
959
960 static void vmcs_clear(struct vmcs *vmcs)
961 {
962         u64 phys_addr = __pa(vmcs);
963         u8 error;
964
965         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
966                       : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
967                       : "cc", "memory");
968         if (error)
969                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
970                        vmcs, phys_addr);
971 }
972
973 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
974 {
975         vmcs_clear(loaded_vmcs->vmcs);
976         loaded_vmcs->cpu = -1;
977         loaded_vmcs->launched = 0;
978 }
979
980 static void vmcs_load(struct vmcs *vmcs)
981 {
982         u64 phys_addr = __pa(vmcs);
983         u8 error;
984
985         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
986                         : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
987                         : "cc", "memory");
988         if (error)
989                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
990                        vmcs, phys_addr);
991 }
992
993 #ifdef CONFIG_KEXEC
994 /*
995  * This bitmap is used to indicate whether the vmclear
996  * operation is enabled on all cpus. All disabled by
997  * default.
998  */
999 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1000
1001 static inline void crash_enable_local_vmclear(int cpu)
1002 {
1003         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1004 }
1005
1006 static inline void crash_disable_local_vmclear(int cpu)
1007 {
1008         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1009 }
1010
1011 static inline int crash_local_vmclear_enabled(int cpu)
1012 {
1013         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1014 }
1015
1016 static void crash_vmclear_local_loaded_vmcss(void)
1017 {
1018         int cpu = raw_smp_processor_id();
1019         struct loaded_vmcs *v;
1020
1021         if (!crash_local_vmclear_enabled(cpu))
1022                 return;
1023
1024         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1025                             loaded_vmcss_on_cpu_link)
1026                 vmcs_clear(v->vmcs);
1027 }
1028 #else
1029 static inline void crash_enable_local_vmclear(int cpu) { }
1030 static inline void crash_disable_local_vmclear(int cpu) { }
1031 #endif /* CONFIG_KEXEC */
1032
1033 static void __loaded_vmcs_clear(void *arg)
1034 {
1035         struct loaded_vmcs *loaded_vmcs = arg;
1036         int cpu = raw_smp_processor_id();
1037
1038         if (loaded_vmcs->cpu != cpu)
1039                 return; /* vcpu migration can race with cpu offline */
1040         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1041                 per_cpu(current_vmcs, cpu) = NULL;
1042         crash_disable_local_vmclear(cpu);
1043         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1044
1045         /*
1046          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1047          * is before setting loaded_vmcs->vcpu to -1 which is done in
1048          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1049          * then adds the vmcs into percpu list before it is deleted.
1050          */
1051         smp_wmb();
1052
1053         loaded_vmcs_init(loaded_vmcs);
1054         crash_enable_local_vmclear(cpu);
1055 }
1056
1057 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1058 {
1059         int cpu = loaded_vmcs->cpu;
1060
1061         if (cpu != -1)
1062                 smp_call_function_single(cpu,
1063                          __loaded_vmcs_clear, loaded_vmcs, 1);
1064 }
1065
1066 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1067 {
1068         if (vmx->vpid == 0)
1069                 return;
1070
1071         if (cpu_has_vmx_invvpid_single())
1072                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1073 }
1074
1075 static inline void vpid_sync_vcpu_global(void)
1076 {
1077         if (cpu_has_vmx_invvpid_global())
1078                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1079 }
1080
1081 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1082 {
1083         if (cpu_has_vmx_invvpid_single())
1084                 vpid_sync_vcpu_single(vmx);
1085         else
1086                 vpid_sync_vcpu_global();
1087 }
1088
1089 static inline void ept_sync_global(void)
1090 {
1091         if (cpu_has_vmx_invept_global())
1092                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1093 }
1094
1095 static inline void ept_sync_context(u64 eptp)
1096 {
1097         if (enable_ept) {
1098                 if (cpu_has_vmx_invept_context())
1099                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1100                 else
1101                         ept_sync_global();
1102         }
1103 }
1104
1105 static __always_inline unsigned long vmcs_readl(unsigned long field)
1106 {
1107         unsigned long value;
1108
1109         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1110                       : "=a"(value) : "d"(field) : "cc");
1111         return value;
1112 }
1113
1114 static __always_inline u16 vmcs_read16(unsigned long field)
1115 {
1116         return vmcs_readl(field);
1117 }
1118
1119 static __always_inline u32 vmcs_read32(unsigned long field)
1120 {
1121         return vmcs_readl(field);
1122 }
1123
1124 static __always_inline u64 vmcs_read64(unsigned long field)
1125 {
1126 #ifdef CONFIG_X86_64
1127         return vmcs_readl(field);
1128 #else
1129         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1130 #endif
1131 }
1132
1133 static noinline void vmwrite_error(unsigned long field, unsigned long value)
1134 {
1135         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1136                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1137         dump_stack();
1138 }
1139
1140 static void vmcs_writel(unsigned long field, unsigned long value)
1141 {
1142         u8 error;
1143
1144         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1145                        : "=q"(error) : "a"(value), "d"(field) : "cc");
1146         if (unlikely(error))
1147                 vmwrite_error(field, value);
1148 }
1149
1150 static void vmcs_write16(unsigned long field, u16 value)
1151 {
1152         vmcs_writel(field, value);
1153 }
1154
1155 static void vmcs_write32(unsigned long field, u32 value)
1156 {
1157         vmcs_writel(field, value);
1158 }
1159
1160 static void vmcs_write64(unsigned long field, u64 value)
1161 {
1162         vmcs_writel(field, value);
1163 #ifndef CONFIG_X86_64
1164         asm volatile ("");
1165         vmcs_writel(field+1, value >> 32);
1166 #endif
1167 }
1168
1169 static void vmcs_clear_bits(unsigned long field, u32 mask)
1170 {
1171         vmcs_writel(field, vmcs_readl(field) & ~mask);
1172 }
1173
1174 static void vmcs_set_bits(unsigned long field, u32 mask)
1175 {
1176         vmcs_writel(field, vmcs_readl(field) | mask);
1177 }
1178
1179 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1180 {
1181         vmx->segment_cache.bitmask = 0;
1182 }
1183
1184 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1185                                        unsigned field)
1186 {
1187         bool ret;
1188         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1189
1190         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1191                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1192                 vmx->segment_cache.bitmask = 0;
1193         }
1194         ret = vmx->segment_cache.bitmask & mask;
1195         vmx->segment_cache.bitmask |= mask;
1196         return ret;
1197 }
1198
1199 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1200 {
1201         u16 *p = &vmx->segment_cache.seg[seg].selector;
1202
1203         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1204                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1205         return *p;
1206 }
1207
1208 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1209 {
1210         ulong *p = &vmx->segment_cache.seg[seg].base;
1211
1212         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1213                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1214         return *p;
1215 }
1216
1217 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1218 {
1219         u32 *p = &vmx->segment_cache.seg[seg].limit;
1220
1221         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1222                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1223         return *p;
1224 }
1225
1226 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1227 {
1228         u32 *p = &vmx->segment_cache.seg[seg].ar;
1229
1230         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1231                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1232         return *p;
1233 }
1234
1235 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1236 {
1237         u32 eb;
1238
1239         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1240              (1u << NM_VECTOR) | (1u << DB_VECTOR);
1241         if ((vcpu->guest_debug &
1242              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1243             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1244                 eb |= 1u << BP_VECTOR;
1245         if (to_vmx(vcpu)->rmode.vm86_active)
1246                 eb = ~0;
1247         if (enable_ept)
1248                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1249         if (vcpu->fpu_active)
1250                 eb &= ~(1u << NM_VECTOR);
1251
1252         /* When we are running a nested L2 guest and L1 specified for it a
1253          * certain exception bitmap, we must trap the same exceptions and pass
1254          * them to L1. When running L2, we will only handle the exceptions
1255          * specified above if L1 did not want them.
1256          */
1257         if (is_guest_mode(vcpu))
1258                 eb |= get_vmcs12(vcpu)->exception_bitmap;
1259
1260         vmcs_write32(EXCEPTION_BITMAP, eb);
1261 }
1262
1263 static void clear_atomic_switch_msr_special(unsigned long entry,
1264                 unsigned long exit)
1265 {
1266         vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
1267         vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
1268 }
1269
1270 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1271 {
1272         unsigned i;
1273         struct msr_autoload *m = &vmx->msr_autoload;
1274
1275         switch (msr) {
1276         case MSR_EFER:
1277                 if (cpu_has_load_ia32_efer) {
1278                         clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
1279                                         VM_EXIT_LOAD_IA32_EFER);
1280                         return;
1281                 }
1282                 break;
1283         case MSR_CORE_PERF_GLOBAL_CTRL:
1284                 if (cpu_has_load_perf_global_ctrl) {
1285                         clear_atomic_switch_msr_special(
1286                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1287                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1288                         return;
1289                 }
1290                 break;
1291         }
1292
1293         for (i = 0; i < m->nr; ++i)
1294                 if (m->guest[i].index == msr)
1295                         break;
1296
1297         if (i == m->nr)
1298                 return;
1299         --m->nr;
1300         m->guest[i] = m->guest[m->nr];
1301         m->host[i] = m->host[m->nr];
1302         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1303         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1304 }
1305
1306 static void add_atomic_switch_msr_special(unsigned long entry,
1307                 unsigned long exit, unsigned long guest_val_vmcs,
1308                 unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
1309 {
1310         vmcs_write64(guest_val_vmcs, guest_val);
1311         vmcs_write64(host_val_vmcs, host_val);
1312         vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
1313         vmcs_set_bits(VM_EXIT_CONTROLS, exit);
1314 }
1315
1316 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1317                                   u64 guest_val, u64 host_val)
1318 {
1319         unsigned i;
1320         struct msr_autoload *m = &vmx->msr_autoload;
1321
1322         switch (msr) {
1323         case MSR_EFER:
1324                 if (cpu_has_load_ia32_efer) {
1325                         add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
1326                                         VM_EXIT_LOAD_IA32_EFER,
1327                                         GUEST_IA32_EFER,
1328                                         HOST_IA32_EFER,
1329                                         guest_val, host_val);
1330                         return;
1331                 }
1332                 break;
1333         case MSR_CORE_PERF_GLOBAL_CTRL:
1334                 if (cpu_has_load_perf_global_ctrl) {
1335                         add_atomic_switch_msr_special(
1336                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1337                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1338                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1339                                         HOST_IA32_PERF_GLOBAL_CTRL,
1340                                         guest_val, host_val);
1341                         return;
1342                 }
1343                 break;
1344         }
1345
1346         for (i = 0; i < m->nr; ++i)
1347                 if (m->guest[i].index == msr)
1348                         break;
1349
1350         if (i == NR_AUTOLOAD_MSRS) {
1351                 printk_once(KERN_WARNING"Not enough mst switch entries. "
1352                                 "Can't add msr %x\n", msr);
1353                 return;
1354         } else if (i == m->nr) {
1355                 ++m->nr;
1356                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1357                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1358         }
1359
1360         m->guest[i].index = msr;
1361         m->guest[i].value = guest_val;
1362         m->host[i].index = msr;
1363         m->host[i].value = host_val;
1364 }
1365
1366 static void reload_tss(void)
1367 {
1368         /*
1369          * VT restores TR but not its size.  Useless.
1370          */
1371         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1372         struct desc_struct *descs;
1373
1374         descs = (void *)gdt->address;
1375         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1376         load_TR_desc();
1377 }
1378
1379 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1380 {
1381         u64 guest_efer;
1382         u64 ignore_bits;
1383
1384         guest_efer = vmx->vcpu.arch.efer;
1385
1386         /*
1387          * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1388          * outside long mode
1389          */
1390         ignore_bits = EFER_NX | EFER_SCE;
1391 #ifdef CONFIG_X86_64
1392         ignore_bits |= EFER_LMA | EFER_LME;
1393         /* SCE is meaningful only in long mode on Intel */
1394         if (guest_efer & EFER_LMA)
1395                 ignore_bits &= ~(u64)EFER_SCE;
1396 #endif
1397         guest_efer &= ~ignore_bits;
1398         guest_efer |= host_efer & ignore_bits;
1399         vmx->guest_msrs[efer_offset].data = guest_efer;
1400         vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1401
1402         clear_atomic_switch_msr(vmx, MSR_EFER);
1403         /* On ept, can't emulate nx, and must switch nx atomically */
1404         if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
1405                 guest_efer = vmx->vcpu.arch.efer;
1406                 if (!(guest_efer & EFER_LMA))
1407                         guest_efer &= ~EFER_LME;
1408                 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
1409                 return false;
1410         }
1411
1412         return true;
1413 }
1414
1415 static unsigned long segment_base(u16 selector)
1416 {
1417         struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1418         struct desc_struct *d;
1419         unsigned long table_base;
1420         unsigned long v;
1421
1422         if (!(selector & ~3))
1423                 return 0;
1424
1425         table_base = gdt->address;
1426
1427         if (selector & 4) {           /* from ldt */
1428                 u16 ldt_selector = kvm_read_ldt();
1429
1430                 if (!(ldt_selector & ~3))
1431                         return 0;
1432
1433                 table_base = segment_base(ldt_selector);
1434         }
1435         d = (struct desc_struct *)(table_base + (selector & ~7));
1436         v = get_desc_base(d);
1437 #ifdef CONFIG_X86_64
1438        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1439                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1440 #endif
1441         return v;
1442 }
1443
1444 static inline unsigned long kvm_read_tr_base(void)
1445 {
1446         u16 tr;
1447         asm("str %0" : "=g"(tr));
1448         return segment_base(tr);
1449 }
1450
1451 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1452 {
1453         struct vcpu_vmx *vmx = to_vmx(vcpu);
1454         int i;
1455
1456         if (vmx->host_state.loaded)
1457                 return;
1458
1459         vmx->host_state.loaded = 1;
1460         /*
1461          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1462          * allow segment selectors with cpl > 0 or ti == 1.
1463          */
1464         vmx->host_state.ldt_sel = kvm_read_ldt();
1465         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1466         savesegment(fs, vmx->host_state.fs_sel);
1467         if (!(vmx->host_state.fs_sel & 7)) {
1468                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1469                 vmx->host_state.fs_reload_needed = 0;
1470         } else {
1471                 vmcs_write16(HOST_FS_SELECTOR, 0);
1472                 vmx->host_state.fs_reload_needed = 1;
1473         }
1474         savesegment(gs, vmx->host_state.gs_sel);
1475         if (!(vmx->host_state.gs_sel & 7))
1476                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1477         else {
1478                 vmcs_write16(HOST_GS_SELECTOR, 0);
1479                 vmx->host_state.gs_ldt_reload_needed = 1;
1480         }
1481
1482 #ifdef CONFIG_X86_64
1483         savesegment(ds, vmx->host_state.ds_sel);
1484         savesegment(es, vmx->host_state.es_sel);
1485 #endif
1486
1487 #ifdef CONFIG_X86_64
1488         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1489         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1490 #else
1491         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1492         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1493 #endif
1494
1495 #ifdef CONFIG_X86_64
1496         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1497         if (is_long_mode(&vmx->vcpu))
1498                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1499 #endif
1500         for (i = 0; i < vmx->save_nmsrs; ++i)
1501                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1502                                    vmx->guest_msrs[i].data,
1503                                    vmx->guest_msrs[i].mask);
1504 }
1505
1506 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1507 {
1508         if (!vmx->host_state.loaded)
1509                 return;
1510
1511         ++vmx->vcpu.stat.host_state_reload;
1512         vmx->host_state.loaded = 0;
1513 #ifdef CONFIG_X86_64
1514         if (is_long_mode(&vmx->vcpu))
1515                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1516 #endif
1517         if (vmx->host_state.gs_ldt_reload_needed) {
1518                 kvm_load_ldt(vmx->host_state.ldt_sel);
1519 #ifdef CONFIG_X86_64
1520                 load_gs_index(vmx->host_state.gs_sel);
1521 #else
1522                 loadsegment(gs, vmx->host_state.gs_sel);
1523 #endif
1524         }
1525         if (vmx->host_state.fs_reload_needed)
1526                 loadsegment(fs, vmx->host_state.fs_sel);
1527 #ifdef CONFIG_X86_64
1528         if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1529                 loadsegment(ds, vmx->host_state.ds_sel);
1530                 loadsegment(es, vmx->host_state.es_sel);
1531         }
1532 #endif
1533         reload_tss();
1534 #ifdef CONFIG_X86_64
1535         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1536 #endif
1537         /*
1538          * If the FPU is not active (through the host task or
1539          * the guest vcpu), then restore the cr0.TS bit.
1540          */
1541         if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1542                 stts();
1543         load_gdt(&__get_cpu_var(host_gdt));
1544 }
1545
1546 static void vmx_load_host_state(struct vcpu_vmx *vmx)
1547 {
1548         preempt_disable();
1549         __vmx_load_host_state(vmx);
1550         preempt_enable();
1551 }
1552
1553 /*
1554  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1555  * vcpu mutex is already taken.
1556  */
1557 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1558 {
1559         struct vcpu_vmx *vmx = to_vmx(vcpu);
1560         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1561
1562         if (!vmm_exclusive)
1563                 kvm_cpu_vmxon(phys_addr);
1564         else if (vmx->loaded_vmcs->cpu != cpu)
1565                 loaded_vmcs_clear(vmx->loaded_vmcs);
1566
1567         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1568                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1569                 vmcs_load(vmx->loaded_vmcs->vmcs);
1570         }
1571
1572         if (vmx->loaded_vmcs->cpu != cpu) {
1573                 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
1574                 unsigned long sysenter_esp;
1575
1576                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1577                 local_irq_disable();
1578                 crash_disable_local_vmclear(cpu);
1579
1580                 /*
1581                  * Read loaded_vmcs->cpu should be before fetching
1582                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
1583                  * See the comments in __loaded_vmcs_clear().
1584                  */
1585                 smp_rmb();
1586
1587                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1588                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1589                 crash_enable_local_vmclear(cpu);
1590                 local_irq_enable();
1591
1592                 /*
1593                  * Linux uses per-cpu TSS and GDT, so set these when switching
1594                  * processors.
1595                  */
1596                 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1597                 vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1598
1599                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1600                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1601                 vmx->loaded_vmcs->cpu = cpu;
1602         }
1603 }
1604
1605 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1606 {
1607         __vmx_load_host_state(to_vmx(vcpu));
1608         if (!vmm_exclusive) {
1609                 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1610                 vcpu->cpu = -1;
1611                 kvm_cpu_vmxoff();
1612         }
1613 }
1614
1615 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1616 {
1617         ulong cr0;
1618
1619         if (vcpu->fpu_active)
1620                 return;
1621         vcpu->fpu_active = 1;
1622         cr0 = vmcs_readl(GUEST_CR0);
1623         cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1624         cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1625         vmcs_writel(GUEST_CR0, cr0);
1626         update_exception_bitmap(vcpu);
1627         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1628         if (is_guest_mode(vcpu))
1629                 vcpu->arch.cr0_guest_owned_bits &=
1630                         ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1631         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1632 }
1633
1634 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1635
1636 /*
1637  * Return the cr0 value that a nested guest would read. This is a combination
1638  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1639  * its hypervisor (cr0_read_shadow).
1640  */
1641 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1642 {
1643         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1644                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1645 }
1646 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1647 {
1648         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1649                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1650 }
1651
1652 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1653 {
1654         /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1655          * set this *before* calling this function.
1656          */
1657         vmx_decache_cr0_guest_bits(vcpu);
1658         vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1659         update_exception_bitmap(vcpu);
1660         vcpu->arch.cr0_guest_owned_bits = 0;
1661         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1662         if (is_guest_mode(vcpu)) {
1663                 /*
1664                  * L1's specified read shadow might not contain the TS bit,
1665                  * so now that we turned on shadowing of this bit, we need to
1666                  * set this bit of the shadow. Like in nested_vmx_run we need
1667                  * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1668                  * up-to-date here because we just decached cr0.TS (and we'll
1669                  * only update vmcs12->guest_cr0 on nested exit).
1670                  */
1671                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1672                 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1673                         (vcpu->arch.cr0 & X86_CR0_TS);
1674                 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1675         } else
1676                 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1677 }
1678
1679 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1680 {
1681         unsigned long rflags, save_rflags;
1682
1683         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1684                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1685                 rflags = vmcs_readl(GUEST_RFLAGS);
1686                 if (to_vmx(vcpu)->rmode.vm86_active) {
1687                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1688                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1689                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1690                 }
1691                 to_vmx(vcpu)->rflags = rflags;
1692         }
1693         return to_vmx(vcpu)->rflags;
1694 }
1695
1696 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1697 {
1698         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1699         to_vmx(vcpu)->rflags = rflags;
1700         if (to_vmx(vcpu)->rmode.vm86_active) {
1701                 to_vmx(vcpu)->rmode.save_rflags = rflags;
1702                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1703         }
1704         vmcs_writel(GUEST_RFLAGS, rflags);
1705 }
1706
1707 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1708 {
1709         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1710         int ret = 0;
1711
1712         if (interruptibility & GUEST_INTR_STATE_STI)
1713                 ret |= KVM_X86_SHADOW_INT_STI;
1714         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1715                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1716
1717         return ret & mask;
1718 }
1719
1720 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1721 {
1722         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1723         u32 interruptibility = interruptibility_old;
1724
1725         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1726
1727         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1728                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1729         else if (mask & KVM_X86_SHADOW_INT_STI)
1730                 interruptibility |= GUEST_INTR_STATE_STI;
1731
1732         if ((interruptibility != interruptibility_old))
1733                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1734 }
1735
1736 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1737 {
1738         unsigned long rip;
1739
1740         rip = kvm_rip_read(vcpu);
1741         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1742         kvm_rip_write(vcpu, rip);
1743
1744         /* skipping an emulated instruction also counts */
1745         vmx_set_interrupt_shadow(vcpu, 0);
1746 }
1747
1748 /*
1749  * KVM wants to inject page-faults which it got to the guest. This function
1750  * checks whether in a nested guest, we need to inject them to L1 or L2.
1751  * This function assumes it is called with the exit reason in vmcs02 being
1752  * a #PF exception (this is the only case in which KVM injects a #PF when L2
1753  * is running).
1754  */
1755 static int nested_pf_handled(struct kvm_vcpu *vcpu)
1756 {
1757         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1758
1759         /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1760         if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
1761                 return 0;
1762
1763         nested_vmx_vmexit(vcpu);
1764         return 1;
1765 }
1766
1767 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1768                                 bool has_error_code, u32 error_code,
1769                                 bool reinject)
1770 {
1771         struct vcpu_vmx *vmx = to_vmx(vcpu);
1772         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1773
1774         if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1775                 nested_pf_handled(vcpu))
1776                 return;
1777
1778         if (has_error_code) {
1779                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1780                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1781         }
1782
1783         if (vmx->rmode.vm86_active) {
1784                 int inc_eip = 0;
1785                 if (kvm_exception_is_soft(nr))
1786                         inc_eip = vcpu->arch.event_exit_inst_len;
1787                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1788                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1789                 return;
1790         }
1791
1792         if (kvm_exception_is_soft(nr)) {
1793                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1794                              vmx->vcpu.arch.event_exit_inst_len);
1795                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1796         } else
1797                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1798
1799         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1800 }
1801
1802 static bool vmx_rdtscp_supported(void)
1803 {
1804         return cpu_has_vmx_rdtscp();
1805 }
1806
1807 static bool vmx_invpcid_supported(void)
1808 {
1809         return cpu_has_vmx_invpcid() && enable_ept;
1810 }
1811
1812 /*
1813  * Swap MSR entry in host/guest MSR entry array.
1814  */
1815 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1816 {
1817         struct shared_msr_entry tmp;
1818
1819         tmp = vmx->guest_msrs[to];
1820         vmx->guest_msrs[to] = vmx->guest_msrs[from];
1821         vmx->guest_msrs[from] = tmp;
1822 }
1823
1824 /*
1825  * Set up the vmcs to automatically save and restore system
1826  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1827  * mode, as fiddling with msrs is very expensive.
1828  */
1829 static void setup_msrs(struct vcpu_vmx *vmx)
1830 {
1831         int save_nmsrs, index;
1832         unsigned long *msr_bitmap;
1833
1834         save_nmsrs = 0;
1835 #ifdef CONFIG_X86_64
1836         if (is_long_mode(&vmx->vcpu)) {
1837                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1838                 if (index >= 0)
1839                         move_msr_up(vmx, index, save_nmsrs++);
1840                 index = __find_msr_index(vmx, MSR_LSTAR);
1841                 if (index >= 0)
1842                         move_msr_up(vmx, index, save_nmsrs++);
1843                 index = __find_msr_index(vmx, MSR_CSTAR);
1844                 if (index >= 0)
1845                         move_msr_up(vmx, index, save_nmsrs++);
1846                 index = __find_msr_index(vmx, MSR_TSC_AUX);
1847                 if (index >= 0 && vmx->rdtscp_enabled)
1848                         move_msr_up(vmx, index, save_nmsrs++);
1849                 /*
1850                  * MSR_STAR is only needed on long mode guests, and only
1851                  * if efer.sce is enabled.
1852                  */
1853                 index = __find_msr_index(vmx, MSR_STAR);
1854                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1855                         move_msr_up(vmx, index, save_nmsrs++);
1856         }
1857 #endif
1858         index = __find_msr_index(vmx, MSR_EFER);
1859         if (index >= 0 && update_transition_efer(vmx, index))
1860                 move_msr_up(vmx, index, save_nmsrs++);
1861
1862         vmx->save_nmsrs = save_nmsrs;
1863
1864         if (cpu_has_vmx_msr_bitmap()) {
1865                 if (is_long_mode(&vmx->vcpu))
1866                         msr_bitmap = vmx_msr_bitmap_longmode;
1867                 else
1868                         msr_bitmap = vmx_msr_bitmap_legacy;
1869
1870                 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1871         }
1872 }
1873
1874 /*
1875  * reads and returns guest's timestamp counter "register"
1876  * guest_tsc = host_tsc + tsc_offset    -- 21.3
1877  */
1878 static u64 guest_read_tsc(void)
1879 {
1880         u64 host_tsc, tsc_offset;
1881
1882         rdtscll(host_tsc);
1883         tsc_offset = vmcs_read64(TSC_OFFSET);
1884         return host_tsc + tsc_offset;
1885 }
1886
1887 /*
1888  * Like guest_read_tsc, but always returns L1's notion of the timestamp
1889  * counter, even if a nested guest (L2) is currently running.
1890  */
1891 u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1892 {
1893         u64 tsc_offset;
1894
1895         tsc_offset = is_guest_mode(vcpu) ?
1896                 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1897                 vmcs_read64(TSC_OFFSET);
1898         return host_tsc + tsc_offset;
1899 }
1900
1901 /*
1902  * Engage any workarounds for mis-matched TSC rates.  Currently limited to
1903  * software catchup for faster rates on slower CPUs.
1904  */
1905 static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1906 {
1907         if (!scale)
1908                 return;
1909
1910         if (user_tsc_khz > tsc_khz) {
1911                 vcpu->arch.tsc_catchup = 1;
1912                 vcpu->arch.tsc_always_catchup = 1;
1913         } else
1914                 WARN(1, "user requested TSC rate below hardware speed\n");
1915 }
1916
1917 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
1918 {
1919         return vmcs_read64(TSC_OFFSET);
1920 }
1921
1922 /*
1923  * writes 'offset' into guest's timestamp counter offset register
1924  */
1925 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1926 {
1927         if (is_guest_mode(vcpu)) {
1928                 /*
1929                  * We're here if L1 chose not to trap WRMSR to TSC. According
1930                  * to the spec, this should set L1's TSC; The offset that L1
1931                  * set for L2 remains unchanged, and still needs to be added
1932                  * to the newly set TSC to get L2's TSC.
1933                  */
1934                 struct vmcs12 *vmcs12;
1935                 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
1936                 /* recalculate vmcs02.TSC_OFFSET: */
1937                 vmcs12 = get_vmcs12(vcpu);
1938                 vmcs_write64(TSC_OFFSET, offset +
1939                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
1940                          vmcs12->tsc_offset : 0));
1941         } else {
1942                 vmcs_write64(TSC_OFFSET, offset);
1943         }
1944 }
1945
1946 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1947 {
1948         u64 offset = vmcs_read64(TSC_OFFSET);
1949         vmcs_write64(TSC_OFFSET, offset + adjustment);
1950         if (is_guest_mode(vcpu)) {
1951                 /* Even when running L2, the adjustment needs to apply to L1 */
1952                 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
1953         }
1954 }
1955
1956 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1957 {
1958         return target_tsc - native_read_tsc();
1959 }
1960
1961 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1962 {
1963         struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1964         return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1965 }
1966
1967 /*
1968  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1969  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1970  * all guests if the "nested" module option is off, and can also be disabled
1971  * for a single guest by disabling its VMX cpuid bit.
1972  */
1973 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1974 {
1975         return nested && guest_cpuid_has_vmx(vcpu);
1976 }
1977
1978 /*
1979  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1980  * returned for the various VMX controls MSRs when nested VMX is enabled.
1981  * The same values should also be used to verify that vmcs12 control fields are
1982  * valid during nested entry from L1 to L2.
1983  * Each of these control msrs has a low and high 32-bit half: A low bit is on
1984  * if the corresponding bit in the (32-bit) control field *must* be on, and a
1985  * bit in the high half is on if the corresponding bit in the control field
1986  * may be on. See also vmx_control_verify().
1987  * TODO: allow these variables to be modified (downgraded) by module options
1988  * or other means.
1989  */
1990 static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1991 static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1992 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1993 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1994 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1995 static __init void nested_vmx_setup_ctls_msrs(void)
1996 {
1997         /*
1998          * Note that as a general rule, the high half of the MSRs (bits in
1999          * the control fields which may be 1) should be initialized by the
2000          * intersection of the underlying hardware's MSR (i.e., features which
2001          * can be supported) and the list of features we want to expose -
2002          * because they are known to be properly supported in our code.
2003          * Also, usually, the low half of the MSRs (bits which must be 1) can
2004          * be set to 0, meaning that L1 may turn off any of these bits. The
2005          * reason is that if one of these bits is necessary, it will appear
2006          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2007          * fields of vmcs01 and vmcs02, will turn these bits off - and
2008          * nested_vmx_exit_handled() will not pass related exits to L1.
2009          * These rules have exceptions below.
2010          */
2011
2012         /* pin-based controls */
2013         /*
2014          * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
2015          * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
2016          */
2017         nested_vmx_pinbased_ctls_low = 0x16 ;
2018         nested_vmx_pinbased_ctls_high = 0x16 |
2019                 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
2020                 PIN_BASED_VIRTUAL_NMIS;
2021
2022         /* exit controls */
2023         nested_vmx_exit_ctls_low = 0;
2024         /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2025 #ifdef CONFIG_X86_64
2026         nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
2027 #else
2028         nested_vmx_exit_ctls_high = 0;
2029 #endif
2030
2031         /* entry controls */
2032         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2033                 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2034         nested_vmx_entry_ctls_low = 0;
2035         nested_vmx_entry_ctls_high &=
2036                 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
2037
2038         /* cpu-based controls */
2039         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2040                 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
2041         nested_vmx_procbased_ctls_low = 0;
2042         nested_vmx_procbased_ctls_high &=
2043                 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2044                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2045                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2046                 CPU_BASED_CR3_STORE_EXITING |
2047 #ifdef CONFIG_X86_64
2048                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2049 #endif
2050                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2051                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2052                 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2053                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2054         /*
2055          * We can allow some features even when not supported by the
2056          * hardware. For example, L1 can specify an MSR bitmap - and we
2057          * can use it to avoid exits to L1 - even when L0 runs L2
2058          * without MSR bitmaps.
2059          */
2060         nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
2061
2062         /* secondary cpu-based controls */
2063         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2064                 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2065         nested_vmx_secondary_ctls_low = 0;
2066         nested_vmx_secondary_ctls_high &=
2067                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2068 }
2069
2070 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2071 {
2072         /*
2073          * Bits 0 in high must be 0, and bits 1 in low must be 1.
2074          */
2075         return ((control & high) | low) == control;
2076 }
2077
2078 static inline u64 vmx_control_msr(u32 low, u32 high)
2079 {
2080         return low | ((u64)high << 32);
2081 }
2082
2083 /*
2084  * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
2085  * also let it use VMX-specific MSRs.
2086  * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
2087  * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
2088  * like all other MSRs).
2089  */
2090 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2091 {
2092         if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
2093                      msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
2094                 /*
2095                  * According to the spec, processors which do not support VMX
2096                  * should throw a #GP(0) when VMX capability MSRs are read.
2097                  */
2098                 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
2099                 return 1;
2100         }
2101
2102         switch (msr_index) {
2103         case MSR_IA32_FEATURE_CONTROL:
2104                 *pdata = 0;
2105                 break;
2106         case MSR_IA32_VMX_BASIC:
2107                 /*
2108                  * This MSR reports some information about VMX support. We
2109                  * should return information about the VMX we emulate for the
2110                  * guest, and the VMCS structure we give it - not about the
2111                  * VMX support of the underlying hardware.
2112                  */
2113                 *pdata = VMCS12_REVISION |
2114                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2115                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2116                 break;
2117         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2118         case MSR_IA32_VMX_PINBASED_CTLS:
2119                 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
2120                                         nested_vmx_pinbased_ctls_high);
2121                 break;
2122         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2123         case MSR_IA32_VMX_PROCBASED_CTLS:
2124                 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
2125                                         nested_vmx_procbased_ctls_high);
2126                 break;
2127         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2128         case MSR_IA32_VMX_EXIT_CTLS:
2129                 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
2130                                         nested_vmx_exit_ctls_high);
2131                 break;
2132         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2133         case MSR_IA32_VMX_ENTRY_CTLS:
2134                 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
2135                                         nested_vmx_entry_ctls_high);
2136                 break;
2137         case MSR_IA32_VMX_MISC:
2138                 *pdata = 0;
2139                 break;
2140         /*
2141          * These MSRs specify bits which the guest must keep fixed (on or off)
2142          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2143          * We picked the standard core2 setting.
2144          */
2145 #define VMXON_CR0_ALWAYSON      (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2146 #define VMXON_CR4_ALWAYSON      X86_CR4_VMXE
2147         case MSR_IA32_VMX_CR0_FIXED0:
2148                 *pdata = VMXON_CR0_ALWAYSON;
2149                 break;
2150         case MSR_IA32_VMX_CR0_FIXED1:
2151                 *pdata = -1ULL;
2152                 break;
2153         case MSR_IA32_VMX_CR4_FIXED0:
2154                 *pdata = VMXON_CR4_ALWAYSON;
2155                 break;
2156         case MSR_IA32_VMX_CR4_FIXED1:
2157                 *pdata = -1ULL;
2158                 break;
2159         case MSR_IA32_VMX_VMCS_ENUM:
2160                 *pdata = 0x1f;
2161                 break;
2162         case MSR_IA32_VMX_PROCBASED_CTLS2:
2163                 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
2164                                         nested_vmx_secondary_ctls_high);
2165                 break;
2166         case MSR_IA32_VMX_EPT_VPID_CAP:
2167                 /* Currently, no nested ept or nested vpid */
2168                 *pdata = 0;
2169                 break;
2170         default:
2171                 return 0;
2172         }
2173
2174         return 1;
2175 }
2176
2177 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2178 {
2179         if (!nested_vmx_allowed(vcpu))
2180                 return 0;
2181
2182         if (msr_index == MSR_IA32_FEATURE_CONTROL)
2183                 /* TODO: the right thing. */
2184                 return 1;
2185         /*
2186          * No need to treat VMX capability MSRs specially: If we don't handle
2187          * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2188          */
2189         return 0;
2190 }
2191
2192 /*
2193  * Reads an msr value (of 'msr_index') into 'pdata'.
2194  * Returns 0 on success, non-0 otherwise.
2195  * Assumes vcpu_load() was already called.
2196  */
2197 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2198 {
2199         u64 data;
2200         struct shared_msr_entry *msr;
2201
2202         if (!pdata) {
2203                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2204                 return -EINVAL;
2205         }
2206
2207         switch (msr_index) {
2208 #ifdef CONFIG_X86_64
2209         case MSR_FS_BASE:
2210                 data = vmcs_readl(GUEST_FS_BASE);
2211                 break;
2212         case MSR_GS_BASE:
2213                 data = vmcs_readl(GUEST_GS_BASE);
2214                 break;
2215         case MSR_KERNEL_GS_BASE:
2216                 vmx_load_host_state(to_vmx(vcpu));
2217                 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2218                 break;
2219 #endif
2220         case MSR_EFER:
2221                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2222         case MSR_IA32_TSC:
2223                 data = guest_read_tsc();
2224                 break;
2225         case MSR_IA32_SYSENTER_CS:
2226                 data = vmcs_read32(GUEST_SYSENTER_CS);
2227                 break;
2228         case MSR_IA32_SYSENTER_EIP:
2229                 data = vmcs_readl(GUEST_SYSENTER_EIP);
2230                 break;
2231         case MSR_IA32_SYSENTER_ESP:
2232                 data = vmcs_readl(GUEST_SYSENTER_ESP);
2233                 break;
2234         case MSR_TSC_AUX:
2235                 if (!to_vmx(vcpu)->rdtscp_enabled)
2236                         return 1;
2237                 /* Otherwise falls through */
2238         default:
2239                 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2240                         return 0;
2241                 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2242                 if (msr) {
2243                         data = msr->data;
2244                         break;
2245                 }
2246                 return kvm_get_msr_common(vcpu, msr_index, pdata);
2247         }
2248
2249         *pdata = data;
2250         return 0;
2251 }
2252
2253 /*
2254  * Writes msr value into into the appropriate "register".
2255  * Returns 0 on success, non-0 otherwise.
2256  * Assumes vcpu_load() was already called.
2257  */
2258 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2259 {
2260         struct vcpu_vmx *vmx = to_vmx(vcpu);
2261         struct shared_msr_entry *msr;
2262         int ret = 0;
2263         u32 msr_index = msr_info->index;
2264         u64 data = msr_info->data;
2265
2266         switch (msr_index) {
2267         case MSR_EFER:
2268                 ret = kvm_set_msr_common(vcpu, msr_info);
2269                 break;
2270 #ifdef CONFIG_X86_64
2271         case MSR_FS_BASE:
2272                 vmx_segment_cache_clear(vmx);
2273                 vmcs_writel(GUEST_FS_BASE, data);
2274                 break;
2275         case MSR_GS_BASE:
2276                 vmx_segment_cache_clear(vmx);
2277                 vmcs_writel(GUEST_GS_BASE, data);
2278                 break;
2279         case MSR_KERNEL_GS_BASE:
2280                 vmx_load_host_state(vmx);
2281                 vmx->msr_guest_kernel_gs_base = data;
2282                 break;
2283 #endif
2284         case MSR_IA32_SYSENTER_CS:
2285                 vmcs_write32(GUEST_SYSENTER_CS, data);
2286                 break;
2287         case MSR_IA32_SYSENTER_EIP:
2288                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2289                 break;
2290         case MSR_IA32_SYSENTER_ESP:
2291                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2292                 break;
2293         case MSR_IA32_TSC:
2294                 kvm_write_tsc(vcpu, msr_info);
2295                 break;
2296         case MSR_IA32_CR_PAT:
2297                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2298                         vmcs_write64(GUEST_IA32_PAT, data);
2299                         vcpu->arch.pat = data;
2300                         break;
2301                 }
2302                 ret = kvm_set_msr_common(vcpu, msr_info);
2303                 break;
2304         case MSR_IA32_TSC_ADJUST:
2305                 ret = kvm_set_msr_common(vcpu, msr_info);
2306                 break;
2307         case MSR_TSC_AUX:
2308                 if (!vmx->rdtscp_enabled)
2309                         return 1;
2310                 /* Check reserved bit, higher 32 bits should be zero */
2311                 if ((data >> 32) != 0)
2312                         return 1;
2313                 /* Otherwise falls through */
2314         default:
2315                 if (vmx_set_vmx_msr(vcpu, msr_index, data))
2316                         break;
2317                 msr = find_msr_entry(vmx, msr_index);
2318                 if (msr) {
2319                         msr->data = data;
2320                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2321                                 preempt_disable();
2322                                 kvm_set_shared_msr(msr->index, msr->data,
2323                                                    msr->mask);
2324                                 preempt_enable();
2325                         }
2326                         break;
2327                 }
2328                 ret = kvm_set_msr_common(vcpu, msr_info);
2329         }
2330
2331         return ret;
2332 }
2333
2334 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2335 {
2336         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2337         switch (reg) {
2338         case VCPU_REGS_RSP:
2339                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2340                 break;
2341         case VCPU_REGS_RIP:
2342                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2343                 break;
2344         case VCPU_EXREG_PDPTR:
2345                 if (enable_ept)
2346                         ept_save_pdptrs(vcpu);
2347                 break;
2348         default:
2349                 break;
2350         }
2351 }
2352
2353 static __init int cpu_has_kvm_support(void)
2354 {
2355         return cpu_has_vmx();
2356 }
2357
2358 static __init int vmx_disabled_by_bios(void)
2359 {
2360         u64 msr;
2361
2362         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2363         if (msr & FEATURE_CONTROL_LOCKED) {
2364                 /* launched w/ TXT and VMX disabled */
2365                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2366                         && tboot_enabled())
2367                         return 1;
2368                 /* launched w/o TXT and VMX only enabled w/ TXT */
2369                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2370                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2371                         && !tboot_enabled()) {
2372                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2373                                 "activate TXT before enabling KVM\n");
2374                         return 1;
2375                 }
2376                 /* launched w/o TXT and VMX disabled */
2377                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2378                         && !tboot_enabled())
2379                         return 1;
2380         }
2381
2382         return 0;
2383 }
2384
2385 static void kvm_cpu_vmxon(u64 addr)
2386 {
2387         asm volatile (ASM_VMX_VMXON_RAX
2388                         : : "a"(&addr), "m"(addr)
2389                         : "memory", "cc");
2390 }
2391
2392 static int hardware_enable(void *garbage)
2393 {
2394         int cpu = raw_smp_processor_id();
2395         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2396         u64 old, test_bits;
2397
2398         if (read_cr4() & X86_CR4_VMXE)
2399                 return -EBUSY;
2400
2401         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2402
2403         /*
2404          * Now we can enable the vmclear operation in kdump
2405          * since the loaded_vmcss_on_cpu list on this cpu
2406          * has been initialized.
2407          *
2408          * Though the cpu is not in VMX operation now, there
2409          * is no problem to enable the vmclear operation
2410          * for the loaded_vmcss_on_cpu list is empty!
2411          */
2412         crash_enable_local_vmclear(cpu);
2413
2414         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2415
2416         test_bits = FEATURE_CONTROL_LOCKED;
2417         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2418         if (tboot_enabled())
2419                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2420
2421         if ((old & test_bits) != test_bits) {
2422                 /* enable and lock */
2423                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2424         }
2425         write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
2426
2427         if (vmm_exclusive) {
2428                 kvm_cpu_vmxon(phys_addr);
2429                 ept_sync_global();
2430         }
2431
2432         store_gdt(&__get_cpu_var(host_gdt));
2433
2434         return 0;
2435 }
2436
2437 static void vmclear_local_loaded_vmcss(void)
2438 {
2439         int cpu = raw_smp_processor_id();
2440         struct loaded_vmcs *v, *n;
2441
2442         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2443                                  loaded_vmcss_on_cpu_link)
2444                 __loaded_vmcs_clear(v);
2445 }
2446
2447
2448 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2449  * tricks.
2450  */
2451 static void kvm_cpu_vmxoff(void)
2452 {
2453         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2454 }
2455
2456 static void hardware_disable(void *garbage)
2457 {
2458         if (vmm_exclusive) {
2459                 vmclear_local_loaded_vmcss();
2460                 kvm_cpu_vmxoff();
2461         }
2462         write_cr4(read_cr4() & ~X86_CR4_VMXE);
2463 }
2464
2465 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2466                                       u32 msr, u32 *result)
2467 {
2468         u32 vmx_msr_low, vmx_msr_high;
2469         u32 ctl = ctl_min | ctl_opt;
2470
2471         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2472
2473         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2474         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2475
2476         /* Ensure minimum (required) set of control bits are supported. */
2477         if (ctl_min & ~ctl)
2478                 return -EIO;
2479
2480         *result = ctl;
2481         return 0;
2482 }
2483
2484 static __init bool allow_1_setting(u32 msr, u32 ctl)
2485 {
2486         u32 vmx_msr_low, vmx_msr_high;
2487
2488         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2489         return vmx_msr_high & ctl;
2490 }
2491
2492 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2493 {
2494         u32 vmx_msr_low, vmx_msr_high;
2495         u32 min, opt, min2, opt2;
2496         u32 _pin_based_exec_control = 0;
2497         u32 _cpu_based_exec_control = 0;
2498         u32 _cpu_based_2nd_exec_control = 0;
2499         u32 _vmexit_control = 0;
2500         u32 _vmentry_control = 0;
2501
2502         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2503         opt = PIN_BASED_VIRTUAL_NMIS;
2504         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2505                                 &_pin_based_exec_control) < 0)
2506                 return -EIO;
2507
2508         min = CPU_BASED_HLT_EXITING |
2509 #ifdef CONFIG_X86_64
2510               CPU_BASED_CR8_LOAD_EXITING |
2511               CPU_BASED_CR8_STORE_EXITING |
2512 #endif
2513               CPU_BASED_CR3_LOAD_EXITING |
2514               CPU_BASED_CR3_STORE_EXITING |
2515               CPU_BASED_USE_IO_BITMAPS |
2516               CPU_BASED_MOV_DR_EXITING |
2517               CPU_BASED_USE_TSC_OFFSETING |
2518               CPU_BASED_MWAIT_EXITING |
2519               CPU_BASED_MONITOR_EXITING |
2520               CPU_BASED_INVLPG_EXITING |
2521               CPU_BASED_RDPMC_EXITING;
2522
2523         opt = CPU_BASED_TPR_SHADOW |
2524               CPU_BASED_USE_MSR_BITMAPS |
2525               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2526         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2527                                 &_cpu_based_exec_control) < 0)
2528                 return -EIO;
2529 #ifdef CONFIG_X86_64
2530         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2531                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2532                                            ~CPU_BASED_CR8_STORE_EXITING;
2533 #endif
2534         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2535                 min2 = 0;
2536                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2537                         SECONDARY_EXEC_WBINVD_EXITING |
2538                         SECONDARY_EXEC_ENABLE_VPID |
2539                         SECONDARY_EXEC_ENABLE_EPT |
2540                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2541                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2542                         SECONDARY_EXEC_RDTSCP |
2543                         SECONDARY_EXEC_ENABLE_INVPCID;
2544                 if (adjust_vmx_controls(min2, opt2,
2545                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2546                                         &_cpu_based_2nd_exec_control) < 0)
2547                         return -EIO;
2548         }
2549 #ifndef CONFIG_X86_64
2550         if (!(_cpu_based_2nd_exec_control &
2551                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2552                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2553 #endif
2554         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2555                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2556                    enabled */
2557                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2558                                              CPU_BASED_CR3_STORE_EXITING |
2559                                              CPU_BASED_INVLPG_EXITING);
2560                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
2561                       vmx_capability.ept, vmx_capability.vpid);
2562         }
2563
2564         min = 0;
2565 #ifdef CONFIG_X86_64
2566         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2567 #endif
2568         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
2569         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2570                                 &_vmexit_control) < 0)
2571                 return -EIO;
2572
2573         min = 0;
2574         opt = VM_ENTRY_LOAD_IA32_PAT;
2575         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2576                                 &_vmentry_control) < 0)
2577                 return -EIO;
2578
2579         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2580
2581         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2582         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2583                 return -EIO;
2584
2585 #ifdef CONFIG_X86_64
2586         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2587         if (vmx_msr_high & (1u<<16))
2588                 return -EIO;
2589 #endif
2590
2591         /* Require Write-Back (WB) memory type for VMCS accesses. */
2592         if (((vmx_msr_high >> 18) & 15) != 6)
2593                 return -EIO;
2594
2595         vmcs_conf->size = vmx_msr_high & 0x1fff;
2596         vmcs_conf->order = get_order(vmcs_config.size);
2597         vmcs_conf->revision_id = vmx_msr_low;
2598
2599         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2600         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2601         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2602         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2603         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2604
2605         cpu_has_load_ia32_efer =
2606                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2607                                 VM_ENTRY_LOAD_IA32_EFER)
2608                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2609                                    VM_EXIT_LOAD_IA32_EFER);
2610
2611         cpu_has_load_perf_global_ctrl =
2612                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
2613                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
2614                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
2615                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2616
2617         /*
2618          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
2619          * but due to arrata below it can't be used. Workaround is to use
2620          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2621          *
2622          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
2623          *
2624          * AAK155             (model 26)
2625          * AAP115             (model 30)
2626          * AAT100             (model 37)
2627          * BC86,AAY89,BD102   (model 44)
2628          * BA97               (model 46)
2629          *
2630          */
2631         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
2632                 switch (boot_cpu_data.x86_model) {
2633                 case 26:
2634                 case 30:
2635                 case 37:
2636                 case 44:
2637                 case 46:
2638                         cpu_has_load_perf_global_ctrl = false;
2639                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2640                                         "does not work properly. Using workaround\n");
2641                         break;
2642                 default:
2643                         break;
2644                 }
2645         }
2646
2647         return 0;
2648 }
2649
2650 static struct vmcs *alloc_vmcs_cpu(int cpu)
2651 {
2652         int node = cpu_to_node(cpu);
2653         struct page *pages;
2654         struct vmcs *vmcs;
2655
2656         pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
2657         if (!pages)
2658                 return NULL;
2659         vmcs = page_address(pages);
2660         memset(vmcs, 0, vmcs_config.size);
2661         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
2662         return vmcs;
2663 }
2664
2665 static struct vmcs *alloc_vmcs(void)
2666 {
2667         return alloc_vmcs_cpu(raw_smp_processor_id());
2668 }
2669
2670 static void free_vmcs(struct vmcs *vmcs)
2671 {
2672         free_pages((unsigned long)vmcs, vmcs_config.order);
2673 }
2674
2675 /*
2676  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2677  */
2678 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2679 {
2680         if (!loaded_vmcs->vmcs)
2681                 return;
2682         loaded_vmcs_clear(loaded_vmcs);
2683         free_vmcs(loaded_vmcs->vmcs);
2684         loaded_vmcs->vmcs = NULL;
2685 }
2686
2687 static void free_kvm_area(void)
2688 {
2689         int cpu;
2690
2691         for_each_possible_cpu(cpu) {
2692                 free_vmcs(per_cpu(vmxarea, cpu));
2693                 per_cpu(vmxarea, cpu) = NULL;
2694         }
2695 }
2696
2697 static __init int alloc_kvm_area(void)
2698 {
2699         int cpu;
2700
2701         for_each_possible_cpu(cpu) {
2702                 struct vmcs *vmcs;
2703
2704                 vmcs = alloc_vmcs_cpu(cpu);
2705                 if (!vmcs) {
2706                         free_kvm_area();
2707                         return -ENOMEM;
2708                 }
2709
2710                 per_cpu(vmxarea, cpu) = vmcs;
2711         }
2712         return 0;
2713 }
2714
2715 static __init int hardware_setup(void)
2716 {
2717         if (setup_vmcs_config(&vmcs_config) < 0)
2718                 return -EIO;
2719
2720         if (boot_cpu_has(X86_FEATURE_NX))
2721                 kvm_enable_efer_bits(EFER_NX);
2722
2723         if (!cpu_has_vmx_vpid())
2724                 enable_vpid = 0;
2725
2726         if (!cpu_has_vmx_ept() ||
2727             !cpu_has_vmx_ept_4levels()) {
2728                 enable_ept = 0;
2729                 enable_unrestricted_guest = 0;
2730                 enable_ept_ad_bits = 0;
2731         }
2732
2733         if (!cpu_has_vmx_ept_ad_bits())
2734                 enable_ept_ad_bits = 0;
2735
2736         if (!cpu_has_vmx_unrestricted_guest())
2737                 enable_unrestricted_guest = 0;
2738
2739         if (!cpu_has_vmx_flexpriority())
2740                 flexpriority_enabled = 0;
2741
2742         if (!cpu_has_vmx_tpr_shadow())
2743                 kvm_x86_ops->update_cr8_intercept = NULL;
2744
2745         if (enable_ept && !cpu_has_vmx_ept_2m_page())
2746                 kvm_disable_largepages();
2747
2748         if (!cpu_has_vmx_ple())
2749                 ple_gap = 0;
2750
2751         if (nested)
2752                 nested_vmx_setup_ctls_msrs();
2753
2754         return alloc_kvm_area();
2755 }
2756
2757 static __exit void hardware_unsetup(void)
2758 {
2759         free_kvm_area();
2760 }
2761
2762 static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg,
2763                 struct kvm_segment *save)
2764 {
2765         if (!emulate_invalid_guest_state) {
2766                 /*
2767                  * CS and SS RPL should be equal during guest entry according
2768                  * to VMX spec, but in reality it is not always so. Since vcpu
2769                  * is in the middle of the transition from real mode to
2770                  * protected mode it is safe to assume that RPL 0 is a good
2771                  * default value.
2772                  */
2773                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2774                         save->selector &= ~SELECTOR_RPL_MASK;
2775                 save->dpl = save->selector & SELECTOR_RPL_MASK;
2776                 save->s = 1;
2777         }
2778         vmx_set_segment(vcpu, save, seg);
2779 }
2780
2781 static void enter_pmode(struct kvm_vcpu *vcpu)
2782 {
2783         unsigned long flags;
2784         struct vcpu_vmx *vmx = to_vmx(vcpu);
2785
2786         /*
2787          * Update real mode segment cache. It may be not up-to-date if sement
2788          * register was written while vcpu was in a guest mode.
2789          */
2790         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2791         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2792         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2793         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2794         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2795         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2796
2797         vmx->emulation_required = 1;
2798         vmx->rmode.vm86_active = 0;
2799
2800         vmx_segment_cache_clear(vmx);
2801
2802         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2803
2804         flags = vmcs_readl(GUEST_RFLAGS);
2805         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2806         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2807         vmcs_writel(GUEST_RFLAGS, flags);
2808
2809         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2810                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2811
2812         update_exception_bitmap(vcpu);
2813
2814         fix_pmode_dataseg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2815         fix_pmode_dataseg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2816         fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2817         fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2818         fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2819         fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2820
2821         /* CPL is always 0 when CPU enters protected mode */
2822         __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2823         vmx->cpl = 0;
2824 }
2825
2826 static gva_t rmode_tss_base(struct kvm *kvm)
2827 {
2828         if (!kvm->arch.tss_addr) {
2829                 struct kvm_memslots *slots;
2830                 struct kvm_memory_slot *slot;
2831                 gfn_t base_gfn;
2832
2833                 slots = kvm_memslots(kvm);
2834                 slot = id_to_memslot(slots, 0);
2835                 base_gfn = slot->base_gfn + slot->npages - 3;
2836
2837                 return base_gfn << PAGE_SHIFT;
2838         }
2839         return kvm->arch.tss_addr;
2840 }
2841
2842 static void fix_rmode_seg(int seg, struct kvm_segment *save)
2843 {
2844         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2845         struct kvm_segment var = *save;
2846
2847         var.dpl = 0x3;
2848         if (seg == VCPU_SREG_CS)
2849                 var.type = 0x3;
2850
2851         if (!emulate_invalid_guest_state) {
2852                 var.selector = var.base >> 4;
2853                 var.base = var.base & 0xffff0;
2854                 var.limit = 0xffff;
2855                 var.g = 0;
2856                 var.db = 0;
2857                 var.present = 1;
2858                 var.s = 1;
2859                 var.l = 0;
2860                 var.unusable = 0;
2861                 var.type = 0x3;
2862                 var.avl = 0;
2863                 if (save->base & 0xf)
2864                         printk_once(KERN_WARNING "kvm: segment base is not "
2865                                         "paragraph aligned when entering "
2866                                         "protected mode (seg=%d)", seg);
2867         }
2868
2869         vmcs_write16(sf->selector, var.selector);
2870         vmcs_write32(sf->base, var.base);
2871         vmcs_write32(sf->limit, var.limit);
2872         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2873 }
2874
2875 static void enter_rmode(struct kvm_vcpu *vcpu)
2876 {
2877         unsigned long flags;
2878         struct vcpu_vmx *vmx = to_vmx(vcpu);
2879
2880         if (enable_unrestricted_guest)
2881                 return;
2882
2883         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2884         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2885         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2886         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2887         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2888         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2889         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2890
2891         vmx->emulation_required = 1;
2892         vmx->rmode.vm86_active = 1;
2893
2894         /*
2895          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2896          * vcpu. Call it here with phys address pointing 16M below 4G.
2897          */
2898         if (!vcpu->kvm->arch.tss_addr) {
2899                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2900                              "called before entering vcpu\n");
2901                 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
2902                 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
2903                 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2904         }
2905
2906         vmx_segment_cache_clear(vmx);
2907
2908         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
2909         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2910         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2911
2912         flags = vmcs_readl(GUEST_RFLAGS);
2913         vmx->rmode.save_rflags = flags;
2914
2915         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2916
2917         vmcs_writel(GUEST_RFLAGS, flags);
2918         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2919         update_exception_bitmap(vcpu);
2920
2921         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2922         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2923         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2924         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2925         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2926         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2927
2928         kvm_mmu_reset_context(vcpu);
2929 }
2930
2931 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2932 {
2933         struct vcpu_vmx *vmx = to_vmx(vcpu);
2934         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2935
2936         if (!msr)
2937                 return;
2938
2939         /*
2940          * Force kernel_gs_base reloading before EFER changes, as control
2941          * of this msr depends on is_long_mode().
2942          */
2943         vmx_load_host_state(to_vmx(vcpu));
2944         vcpu->arch.efer = efer;
2945         if (efer & EFER_LMA) {
2946                 vmcs_write32(VM_ENTRY_CONTROLS,
2947                              vmcs_read32(VM_ENTRY_CONTROLS) |
2948                              VM_ENTRY_IA32E_MODE);
2949                 msr->data = efer;
2950         } else {
2951                 vmcs_write32(VM_ENTRY_CONTROLS,
2952                              vmcs_read32(VM_ENTRY_CONTROLS) &
2953                              ~VM_ENTRY_IA32E_MODE);
2954
2955                 msr->data = efer & ~EFER_LME;
2956         }
2957         setup_msrs(vmx);
2958 }
2959
2960 #ifdef CONFIG_X86_64
2961
2962 static void enter_lmode(struct kvm_vcpu *vcpu)
2963 {
2964         u32 guest_tr_ar;
2965
2966         vmx_segment_cache_clear(to_vmx(vcpu));
2967
2968         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2969         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
2970                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2971                                      __func__);
2972                 vmcs_write32(GUEST_TR_AR_BYTES,
2973                              (guest_tr_ar & ~AR_TYPE_MASK)
2974                              | AR_TYPE_BUSY_64_TSS);
2975         }
2976         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2977 }
2978
2979 static void exit_lmode(struct kvm_vcpu *vcpu)
2980 {
2981         vmcs_write32(VM_ENTRY_CONTROLS,
2982                      vmcs_read32(VM_ENTRY_CONTROLS)
2983                      & ~VM_ENTRY_IA32E_MODE);
2984         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2985 }
2986
2987 #endif
2988
2989 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2990 {
2991         vpid_sync_context(to_vmx(vcpu));
2992         if (enable_ept) {
2993                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2994                         return;
2995                 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
2996         }
2997 }
2998
2999 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3000 {
3001         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3002
3003         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3004         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3005 }
3006
3007 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3008 {
3009         if (enable_ept && is_paging(vcpu))
3010                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3011         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3012 }
3013
3014 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3015 {
3016         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3017
3018         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3019         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3020 }
3021
3022 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3023 {
3024         if (!test_bit(VCPU_EXREG_PDPTR,
3025                       (unsigned long *)&vcpu->arch.regs_dirty))
3026                 return;
3027
3028         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3029                 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
3030                 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
3031                 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
3032                 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
3033         }
3034 }
3035
3036 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3037 {
3038         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3039                 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3040                 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3041                 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3042                 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3043         }
3044
3045         __set_bit(VCPU_EXREG_PDPTR,
3046                   (unsigned long *)&vcpu->arch.regs_avail);
3047         __set_bit(VCPU_EXREG_PDPTR,
3048                   (unsigned long *)&vcpu->arch.regs_dirty);
3049 }
3050
3051 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
3052
3053 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3054                                         unsigned long cr0,
3055                                         struct kvm_vcpu *vcpu)
3056 {
3057         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3058                 vmx_decache_cr3(vcpu);
3059         if (!(cr0 & X86_CR0_PG)) {
3060                 /* From paging/starting to nonpaging */
3061                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3062                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
3063                              (CPU_BASED_CR3_LOAD_EXITING |
3064                               CPU_BASED_CR3_STORE_EXITING));
3065                 vcpu->arch.cr0 = cr0;
3066                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3067         } else if (!is_paging(vcpu)) {
3068                 /* From nonpaging to paging */
3069                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3070                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
3071                              ~(CPU_BASED_CR3_LOAD_EXITING |
3072                                CPU_BASED_CR3_STORE_EXITING));
3073                 vcpu->arch.cr0 = cr0;
3074                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3075         }
3076
3077         if (!(cr0 & X86_CR0_WP))
3078                 *hw_cr0 &= ~X86_CR0_WP;
3079 }
3080
3081 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3082 {
3083         struct vcpu_vmx *vmx = to_vmx(vcpu);
3084         unsigned long hw_cr0;
3085
3086         if (enable_unrestricted_guest)
3087                 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
3088                         | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3089         else
3090                 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
3091
3092         if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3093                 enter_pmode(vcpu);
3094
3095         if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3096                 enter_rmode(vcpu);
3097
3098 #ifdef CONFIG_X86_64
3099         if (vcpu->arch.efer & EFER_LME) {
3100                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3101                         enter_lmode(vcpu);
3102                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3103                         exit_lmode(vcpu);
3104         }
3105 #endif
3106
3107         if (enable_ept)
3108                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3109
3110         if (!vcpu->fpu_active)
3111                 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
3112
3113         vmcs_writel(CR0_READ_SHADOW, cr0);
3114         vmcs_writel(GUEST_CR0, hw_cr0);
3115         vcpu->arch.cr0 = cr0;
3116 }
3117
3118 static u64 construct_eptp(unsigned long root_hpa)
3119 {
3120         u64 eptp;
3121
3122         /* TODO write the value reading from MSR */
3123         eptp = VMX_EPT_DEFAULT_MT |
3124                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3125         if (enable_ept_ad_bits)
3126                 eptp |= VMX_EPT_AD_ENABLE_BIT;
3127         eptp |= (root_hpa & PAGE_MASK);
3128
3129         return eptp;
3130 }
3131
3132 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3133 {
3134         unsigned long guest_cr3;
3135         u64 eptp;
3136
3137         guest_cr3 = cr3;
3138         if (enable_ept) {
3139                 eptp = construct_eptp(cr3);
3140                 vmcs_write64(EPT_POINTER, eptp);
3141                 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
3142                         vcpu->kvm->arch.ept_identity_map_addr;
3143                 ept_load_pdptrs(vcpu);
3144         }
3145
3146         vmx_flush_tlb(vcpu);
3147         vmcs_writel(GUEST_CR3, guest_cr3);
3148 }
3149
3150 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3151 {
3152         unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
3153                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
3154
3155         if (cr4 & X86_CR4_VMXE) {
3156                 /*
3157                  * To use VMXON (and later other VMX instructions), a guest
3158                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
3159                  * So basically the check on whether to allow nested VMX
3160                  * is here.
3161                  */
3162                 if (!nested_vmx_allowed(vcpu))
3163                         return 1;
3164         } else if (to_vmx(vcpu)->nested.vmxon)
3165                 return 1;
3166
3167         vcpu->arch.cr4 = cr4;
3168         if (enable_ept) {
3169                 if (!is_paging(vcpu)) {
3170                         hw_cr4 &= ~X86_CR4_PAE;
3171                         hw_cr4 |= X86_CR4_PSE;
3172                 } else if (!(cr4 & X86_CR4_PAE)) {
3173                         hw_cr4 &= ~X86_CR4_PAE;
3174                 }
3175         }
3176
3177         vmcs_writel(CR4_READ_SHADOW, cr4);
3178         vmcs_writel(GUEST_CR4, hw_cr4);
3179         return 0;
3180 }
3181
3182 static void vmx_get_segment(struct kvm_vcpu *vcpu,
3183                             struct kvm_segment *var, int seg)
3184 {
3185         struct vcpu_vmx *vmx = to_vmx(vcpu);
3186         u32 ar;
3187
3188         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3189                 *var = vmx->rmode.segs[seg];
3190                 if (seg == VCPU_SREG_TR
3191                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3192                         return;
3193                 var->base = vmx_read_guest_seg_base(vmx, seg);
3194                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3195                 return;
3196         }
3197         var->base = vmx_read_guest_seg_base(vmx, seg);
3198         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3199         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3200         ar = vmx_read_guest_seg_ar(vmx, seg);
3201         if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3202                 ar = 0;
3203         var->type = ar & 15;
3204         var->s = (ar >> 4) & 1;
3205         var->dpl = (ar >> 5) & 3;
3206         var->present = (ar >> 7) & 1;
3207         var->avl = (ar >> 12) & 1;
3208         var->l = (ar >> 13) & 1;
3209         var->db = (ar >> 14) & 1;
3210         var->g = (ar >> 15) & 1;
3211         var->unusable = (ar >> 16) & 1;
3212 }
3213
3214 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3215 {
3216         struct kvm_segment s;
3217
3218         if (to_vmx(vcpu)->rmode.vm86_active) {
3219                 vmx_get_segment(vcpu, &s, seg);
3220                 return s.base;
3221         }
3222         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3223 }
3224
3225 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3226 {
3227         struct vcpu_vmx *vmx = to_vmx(vcpu);
3228
3229         if (!is_protmode(vcpu))
3230                 return 0;
3231
3232         if (!is_long_mode(vcpu)
3233             && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
3234                 return 3;
3235
3236         if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3237                 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3238                 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3239         }
3240
3241         return vmx->cpl;
3242 }
3243
3244
3245 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3246 {
3247         u32 ar;
3248
3249         if (var->unusable || !var->present)
3250                 ar = 1 << 16;
3251         else {
3252                 ar = var->type & 15;
3253                 ar |= (var->s & 1) << 4;
3254                 ar |= (var->dpl & 3) << 5;
3255                 ar |= (var->present & 1) << 7;
3256                 ar |= (var->avl & 1) << 12;
3257                 ar |= (var->l & 1) << 13;
3258                 ar |= (var->db & 1) << 14;
3259                 ar |= (var->g & 1) << 15;
3260         }
3261
3262         return ar;
3263 }
3264
3265 static void vmx_set_segment(struct kvm_vcpu *vcpu,
3266                             struct kvm_segment *var, int seg)
3267 {
3268         struct vcpu_vmx *vmx = to_vmx(vcpu);
3269         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3270
3271         vmx_segment_cache_clear(vmx);
3272         __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3273
3274         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3275                 vmx->rmode.segs[seg] = *var;
3276                 if (seg == VCPU_SREG_TR)
3277                         vmcs_write16(sf->selector, var->selector);
3278                 else if (var->s)
3279                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3280                 goto out;
3281         }
3282
3283         vmcs_writel(sf->base, var->base);
3284         vmcs_write32(sf->limit, var->limit);
3285         vmcs_write16(sf->selector, var->selector);
3286
3287         /*
3288          *   Fix the "Accessed" bit in AR field of segment registers for older
3289          * qemu binaries.
3290          *   IA32 arch specifies that at the time of processor reset the
3291          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3292          * is setting it to 0 in the userland code. This causes invalid guest
3293          * state vmexit when "unrestricted guest" mode is turned on.
3294          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3295          * tree. Newer qemu binaries with that qemu fix would not need this
3296          * kvm hack.
3297          */
3298         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3299                 var->type |= 0x1; /* Accessed */
3300
3301         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3302
3303 out:
3304         if (!vmx->emulation_required)
3305                 vmx->emulation_required = !guest_state_valid(vcpu);
3306 }
3307
3308 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3309 {
3310         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3311
3312         *db = (ar >> 14) & 1;
3313         *l = (ar >> 13) & 1;
3314 }
3315
3316 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3317 {
3318         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3319         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3320 }
3321
3322 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3323 {
3324         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3325         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3326 }
3327
3328 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3329 {
3330         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3331         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3332 }
3333
3334 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3335 {
3336         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3337         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3338 }
3339
3340 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3341 {
3342         struct kvm_segment var;
3343         u32 ar;
3344
3345         vmx_get_segment(vcpu, &var, seg);
3346         var.dpl = 0x3;
3347         if (seg == VCPU_SREG_CS)
3348                 var.type = 0x3;
3349         ar = vmx_segment_access_rights(&var);
3350
3351         if (var.base != (var.selector << 4))
3352                 return false;
3353         if (var.limit != 0xffff)
3354                 return false;
3355         if (ar != 0xf3)
3356                 return false;
3357
3358         return true;
3359 }
3360
3361 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3362 {
3363         struct kvm_segment cs;
3364         unsigned int cs_rpl;
3365
3366         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3367         cs_rpl = cs.selector & SELECTOR_RPL_MASK;
3368
3369         if (cs.unusable)
3370                 return false;
3371         if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
3372                 return false;
3373         if (!cs.s)
3374                 return false;
3375         if (cs.type & AR_TYPE_WRITEABLE_MASK) {
3376                 if (cs.dpl > cs_rpl)
3377                         return false;
3378         } else {
3379                 if (cs.dpl != cs_rpl)
3380                         return false;
3381         }
3382         if (!cs.present)
3383                 return false;
3384
3385         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3386         return true;
3387 }
3388
3389 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3390 {
3391         struct kvm_segment ss;
3392         unsigned int ss_rpl;
3393
3394         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3395         ss_rpl = ss.selector & SELECTOR_RPL_MASK;
3396
3397         if (ss.unusable)
3398                 return true;
3399         if (ss.type != 3 && ss.type != 7)
3400                 return false;
3401         if (!ss.s)
3402                 return false;
3403         if (ss.dpl != ss_rpl) /* DPL != RPL */
3404                 return false;
3405         if (!ss.present)
3406                 return false;
3407
3408         return true;
3409 }
3410
3411 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3412 {
3413         struct kvm_segment var;
3414         unsigned int rpl;
3415
3416         vmx_get_segment(vcpu, &var, seg);
3417         rpl = var.selector & SELECTOR_RPL_MASK;
3418
3419         if (var.unusable)
3420                 return true;
3421         if (!var.s)
3422                 return false;
3423         if (!var.present)
3424                 return false;
3425         if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
3426                 if (var.dpl < rpl) /* DPL < RPL */
3427                         return false;
3428         }
3429
3430         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3431          * rights flags
3432          */
3433         return true;
3434 }
3435
3436 static bool tr_valid(struct kvm_vcpu *vcpu)
3437 {
3438         struct kvm_segment tr;
3439
3440         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3441
3442         if (tr.unusable)
3443                 return false;
3444         if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
3445                 return false;
3446         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3447                 return false;
3448         if (!tr.present)
3449                 return false;
3450
3451         return true;
3452 }
3453
3454 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3455 {
3456         struct kvm_segment ldtr;
3457
3458         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3459
3460         if (ldtr.unusable)
3461                 return true;
3462         if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
3463                 return false;
3464         if (ldtr.type != 2)
3465                 return false;
3466         if (!ldtr.present)
3467                 return false;
3468
3469         return true;
3470 }
3471
3472 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3473 {
3474         struct kvm_segment cs, ss;
3475
3476         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3477         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3478
3479         return ((cs.selector & SELECTOR_RPL_MASK) ==
3480                  (ss.selector & SELECTOR_RPL_MASK));
3481 }
3482
3483 /*
3484  * Check if guest state is valid. Returns true if valid, false if
3485  * not.
3486  * We assume that registers are always usable
3487  */
3488 static bool guest_state_valid(struct kvm_vcpu *vcpu)
3489 {
3490         /* real mode guest state checks */
3491         if (!is_protmode(vcpu)) {
3492                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3493                         return false;
3494                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3495                         return false;
3496                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3497                         return false;
3498                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3499                         return false;
3500                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3501                         return false;
3502                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3503                         return false;
3504         } else {
3505         /* protected mode guest state checks */
3506                 if (!cs_ss_rpl_check(vcpu))
3507                         return false;
3508                 if (!code_segment_valid(vcpu))
3509                         return false;
3510                 if (!stack_segment_valid(vcpu))
3511                         return false;
3512                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3513                         return false;
3514                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3515                         return false;
3516                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3517                         return false;
3518                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3519                         return false;
3520                 if (!tr_valid(vcpu))
3521                         return false;
3522                 if (!ldtr_valid(vcpu))
3523                         return false;
3524         }
3525         /* TODO:
3526          * - Add checks on RIP
3527          * - Add checks on RFLAGS
3528          */
3529
3530         return true;
3531 }
3532
3533 static int init_rmode_tss(struct kvm *kvm)
3534 {
3535         gfn_t fn;
3536         u16 data = 0;
3537         int r, idx, ret = 0;
3538
3539         idx = srcu_read_lock(&kvm->srcu);
3540         fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
3541         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3542         if (r < 0)
3543                 goto out;
3544         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3545         r = kvm_write_guest_page(kvm, fn++, &data,
3546                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
3547         if (r < 0)
3548                 goto out;
3549         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3550         if (r < 0)
3551                 goto out;
3552         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3553         if (r < 0)
3554                 goto out;
3555         data = ~0;
3556         r = kvm_write_guest_page(kvm, fn, &data,
3557                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3558                                  sizeof(u8));
3559         if (r < 0)
3560                 goto out;
3561
3562         ret = 1;
3563 out:
3564         srcu_read_unlock(&kvm->srcu, idx);
3565         return ret;
3566 }
3567
3568 static int init_rmode_identity_map(struct kvm *kvm)
3569 {
3570         int i, idx, r, ret;
3571         pfn_t identity_map_pfn;
3572         u32 tmp;
3573
3574         if (!enable_ept)
3575                 return 1;
3576         if (unlikely(!kvm->arch.ept_identity_pagetable)) {
3577                 printk(KERN_ERR "EPT: identity-mapping pagetable "
3578                         "haven't been allocated!\n");
3579                 return 0;
3580         }
3581         if (likely(kvm->arch.ept_identity_pagetable_done))
3582                 return 1;
3583         ret = 0;
3584         identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
3585         idx = srcu_read_lock(&kvm->srcu);
3586         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3587         if (r < 0)
3588                 goto out;
3589         /* Set up identity-mapping pagetable for EPT in real mode */
3590         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3591                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3592                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3593                 r = kvm_write_guest_page(kvm, identity_map_pfn,
3594                                 &tmp, i * sizeof(tmp), sizeof(tmp));
3595                 if (r < 0)
3596                         goto out;
3597         }
3598         kvm->arch.ept_identity_pagetable_done = true;
3599         ret = 1;
3600 out:
3601         srcu_read_unlock(&kvm->srcu, idx);
3602         return ret;
3603 }
3604
3605 static void seg_setup(int seg)
3606 {
3607         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3608         unsigned int ar;
3609
3610         vmcs_write16(sf->selector, 0);
3611         vmcs_writel(sf->base, 0);
3612         vmcs_write32(sf->limit, 0xffff);
3613         ar = 0x93;
3614         if (seg == VCPU_SREG_CS)
3615                 ar |= 0x08; /* code segment */
3616
3617         vmcs_write32(sf->ar_bytes, ar);
3618 }
3619
3620 static int alloc_apic_access_page(struct kvm *kvm)
3621 {
3622         struct page *page;
3623         struct kvm_userspace_memory_region kvm_userspace_mem;
3624         int r = 0;
3625
3626         mutex_lock(&kvm->slots_lock);
3627         if (kvm->arch.apic_access_page)
3628                 goto out;
3629         kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
3630         kvm_userspace_mem.flags = 0;
3631         kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3632         kvm_userspace_mem.memory_size = PAGE_SIZE;
3633         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3634         if (r)
3635                 goto out;
3636
3637         page = gfn_to_page(kvm, 0xfee00);
3638         if (is_error_page(page)) {
3639                 r = -EFAULT;
3640                 goto out;
3641         }
3642
3643         kvm->arch.apic_access_page = page;
3644 out:
3645         mutex_unlock(&kvm->slots_lock);
3646         return r;
3647 }
3648
3649 static int alloc_identity_pagetable(struct kvm *kvm)
3650 {
3651         struct page *page;
3652         struct kvm_userspace_memory_region kvm_userspace_mem;
3653         int r = 0;
3654
3655         mutex_lock(&kvm->slots_lock);
3656         if (kvm->arch.ept_identity_pagetable)
3657                 goto out;
3658         kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
3659         kvm_userspace_mem.flags = 0;
3660         kvm_userspace_mem.guest_phys_addr =
3661                 kvm->arch.ept_identity_map_addr;
3662         kvm_userspace_mem.memory_size = PAGE_SIZE;
3663         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3664         if (r)
3665                 goto out;
3666
3667         page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
3668         if (is_error_page(page)) {
3669                 r = -EFAULT;
3670                 goto out;
3671         }
3672
3673         kvm->arch.ept_identity_pagetable = page;
3674 out:
3675         mutex_unlock(&kvm->slots_lock);
3676         return r;
3677 }
3678
3679 static void allocate_vpid(struct vcpu_vmx *vmx)
3680 {
3681         int vpid;
3682
3683         vmx->vpid = 0;
3684         if (!enable_vpid)
3685                 return;
3686         spin_lock(&vmx_vpid_lock);
3687         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3688         if (vpid < VMX_NR_VPIDS) {
3689                 vmx->vpid = vpid;
3690                 __set_bit(vpid, vmx_vpid_bitmap);
3691         }
3692         spin_unlock(&vmx_vpid_lock);
3693 }
3694
3695 static void free_vpid(struct vcpu_vmx *vmx)
3696 {
3697         if (!enable_vpid)
3698                 return;
3699         spin_lock(&vmx_vpid_lock);
3700         if (vmx->vpid != 0)
3701                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3702         spin_unlock(&vmx_vpid_lock);
3703 }
3704
3705 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3706 {
3707         int f = sizeof(unsigned long);
3708
3709         if (!cpu_has_vmx_msr_bitmap())
3710                 return;
3711
3712         /*
3713          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3714          * have the write-low and read-high bitmap offsets the wrong way round.
3715          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3716          */
3717         if (msr <= 0x1fff) {
3718                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
3719                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
3720         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3721                 msr &= 0x1fff;
3722                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
3723                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
3724         }
3725 }
3726
3727 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3728 {
3729         if (!longmode_only)
3730                 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
3731         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
3732 }
3733
3734 /*
3735  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3736  * will not change in the lifetime of the guest.
3737  * Note that host-state that does change is set elsewhere. E.g., host-state
3738  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3739  */
3740 static void vmx_set_constant_host_state(void)
3741 {
3742         u32 low32, high32;
3743         unsigned long tmpl;
3744         struct desc_ptr dt;
3745
3746         vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
3747         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
3748         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
3749
3750         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
3751 #ifdef CONFIG_X86_64
3752         /*
3753          * Load null selectors, so we can avoid reloading them in
3754          * __vmx_load_host_state(), in case userspace uses the null selectors
3755          * too (the expected case).
3756          */
3757         vmcs_write16(HOST_DS_SELECTOR, 0);
3758         vmcs_write16(HOST_ES_SELECTOR, 0);
3759 #else
3760         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3761         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3762 #endif
3763         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3764         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
3765
3766         native_store_idt(&dt);
3767         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
3768
3769         vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3770
3771         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3772         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3773         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3774         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
3775
3776         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3777                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3778                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3779         }
3780 }
3781
3782 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3783 {
3784         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3785         if (enable_ept)
3786                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3787         if (is_guest_mode(&vmx->vcpu))
3788                 vmx->vcpu.arch.cr4_guest_owned_bits &=
3789                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3790         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3791 }
3792
3793 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3794 {
3795         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3796         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3797                 exec_control &= ~CPU_BASED_TPR_SHADOW;
3798 #ifdef CONFIG_X86_64
3799                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3800                                 CPU_BASED_CR8_LOAD_EXITING;
3801 #endif
3802         }
3803         if (!enable_ept)
3804                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3805                                 CPU_BASED_CR3_LOAD_EXITING  |
3806                                 CPU_BASED_INVLPG_EXITING;
3807         return exec_control;
3808 }
3809
3810 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3811 {
3812         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3813         if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3814                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3815         if (vmx->vpid == 0)
3816                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3817         if (!enable_ept) {
3818                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3819                 enable_unrestricted_guest = 0;
3820                 /* Enable INVPCID for non-ept guests may cause performance regression. */
3821                 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3822         }
3823         if (!enable_unrestricted_guest)
3824                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3825         if (!ple_gap)
3826                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3827         return exec_control;
3828 }
3829
3830 static void ept_set_mmio_spte_mask(void)
3831 {
3832         /*
3833          * EPT Misconfigurations can be generated if the value of bits 2:0
3834          * of an EPT paging-structure entry is 110b (write/execute).
3835          * Also, magic bits (0xffull << 49) is set to quickly identify mmio
3836          * spte.
3837          */
3838         kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
3839 }
3840
3841 /*
3842  * Sets up the vmcs for emulated real mode.
3843  */
3844 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3845 {
3846 #ifdef CONFIG_X86_64
3847         unsigned long a;
3848 #endif
3849         int i;
3850
3851         /* I/O */
3852         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
3853         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
3854
3855         if (cpu_has_vmx_msr_bitmap())
3856                 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
3857
3858         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
3859
3860         /* Control */
3861         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
3862                 vmcs_config.pin_based_exec_ctrl);
3863
3864         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
3865
3866         if (cpu_has_secondary_exec_ctrls()) {
3867                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
3868                                 vmx_secondary_exec_control(vmx));
3869         }
3870
3871         if (ple_gap) {
3872                 vmcs_write32(PLE_GAP, ple_gap);
3873                 vmcs_write32(PLE_WINDOW, ple_window);
3874         }
3875
3876         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
3877         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
3878         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
3879
3880         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
3881         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
3882         vmx_set_constant_host_state();
3883 #ifdef CONFIG_X86_64
3884         rdmsrl(MSR_FS_BASE, a);
3885         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
3886         rdmsrl(MSR_GS_BASE, a);
3887         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
3888 #else
3889         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
3890         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
3891 #endif
3892
3893         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
3894         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3895         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
3896         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3897         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
3898
3899         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3900                 u32 msr_low, msr_high;
3901                 u64 host_pat;
3902                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
3903                 host_pat = msr_low | ((u64) msr_high << 32);
3904                 /* Write the default value follow host pat */
3905                 vmcs_write64(GUEST_IA32_PAT, host_pat);
3906                 /* Keep arch.pat sync with GUEST_IA32_PAT */
3907                 vmx->vcpu.arch.pat = host_pat;
3908         }
3909
3910         for (i = 0; i < NR_VMX_MSR; ++i) {
3911                 u32 index = vmx_msr_index[i];
3912                 u32 data_low, data_high;
3913                 int j = vmx->nmsrs;
3914
3915                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
3916                         continue;
3917                 if (wrmsr_safe(index, data_low, data_high) < 0)
3918                         continue;
3919                 vmx->guest_msrs[j].index = i;
3920                 vmx->guest_msrs[j].data = 0;
3921                 vmx->guest_msrs[j].mask = -1ull;
3922                 ++vmx->nmsrs;
3923         }
3924
3925         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
3926
3927         /* 22.2.1, 20.8.1 */
3928         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
3929
3930         vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3931         set_cr4_guest_host_mask(vmx);
3932
3933         return 0;
3934 }
3935
3936 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3937 {
3938         struct vcpu_vmx *vmx = to_vmx(vcpu);
3939         u64 msr;
3940         int ret;
3941
3942         vmx->rmode.vm86_active = 0;
3943
3944         vmx->soft_vnmi_blocked = 0;
3945
3946         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
3947         kvm_set_cr8(&vmx->vcpu, 0);
3948         msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
3949         if (kvm_vcpu_is_bsp(&vmx->vcpu))
3950                 msr |= MSR_IA32_APICBASE_BSP;
3951         kvm_set_apic_base(&vmx->vcpu, msr);
3952
3953         vmx_segment_cache_clear(vmx);
3954
3955         seg_setup(VCPU_SREG_CS);
3956         if (kvm_vcpu_is_bsp(&vmx->vcpu))
3957                 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
3958         else {
3959                 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
3960                 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
3961         }
3962
3963         seg_setup(VCPU_SREG_DS);
3964         seg_setup(VCPU_SREG_ES);
3965         seg_setup(VCPU_SREG_FS);
3966         seg_setup(VCPU_SREG_GS);
3967         seg_setup(VCPU_SREG_SS);
3968
3969         vmcs_write16(GUEST_TR_SELECTOR, 0);
3970         vmcs_writel(GUEST_TR_BASE, 0);
3971         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
3972         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3973
3974         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
3975         vmcs_writel(GUEST_LDTR_BASE, 0);
3976         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
3977         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
3978
3979         vmcs_write32(GUEST_SYSENTER_CS, 0);
3980         vmcs_writel(GUEST_SYSENTER_ESP, 0);
3981         vmcs_writel(GUEST_SYSENTER_EIP, 0);
3982
3983         vmcs_writel(GUEST_RFLAGS, 0x02);
3984         if (kvm_vcpu_is_bsp(&vmx->vcpu))
3985                 kvm_rip_write(vcpu, 0xfff0);
3986         else
3987                 kvm_rip_write(vcpu, 0);
3988
3989         vmcs_writel(GUEST_GDTR_BASE, 0);
3990         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
3991
3992         vmcs_writel(GUEST_IDTR_BASE, 0);
3993         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
3994
3995         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3996         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
3997         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
3998
3999         /* Special registers */
4000         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4001
4002         setup_msrs(vmx);
4003
4004         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4005
4006         if (cpu_has_vmx_tpr_shadow()) {
4007                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4008                 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
4009                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4010                                      __pa(vmx->vcpu.arch.apic->regs));
4011                 vmcs_write32(TPR_THRESHOLD, 0);
4012         }
4013
4014         if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4015                 vmcs_write64(APIC_ACCESS_ADDR,
4016                              page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
4017
4018         if (vmx->vpid != 0)
4019                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4020
4021         vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4022         vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4023         vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4024         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4025         vmx_set_cr4(&vmx->vcpu, 0);
4026         vmx_set_efer(&vmx->vcpu, 0);
4027         vmx_fpu_activate(&vmx->vcpu);
4028         update_exception_bitmap(&vmx->vcpu);
4029
4030         vpid_sync_context(vmx);
4031
4032         ret = 0;
4033
4034         /* HACK: Don't enable emulation on guest boot/reset */
4035         vmx->emulation_required = 0;
4036
4037         return ret;
4038 }
4039
4040 /*
4041  * In nested virtualization, check if L1 asked to exit on external interrupts.
4042  * For most existing hypervisors, this will always return true.
4043  */
4044 static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4045 {
4046         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4047                 PIN_BASED_EXT_INTR_MASK;
4048 }
4049
4050 static void enable_irq_window(struct kvm_vcpu *vcpu)
4051 {
4052         u32 cpu_based_vm_exec_control;
4053         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
4054                 /*
4055                  * We get here if vmx_interrupt_allowed() said we can't
4056                  * inject to L1 now because L2 must run. Ask L2 to exit
4057                  * right after entry, so we can inject to L1 more promptly.
4058                  */
4059                 kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
4060                 return;
4061         }
4062
4063         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4064         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4065         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4066 }
4067
4068 static void enable_nmi_window(struct kvm_vcpu *vcpu)
4069 {
4070         u32 cpu_based_vm_exec_control;
4071
4072         if (!cpu_has_virtual_nmis()) {
4073                 enable_irq_window(vcpu);
4074                 return;
4075         }
4076
4077         if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4078                 enable_irq_window(vcpu);
4079                 return;
4080         }
4081         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4082         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4083         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4084 }
4085
4086 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4087 {
4088         struct vcpu_vmx *vmx = to_vmx(vcpu);
4089         uint32_t intr;
4090         int irq = vcpu->arch.interrupt.nr;
4091
4092         trace_kvm_inj_virq(irq);
4093
4094         ++vcpu->stat.irq_injections;
4095         if (vmx->rmode.vm86_active) {
4096                 int inc_eip = 0;
4097                 if (vcpu->arch.interrupt.soft)
4098                         inc_eip = vcpu->arch.event_exit_inst_len;
4099                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
4100                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4101                 return;
4102         }
4103         intr = irq | INTR_INFO_VALID_MASK;
4104         if (vcpu->arch.interrupt.soft) {
4105                 intr |= INTR_TYPE_SOFT_INTR;
4106                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4107                              vmx->vcpu.arch.event_exit_inst_len);
4108         } else
4109                 intr |= INTR_TYPE_EXT_INTR;
4110         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4111 }
4112
4113 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4114 {
4115         struct vcpu_vmx *vmx = to_vmx(vcpu);
4116
4117         if (is_guest_mode(vcpu))
4118                 return;
4119
4120         if (!cpu_has_virtual_nmis()) {
4121                 /*
4122                  * Tracking the NMI-blocked state in software is built upon
4123                  * finding the next open IRQ window. This, in turn, depends on
4124                  * well-behaving guests: They have to keep IRQs disabled at
4125                  * least as long as the NMI handler runs. Otherwise we may
4126                  * cause NMI nesting, maybe breaking the guest. But as this is
4127                  * highly unlikely, we can live with the residual risk.
4128                  */
4129                 vmx->soft_vnmi_blocked = 1;
4130                 vmx->vnmi_blocked_time = 0;
4131         }
4132
4133         ++vcpu->stat.nmi_injections;
4134         vmx->nmi_known_unmasked = false;
4135         if (vmx->rmode.vm86_active) {
4136                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
4137                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4138                 return;
4139         }
4140         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4141                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4142 }
4143
4144 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4145 {
4146         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4147                 return 0;
4148
4149         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4150                   (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4151                    | GUEST_INTR_STATE_NMI));
4152 }
4153
4154 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4155 {
4156         if (!cpu_has_virtual_nmis())
4157                 return to_vmx(vcpu)->soft_vnmi_blocked;
4158         if (to_vmx(vcpu)->nmi_known_unmasked)
4159                 return false;
4160         return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4161 }
4162
4163 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4164 {
4165         struct vcpu_vmx *vmx = to_vmx(vcpu);
4166
4167         if (!cpu_has_virtual_nmis()) {
4168                 if (vmx->soft_vnmi_blocked != masked) {
4169                         vmx->soft_vnmi_blocked = masked;
4170                         vmx->vnmi_blocked_time = 0;
4171                 }
4172         } else {
4173                 vmx->nmi_known_unmasked = !masked;
4174                 if (masked)
4175                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4176                                       GUEST_INTR_STATE_NMI);
4177                 else
4178                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4179                                         GUEST_INTR_STATE_NMI);
4180         }
4181 }
4182
4183 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4184 {
4185         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
4186                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4187                 if (to_vmx(vcpu)->nested.nested_run_pending ||
4188                     (vmcs12->idt_vectoring_info_field &
4189                      VECTORING_INFO_VALID_MASK))
4190                         return 0;
4191                 nested_vmx_vmexit(vcpu);
4192                 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
4193                 vmcs12->vm_exit_intr_info = 0;
4194                 /* fall through to normal code, but now in L1, not L2 */
4195         }
4196
4197         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4198                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4199                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4200 }
4201
4202 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4203 {
4204         int ret;
4205         struct kvm_userspace_memory_region tss_mem = {
4206                 .slot = TSS_PRIVATE_MEMSLOT,
4207                 .guest_phys_addr = addr,
4208                 .memory_size = PAGE_SIZE * 3,
4209                 .flags = 0,
4210         };
4211
4212         ret = kvm_set_memory_region(kvm, &tss_mem, false);
4213         if (ret)
4214                 return ret;
4215         kvm->arch.tss_addr = addr;
4216         if (!init_rmode_tss(kvm))
4217                 return  -ENOMEM;
4218
4219         return 0;
4220 }
4221
4222 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4223 {
4224         switch (vec) {
4225         case BP_VECTOR:
4226                 /*
4227                  * Update instruction length as we may reinject the exception
4228                  * from user space while in guest debugging mode.
4229                  */
4230                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4231                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4232                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4233                         return false;
4234                 /* fall through */
4235         case DB_VECTOR:
4236                 if (vcpu->guest_debug &
4237                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4238                         return false;
4239                 /* fall through */
4240         case DE_VECTOR:
4241         case OF_VECTOR:
4242         case BR_VECTOR:
4243         case UD_VECTOR:
4244         case DF_VECTOR:
4245         case SS_VECTOR:
4246         case GP_VECTOR:
4247         case MF_VECTOR:
4248                 return true;
4249         break;
4250         }
4251         return false;
4252 }
4253
4254 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4255                                   int vec, u32 err_code)
4256 {
4257         /*
4258          * Instruction with address size override prefix opcode 0x67
4259          * Cause the #SS fault with 0 error code in VM86 mode.
4260          */
4261         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4262                 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4263                         if (vcpu->arch.halt_request) {
4264                                 vcpu->arch.halt_request = 0;
4265                                 return kvm_emulate_halt(vcpu);
4266                         }
4267                         return 1;
4268                 }
4269                 return 0;
4270         }
4271
4272         /*
4273          * Forward all other exceptions that are valid in real mode.
4274          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4275          *        the required debugging infrastructure rework.
4276          */
4277         kvm_queue_exception(vcpu, vec);
4278         return 1;
4279 }
4280
4281 /*
4282  * Trigger machine check on the host. We assume all the MSRs are already set up
4283  * by the CPU and that we still run on the same CPU as the MCE occurred on.
4284  * We pass a fake environment to the machine check handler because we want
4285  * the guest to be always treated like user space, no matter what context
4286  * it used internally.
4287  */
4288 static void kvm_machine_check(void)
4289 {
4290 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
4291         struct pt_regs regs = {
4292                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
4293                 .flags = X86_EFLAGS_IF,
4294         };
4295
4296         do_machine_check(&regs, 0);
4297 #endif
4298 }
4299
4300 static int handle_machine_check(struct kvm_vcpu *vcpu)
4301 {
4302         /* already handled by vcpu_run */
4303         return 1;
4304 }
4305
4306 static int handle_exception(struct kvm_vcpu *vcpu)
4307 {
4308         struct vcpu_vmx *vmx = to_vmx(vcpu);
4309         struct kvm_run *kvm_run = vcpu->run;
4310         u32 intr_info, ex_no, error_code;
4311         unsigned long cr2, rip, dr6;
4312         u32 vect_info;
4313         enum emulation_result er;
4314
4315         vect_info = vmx->idt_vectoring_info;
4316         intr_info = vmx->exit_intr_info;
4317
4318         if (is_machine_check(intr_info))
4319                 return handle_machine_check(vcpu);
4320
4321         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4322                 return 1;  /* already handled by vmx_vcpu_run() */
4323
4324         if (is_no_device(intr_info)) {
4325                 vmx_fpu_activate(vcpu);
4326                 return 1;
4327         }
4328
4329         if (is_invalid_opcode(intr_info)) {
4330                 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
4331                 if (er != EMULATE_DONE)
4332                         kvm_queue_exception(vcpu, UD_VECTOR);
4333                 return 1;
4334         }
4335
4336         error_code = 0;
4337         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4338                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4339
4340         /*
4341          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4342          * MMIO, it is better to report an internal error.
4343          * See the comments in vmx_handle_exit.
4344          */
4345         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4346             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4347                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4348                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4349                 vcpu->run->internal.ndata = 2;
4350                 vcpu->run->internal.data[0] = vect_info;
4351                 vcpu->run->internal.data[1] = intr_info;
4352                 return 0;
4353         }
4354
4355         if (is_page_fault(intr_info)) {
4356                 /* EPT won't cause page fault directly */
4357                 BUG_ON(enable_ept);
4358                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
4359                 trace_kvm_page_fault(cr2, error_code);
4360
4361                 if (kvm_event_needs_reinjection(vcpu))
4362                         kvm_mmu_unprotect_page_virt(vcpu, cr2);
4363                 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
4364         }
4365
4366         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4367
4368         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4369                 return handle_rmode_exception(vcpu, ex_no, error_code);
4370
4371         switch (ex_no) {
4372         case DB_VECTOR:
4373                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4374                 if (!(vcpu->guest_debug &
4375                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4376                         vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
4377                         kvm_queue_exception(vcpu, DB_VECTOR);
4378                         return 1;
4379                 }
4380                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
4381                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4382                 /* fall through */
4383         case BP_VECTOR:
4384                 /*
4385                  * Update instruction length as we may reinject #BP from
4386                  * user space while in guest debugging mode. Reading it for
4387                  * #DB as well causes no harm, it is not used in that case.
4388                  */
4389                 vmx->vcpu.arch.event_exit_inst_len =
4390                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4391                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
4392                 rip = kvm_rip_read(vcpu);
4393                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
4394                 kvm_run->debug.arch.exception = ex_no;
4395                 break;
4396         default:
4397                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4398                 kvm_run->ex.exception = ex_no;
4399                 kvm_run->ex.error_code = error_code;
4400                 break;
4401         }
4402         return 0;
4403 }
4404
4405 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
4406 {
4407         ++vcpu->stat.irq_exits;
4408         return 1;
4409 }
4410
4411 static int handle_triple_fault(struct kvm_vcpu *vcpu)
4412 {
4413         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4414         return 0;
4415 }
4416
4417 static int handle_io(struct kvm_vcpu *vcpu)
4418 {
4419         unsigned long exit_qualification;
4420         int size, in, string;
4421         unsigned port;
4422
4423         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4424         string = (exit_qualification & 16) != 0;
4425         in = (exit_qualification & 8) != 0;
4426
4427         ++vcpu->stat.io_exits;
4428
4429         if (string || in)
4430                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4431
4432         port = exit_qualification >> 16;
4433         size = (exit_qualification & 7) + 1;
4434         skip_emulated_instruction(vcpu);
4435
4436         return kvm_fast_pio_out(vcpu, size, port);
4437 }
4438
4439 static void
4440 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4441 {
4442         /*
4443          * Patch in the VMCALL instruction:
4444          */
4445         hypercall[0] = 0x0f;
4446         hypercall[1] = 0x01;
4447         hypercall[2] = 0xc1;
4448 }
4449
4450 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4451 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4452 {
4453         if (to_vmx(vcpu)->nested.vmxon &&
4454             ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4455                 return 1;
4456
4457         if (is_guest_mode(vcpu)) {
4458                 /*
4459                  * We get here when L2 changed cr0 in a way that did not change
4460                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4461                  * but did change L0 shadowed bits. This can currently happen
4462                  * with the TS bit: L0 may want to leave TS on (for lazy fpu
4463                  * loading) while pretending to allow the guest to change it.
4464                  */
4465                 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
4466                          (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
4467                         return 1;
4468                 vmcs_writel(CR0_READ_SHADOW, val);
4469                 return 0;
4470         } else
4471                 return kvm_set_cr0(vcpu, val);
4472 }
4473
4474 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4475 {
4476         if (is_guest_mode(vcpu)) {
4477                 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
4478                          (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
4479                         return 1;
4480                 vmcs_writel(CR4_READ_SHADOW, val);
4481                 return 0;
4482         } else
4483                 return kvm_set_cr4(vcpu, val);
4484 }
4485
4486 /* called to set cr0 as approriate for clts instruction exit. */
4487 static void handle_clts(struct kvm_vcpu *vcpu)
4488 {
4489         if (is_guest_mode(vcpu)) {
4490                 /*
4491                  * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4492                  * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
4493                  * just pretend it's off (also in arch.cr0 for fpu_activate).
4494                  */
4495                 vmcs_writel(CR0_READ_SHADOW,
4496                         vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
4497                 vcpu->arch.cr0 &= ~X86_CR0_TS;
4498         } else
4499                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4500 }
4501
4502 static int handle_cr(struct kvm_vcpu *vcpu)
4503 {
4504         unsigned long exit_qualification, val;
4505         int cr;
4506         int reg;
4507         int err;
4508
4509         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4510         cr = exit_qualification & 15;
4511         reg = (exit_qualification >> 8) & 15;
4512         switch ((exit_qualification >> 4) & 3) {
4513         case 0: /* mov to cr */
4514                 val = kvm_register_read(vcpu, reg);
4515                 trace_kvm_cr_write(cr, val);
4516                 switch (cr) {
4517                 case 0:
4518                         err = handle_set_cr0(vcpu, val);
4519                         kvm_complete_insn_gp(vcpu, err);
4520                         return 1;
4521                 case 3:
4522                         err = kvm_set_cr3(vcpu, val);
4523                         kvm_complete_insn_gp(vcpu, err);
4524                         return 1;
4525                 case 4:
4526                         err = handle_set_cr4(vcpu, val);
4527                         kvm_complete_insn_gp(vcpu, err);
4528                         return 1;
4529                 case 8: {
4530                                 u8 cr8_prev = kvm_get_cr8(vcpu);
4531                                 u8 cr8 = kvm_register_read(vcpu, reg);
4532                                 err = kvm_set_cr8(vcpu, cr8);
4533                                 kvm_complete_insn_gp(vcpu, err);
4534                                 if (irqchip_in_kernel(vcpu->kvm))
4535                                         return 1;
4536                                 if (cr8_prev <= cr8)
4537                                         return 1;
4538                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
4539                                 return 0;
4540                         }
4541                 }
4542                 break;
4543         case 2: /* clts */
4544                 handle_clts(vcpu);
4545                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
4546                 skip_emulated_instruction(vcpu);
4547                 vmx_fpu_activate(vcpu);
4548                 return 1;
4549         case 1: /*mov from cr*/
4550                 switch (cr) {
4551                 case 3:
4552                         val = kvm_read_cr3(vcpu);
4553                         kvm_register_write(vcpu, reg, val);
4554                         trace_kvm_cr_read(cr, val);
4555                         skip_emulated_instruction(vcpu);
4556                         return 1;
4557                 case 8:
4558                         val = kvm_get_cr8(vcpu);
4559                         kvm_register_write(vcpu, reg, val);
4560                         trace_kvm_cr_read(cr, val);
4561                         skip_emulated_instruction(vcpu);
4562                         return 1;
4563                 }
4564                 break;
4565         case 3: /* lmsw */
4566                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4567                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
4568                 kvm_lmsw(vcpu, val);
4569
4570                 skip_emulated_instruction(vcpu);
4571                 return 1;
4572         default:
4573                 break;
4574         }
4575         vcpu->run->exit_reason = 0;
4576         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4577                (int)(exit_qualification >> 4) & 3, cr);
4578         return 0;
4579 }
4580
4581 static int handle_dr(struct kvm_vcpu *vcpu)
4582 {
4583         unsigned long exit_qualification;
4584         int dr, reg;
4585
4586         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
4587         if (!kvm_require_cpl(vcpu, 0))
4588                 return 1;
4589         dr = vmcs_readl(GUEST_DR7);
4590         if (dr & DR7_GD) {
4591                 /*
4592                  * As the vm-exit takes precedence over the debug trap, we
4593                  * need to emulate the latter, either for the host or the
4594                  * guest debugging itself.
4595                  */
4596                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4597                         vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
4598                         vcpu->run->debug.arch.dr7 = dr;
4599                         vcpu->run->debug.arch.pc =
4600                                 vmcs_readl(GUEST_CS_BASE) +
4601                                 vmcs_readl(GUEST_RIP);
4602                         vcpu->run->debug.arch.exception = DB_VECTOR;
4603                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
4604                         return 0;
4605                 } else {
4606                         vcpu->arch.dr7 &= ~DR7_GD;
4607                         vcpu->arch.dr6 |= DR6_BD;
4608                         vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
4609                         kvm_queue_exception(vcpu, DB_VECTOR);
4610                         return 1;
4611                 }
4612         }
4613
4614         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4615         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
4616         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
4617         if (exit_qualification & TYPE_MOV_FROM_DR) {
4618                 unsigned long val;
4619                 if (!kvm_get_dr(vcpu, dr, &val))
4620                         kvm_register_write(vcpu, reg, val);
4621         } else
4622                 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
4623         skip_emulated_instruction(vcpu);
4624         return 1;
4625 }
4626
4627 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
4628 {
4629         vmcs_writel(GUEST_DR7, val);
4630 }
4631
4632 static int handle_cpuid(struct kvm_vcpu *vcpu)
4633 {
4634         kvm_emulate_cpuid(vcpu);
4635         return 1;
4636 }
4637
4638 static int handle_rdmsr(struct kvm_vcpu *vcpu)
4639 {
4640         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4641         u64 data;
4642
4643         if (vmx_get_msr(vcpu, ecx, &data)) {
4644                 trace_kvm_msr_read_ex(ecx);
4645                 kvm_inject_gp(vcpu, 0);
4646                 return 1;
4647         }
4648
4649         trace_kvm_msr_read(ecx, data);
4650
4651         /* FIXME: handling of bits 32:63 of rax, rdx */
4652         vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
4653         vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
4654         skip_emulated_instruction(vcpu);
4655         return 1;
4656 }
4657
4658 static int handle_wrmsr(struct kvm_vcpu *vcpu)
4659 {
4660         struct msr_data msr;
4661         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4662         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4663                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4664
4665         msr.data = data;
4666         msr.index = ecx;
4667         msr.host_initiated = false;
4668         if (vmx_set_msr(vcpu, &msr) != 0) {
4669                 trace_kvm_msr_write_ex(ecx, data);
4670                 kvm_inject_gp(vcpu, 0);
4671                 return 1;
4672         }
4673
4674         trace_kvm_msr_write(ecx, data);
4675         skip_emulated_instruction(vcpu);
4676         return 1;
4677 }
4678
4679 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
4680 {
4681         kvm_make_request(KVM_REQ_EVENT, vcpu);
4682         return 1;
4683 }
4684
4685 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
4686 {
4687         u32 cpu_based_vm_exec_control;
4688
4689         /* clear pending irq */
4690         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4691         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
4692         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4693
4694         kvm_make_request(KVM_REQ_EVENT, vcpu);
4695
4696         ++vcpu->stat.irq_window_exits;
4697
4698         /*
4699          * If the user space waits to inject interrupts, exit as soon as
4700          * possible
4701          */
4702         if (!irqchip_in_kernel(vcpu->kvm) &&
4703             vcpu->run->request_interrupt_window &&
4704             !kvm_cpu_has_interrupt(vcpu)) {
4705                 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
4706                 return 0;
4707         }
4708         return 1;
4709 }
4710
4711 static int handle_halt(struct kvm_vcpu *vcpu)
4712 {
4713         skip_emulated_instruction(vcpu);
4714         return kvm_emulate_halt(vcpu);
4715 }
4716
4717 static int handle_vmcall(struct kvm_vcpu *vcpu)
4718 {
4719         skip_emulated_instruction(vcpu);
4720         kvm_emulate_hypercall(vcpu);
4721         return 1;
4722 }
4723
4724 static int handle_invd(struct kvm_vcpu *vcpu)
4725 {
4726         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4727 }
4728
4729 static int handle_invlpg(struct kvm_vcpu *vcpu)
4730 {
4731         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4732
4733         kvm_mmu_invlpg(vcpu, exit_qualification);
4734         skip_emulated_instruction(vcpu);
4735         return 1;
4736 }
4737
4738 static int handle_rdpmc(struct kvm_vcpu *vcpu)
4739 {
4740         int err;
4741
4742         err = kvm_rdpmc(vcpu);
4743         kvm_complete_insn_gp(vcpu, err);
4744
4745         return 1;
4746 }
4747
4748 static int handle_wbinvd(struct kvm_vcpu *vcpu)
4749 {
4750         skip_emulated_instruction(vcpu);
4751         kvm_emulate_wbinvd(vcpu);
4752         return 1;
4753 }
4754
4755 static int handle_xsetbv(struct kvm_vcpu *vcpu)
4756 {
4757         u64 new_bv = kvm_read_edx_eax(vcpu);
4758         u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
4759
4760         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
4761                 skip_emulated_instruction(vcpu);
4762         return 1;
4763 }
4764
4765 static int handle_apic_access(struct kvm_vcpu *vcpu)
4766 {
4767         if (likely(fasteoi)) {
4768                 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4769                 int access_type, offset;
4770
4771                 access_type = exit_qualification & APIC_ACCESS_TYPE;
4772                 offset = exit_qualification & APIC_ACCESS_OFFSET;
4773                 /*
4774                  * Sane guest uses MOV to write EOI, with written value
4775                  * not cared. So make a short-circuit here by avoiding
4776                  * heavy instruction emulation.
4777                  */
4778                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
4779                     (offset == APIC_EOI)) {
4780                         kvm_lapic_set_eoi(vcpu);
4781                         skip_emulated_instruction(vcpu);
4782                         return 1;
4783                 }
4784         }
4785         return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4786 }
4787
4788 static int handle_task_switch(struct kvm_vcpu *vcpu)
4789 {
4790         struct vcpu_vmx *vmx = to_vmx(vcpu);
4791         unsigned long exit_qualification;
4792         bool has_error_code = false;
4793         u32 error_code = 0;
4794         u16 tss_selector;
4795         int reason, type, idt_v, idt_index;
4796
4797         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4798         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
4799         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
4800
4801         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4802
4803         reason = (u32)exit_qualification >> 30;
4804         if (reason == TASK_SWITCH_GATE && idt_v) {
4805                 switch (type) {
4806                 case INTR_TYPE_NMI_INTR:
4807                         vcpu->arch.nmi_injected = false;
4808                         vmx_set_nmi_mask(vcpu, true);
4809                         break;
4810                 case INTR_TYPE_EXT_INTR:
4811                 case INTR_TYPE_SOFT_INTR:
4812                         kvm_clear_interrupt_queue(vcpu);
4813                         break;
4814                 case INTR_TYPE_HARD_EXCEPTION:
4815                         if (vmx->idt_vectoring_info &
4816                             VECTORING_INFO_DELIVER_CODE_MASK) {
4817                                 has_error_code = true;
4818                                 error_code =
4819                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
4820                         }
4821                         /* fall through */
4822                 case INTR_TYPE_SOFT_EXCEPTION:
4823                         kvm_clear_exception_queue(vcpu);
4824                         break;
4825                 default:
4826                         break;
4827                 }
4828         }
4829         tss_selector = exit_qualification;
4830
4831         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
4832                        type != INTR_TYPE_EXT_INTR &&
4833                        type != INTR_TYPE_NMI_INTR))
4834                 skip_emulated_instruction(vcpu);
4835
4836         if (kvm_task_switch(vcpu, tss_selector,
4837                             type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
4838                             has_error_code, error_code) == EMULATE_FAIL) {
4839                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4840                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4841                 vcpu->run->internal.ndata = 0;
4842                 return 0;
4843         }
4844
4845         /* clear all local breakpoint enable flags */
4846         vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
4847
4848         /*
4849          * TODO: What about debug traps on tss switch?
4850          *       Are we supposed to inject them and update dr6?
4851          */
4852
4853         return 1;
4854 }
4855
4856 static int handle_ept_violation(struct kvm_vcpu *vcpu)
4857 {
4858         unsigned long exit_qualification;
4859         gpa_t gpa;
4860         u32 error_code;
4861         int gla_validity;
4862
4863         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4864
4865         gla_validity = (exit_qualification >> 7) & 0x3;
4866         if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4867                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
4868                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
4869                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
4870                         vmcs_readl(GUEST_LINEAR_ADDRESS));
4871                 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
4872                         (long unsigned int)exit_qualification);
4873                 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
4874                 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
4875                 return 0;
4876         }
4877
4878         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4879         trace_kvm_page_fault(gpa, exit_qualification);
4880
4881         /* It is a write fault? */
4882         error_code = exit_qualification & (1U << 1);
4883         /* ept page table is present? */
4884         error_code |= (exit_qualification >> 3) & 0x1;
4885
4886         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
4887 }
4888
4889 static u64 ept_rsvd_mask(u64 spte, int level)
4890 {
4891         int i;
4892         u64 mask = 0;
4893
4894         for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
4895                 mask |= (1ULL << i);
4896
4897         if (level > 2)
4898                 /* bits 7:3 reserved */
4899                 mask |= 0xf8;
4900         else if (level == 2) {
4901                 if (spte & (1ULL << 7))
4902                         /* 2MB ref, bits 20:12 reserved */
4903                         mask |= 0x1ff000;
4904                 else
4905                         /* bits 6:3 reserved */
4906                         mask |= 0x78;
4907         }
4908
4909         return mask;
4910 }
4911
4912 static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
4913                                        int level)
4914 {
4915         printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
4916
4917         /* 010b (write-only) */
4918         WARN_ON((spte & 0x7) == 0x2);
4919
4920         /* 110b (write/execute) */
4921         WARN_ON((spte & 0x7) == 0x6);
4922
4923         /* 100b (execute-only) and value not supported by logical processor */
4924         if (!cpu_has_vmx_ept_execute_only())
4925                 WARN_ON((spte & 0x7) == 0x4);
4926
4927         /* not 000b */
4928         if ((spte & 0x7)) {
4929                 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
4930
4931                 if (rsvd_bits != 0) {
4932                         printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
4933                                          __func__, rsvd_bits);
4934                         WARN_ON(1);
4935                 }
4936
4937                 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
4938                         u64 ept_mem_type = (spte & 0x38) >> 3;
4939
4940                         if (ept_mem_type == 2 || ept_mem_type == 3 ||
4941                             ept_mem_type == 7) {
4942                                 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
4943                                                 __func__, ept_mem_type);
4944                                 WARN_ON(1);
4945                         }
4946                 }
4947         }
4948 }
4949
4950 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
4951 {
4952         u64 sptes[4];
4953         int nr_sptes, i, ret;
4954         gpa_t gpa;
4955
4956         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4957
4958         ret = handle_mmio_page_fault_common(vcpu, gpa, true);
4959         if (likely(ret == 1))
4960                 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
4961                                               EMULATE_DONE;
4962         if (unlikely(!ret))
4963                 return 1;
4964
4965         /* It is the real ept misconfig */
4966         printk(KERN_ERR "EPT: Misconfiguration.\n");
4967         printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
4968
4969         nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
4970
4971         for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
4972                 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
4973
4974         vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
4975         vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
4976
4977         return 0;
4978 }
4979
4980 static int handle_nmi_window(struct kvm_vcpu *vcpu)
4981 {
4982         u32 cpu_based_vm_exec_control;
4983
4984         /* clear pending NMI */
4985         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4986         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
4987         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4988         ++vcpu->stat.nmi_window_exits;
4989         kvm_make_request(KVM_REQ_EVENT, vcpu);
4990
4991         return 1;
4992 }
4993
4994 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4995 {
4996         struct vcpu_vmx *vmx = to_vmx(vcpu);
4997         enum emulation_result err = EMULATE_DONE;
4998         int ret = 1;
4999         u32 cpu_exec_ctrl;
5000         bool intr_window_requested;
5001         unsigned count = 130;
5002
5003         cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5004         intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
5005
5006         while (!guest_state_valid(vcpu) && count-- != 0) {
5007                 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
5008                         return handle_interrupt_window(&vmx->vcpu);
5009
5010                 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5011                         return 1;
5012
5013                 err = emulate_instruction(vcpu, 0);
5014
5015                 if (err == EMULATE_DO_MMIO) {
5016                         ret = 0;
5017                         goto out;
5018                 }
5019
5020                 if (err != EMULATE_DONE) {
5021                         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5022                         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5023                         vcpu->run->internal.ndata = 0;
5024                         return 0;
5025                 }
5026
5027                 if (signal_pending(current))
5028                         goto out;
5029                 if (need_resched())
5030                         schedule();
5031         }
5032
5033         vmx->emulation_required = !guest_state_valid(vcpu);
5034 out:
5035         return ret;
5036 }
5037
5038 /*
5039  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5040  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5041  */
5042 static int handle_pause(struct kvm_vcpu *vcpu)
5043 {
5044         skip_emulated_instruction(vcpu);
5045         kvm_vcpu_on_spin(vcpu);
5046
5047         return 1;
5048 }
5049
5050 static int handle_invalid_op(struct kvm_vcpu *vcpu)
5051 {
5052         kvm_queue_exception(vcpu, UD_VECTOR);
5053         return 1;
5054 }
5055
5056 /*
5057  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
5058  * We could reuse a single VMCS for all the L2 guests, but we also want the
5059  * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
5060  * allows keeping them loaded on the processor, and in the future will allow
5061  * optimizations where prepare_vmcs02 doesn't need to set all the fields on
5062  * every entry if they never change.
5063  * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
5064  * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
5065  *
5066  * The following functions allocate and free a vmcs02 in this pool.
5067  */
5068
5069 /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
5070 static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
5071 {
5072         struct vmcs02_list *item;
5073         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
5074                 if (item->vmptr == vmx->nested.current_vmptr) {
5075                         list_move(&item->list, &vmx->nested.vmcs02_pool);
5076                         return &item->vmcs02;
5077                 }
5078
5079         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
5080                 /* Recycle the least recently used VMCS. */
5081                 item = list_entry(vmx->nested.vmcs02_pool.prev,
5082                         struct vmcs02_list, list);
5083                 item->vmptr = vmx->nested.current_vmptr;
5084                 list_move(&item->list, &vmx->nested.vmcs02_pool);
5085                 return &item->vmcs02;
5086         }
5087
5088         /* Create a new VMCS */
5089         item = (struct vmcs02_list *)
5090                 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5091         if (!item)
5092                 return NULL;
5093         item->vmcs02.vmcs = alloc_vmcs();
5094         if (!item->vmcs02.vmcs) {
5095                 kfree(item);
5096                 return NULL;
5097         }
5098         loaded_vmcs_init(&item->vmcs02);
5099         item->vmptr = vmx->nested.current_vmptr;
5100         list_add(&(item->list), &(vmx->nested.vmcs02_pool));
5101         vmx->nested.vmcs02_num++;
5102         return &item->vmcs02;
5103 }
5104
5105 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
5106 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
5107 {
5108         struct vmcs02_list *item;
5109         list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
5110                 if (item->vmptr == vmptr) {
5111                         free_loaded_vmcs(&item->vmcs02);
5112                         list_del(&item->list);
5113                         kfree(item);
5114                         vmx->nested.vmcs02_num--;
5115                         return;
5116                 }
5117 }
5118
5119 /*
5120  * Free all VMCSs saved for this vcpu, except the one pointed by
5121  * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
5122  * currently used, if running L2), and vmcs01 when running L2.
5123  */
5124 static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5125 {
5126         struct vmcs02_list *item, *n;
5127         list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
5128                 if (vmx->loaded_vmcs != &item->vmcs02)
5129                         free_loaded_vmcs(&item->vmcs02);
5130                 list_del(&item->list);
5131                 kfree(item);
5132         }
5133         vmx->nested.vmcs02_num = 0;
5134
5135         if (vmx->loaded_vmcs != &vmx->vmcs01)
5136                 free_loaded_vmcs(&vmx->vmcs01);
5137 }
5138
5139 /*
5140  * Emulate the VMXON instruction.
5141  * Currently, we just remember that VMX is active, and do not save or even
5142  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
5143  * do not currently need to store anything in that guest-allocated memory
5144  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
5145  * argument is different from the VMXON pointer (which the spec says they do).
5146  */
5147 static int handle_vmon(struct kvm_vcpu *vcpu)
5148 {
5149         struct kvm_segment cs;
5150         struct vcpu_vmx *vmx = to_vmx(vcpu);
5151
5152         /* The Intel VMX Instruction Reference lists a bunch of bits that
5153          * are prerequisite to running VMXON, most notably cr4.VMXE must be
5154          * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
5155          * Otherwise, we should fail with #UD. We test these now:
5156          */
5157         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
5158             !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
5159             (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
5160                 kvm_queue_exception(vcpu, UD_VECTOR);
5161                 return 1;
5162         }
5163
5164         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5165         if (is_long_mode(vcpu) && !cs.l) {
5166                 kvm_queue_exception(vcpu, UD_VECTOR);
5167                 return 1;
5168         }
5169
5170         if (vmx_get_cpl(vcpu)) {
5171                 kvm_inject_gp(vcpu, 0);
5172                 return 1;
5173         }
5174
5175         INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5176         vmx->nested.vmcs02_num = 0;
5177
5178         vmx->nested.vmxon = true;
5179
5180         skip_emulated_instruction(vcpu);
5181         return 1;
5182 }
5183
5184 /*
5185  * Intel's VMX Instruction Reference specifies a common set of prerequisites
5186  * for running VMX instructions (except VMXON, whose prerequisites are
5187  * slightly different). It also specifies what exception to inject otherwise.
5188  */
5189 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
5190 {
5191         struct kvm_segment cs;
5192         struct vcpu_vmx *vmx = to_vmx(vcpu);
5193
5194         if (!vmx->nested.vmxon) {
5195                 kvm_queue_exception(vcpu, UD_VECTOR);
5196                 return 0;
5197         }
5198
5199         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5200         if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
5201             (is_long_mode(vcpu) && !cs.l)) {
5202                 kvm_queue_exception(vcpu, UD_VECTOR);
5203                 return 0;
5204         }
5205
5206         if (vmx_get_cpl(vcpu)) {
5207                 kvm_inject_gp(vcpu, 0);
5208                 return 0;
5209         }
5210
5211         return 1;
5212 }
5213
5214 /*
5215  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
5216  * just stops using VMX.
5217  */
5218 static void free_nested(struct vcpu_vmx *vmx)
5219 {
5220         if (!vmx->nested.vmxon)
5221                 return;
5222         vmx->nested.vmxon = false;
5223         if (vmx->nested.current_vmptr != -1ull) {
5224                 kunmap(vmx->nested.current_vmcs12_page);
5225                 nested_release_page(vmx->nested.current_vmcs12_page);
5226                 vmx->nested.current_vmptr = -1ull;
5227                 vmx->nested.current_vmcs12 = NULL;
5228         }
5229         /* Unpin physical memory we referred to in current vmcs02 */
5230         if (vmx->nested.apic_access_page) {
5231                 nested_release_page(vmx->nested.apic_access_page);
5232                 vmx->nested.apic_access_page = 0;
5233         }
5234
5235         nested_free_all_saved_vmcss(vmx);
5236 }
5237
5238 /* Emulate the VMXOFF instruction */
5239 static int handle_vmoff(struct kvm_vcpu *vcpu)
5240 {
5241         if (!nested_vmx_check_permission(vcpu))
5242                 return 1;
5243         free_nested(to_vmx(vcpu));
5244         skip_emulated_instruction(vcpu);
5245         return 1;
5246 }
5247
5248 /*
5249  * Decode the memory-address operand of a vmx instruction, as recorded on an
5250  * exit caused by such an instruction (run by a guest hypervisor).
5251  * On success, returns 0. When the operand is invalid, returns 1 and throws
5252  * #UD or #GP.
5253  */
5254 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5255                                  unsigned long exit_qualification,
5256                                  u32 vmx_instruction_info, gva_t *ret)
5257 {
5258         /*
5259          * According to Vol. 3B, "Information for VM Exits Due to Instruction
5260          * Execution", on an exit, vmx_instruction_info holds most of the
5261          * addressing components of the operand. Only the displacement part
5262          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5263          * For how an actual address is calculated from all these components,
5264          * refer to Vol. 1, "Operand Addressing".
5265          */
5266         int  scaling = vmx_instruction_info & 3;
5267         int  addr_size = (vmx_instruction_info >> 7) & 7;
5268         bool is_reg = vmx_instruction_info & (1u << 10);
5269         int  seg_reg = (vmx_instruction_info >> 15) & 7;
5270         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
5271         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5272         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
5273         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
5274
5275         if (is_reg) {
5276                 kvm_queue_exception(vcpu, UD_VECTOR);
5277                 return 1;
5278         }
5279
5280         /* Addr = segment_base + offset */
5281         /* offset = base + [index * scale] + displacement */
5282         *ret = vmx_get_segment_base(vcpu, seg_reg);
5283         if (base_is_valid)
5284                 *ret += kvm_register_read(vcpu, base_reg);
5285         if (index_is_valid)
5286                 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5287         *ret += exit_qualification; /* holds the displacement */
5288
5289         if (addr_size == 1) /* 32 bit */
5290                 *ret &= 0xffffffff;
5291
5292         /*
5293          * TODO: throw #GP (and return 1) in various cases that the VM*
5294          * instructions require it - e.g., offset beyond segment limit,
5295          * unusable or unreadable/unwritable segment, non-canonical 64-bit
5296          * address, and so on. Currently these are not checked.
5297          */
5298         return 0;
5299 }
5300
5301 /*
5302  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5303  * set the success or error code of an emulated VMX instruction, as specified
5304  * by Vol 2B, VMX Instruction Reference, "Conventions".
5305  */
5306 static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5307 {
5308         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5309                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5310                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5311 }
5312
5313 static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5314 {
5315         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5316                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5317                             X86_EFLAGS_SF | X86_EFLAGS_OF))
5318                         | X86_EFLAGS_CF);
5319 }
5320
5321 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5322                                         u32 vm_instruction_error)
5323 {
5324         if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5325                 /*
5326                  * failValid writes the error number to the current VMCS, which
5327                  * can't be done there isn't a current VMCS.
5328                  */
5329                 nested_vmx_failInvalid(vcpu);
5330                 return;
5331         }
5332         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5333                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5334                             X86_EFLAGS_SF | X86_EFLAGS_OF))
5335                         | X86_EFLAGS_ZF);
5336         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5337 }
5338
5339 /* Emulate the VMCLEAR instruction */
5340 static int handle_vmclear(struct kvm_vcpu *vcpu)
5341 {
5342         struct vcpu_vmx *vmx = to_vmx(vcpu);
5343         gva_t gva;
5344         gpa_t vmptr;
5345         struct vmcs12 *vmcs12;
5346         struct page *page;
5347         struct x86_exception e;
5348
5349         if (!nested_vmx_check_permission(vcpu))
5350                 return 1;
5351
5352         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5353                         vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5354                 return 1;
5355
5356         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5357                                 sizeof(vmptr), &e)) {
5358                 kvm_inject_page_fault(vcpu, &e);
5359                 return 1;
5360         }
5361
5362         if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5363                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5364                 skip_emulated_instruction(vcpu);
5365                 return 1;
5366         }
5367
5368         if (vmptr == vmx->nested.current_vmptr) {
5369                 kunmap(vmx->nested.current_vmcs12_page);
5370                 nested_release_page(vmx->nested.current_vmcs12_page);
5371                 vmx->nested.current_vmptr = -1ull;
5372                 vmx->nested.current_vmcs12 = NULL;
5373         }
5374
5375         page = nested_get_page(vcpu, vmptr);
5376         if (page == NULL) {
5377                 /*
5378                  * For accurate processor emulation, VMCLEAR beyond available
5379                  * physical memory should do nothing at all. However, it is
5380                  * possible that a nested vmx bug, not a guest hypervisor bug,
5381                  * resulted in this case, so let's shut down before doing any
5382                  * more damage:
5383                  */
5384                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5385                 return 1;
5386         }
5387         vmcs12 = kmap(page);
5388         vmcs12->launch_state = 0;
5389         kunmap(page);
5390         nested_release_page(page);
5391
5392         nested_free_vmcs02(vmx, vmptr);
5393
5394         skip_emulated_instruction(vcpu);
5395         nested_vmx_succeed(vcpu);
5396         return 1;
5397 }
5398
5399 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
5400
5401 /* Emulate the VMLAUNCH instruction */
5402 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5403 {
5404         return nested_vmx_run(vcpu, true);
5405 }
5406
5407 /* Emulate the VMRESUME instruction */
5408 static int handle_vmresume(struct kvm_vcpu *vcpu)
5409 {
5410
5411         return nested_vmx_run(vcpu, false);
5412 }
5413
5414 enum vmcs_field_type {
5415         VMCS_FIELD_TYPE_U16 = 0,
5416         VMCS_FIELD_TYPE_U64 = 1,
5417         VMCS_FIELD_TYPE_U32 = 2,
5418         VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
5419 };
5420
5421 static inline int vmcs_field_type(unsigned long field)
5422 {
5423         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
5424                 return VMCS_FIELD_TYPE_U32;
5425         return (field >> 13) & 0x3 ;
5426 }
5427
5428 static inline int vmcs_field_readonly(unsigned long field)
5429 {
5430         return (((field >> 10) & 0x3) == 1);
5431 }
5432
5433 /*
5434  * Read a vmcs12 field. Since these can have varying lengths and we return
5435  * one type, we chose the biggest type (u64) and zero-extend the return value
5436  * to that size. Note that the caller, handle_vmread, might need to use only
5437  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
5438  * 64-bit fields are to be returned).
5439  */
5440 static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5441                                         unsigned long field, u64 *ret)
5442 {
5443         short offset = vmcs_field_to_offset(field);
5444         char *p;
5445
5446         if (offset < 0)
5447                 return 0;
5448
5449         p = ((char *)(get_vmcs12(vcpu))) + offset;
5450
5451         switch (vmcs_field_type(field)) {
5452         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5453                 *ret = *((natural_width *)p);
5454                 return 1;
5455         case VMCS_FIELD_TYPE_U16:
5456                 *ret = *((u16 *)p);
5457                 return 1;
5458         case VMCS_FIELD_TYPE_U32:
5459                 *ret = *((u32 *)p);
5460                 return 1;
5461         case VMCS_FIELD_TYPE_U64:
5462                 *ret = *((u64 *)p);
5463                 return 1;
5464         default:
5465                 return 0; /* can never happen. */
5466         }
5467 }
5468
5469 /*
5470  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5471  * used before) all generate the same failure when it is missing.
5472  */
5473 static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
5474 {
5475         struct vcpu_vmx *vmx = to_vmx(vcpu);
5476         if (vmx->nested.current_vmptr == -1ull) {
5477                 nested_vmx_failInvalid(vcpu);
5478                 skip_emulated_instruction(vcpu);
5479                 return 0;
5480         }
5481         return 1;
5482 }
5483
5484 static int handle_vmread(struct kvm_vcpu *vcpu)
5485 {
5486         unsigned long field;
5487         u64 field_value;
5488         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5489         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5490         gva_t gva = 0;
5491
5492         if (!nested_vmx_check_permission(vcpu) ||
5493             !nested_vmx_check_vmcs12(vcpu))
5494                 return 1;
5495
5496         /* Decode instruction info and find the field to read */
5497         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5498         /* Read the field, zero-extended to a u64 field_value */
5499         if (!vmcs12_read_any(vcpu, field, &field_value)) {
5500                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5501                 skip_emulated_instruction(vcpu);
5502                 return 1;
5503         }
5504         /*
5505          * Now copy part of this value to register or memory, as requested.
5506          * Note that the number of bits actually copied is 32 or 64 depending
5507          * on the guest's mode (32 or 64 bit), not on the given field's length.
5508          */
5509         if (vmx_instruction_info & (1u << 10)) {
5510                 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
5511                         field_value);
5512         } else {
5513                 if (get_vmx_mem_address(vcpu, exit_qualification,
5514                                 vmx_instruction_info, &gva))
5515                         return 1;
5516                 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
5517                 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
5518                              &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
5519         }
5520
5521         nested_vmx_succeed(vcpu);
5522         skip_emulated_instruction(vcpu);
5523         return 1;
5524 }
5525
5526
5527 static int handle_vmwrite(struct kvm_vcpu *vcpu)
5528 {
5529         unsigned long field;
5530         gva_t gva;
5531         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5532         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5533         char *p;
5534         short offset;
5535         /* The value to write might be 32 or 64 bits, depending on L1's long
5536          * mode, and eventually we need to write that into a field of several
5537          * possible lengths. The code below first zero-extends the value to 64
5538          * bit (field_value), and then copies only the approriate number of
5539          * bits into the vmcs12 field.
5540          */
5541         u64 field_value = 0;
5542         struct x86_exception e;
5543
5544         if (!nested_vmx_check_permission(vcpu) ||
5545             !nested_vmx_check_vmcs12(vcpu))
5546                 return 1;
5547
5548         if (vmx_instruction_info & (1u << 10))
5549                 field_value = kvm_register_read(vcpu,
5550                         (((vmx_instruction_info) >> 3) & 0xf));
5551         else {
5552                 if (get_vmx_mem_address(vcpu, exit_qualification,
5553                                 vmx_instruction_info, &gva))
5554                         return 1;
5555                 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
5556                            &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
5557                         kvm_inject_page_fault(vcpu, &e);
5558                         return 1;
5559                 }
5560         }
5561
5562
5563         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5564         if (vmcs_field_readonly(field)) {
5565                 nested_vmx_failValid(vcpu,
5566                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5567                 skip_emulated_instruction(vcpu);
5568                 return 1;
5569         }
5570
5571         offset = vmcs_field_to_offset(field);
5572         if (offset < 0) {
5573                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5574                 skip_emulated_instruction(vcpu);
5575                 return 1;
5576         }
5577         p = ((char *) get_vmcs12(vcpu)) + offset;
5578
5579         switch (vmcs_field_type(field)) {
5580         case VMCS_FIELD_TYPE_U16:
5581                 *(u16 *)p = field_value;
5582                 break;
5583         case VMCS_FIELD_TYPE_U32:
5584                 *(u32 *)p = field_value;
5585                 break;
5586         case VMCS_FIELD_TYPE_U64:
5587                 *(u64 *)p = field_value;
5588                 break;
5589         case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5590                 *(natural_width *)p = field_value;
5591                 break;
5592         default:
5593                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5594                 skip_emulated_instruction(vcpu);
5595                 return 1;
5596         }
5597
5598         nested_vmx_succeed(vcpu);
5599         skip_emulated_instruction(vcpu);
5600         return 1;
5601 }
5602
5603 /* Emulate the VMPTRLD instruction */
5604 static int handle_vmptrld(struct kvm_vcpu *vcpu)
5605 {
5606         struct vcpu_vmx *vmx = to_vmx(vcpu);
5607         gva_t gva;
5608         gpa_t vmptr;
5609         struct x86_exception e;
5610
5611         if (!nested_vmx_check_permission(vcpu))
5612                 return 1;
5613
5614         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5615                         vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5616                 return 1;
5617
5618         if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5619                                 sizeof(vmptr), &e)) {
5620                 kvm_inject_page_fault(vcpu, &e);
5621                 return 1;
5622         }
5623
5624         if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5625                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5626                 skip_emulated_instruction(vcpu);
5627                 return 1;
5628         }
5629
5630         if (vmx->nested.current_vmptr != vmptr) {
5631                 struct vmcs12 *new_vmcs12;
5632                 struct page *page;
5633                 page = nested_get_page(vcpu, vmptr);
5634                 if (page == NULL) {
5635                         nested_vmx_failInvalid(vcpu);
5636                         skip_emulated_instruction(vcpu);
5637                         return 1;
5638                 }
5639                 new_vmcs12 = kmap(page);
5640                 if (new_vmcs12->revision_id != VMCS12_REVISION) {
5641                         kunmap(page);
5642                         nested_release_page_clean(page);
5643                         nested_vmx_failValid(vcpu,
5644                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5645                         skip_emulated_instruction(vcpu);
5646                         return 1;
5647                 }
5648                 if (vmx->nested.current_vmptr != -1ull) {
5649                         kunmap(vmx->nested.current_vmcs12_page);
5650                         nested_release_page(vmx->nested.current_vmcs12_page);
5651                 }
5652
5653                 vmx->nested.current_vmptr = vmptr;
5654                 vmx->nested.current_vmcs12 = new_vmcs12;
5655                 vmx->nested.current_vmcs12_page = page;
5656         }
5657
5658         nested_vmx_succeed(vcpu);
5659         skip_emulated_instruction(vcpu);
5660         return 1;
5661 }
5662
5663 /* Emulate the VMPTRST instruction */
5664 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5665 {
5666         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5667         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5668         gva_t vmcs_gva;
5669         struct x86_exception e;
5670
5671         if (!nested_vmx_check_permission(vcpu))
5672                 return 1;
5673
5674         if (get_vmx_mem_address(vcpu, exit_qualification,
5675                         vmx_instruction_info, &vmcs_gva))
5676                 return 1;
5677         /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
5678         if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
5679                                  (void *)&to_vmx(vcpu)->nested.current_vmptr,
5680                                  sizeof(u64), &e)) {
5681                 kvm_inject_page_fault(vcpu, &e);
5682                 return 1;
5683         }
5684         nested_vmx_succeed(vcpu);
5685         skip_emulated_instruction(vcpu);
5686         return 1;
5687 }
5688
5689 /*
5690  * The exit handlers return 1 if the exit was handled fully and guest execution
5691  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
5692  * to be done to userspace and return 0.
5693  */
5694 static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5695         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
5696         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
5697         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
5698         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
5699         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
5700         [EXIT_REASON_CR_ACCESS]               = handle_cr,
5701         [EXIT_REASON_DR_ACCESS]               = handle_dr,
5702         [EXIT_REASON_CPUID]                   = handle_cpuid,
5703         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
5704         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
5705         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
5706         [EXIT_REASON_HLT]                     = handle_halt,
5707         [EXIT_REASON_INVD]                    = handle_invd,
5708         [EXIT_REASON_INVLPG]                  = handle_invlpg,
5709         [EXIT_REASON_RDPMC]                   = handle_rdpmc,
5710         [EXIT_REASON_VMCALL]                  = handle_vmcall,
5711         [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
5712         [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
5713         [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
5714         [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
5715         [EXIT_REASON_VMREAD]                  = handle_vmread,
5716         [EXIT_REASON_VMRESUME]                = handle_vmresume,
5717         [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
5718         [EXIT_REASON_VMOFF]                   = handle_vmoff,
5719         [EXIT_REASON_VMON]                    = handle_vmon,
5720         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
5721         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
5722         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
5723         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
5724         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
5725         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
5726         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
5727         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
5728         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
5729         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
5730         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
5731 };
5732
5733 static const int kvm_vmx_max_exit_handlers =
5734         ARRAY_SIZE(kvm_vmx_exit_handlers);
5735
5736 /*
5737  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5738  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5739  * disinterest in the current event (read or write a specific MSR) by using an
5740  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5741  */
5742 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5743         struct vmcs12 *vmcs12, u32 exit_reason)
5744 {
5745         u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5746         gpa_t bitmap;
5747
5748         if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
5749                 return 1;
5750
5751         /*
5752          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5753          * for the four combinations of read/write and low/high MSR numbers.
5754          * First we need to figure out which of the four to use:
5755          */
5756         bitmap = vmcs12->msr_bitmap;
5757         if (exit_reason == EXIT_REASON_MSR_WRITE)
5758                 bitmap += 2048;
5759         if (msr_index >= 0xc0000000) {
5760                 msr_index -= 0xc0000000;
5761                 bitmap += 1024;
5762         }
5763
5764         /* Then read the msr_index'th bit from this bitmap: */
5765         if (msr_index < 1024*8) {
5766                 unsigned char b;
5767                 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
5768                 return 1 & (b >> (msr_index & 7));
5769         } else
5770                 return 1; /* let L1 handle the wrong parameter */
5771 }
5772
5773 /*
5774  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5775  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5776  * intercept (via guest_host_mask etc.) the current event.
5777  */
5778 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5779         struct vmcs12 *vmcs12)
5780 {
5781         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5782         int cr = exit_qualification & 15;
5783         int reg = (exit_qualification >> 8) & 15;
5784         unsigned long val = kvm_register_read(vcpu, reg);
5785
5786         switch ((exit_qualification >> 4) & 3) {
5787         case 0: /* mov to cr */
5788                 switch (cr) {
5789                 case 0:
5790                         if (vmcs12->cr0_guest_host_mask &
5791                             (val ^ vmcs12->cr0_read_shadow))
5792                                 return 1;
5793                         break;
5794                 case 3:
5795                         if ((vmcs12->cr3_target_count >= 1 &&
5796                                         vmcs12->cr3_target_value0 == val) ||
5797                                 (vmcs12->cr3_target_count >= 2 &&
5798                                         vmcs12->cr3_target_value1 == val) ||
5799                                 (vmcs12->cr3_target_count >= 3 &&
5800                                         vmcs12->cr3_target_value2 == val) ||
5801                                 (vmcs12->cr3_target_count >= 4 &&
5802                                         vmcs12->cr3_target_value3 == val))
5803                                 return 0;
5804                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5805                                 return 1;
5806                         break;
5807                 case 4:
5808                         if (vmcs12->cr4_guest_host_mask &
5809                             (vmcs12->cr4_read_shadow ^ val))
5810                                 return 1;
5811                         break;
5812                 case 8:
5813                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5814                                 return 1;
5815                         break;
5816                 }
5817                 break;
5818         case 2: /* clts */
5819                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5820                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
5821                         return 1;
5822                 break;
5823         case 1: /* mov from cr */
5824                 switch (cr) {
5825                 case 3:
5826                         if (vmcs12->cpu_based_vm_exec_control &
5827                             CPU_BASED_CR3_STORE_EXITING)
5828                                 return 1;
5829                         break;
5830                 case 8:
5831                         if (vmcs12->cpu_based_vm_exec_control &
5832                             CPU_BASED_CR8_STORE_EXITING)
5833                                 return 1;
5834                         break;
5835                 }
5836                 break;
5837         case 3: /* lmsw */
5838                 /*
5839                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5840                  * cr0. Other attempted changes are ignored, with no exit.
5841                  */
5842                 if (vmcs12->cr0_guest_host_mask & 0xe &
5843                     (val ^ vmcs12->cr0_read_shadow))
5844                         return 1;
5845                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5846                     !(vmcs12->cr0_read_shadow & 0x1) &&
5847                     (val & 0x1))
5848                         return 1;
5849                 break;
5850         }
5851         return 0;
5852 }
5853
5854 /*
5855  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5856  * should handle it ourselves in L0 (and then continue L2). Only call this
5857  * when in is_guest_mode (L2).
5858  */
5859 static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5860 {
5861         u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5862         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5863         struct vcpu_vmx *vmx = to_vmx(vcpu);
5864         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5865
5866         if (vmx->nested.nested_run_pending)
5867                 return 0;
5868
5869         if (unlikely(vmx->fail)) {
5870                 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5871                                     vmcs_read32(VM_INSTRUCTION_ERROR));
5872                 return 1;
5873         }
5874
5875         switch (exit_reason) {
5876         case EXIT_REASON_EXCEPTION_NMI:
5877                 if (!is_exception(intr_info))
5878                         return 0;
5879                 else if (is_page_fault(intr_info))
5880                         return enable_ept;
5881                 return vmcs12->exception_bitmap &
5882                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5883         case EXIT_REASON_EXTERNAL_INTERRUPT:
5884                 return 0;
5885         case EXIT_REASON_TRIPLE_FAULT:
5886                 return 1;
5887         case EXIT_REASON_PENDING_INTERRUPT:
5888         case EXIT_REASON_NMI_WINDOW:
5889                 /*
5890                  * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
5891                  * (aka Interrupt Window Exiting) only when L1 turned it on,
5892                  * so if we got a PENDING_INTERRUPT exit, this must be for L1.
5893                  * Same for NMI Window Exiting.
5894                  */
5895                 return 1;
5896         case EXIT_REASON_TASK_SWITCH:
5897                 return 1;
5898         case EXIT_REASON_CPUID:
5899                 return 1;
5900         case EXIT_REASON_HLT:
5901                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5902         case EXIT_REASON_INVD:
5903                 return 1;
5904         case EXIT_REASON_INVLPG:
5905                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5906         case EXIT_REASON_RDPMC:
5907                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5908         case EXIT_REASON_RDTSC:
5909                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5910         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5911         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5912         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
5913         case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
5914         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5915                 /*
5916                  * VMX instructions trap unconditionally. This allows L1 to
5917                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
5918                  */
5919                 return 1;
5920         case EXIT_REASON_CR_ACCESS:
5921                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5922         case EXIT_REASON_DR_ACCESS:
5923                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5924         case EXIT_REASON_IO_INSTRUCTION:
5925                 /* TODO: support IO bitmaps */
5926                 return 1;
5927         case EXIT_REASON_MSR_READ:
5928         case EXIT_REASON_MSR_WRITE:
5929                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5930         case EXIT_REASON_INVALID_STATE:
5931                 return 1;
5932         case EXIT_REASON_MWAIT_INSTRUCTION:
5933                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5934         case EXIT_REASON_MONITOR_INSTRUCTION:
5935                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5936         case EXIT_REASON_PAUSE_INSTRUCTION:
5937                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5938                         nested_cpu_has2(vmcs12,
5939                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5940         case EXIT_REASON_MCE_DURING_VMENTRY:
5941                 return 0;
5942         case EXIT_REASON_TPR_BELOW_THRESHOLD:
5943                 return 1;
5944         case EXIT_REASON_APIC_ACCESS:
5945                 return nested_cpu_has2(vmcs12,
5946                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
5947         case EXIT_REASON_EPT_VIOLATION:
5948         case EXIT_REASON_EPT_MISCONFIG:
5949                 return 0;
5950         case EXIT_REASON_WBINVD:
5951                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5952         case EXIT_REASON_XSETBV:
5953                 return 1;
5954         default:
5955                 return 1;
5956         }
5957 }
5958
5959 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
5960 {
5961         *info1 = vmcs_readl(EXIT_QUALIFICATION);
5962         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
5963 }
5964
5965 /*
5966  * The guest has exited.  See if we can fix it or if we need userspace
5967  * assistance.
5968  */
5969 static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5970 {
5971         struct vcpu_vmx *vmx = to_vmx(vcpu);
5972         u32 exit_reason = vmx->exit_reason;
5973         u32 vectoring_info = vmx->idt_vectoring_info;
5974
5975         /* If guest state is invalid, start emulating */
5976         if (vmx->emulation_required && emulate_invalid_guest_state)
5977                 return handle_invalid_guest_state(vcpu);
5978
5979         /*
5980          * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
5981          * we did not inject a still-pending event to L1 now because of
5982          * nested_run_pending, we need to re-enable this bit.
5983          */
5984         if (vmx->nested.nested_run_pending)
5985                 kvm_make_request(KVM_REQ_EVENT, vcpu);
5986
5987         if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
5988             exit_reason == EXIT_REASON_VMRESUME))
5989                 vmx->nested.nested_run_pending = 1;
5990         else
5991                 vmx->nested.nested_run_pending = 0;
5992
5993         if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
5994                 nested_vmx_vmexit(vcpu);
5995                 return 1;
5996         }
5997
5998         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
5999                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6000                 vcpu->run->fail_entry.hardware_entry_failure_reason
6001                         = exit_reason;
6002                 return 0;
6003         }
6004
6005         if (unlikely(vmx->fail)) {
6006                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6007                 vcpu->run->fail_entry.hardware_entry_failure_reason
6008                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6009                 return 0;
6010         }
6011
6012         /*
6013          * Note:
6014          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6015          * delivery event since it indicates guest is accessing MMIO.
6016          * The vm-exit can be triggered again after return to guest that
6017          * will cause infinite loop.
6018          */
6019         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6020                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
6021                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
6022                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
6023                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6024                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6025                 vcpu->run->internal.ndata = 2;
6026                 vcpu->run->internal.data[0] = vectoring_info;
6027                 vcpu->run->internal.data[1] = exit_reason;
6028                 return 0;
6029         }
6030
6031         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
6032             !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
6033                                         get_vmcs12(vcpu), vcpu)))) {
6034                 if (vmx_interrupt_allowed(vcpu)) {
6035                         vmx->soft_vnmi_blocked = 0;
6036                 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
6037                            vcpu->arch.nmi_pending) {
6038                         /*
6039                          * This CPU don't support us in finding the end of an
6040                          * NMI-blocked window if the guest runs with IRQs
6041                          * disabled. So we pull the trigger after 1 s of
6042                          * futile waiting, but inform the user about this.
6043                          */
6044                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6045                                "state on VCPU %d after 1 s timeout\n",
6046                                __func__, vcpu->vcpu_id);
6047                         vmx->soft_vnmi_blocked = 0;
6048                 }
6049         }
6050
6051         if (exit_reason < kvm_vmx_max_exit_handlers
6052             && kvm_vmx_exit_handlers[exit_reason])
6053                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
6054         else {
6055                 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
6056                 vcpu->run->hw.hardware_exit_reason = exit_reason;
6057         }
6058         return 0;
6059 }
6060
6061 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6062 {
6063         if (irr == -1 || tpr < irr) {
6064                 vmcs_write32(TPR_THRESHOLD, 0);
6065                 return;
6066         }
6067
6068         vmcs_write32(TPR_THRESHOLD, irr);
6069 }
6070
6071 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6072 {
6073         u32 exit_intr_info;
6074
6075         if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
6076               || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
6077                 return;
6078
6079         vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6080         exit_intr_info = vmx->exit_intr_info;
6081
6082         /* Handle machine checks before interrupts are enabled */
6083         if (is_machine_check(exit_intr_info))
6084                 kvm_machine_check();
6085
6086         /* We need to handle NMIs before interrupts are enabled */
6087         if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
6088             (exit_intr_info & INTR_INFO_VALID_MASK)) {
6089                 kvm_before_handle_nmi(&vmx->vcpu);
6090                 asm("int $2");
6091                 kvm_after_handle_nmi(&vmx->vcpu);
6092         }
6093 }
6094
6095 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6096 {
6097         u32 exit_intr_info;
6098         bool unblock_nmi;
6099         u8 vector;
6100         bool idtv_info_valid;
6101
6102         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6103
6104         if (cpu_has_virtual_nmis()) {
6105                 if (vmx->nmi_known_unmasked)
6106                         return;
6107                 /*
6108                  * Can't use vmx->exit_intr_info since we're not sure what
6109                  * the exit reason is.
6110                  */
6111                 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6112                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6113                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6114                 /*
6115                  * SDM 3: 27.7.1.2 (September 2008)
6116                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
6117                  * a guest IRET fault.
6118                  * SDM 3: 23.2.2 (September 2008)
6119                  * Bit 12 is undefined in any of the following cases:
6120                  *  If the VM exit sets the valid bit in the IDT-vectoring
6121                  *   information field.
6122                  *  If the VM exit is due to a double fault.
6123                  */
6124                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6125                     vector != DF_VECTOR && !idtv_info_valid)
6126                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6127                                       GUEST_INTR_STATE_NMI);
6128                 else
6129                         vmx->nmi_known_unmasked =
6130                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6131                                   & GUEST_INTR_STATE_NMI);
6132         } else if (unlikely(vmx->soft_vnmi_blocked))
6133                 vmx->vnmi_blocked_time +=
6134                         ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
6135 }
6136
6137 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
6138                                       u32 idt_vectoring_info,
6139                                       int instr_len_field,
6140                                       int error_code_field)
6141 {
6142         u8 vector;
6143         int type;
6144         bool idtv_info_valid;
6145
6146         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6147
6148         vmx->vcpu.arch.nmi_injected = false;
6149         kvm_clear_exception_queue(&vmx->vcpu);
6150         kvm_clear_interrupt_queue(&vmx->vcpu);
6151
6152         if (!idtv_info_valid)
6153                 return;
6154
6155         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
6156
6157         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6158         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6159
6160         switch (type) {
6161         case INTR_TYPE_NMI_INTR:
6162                 vmx->vcpu.arch.nmi_injected = true;
6163                 /*
6164                  * SDM 3: 27.7.1.2 (September 2008)
6165                  * Clear bit "block by NMI" before VM entry if a NMI
6166                  * delivery faulted.
6167                  */
6168                 vmx_set_nmi_mask(&vmx->vcpu, false);
6169                 break;
6170         case INTR_TYPE_SOFT_EXCEPTION:
6171                 vmx->vcpu.arch.event_exit_inst_len =
6172                         vmcs_read32(instr_len_field);
6173                 /* fall through */
6174         case INTR_TYPE_HARD_EXCEPTION:
6175                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6176                         u32 err = vmcs_read32(error_code_field);
6177                         kvm_queue_exception_e(&vmx->vcpu, vector, err);
6178                 } else
6179                         kvm_queue_exception(&vmx->vcpu, vector);
6180                 break;
6181         case INTR_TYPE_SOFT_INTR:
6182                 vmx->vcpu.arch.event_exit_inst_len =
6183                         vmcs_read32(instr_len_field);
6184                 /* fall through */
6185         case INTR_TYPE_EXT_INTR:
6186                 kvm_queue_interrupt(&vmx->vcpu, vector,
6187                         type == INTR_TYPE_SOFT_INTR);
6188                 break;
6189         default:
6190                 break;
6191         }
6192 }
6193
6194 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6195 {
6196         if (is_guest_mode(&vmx->vcpu))
6197                 return;
6198         __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
6199                                   VM_EXIT_INSTRUCTION_LEN,
6200                                   IDT_VECTORING_ERROR_CODE);
6201 }
6202
6203 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6204 {
6205         if (is_guest_mode(vcpu))
6206                 return;
6207         __vmx_complete_interrupts(to_vmx(vcpu),
6208                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6209                                   VM_ENTRY_INSTRUCTION_LEN,
6210                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
6211
6212         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6213 }
6214
6215 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6216 {
6217         int i, nr_msrs;
6218         struct perf_guest_switch_msr *msrs;
6219
6220         msrs = perf_guest_get_msrs(&nr_msrs);
6221
6222         if (!msrs)
6223                 return;
6224
6225         for (i = 0; i < nr_msrs; i++)
6226                 if (msrs[i].host == msrs[i].guest)
6227                         clear_atomic_switch_msr(vmx, msrs[i].msr);
6228                 else
6229                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6230                                         msrs[i].host);
6231 }
6232
6233 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6234 {
6235         struct vcpu_vmx *vmx = to_vmx(vcpu);
6236         unsigned long debugctlmsr;
6237
6238         if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6239                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6240                 if (vmcs12->idt_vectoring_info_field &
6241                                 VECTORING_INFO_VALID_MASK) {
6242                         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6243                                 vmcs12->idt_vectoring_info_field);
6244                         vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6245                                 vmcs12->vm_exit_instruction_len);
6246                         if (vmcs12->idt_vectoring_info_field &
6247                                         VECTORING_INFO_DELIVER_CODE_MASK)
6248                                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6249                                         vmcs12->idt_vectoring_error_code);
6250                 }
6251         }
6252
6253         /* Record the guest's net vcpu time for enforced NMI injections. */
6254         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
6255                 vmx->entry_time = ktime_get();
6256
6257         /* Don't enter VMX if guest state is invalid, let the exit handler
6258            start emulation until we arrive back to a valid state */
6259         if (vmx->emulation_required && emulate_invalid_guest_state)
6260                 return;
6261
6262         if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
6263                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6264         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
6265                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6266
6267         /* When single-stepping over STI and MOV SS, we must clear the
6268          * corresponding interruptibility bits in the guest state. Otherwise
6269          * vmentry fails as it then expects bit 14 (BS) in pending debug
6270          * exceptions being set, but that's not correct for the guest debugging
6271          * case. */
6272         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6273                 vmx_set_interrupt_shadow(vcpu, 0);
6274
6275         atomic_switch_perf_msrs(vmx);
6276         debugctlmsr = get_debugctlmsr();
6277
6278         vmx->__launched = vmx->loaded_vmcs->launched;
6279         asm(
6280                 /* Store host registers */
6281                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6282                 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6283                 "push %%" _ASM_CX " \n\t"
6284                 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6285                 "je 1f \n\t"
6286                 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6287                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
6288                 "1: \n\t"
6289                 /* Reload cr2 if changed */
6290                 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
6291                 "mov %%cr2, %%" _ASM_DX " \n\t"
6292                 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6293                 "je 2f \n\t"
6294                 "mov %%" _ASM_AX", %%cr2 \n\t"
6295                 "2: \n\t"
6296                 /* Check if vmlaunch of vmresume is needed */
6297                 "cmpl $0, %c[launched](%0) \n\t"
6298                 /* Load guest registers.  Don't clobber flags. */
6299                 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
6300                 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
6301                 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
6302                 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
6303                 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
6304                 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
6305 #ifdef CONFIG_X86_64
6306                 "mov %c[r8](%0),  %%r8  \n\t"
6307                 "mov %c[r9](%0),  %%r9  \n\t"
6308                 "mov %c[r10](%0), %%r10 \n\t"
6309                 "mov %c[r11](%0), %%r11 \n\t"
6310                 "mov %c[r12](%0), %%r12 \n\t"
6311                 "mov %c[r13](%0), %%r13 \n\t"
6312                 "mov %c[r14](%0), %%r14 \n\t"
6313                 "mov %c[r15](%0), %%r15 \n\t"
6314 #endif
6315                 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
6316
6317                 /* Enter guest mode */
6318                 "jne 1f \n\t"
6319                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
6320                 "jmp 2f \n\t"
6321                 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
6322                 "2: "
6323                 /* Save guest registers, load host registers, keep flags */
6324                 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
6325                 "pop %0 \n\t"
6326                 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
6327                 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
6328                 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
6329                 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
6330                 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
6331                 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
6332                 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
6333 #ifdef CONFIG_X86_64
6334                 "mov %%r8,  %c[r8](%0) \n\t"
6335                 "mov %%r9,  %c[r9](%0) \n\t"
6336                 "mov %%r10, %c[r10](%0) \n\t"
6337                 "mov %%r11, %c[r11](%0) \n\t"
6338                 "mov %%r12, %c[r12](%0) \n\t"
6339                 "mov %%r13, %c[r13](%0) \n\t"
6340                 "mov %%r14, %c[r14](%0) \n\t"
6341                 "mov %%r15, %c[r15](%0) \n\t"
6342 #endif
6343                 "mov %%cr2, %%" _ASM_AX "   \n\t"
6344                 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
6345
6346                 "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
6347                 "setbe %c[fail](%0) \n\t"
6348                 ".pushsection .rodata \n\t"
6349                 ".global vmx_return \n\t"
6350                 "vmx_return: " _ASM_PTR " 2b \n\t"
6351                 ".popsection"
6352               : : "c"(vmx), "d"((unsigned long)HOST_RSP),
6353                 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6354                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
6355                 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
6356                 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
6357                 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
6358                 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
6359                 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
6360                 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
6361                 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
6362                 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
6363 #ifdef CONFIG_X86_64
6364                 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
6365                 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
6366                 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
6367                 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
6368                 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
6369                 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
6370                 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
6371                 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6372 #endif
6373                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6374                 [wordsize]"i"(sizeof(ulong))
6375               : "cc", "memory"
6376 #ifdef CONFIG_X86_64
6377                 , "rax", "rbx", "rdi", "rsi"
6378                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6379 #else
6380                 , "eax", "ebx", "edi", "esi"
6381 #endif
6382               );
6383
6384         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6385         if (debugctlmsr)
6386                 update_debugctlmsr(debugctlmsr);
6387
6388 #ifndef CONFIG_X86_64
6389         /*
6390          * The sysexit path does not restore ds/es, so we must set them to
6391          * a reasonable value ourselves.
6392          *
6393          * We can't defer this to vmx_load_host_state() since that function
6394          * may be executed in interrupt context, which saves and restore segments
6395          * around it, nullifying its effect.
6396          */
6397         loadsegment(ds, __USER_DS);
6398         loadsegment(es, __USER_DS);
6399 #endif
6400
6401         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6402                                   | (1 << VCPU_EXREG_RFLAGS)
6403                                   | (1 << VCPU_EXREG_CPL)
6404                                   | (1 << VCPU_EXREG_PDPTR)
6405                                   | (1 << VCPU_EXREG_SEGMENTS)
6406                                   | (1 << VCPU_EXREG_CR3));
6407         vcpu->arch.regs_dirty = 0;
6408
6409         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6410
6411         if (is_guest_mode(vcpu)) {
6412                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6413                 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6414                 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6415                         vmcs12->idt_vectoring_error_code =
6416                                 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6417                         vmcs12->vm_exit_instruction_len =
6418                                 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6419                 }
6420         }
6421
6422         vmx->loaded_vmcs->launched = 1;
6423
6424         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
6425         trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
6426
6427         vmx_complete_atomic_exit(vmx);
6428         vmx_recover_nmi_blocking(vmx);
6429         vmx_complete_interrupts(vmx);
6430 }
6431
6432 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6433 {
6434         struct vcpu_vmx *vmx = to_vmx(vcpu);
6435
6436         free_vpid(vmx);
6437         free_nested(vmx);
6438         free_loaded_vmcs(vmx->loaded_vmcs);
6439         kfree(vmx->guest_msrs);
6440         kvm_vcpu_uninit(vcpu);
6441         kmem_cache_free(kvm_vcpu_cache, vmx);
6442 }
6443
6444 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6445 {
6446         int err;
6447         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
6448         int cpu;
6449
6450         if (!vmx)
6451                 return ERR_PTR(-ENOMEM);
6452
6453         allocate_vpid(vmx);
6454
6455         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
6456         if (err)
6457                 goto free_vcpu;
6458
6459         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
6460         err = -ENOMEM;
6461         if (!vmx->guest_msrs) {
6462                 goto uninit_vcpu;
6463         }
6464
6465         vmx->loaded_vmcs = &vmx->vmcs01;
6466         vmx->loaded_vmcs->vmcs = alloc_vmcs();
6467         if (!vmx->loaded_vmcs->vmcs)
6468                 goto free_msrs;
6469         if (!vmm_exclusive)
6470                 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
6471         loaded_vmcs_init(vmx->loaded_vmcs);
6472         if (!vmm_exclusive)
6473                 kvm_cpu_vmxoff();
6474
6475         cpu = get_cpu();
6476         vmx_vcpu_load(&vmx->vcpu, cpu);
6477         vmx->vcpu.cpu = cpu;
6478         err = vmx_vcpu_setup(vmx);
6479         vmx_vcpu_put(&vmx->vcpu);
6480         put_cpu();
6481         if (err)
6482                 goto free_vmcs;
6483         if (vm_need_virtualize_apic_accesses(kvm))
6484                 err = alloc_apic_access_page(kvm);
6485                 if (err)
6486                         goto free_vmcs;
6487
6488         if (enable_ept) {
6489                 if (!kvm->arch.ept_identity_map_addr)
6490                         kvm->arch.ept_identity_map_addr =
6491                                 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
6492                 err = -ENOMEM;
6493                 if (alloc_identity_pagetable(kvm) != 0)
6494                         goto free_vmcs;
6495                 if (!init_rmode_identity_map(kvm))
6496                         goto free_vmcs;
6497         }
6498
6499         vmx->nested.current_vmptr = -1ull;
6500         vmx->nested.current_vmcs12 = NULL;
6501
6502         return &vmx->vcpu;
6503
6504 free_vmcs:
6505         free_loaded_vmcs(vmx->loaded_vmcs);
6506 free_msrs:
6507         kfree(vmx->guest_msrs);
6508 uninit_vcpu:
6509         kvm_vcpu_uninit(&vmx->vcpu);
6510 free_vcpu:
6511         free_vpid(vmx);
6512         kmem_cache_free(kvm_vcpu_cache, vmx);
6513         return ERR_PTR(err);
6514 }
6515
6516 static void __init vmx_check_processor_compat(void *rtn)
6517 {
6518         struct vmcs_config vmcs_conf;
6519
6520         *(int *)rtn = 0;
6521         if (setup_vmcs_config(&vmcs_conf) < 0)
6522                 *(int *)rtn = -EIO;
6523         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
6524                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
6525                                 smp_processor_id());
6526                 *(int *)rtn = -EIO;
6527         }
6528 }
6529
6530 static int get_ept_level(void)
6531 {
6532         return VMX_EPT_DEFAULT_GAW + 1;
6533 }
6534
6535 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6536 {
6537         u64 ret;
6538
6539         /* For VT-d and EPT combination
6540          * 1. MMIO: always map as UC
6541          * 2. EPT with VT-d:
6542          *   a. VT-d without snooping control feature: can't guarantee the
6543          *      result, try to trust guest.
6544          *   b. VT-d with snooping control feature: snooping control feature of
6545          *      VT-d engine can guarantee the cache correctness. Just set it
6546          *      to WB to keep consistent with host. So the same as item 3.
6547          * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
6548          *    consistent with host MTRR
6549          */
6550         if (is_mmio)
6551                 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
6552         else if (vcpu->kvm->arch.iommu_domain &&
6553                 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
6554                 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
6555                       VMX_EPT_MT_EPTE_SHIFT;
6556         else
6557                 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
6558                         | VMX_EPT_IPAT_BIT;
6559
6560         return ret;
6561 }
6562
6563 static int vmx_get_lpage_level(void)
6564 {
6565         if (enable_ept && !cpu_has_vmx_ept_1g_page())
6566                 return PT_DIRECTORY_LEVEL;
6567         else
6568                 /* For shadow and EPT supported 1GB page */
6569                 return PT_PDPE_LEVEL;
6570 }
6571
6572 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6573 {
6574         struct kvm_cpuid_entry2 *best;
6575         struct vcpu_vmx *vmx = to_vmx(vcpu);
6576         u32 exec_control;
6577
6578         vmx->rdtscp_enabled = false;
6579         if (vmx_rdtscp_supported()) {
6580                 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6581                 if (exec_control & SECONDARY_EXEC_RDTSCP) {
6582                         best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
6583                         if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
6584                                 vmx->rdtscp_enabled = true;
6585                         else {
6586                                 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6587                                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6588                                                 exec_control);
6589                         }
6590                 }
6591         }
6592
6593         /* Exposing INVPCID only when PCID is exposed */
6594         best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6595         if (vmx_invpcid_supported() &&
6596             best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
6597             guest_cpuid_has_pcid(vcpu)) {
6598                 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6599                 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
6600                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6601                              exec_control);
6602         } else {
6603                 if (cpu_has_secondary_exec_ctrls()) {
6604                         exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6605                         exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6606                         vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6607                                      exec_control);
6608                 }
6609                 if (best)
6610                         best->ebx &= ~bit(X86_FEATURE_INVPCID);
6611         }
6612 }
6613
6614 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
6615 {
6616         if (func == 1 && nested)
6617                 entry->ecx |= bit(X86_FEATURE_VMX);
6618 }
6619
6620 /*
6621  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
6622  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
6623  * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
6624  * guest in a way that will both be appropriate to L1's requests, and our
6625  * needs. In addition to modifying the active vmcs (which is vmcs02), this
6626  * function also has additional necessary side-effects, like setting various
6627  * vcpu->arch fields.
6628  */
6629 static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6630 {
6631         struct vcpu_vmx *vmx = to_vmx(vcpu);
6632         u32 exec_control;
6633
6634         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
6635         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
6636         vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
6637         vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
6638         vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
6639         vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
6640         vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
6641         vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
6642         vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
6643         vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
6644         vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
6645         vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
6646         vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
6647         vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
6648         vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
6649         vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
6650         vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
6651         vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
6652         vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
6653         vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
6654         vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
6655         vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
6656         vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
6657         vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
6658         vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
6659         vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
6660         vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
6661         vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
6662         vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
6663         vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
6664         vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
6665         vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
6666         vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
6667         vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
6668         vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
6669         vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
6670
6671         vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
6672         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6673                 vmcs12->vm_entry_intr_info_field);
6674         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6675                 vmcs12->vm_entry_exception_error_code);
6676         vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6677                 vmcs12->vm_entry_instruction_len);
6678         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6679                 vmcs12->guest_interruptibility_info);
6680         vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6681         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6682         vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
6683         vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6684         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6685                 vmcs12->guest_pending_dbg_exceptions);
6686         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
6687         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
6688
6689         vmcs_write64(VMCS_LINK_POINTER, -1ull);
6690
6691         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
6692                 (vmcs_config.pin_based_exec_ctrl |
6693                  vmcs12->pin_based_vm_exec_control));
6694
6695         /*
6696          * Whether page-faults are trapped is determined by a combination of
6697          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
6698          * If enable_ept, L0 doesn't care about page faults and we should
6699          * set all of these to L1's desires. However, if !enable_ept, L0 does
6700          * care about (at least some) page faults, and because it is not easy
6701          * (if at all possible?) to merge L0 and L1's desires, we simply ask
6702          * to exit on each and every L2 page fault. This is done by setting
6703          * MASK=MATCH=0 and (see below) EB.PF=1.
6704          * Note that below we don't need special code to set EB.PF beyond the
6705          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
6706          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
6707          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
6708          *
6709          * A problem with this approach (when !enable_ept) is that L1 may be
6710          * injected with more page faults than it asked for. This could have
6711          * caused problems, but in practice existing hypervisors don't care.
6712          * To fix this, we will need to emulate the PFEC checking (on the L1
6713          * page tables), using walk_addr(), when injecting PFs to L1.
6714          */
6715         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
6716                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
6717         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
6718                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
6719
6720         if (cpu_has_secondary_exec_ctrls()) {
6721                 u32 exec_control = vmx_secondary_exec_control(vmx);
6722                 if (!vmx->rdtscp_enabled)
6723                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
6724                 /* Take the following fields only from vmcs12 */
6725                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6726                 if (nested_cpu_has(vmcs12,
6727                                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
6728                         exec_control |= vmcs12->secondary_vm_exec_control;
6729
6730                 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
6731                         /*
6732                          * Translate L1 physical address to host physical
6733                          * address for vmcs02. Keep the page pinned, so this
6734                          * physical address remains valid. We keep a reference
6735                          * to it so we can release it later.
6736                          */
6737                         if (vmx->nested.apic_access_page) /* shouldn't happen */
6738                                 nested_release_page(vmx->nested.apic_access_page);
6739                         vmx->nested.apic_access_page =
6740                                 nested_get_page(vcpu, vmcs12->apic_access_addr);
6741                         /*
6742                          * If translation failed, no matter: This feature asks
6743                          * to exit when accessing the given address, and if it
6744                          * can never be accessed, this feature won't do
6745                          * anything anyway.
6746                          */
6747                         if (!vmx->nested.apic_access_page)
6748                                 exec_control &=
6749                                   ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6750                         else
6751                                 vmcs_write64(APIC_ACCESS_ADDR,
6752                                   page_to_phys(vmx->nested.apic_access_page));
6753                 }
6754
6755                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6756         }
6757
6758
6759         /*
6760          * Set host-state according to L0's settings (vmcs12 is irrelevant here)
6761          * Some constant fields are set here by vmx_set_constant_host_state().
6762          * Other fields are different per CPU, and will be set later when
6763          * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
6764          */
6765         vmx_set_constant_host_state();
6766
6767         /*
6768          * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
6769          * entry, but only if the current (host) sp changed from the value
6770          * we wrote last (vmx->host_rsp). This cache is no longer relevant
6771          * if we switch vmcs, and rather than hold a separate cache per vmcs,
6772          * here we just force the write to happen on entry.
6773          */
6774         vmx->host_rsp = 0;
6775
6776         exec_control = vmx_exec_control(vmx); /* L0's desires */
6777         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6778         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6779         exec_control &= ~CPU_BASED_TPR_SHADOW;
6780         exec_control |= vmcs12->cpu_based_vm_exec_control;
6781         /*
6782          * Merging of IO and MSR bitmaps not currently supported.
6783          * Rather, exit every time.
6784          */
6785         exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
6786         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
6787         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
6788
6789         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6790
6791         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
6792          * bitwise-or of what L1 wants to trap for L2, and what we want to
6793          * trap. Note that CR0.TS also needs updating - we do this later.
6794          */
6795         update_exception_bitmap(vcpu);
6796         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
6797         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6798
6799         /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
6800         vmcs_write32(VM_EXIT_CONTROLS,
6801                 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
6802         vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
6803                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
6804
6805         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
6806                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
6807         else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6808                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6809
6810
6811         set_cr4_guest_host_mask(vmx);
6812
6813         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
6814                 vmcs_write64(TSC_OFFSET,
6815                         vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6816         else
6817                 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6818
6819         if (enable_vpid) {
6820                 /*
6821                  * Trivially support vpid by letting L2s share their parent
6822                  * L1's vpid. TODO: move to a more elaborate solution, giving
6823                  * each L2 its own vpid and exposing the vpid feature to L1.
6824                  */
6825                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6826                 vmx_flush_tlb(vcpu);
6827         }
6828
6829         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
6830                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
6831         if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
6832                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6833         else
6834                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6835         /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
6836         vmx_set_efer(vcpu, vcpu->arch.efer);
6837
6838         /*
6839          * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
6840          * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
6841          * The CR0_READ_SHADOW is what L2 should have expected to read given
6842          * the specifications by L1; It's not enough to take
6843          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
6844          * have more bits than L1 expected.
6845          */
6846         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
6847         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
6848
6849         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
6850         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
6851
6852         /* shadow page tables on either EPT or shadow page tables */
6853         kvm_set_cr3(vcpu, vmcs12->guest_cr3);
6854         kvm_mmu_reset_context(vcpu);
6855
6856         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
6857         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
6858 }
6859
6860 /*
6861  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
6862  * for running an L2 nested guest.
6863  */
6864 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6865 {
6866         struct vmcs12 *vmcs12;
6867         struct vcpu_vmx *vmx = to_vmx(vcpu);
6868         int cpu;
6869         struct loaded_vmcs *vmcs02;
6870
6871         if (!nested_vmx_check_permission(vcpu) ||
6872             !nested_vmx_check_vmcs12(vcpu))
6873                 return 1;
6874
6875         skip_emulated_instruction(vcpu);
6876         vmcs12 = get_vmcs12(vcpu);
6877
6878         /*
6879          * The nested entry process starts with enforcing various prerequisites
6880          * on vmcs12 as required by the Intel SDM, and act appropriately when
6881          * they fail: As the SDM explains, some conditions should cause the
6882          * instruction to fail, while others will cause the instruction to seem
6883          * to succeed, but return an EXIT_REASON_INVALID_STATE.
6884          * To speed up the normal (success) code path, we should avoid checking
6885          * for misconfigurations which will anyway be caught by the processor
6886          * when using the merged vmcs02.
6887          */
6888         if (vmcs12->launch_state == launch) {
6889                 nested_vmx_failValid(vcpu,
6890                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
6891                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
6892                 return 1;
6893         }
6894
6895         if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
6896                         !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
6897                 /*TODO: Also verify bits beyond physical address width are 0*/
6898                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6899                 return 1;
6900         }
6901
6902         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
6903                         !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
6904                 /*TODO: Also verify bits beyond physical address width are 0*/
6905                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6906                 return 1;
6907         }
6908
6909         if (vmcs12->vm_entry_msr_load_count > 0 ||
6910             vmcs12->vm_exit_msr_load_count > 0 ||
6911             vmcs12->vm_exit_msr_store_count > 0) {
6912                 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
6913                                     __func__);
6914                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6915                 return 1;
6916         }
6917
6918         if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6919               nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
6920             !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6921               nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
6922             !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6923               nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
6924             !vmx_control_verify(vmcs12->vm_exit_controls,
6925               nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
6926             !vmx_control_verify(vmcs12->vm_entry_controls,
6927               nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
6928         {
6929                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6930                 return 1;
6931         }
6932
6933         if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6934             ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6935                 nested_vmx_failValid(vcpu,
6936                         VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
6937                 return 1;
6938         }
6939
6940         if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6941             ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6942                 nested_vmx_entry_failure(vcpu, vmcs12,
6943                         EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
6944                 return 1;
6945         }
6946         if (vmcs12->vmcs_link_pointer != -1ull) {
6947                 nested_vmx_entry_failure(vcpu, vmcs12,
6948                         EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
6949                 return 1;
6950         }
6951
6952         /*
6953          * We're finally done with prerequisite checking, and can start with
6954          * the nested entry.
6955          */
6956
6957         vmcs02 = nested_get_current_vmcs02(vmx);
6958         if (!vmcs02)
6959                 return -ENOMEM;
6960
6961         enter_guest_mode(vcpu);
6962
6963         vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
6964
6965         cpu = get_cpu();
6966         vmx->loaded_vmcs = vmcs02;
6967         vmx_vcpu_put(vcpu);
6968         vmx_vcpu_load(vcpu, cpu);
6969         vcpu->cpu = cpu;
6970         put_cpu();
6971
6972         vmcs12->launch_state = 1;
6973
6974         prepare_vmcs02(vcpu, vmcs12);
6975
6976         /*
6977          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
6978          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
6979          * returned as far as L1 is concerned. It will only return (and set
6980          * the success flag) when L2 exits (see nested_vmx_vmexit()).
6981          */
6982         return 1;
6983 }
6984
6985 /*
6986  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6987  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6988  * This function returns the new value we should put in vmcs12.guest_cr0.
6989  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6990  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6991  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6992  *     didn't trap the bit, because if L1 did, so would L0).
6993  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6994  *     been modified by L2, and L1 knows it. So just leave the old value of
6995  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6996  *     isn't relevant, because if L0 traps this bit it can set it to anything.
6997  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6998  *     changed these bits, and therefore they need to be updated, but L0
6999  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
7000  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
7001  */
7002 static inline unsigned long
7003 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7004 {
7005         return
7006         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
7007         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
7008         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
7009                         vcpu->arch.cr0_guest_owned_bits));
7010 }
7011
7012 static inline unsigned long
7013 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7014 {
7015         return
7016         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
7017         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
7018         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
7019                         vcpu->arch.cr4_guest_owned_bits));
7020 }
7021
7022 /*
7023  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
7024  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
7025  * and this function updates it to reflect the changes to the guest state while
7026  * L2 was running (and perhaps made some exits which were handled directly by L0
7027  * without going back to L1), and to reflect the exit reason.
7028  * Note that we do not have to copy here all VMCS fields, just those that
7029  * could have changed by the L2 guest or the exit - i.e., the guest-state and
7030  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
7031  * which already writes to vmcs12 directly.
7032  */
7033 void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7034 {
7035         /* update guest state fields: */
7036         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
7037         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
7038
7039         kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
7040         vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
7041         vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
7042         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
7043
7044         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
7045         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
7046         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
7047         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
7048         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
7049         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
7050         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
7051         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
7052         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
7053         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
7054         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
7055         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
7056         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
7057         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
7058         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
7059         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
7060         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
7061         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
7062         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
7063         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
7064         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
7065         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
7066         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
7067         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
7068         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
7069         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
7070         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
7071         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
7072         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
7073         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
7074         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
7075         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
7076         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
7077         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
7078         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
7079         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
7080
7081         vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
7082         vmcs12->guest_interruptibility_info =
7083                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
7084         vmcs12->guest_pending_dbg_exceptions =
7085                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7086
7087         /* TODO: These cannot have changed unless we have MSR bitmaps and
7088          * the relevant bit asks not to trap the change */
7089         vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
7090         if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
7091                 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
7092         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
7093         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
7094         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
7095
7096         /* update exit information fields: */
7097
7098         vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
7099         vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7100
7101         vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7102         vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
7103         vmcs12->idt_vectoring_info_field =
7104                 vmcs_read32(IDT_VECTORING_INFO_FIELD);
7105         vmcs12->idt_vectoring_error_code =
7106                 vmcs_read32(IDT_VECTORING_ERROR_CODE);
7107         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7108         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7109
7110         /* clear vm-entry fields which are to be cleared on exit */
7111         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
7112                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
7113 }
7114
7115 /*
7116  * A part of what we need to when the nested L2 guest exits and we want to
7117  * run its L1 parent, is to reset L1's guest state to the host state specified
7118  * in vmcs12.
7119  * This function is to be called not only on normal nested exit, but also on
7120  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
7121  * Failures During or After Loading Guest State").
7122  * This function should be called when the active VMCS is L1's (vmcs01).
7123  */
7124 void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7125 {
7126         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7127                 vcpu->arch.efer = vmcs12->host_ia32_efer;
7128         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
7129                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
7130         else
7131                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
7132         vmx_set_efer(vcpu, vcpu->arch.efer);
7133
7134         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
7135         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
7136         /*
7137          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
7138          * actually changed, because it depends on the current state of
7139          * fpu_active (which may have changed).
7140          * Note that vmx_set_cr0 refers to efer set above.
7141          */
7142         kvm_set_cr0(vcpu, vmcs12->host_cr0);
7143         /*
7144          * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
7145          * to apply the same changes to L1's vmcs. We just set cr0 correctly,
7146          * but we also need to update cr0_guest_host_mask and exception_bitmap.
7147          */
7148         update_exception_bitmap(vcpu);
7149         vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
7150         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
7151
7152         /*
7153          * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
7154          * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
7155          */
7156         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
7157         kvm_set_cr4(vcpu, vmcs12->host_cr4);
7158
7159         /* shadow page tables on either EPT or shadow page tables */
7160         kvm_set_cr3(vcpu, vmcs12->host_cr3);
7161         kvm_mmu_reset_context(vcpu);
7162
7163         if (enable_vpid) {
7164                 /*
7165                  * Trivially support vpid by letting L2s share their parent
7166                  * L1's vpid. TODO: move to a more elaborate solution, giving
7167                  * each L2 its own vpid and exposing the vpid feature to L1.
7168                  */
7169                 vmx_flush_tlb(vcpu);
7170         }
7171
7172
7173         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
7174         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
7175         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
7176         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
7177         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
7178         vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
7179         vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
7180         vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
7181         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
7182         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
7183         vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
7184         vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
7185         vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
7186         vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
7187         vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
7188
7189         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
7190                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
7191         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
7192                 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
7193                         vmcs12->host_ia32_perf_global_ctrl);
7194 }
7195
7196 /*
7197  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
7198  * and modify vmcs12 to make it see what it would expect to see there if
7199  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
7200  */
7201 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7202 {
7203         struct vcpu_vmx *vmx = to_vmx(vcpu);
7204         int cpu;
7205         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7206
7207         leave_guest_mode(vcpu);
7208         prepare_vmcs12(vcpu, vmcs12);
7209
7210         cpu = get_cpu();
7211         vmx->loaded_vmcs = &vmx->vmcs01;
7212         vmx_vcpu_put(vcpu);
7213         vmx_vcpu_load(vcpu, cpu);
7214         vcpu->cpu = cpu;
7215         put_cpu();
7216
7217         /* if no vmcs02 cache requested, remove the one we used */
7218         if (VMCS02_POOL_SIZE == 0)
7219                 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
7220
7221         load_vmcs12_host_state(vcpu, vmcs12);
7222
7223         /* Update TSC_OFFSET if TSC was changed while L2 ran */
7224         vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
7225
7226         /* This is needed for same reason as it was needed in prepare_vmcs02 */
7227         vmx->host_rsp = 0;
7228
7229         /* Unpin physical memory we referred to in vmcs02 */
7230         if (vmx->nested.apic_access_page) {
7231                 nested_release_page(vmx->nested.apic_access_page);
7232                 vmx->nested.apic_access_page = 0;
7233         }
7234
7235         /*
7236          * Exiting from L2 to L1, we're now back to L1 which thinks it just
7237          * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
7238          * success or failure flag accordingly.
7239          */
7240         if (unlikely(vmx->fail)) {
7241                 vmx->fail = 0;
7242                 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
7243         } else
7244                 nested_vmx_succeed(vcpu);
7245 }
7246
7247 /*
7248  * L1's failure to enter L2 is a subset of a normal exit, as explained in
7249  * 23.7 "VM-entry failures during or after loading guest state" (this also
7250  * lists the acceptable exit-reason and exit-qualification parameters).
7251  * It should only be called before L2 actually succeeded to run, and when
7252  * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
7253  */
7254 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
7255                         struct vmcs12 *vmcs12,
7256                         u32 reason, unsigned long qualification)
7257 {
7258         load_vmcs12_host_state(vcpu, vmcs12);
7259         vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
7260         vmcs12->exit_qualification = qualification;
7261         nested_vmx_succeed(vcpu);
7262 }
7263
7264 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7265                                struct x86_instruction_info *info,
7266                                enum x86_intercept_stage stage)
7267 {
7268         return X86EMUL_CONTINUE;
7269 }
7270
7271 static struct kvm_x86_ops vmx_x86_ops = {
7272         .cpu_has_kvm_support = cpu_has_kvm_support,
7273         .disabled_by_bios = vmx_disabled_by_bios,
7274         .hardware_setup = hardware_setup,
7275         .hardware_unsetup = hardware_unsetup,
7276         .check_processor_compatibility = vmx_check_processor_compat,
7277         .hardware_enable = hardware_enable,
7278         .hardware_disable = hardware_disable,
7279         .cpu_has_accelerated_tpr = report_flexpriority,
7280
7281         .vcpu_create = vmx_create_vcpu,
7282         .vcpu_free = vmx_free_vcpu,
7283         .vcpu_reset = vmx_vcpu_reset,
7284
7285         .prepare_guest_switch = vmx_save_host_state,
7286         .vcpu_load = vmx_vcpu_load,
7287         .vcpu_put = vmx_vcpu_put,
7288
7289         .update_db_bp_intercept = update_exception_bitmap,
7290         .get_msr = vmx_get_msr,
7291         .set_msr = vmx_set_msr,
7292         .get_segment_base = vmx_get_segment_base,
7293         .get_segment = vmx_get_segment,
7294         .set_segment = vmx_set_segment,
7295         .get_cpl = vmx_get_cpl,
7296         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
7297         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
7298         .decache_cr3 = vmx_decache_cr3,
7299         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
7300         .set_cr0 = vmx_set_cr0,
7301         .set_cr3 = vmx_set_cr3,
7302         .set_cr4 = vmx_set_cr4,
7303         .set_efer = vmx_set_efer,
7304         .get_idt = vmx_get_idt,
7305         .set_idt = vmx_set_idt,
7306         .get_gdt = vmx_get_gdt,
7307         .set_gdt = vmx_set_gdt,
7308         .set_dr7 = vmx_set_dr7,
7309         .cache_reg = vmx_cache_reg,
7310         .get_rflags = vmx_get_rflags,
7311         .set_rflags = vmx_set_rflags,
7312         .fpu_activate = vmx_fpu_activate,
7313         .fpu_deactivate = vmx_fpu_deactivate,
7314
7315         .tlb_flush = vmx_flush_tlb,
7316
7317         .run = vmx_vcpu_run,
7318         .handle_exit = vmx_handle_exit,
7319         .skip_emulated_instruction = skip_emulated_instruction,
7320         .set_interrupt_shadow = vmx_set_interrupt_shadow,
7321         .get_interrupt_shadow = vmx_get_interrupt_shadow,
7322         .patch_hypercall = vmx_patch_hypercall,
7323         .set_irq = vmx_inject_irq,
7324         .set_nmi = vmx_inject_nmi,
7325         .queue_exception = vmx_queue_exception,
7326         .cancel_injection = vmx_cancel_injection,
7327         .interrupt_allowed = vmx_interrupt_allowed,
7328         .nmi_allowed = vmx_nmi_allowed,
7329         .get_nmi_mask = vmx_get_nmi_mask,
7330         .set_nmi_mask = vmx_set_nmi_mask,
7331         .enable_nmi_window = enable_nmi_window,
7332         .enable_irq_window = enable_irq_window,
7333         .update_cr8_intercept = update_cr8_intercept,
7334
7335         .set_tss_addr = vmx_set_tss_addr,
7336         .get_tdp_level = get_ept_level,
7337         .get_mt_mask = vmx_get_mt_mask,
7338
7339         .get_exit_info = vmx_get_exit_info,
7340
7341         .get_lpage_level = vmx_get_lpage_level,
7342
7343         .cpuid_update = vmx_cpuid_update,
7344
7345         .rdtscp_supported = vmx_rdtscp_supported,
7346         .invpcid_supported = vmx_invpcid_supported,
7347
7348         .set_supported_cpuid = vmx_set_supported_cpuid,
7349
7350         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7351
7352         .set_tsc_khz = vmx_set_tsc_khz,
7353         .read_tsc_offset = vmx_read_tsc_offset,
7354         .write_tsc_offset = vmx_write_tsc_offset,
7355         .adjust_tsc_offset = vmx_adjust_tsc_offset,
7356         .compute_tsc_offset = vmx_compute_tsc_offset,
7357         .read_l1_tsc = vmx_read_l1_tsc,
7358
7359         .set_tdp_cr3 = vmx_set_cr3,
7360
7361         .check_intercept = vmx_check_intercept,
7362 };
7363
7364 static int __init vmx_init(void)
7365 {
7366         int r, i;
7367
7368         rdmsrl_safe(MSR_EFER, &host_efer);
7369
7370         for (i = 0; i < NR_VMX_MSR; ++i)
7371                 kvm_define_shared_msr(i, vmx_msr_index[i]);
7372
7373         vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
7374         if (!vmx_io_bitmap_a)
7375                 return -ENOMEM;
7376
7377         r = -ENOMEM;
7378
7379         vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
7380         if (!vmx_io_bitmap_b)
7381                 goto out;
7382
7383         vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
7384         if (!vmx_msr_bitmap_legacy)
7385                 goto out1;
7386
7387
7388         vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7389         if (!vmx_msr_bitmap_longmode)
7390                 goto out2;
7391
7392
7393         /*
7394          * Allow direct access to the PC debug port (it is often used for I/O
7395          * delays, but the vmexits simply slow things down).
7396          */
7397         memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
7398         clear_bit(0x80, vmx_io_bitmap_a);
7399
7400         memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
7401
7402         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
7403         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
7404
7405         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7406
7407         r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
7408                      __alignof__(struct vcpu_vmx), THIS_MODULE);
7409         if (r)
7410                 goto out3;
7411
7412 #ifdef CONFIG_KEXEC
7413         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7414                            crash_vmclear_local_loaded_vmcss);
7415 #endif
7416
7417         vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
7418         vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
7419         vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
7420         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
7421         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
7422         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7423
7424         if (enable_ept) {
7425                 kvm_mmu_set_mask_ptes(0ull,
7426                         (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
7427                         (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
7428                         0ull, VMX_EPT_EXECUTABLE_MASK);
7429                 ept_set_mmio_spte_mask();
7430                 kvm_enable_tdp();
7431         } else
7432                 kvm_disable_tdp();
7433
7434         return 0;
7435
7436 out3:
7437         free_page((unsigned long)vmx_msr_bitmap_longmode);
7438 out2:
7439         free_page((unsigned long)vmx_msr_bitmap_legacy);
7440 out1:
7441         free_page((unsigned long)vmx_io_bitmap_b);
7442 out:
7443         free_page((unsigned long)vmx_io_bitmap_a);
7444         return r;
7445 }
7446
7447 static void __exit vmx_exit(void)
7448 {
7449         free_page((unsigned long)vmx_msr_bitmap_legacy);
7450         free_page((unsigned long)vmx_msr_bitmap_longmode);
7451         free_page((unsigned long)vmx_io_bitmap_b);
7452         free_page((unsigned long)vmx_io_bitmap_a);
7453
7454 #ifdef CONFIG_KEXEC
7455         rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
7456         synchronize_rcu();
7457 #endif
7458
7459         kvm_exit();
7460 }
7461
7462 module_init(vmx_init)
7463 module_exit(vmx_exit)