Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[pandora-kernel.git] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Authors:
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *   Avi Kivity   <avi@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17 #include <linux/kvm_host.h>
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "kvm_cache_regs.h"
22 #include "x86.h"
23
24 #include <linux/module.h>
25 #include <linux/kernel.h>
26 #include <linux/vmalloc.h>
27 #include <linux/highmem.h>
28 #include <linux/sched.h>
29 #include <linux/ftrace_event.h>
30 #include <linux/slab.h>
31
32 #include <asm/tlbflush.h>
33 #include <asm/desc.h>
34 #include <asm/kvm_para.h>
35
36 #include <asm/virtext.h>
37 #include "trace.h"
38
39 #define __ex(x) __kvm_handle_fault_on_reboot(x)
40
41 MODULE_AUTHOR("Qumranet");
42 MODULE_LICENSE("GPL");
43
44 #define IOPM_ALLOC_ORDER 2
45 #define MSRPM_ALLOC_ORDER 1
46
47 #define SEG_TYPE_LDT 2
48 #define SEG_TYPE_BUSY_TSS16 3
49
50 #define SVM_FEATURE_NPT            (1 <<  0)
51 #define SVM_FEATURE_LBRV           (1 <<  1)
52 #define SVM_FEATURE_SVML           (1 <<  2)
53 #define SVM_FEATURE_NRIP           (1 <<  3)
54 #define SVM_FEATURE_TSC_RATE       (1 <<  4)
55 #define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
56 #define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
57 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
58 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
59
60 #define NESTED_EXIT_HOST        0       /* Exit handled on host level */
61 #define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
62 #define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
63
64 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
65
66 static bool erratum_383_found __read_mostly;
67
68 static const u32 host_save_user_msrs[] = {
69 #ifdef CONFIG_X86_64
70         MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
71         MSR_FS_BASE,
72 #endif
73         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
74 };
75
76 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
77
78 struct kvm_vcpu;
79
80 struct nested_state {
81         struct vmcb *hsave;
82         u64 hsave_msr;
83         u64 vm_cr_msr;
84         u64 vmcb;
85
86         /* These are the merged vectors */
87         u32 *msrpm;
88
89         /* gpa pointers to the real vectors */
90         u64 vmcb_msrpm;
91         u64 vmcb_iopm;
92
93         /* A VMEXIT is required but not yet emulated */
94         bool exit_required;
95
96         /*
97          * If we vmexit during an instruction emulation we need this to restore
98          * the l1 guest rip after the emulation
99          */
100         unsigned long vmexit_rip;
101         unsigned long vmexit_rsp;
102         unsigned long vmexit_rax;
103
104         /* cache for intercepts of the guest */
105         u32 intercept_cr;
106         u32 intercept_dr;
107         u32 intercept_exceptions;
108         u64 intercept;
109
110         /* Nested Paging related state */
111         u64 nested_cr3;
112 };
113
114 #define MSRPM_OFFSETS   16
115 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
116
117 struct vcpu_svm {
118         struct kvm_vcpu vcpu;
119         struct vmcb *vmcb;
120         unsigned long vmcb_pa;
121         struct svm_cpu_data *svm_data;
122         uint64_t asid_generation;
123         uint64_t sysenter_esp;
124         uint64_t sysenter_eip;
125
126         u64 next_rip;
127
128         u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
129         struct {
130                 u16 fs;
131                 u16 gs;
132                 u16 ldt;
133                 u64 gs_base;
134         } host;
135
136         u32 *msrpm;
137
138         ulong nmi_iret_rip;
139
140         struct nested_state nested;
141
142         bool nmi_singlestep;
143
144         unsigned int3_injected;
145         unsigned long int3_rip;
146         u32 apf_reason;
147 };
148
149 #define MSR_INVALID                     0xffffffffU
150
151 static struct svm_direct_access_msrs {
152         u32 index;   /* Index of the MSR */
153         bool always; /* True if intercept is always on */
154 } direct_access_msrs[] = {
155         { .index = MSR_STAR,                            .always = true  },
156         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
157 #ifdef CONFIG_X86_64
158         { .index = MSR_GS_BASE,                         .always = true  },
159         { .index = MSR_FS_BASE,                         .always = true  },
160         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
161         { .index = MSR_LSTAR,                           .always = true  },
162         { .index = MSR_CSTAR,                           .always = true  },
163         { .index = MSR_SYSCALL_MASK,                    .always = true  },
164 #endif
165         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
166         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
167         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
168         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
169         { .index = MSR_INVALID,                         .always = false },
170 };
171
172 /* enable NPT for AMD64 and X86 with PAE */
173 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
174 static bool npt_enabled = true;
175 #else
176 static bool npt_enabled;
177 #endif
178 static int npt = 1;
179
180 module_param(npt, int, S_IRUGO);
181
182 static int nested = 1;
183 module_param(nested, int, S_IRUGO);
184
185 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
186 static void svm_complete_interrupts(struct vcpu_svm *svm);
187
188 static int nested_svm_exit_handled(struct vcpu_svm *svm);
189 static int nested_svm_intercept(struct vcpu_svm *svm);
190 static int nested_svm_vmexit(struct vcpu_svm *svm);
191 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
192                                       bool has_error_code, u32 error_code);
193
194 enum {
195         VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
196                             pause filter count */
197         VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
198         VMCB_ASID,       /* ASID */
199         VMCB_INTR,       /* int_ctl, int_vector */
200         VMCB_NPT,        /* npt_en, nCR3, gPAT */
201         VMCB_CR,         /* CR0, CR3, CR4, EFER */
202         VMCB_DR,         /* DR6, DR7 */
203         VMCB_DT,         /* GDT, IDT */
204         VMCB_SEG,        /* CS, DS, SS, ES, CPL */
205         VMCB_CR2,        /* CR2 only */
206         VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
207         VMCB_DIRTY_MAX,
208 };
209
210 /* TPR and CR2 are always written before VMRUN */
211 #define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
212
213 static inline void mark_all_dirty(struct vmcb *vmcb)
214 {
215         vmcb->control.clean = 0;
216 }
217
218 static inline void mark_all_clean(struct vmcb *vmcb)
219 {
220         vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
221                                & ~VMCB_ALWAYS_DIRTY_MASK;
222 }
223
224 static inline void mark_dirty(struct vmcb *vmcb, int bit)
225 {
226         vmcb->control.clean &= ~(1 << bit);
227 }
228
229 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
230 {
231         return container_of(vcpu, struct vcpu_svm, vcpu);
232 }
233
234 static void recalc_intercepts(struct vcpu_svm *svm)
235 {
236         struct vmcb_control_area *c, *h;
237         struct nested_state *g;
238
239         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
240
241         if (!is_guest_mode(&svm->vcpu))
242                 return;
243
244         c = &svm->vmcb->control;
245         h = &svm->nested.hsave->control;
246         g = &svm->nested;
247
248         c->intercept_cr = h->intercept_cr | g->intercept_cr;
249         c->intercept_dr = h->intercept_dr | g->intercept_dr;
250         c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
251         c->intercept = h->intercept | g->intercept;
252 }
253
254 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
255 {
256         if (is_guest_mode(&svm->vcpu))
257                 return svm->nested.hsave;
258         else
259                 return svm->vmcb;
260 }
261
262 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
263 {
264         struct vmcb *vmcb = get_host_vmcb(svm);
265
266         vmcb->control.intercept_cr |= (1U << bit);
267
268         recalc_intercepts(svm);
269 }
270
271 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
272 {
273         struct vmcb *vmcb = get_host_vmcb(svm);
274
275         vmcb->control.intercept_cr &= ~(1U << bit);
276
277         recalc_intercepts(svm);
278 }
279
280 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
281 {
282         struct vmcb *vmcb = get_host_vmcb(svm);
283
284         return vmcb->control.intercept_cr & (1U << bit);
285 }
286
287 static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
288 {
289         struct vmcb *vmcb = get_host_vmcb(svm);
290
291         vmcb->control.intercept_dr |= (1U << bit);
292
293         recalc_intercepts(svm);
294 }
295
296 static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
297 {
298         struct vmcb *vmcb = get_host_vmcb(svm);
299
300         vmcb->control.intercept_dr &= ~(1U << bit);
301
302         recalc_intercepts(svm);
303 }
304
305 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
306 {
307         struct vmcb *vmcb = get_host_vmcb(svm);
308
309         vmcb->control.intercept_exceptions |= (1U << bit);
310
311         recalc_intercepts(svm);
312 }
313
314 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
315 {
316         struct vmcb *vmcb = get_host_vmcb(svm);
317
318         vmcb->control.intercept_exceptions &= ~(1U << bit);
319
320         recalc_intercepts(svm);
321 }
322
323 static inline void set_intercept(struct vcpu_svm *svm, int bit)
324 {
325         struct vmcb *vmcb = get_host_vmcb(svm);
326
327         vmcb->control.intercept |= (1ULL << bit);
328
329         recalc_intercepts(svm);
330 }
331
332 static inline void clr_intercept(struct vcpu_svm *svm, int bit)
333 {
334         struct vmcb *vmcb = get_host_vmcb(svm);
335
336         vmcb->control.intercept &= ~(1ULL << bit);
337
338         recalc_intercepts(svm);
339 }
340
341 static inline void enable_gif(struct vcpu_svm *svm)
342 {
343         svm->vcpu.arch.hflags |= HF_GIF_MASK;
344 }
345
346 static inline void disable_gif(struct vcpu_svm *svm)
347 {
348         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
349 }
350
351 static inline bool gif_set(struct vcpu_svm *svm)
352 {
353         return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
354 }
355
356 static unsigned long iopm_base;
357
358 struct kvm_ldttss_desc {
359         u16 limit0;
360         u16 base0;
361         unsigned base1:8, type:5, dpl:2, p:1;
362         unsigned limit1:4, zero0:3, g:1, base2:8;
363         u32 base3;
364         u32 zero1;
365 } __attribute__((packed));
366
367 struct svm_cpu_data {
368         int cpu;
369
370         u64 asid_generation;
371         u32 max_asid;
372         u32 next_asid;
373         struct kvm_ldttss_desc *tss_desc;
374
375         struct page *save_area;
376 };
377
378 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
379 static uint32_t svm_features;
380
381 struct svm_init_data {
382         int cpu;
383         int r;
384 };
385
386 static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
387
388 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
389 #define MSRS_RANGE_SIZE 2048
390 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
391
392 static u32 svm_msrpm_offset(u32 msr)
393 {
394         u32 offset;
395         int i;
396
397         for (i = 0; i < NUM_MSR_MAPS; i++) {
398                 if (msr < msrpm_ranges[i] ||
399                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
400                         continue;
401
402                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
403                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
404
405                 /* Now we have the u8 offset - but need the u32 offset */
406                 return offset / 4;
407         }
408
409         /* MSR not in any range */
410         return MSR_INVALID;
411 }
412
413 #define MAX_INST_SIZE 15
414
415 static inline void clgi(void)
416 {
417         asm volatile (__ex(SVM_CLGI));
418 }
419
420 static inline void stgi(void)
421 {
422         asm volatile (__ex(SVM_STGI));
423 }
424
425 static inline void invlpga(unsigned long addr, u32 asid)
426 {
427         asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
428 }
429
430 static int get_npt_level(void)
431 {
432 #ifdef CONFIG_X86_64
433         return PT64_ROOT_LEVEL;
434 #else
435         return PT32E_ROOT_LEVEL;
436 #endif
437 }
438
439 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
440 {
441         vcpu->arch.efer = efer;
442         if (!npt_enabled && !(efer & EFER_LMA))
443                 efer &= ~EFER_LME;
444
445         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
446         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
447 }
448
449 static int is_external_interrupt(u32 info)
450 {
451         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
452         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
453 }
454
455 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
456 {
457         struct vcpu_svm *svm = to_svm(vcpu);
458         u32 ret = 0;
459
460         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
461                 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
462         return ret & mask;
463 }
464
465 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
466 {
467         struct vcpu_svm *svm = to_svm(vcpu);
468
469         if (mask == 0)
470                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
471         else
472                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
473
474 }
475
476 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
477 {
478         struct vcpu_svm *svm = to_svm(vcpu);
479
480         if (svm->vmcb->control.next_rip != 0)
481                 svm->next_rip = svm->vmcb->control.next_rip;
482
483         if (!svm->next_rip) {
484                 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
485                                 EMULATE_DONE)
486                         printk(KERN_DEBUG "%s: NOP\n", __func__);
487                 return;
488         }
489         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
490                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
491                        __func__, kvm_rip_read(vcpu), svm->next_rip);
492
493         kvm_rip_write(vcpu, svm->next_rip);
494         svm_set_interrupt_shadow(vcpu, 0);
495 }
496
497 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
498                                 bool has_error_code, u32 error_code,
499                                 bool reinject)
500 {
501         struct vcpu_svm *svm = to_svm(vcpu);
502
503         /*
504          * If we are within a nested VM we'd better #VMEXIT and let the guest
505          * handle the exception
506          */
507         if (!reinject &&
508             nested_svm_check_exception(svm, nr, has_error_code, error_code))
509                 return;
510
511         if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
512                 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
513
514                 /*
515                  * For guest debugging where we have to reinject #BP if some
516                  * INT3 is guest-owned:
517                  * Emulate nRIP by moving RIP forward. Will fail if injection
518                  * raises a fault that is not intercepted. Still better than
519                  * failing in all cases.
520                  */
521                 skip_emulated_instruction(&svm->vcpu);
522                 rip = kvm_rip_read(&svm->vcpu);
523                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
524                 svm->int3_injected = rip - old_rip;
525         }
526
527         svm->vmcb->control.event_inj = nr
528                 | SVM_EVTINJ_VALID
529                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
530                 | SVM_EVTINJ_TYPE_EXEPT;
531         svm->vmcb->control.event_inj_err = error_code;
532 }
533
534 static void svm_init_erratum_383(void)
535 {
536         u32 low, high;
537         int err;
538         u64 val;
539
540         if (!cpu_has_amd_erratum(amd_erratum_383))
541                 return;
542
543         /* Use _safe variants to not break nested virtualization */
544         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
545         if (err)
546                 return;
547
548         val |= (1ULL << 47);
549
550         low  = lower_32_bits(val);
551         high = upper_32_bits(val);
552
553         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
554
555         erratum_383_found = true;
556 }
557
558 static int has_svm(void)
559 {
560         const char *msg;
561
562         if (!cpu_has_svm(&msg)) {
563                 printk(KERN_INFO "has_svm: %s\n", msg);
564                 return 0;
565         }
566
567         return 1;
568 }
569
570 static void svm_hardware_disable(void *garbage)
571 {
572         cpu_svm_disable();
573 }
574
575 static int svm_hardware_enable(void *garbage)
576 {
577
578         struct svm_cpu_data *sd;
579         uint64_t efer;
580         struct desc_ptr gdt_descr;
581         struct desc_struct *gdt;
582         int me = raw_smp_processor_id();
583
584         rdmsrl(MSR_EFER, efer);
585         if (efer & EFER_SVME)
586                 return -EBUSY;
587
588         if (!has_svm()) {
589                 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
590                        me);
591                 return -EINVAL;
592         }
593         sd = per_cpu(svm_data, me);
594
595         if (!sd) {
596                 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
597                        me);
598                 return -EINVAL;
599         }
600
601         sd->asid_generation = 1;
602         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
603         sd->next_asid = sd->max_asid + 1;
604
605         native_store_gdt(&gdt_descr);
606         gdt = (struct desc_struct *)gdt_descr.address;
607         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
608
609         wrmsrl(MSR_EFER, efer | EFER_SVME);
610
611         wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
612
613         svm_init_erratum_383();
614
615         return 0;
616 }
617
618 static void svm_cpu_uninit(int cpu)
619 {
620         struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
621
622         if (!sd)
623                 return;
624
625         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
626         __free_page(sd->save_area);
627         kfree(sd);
628 }
629
630 static int svm_cpu_init(int cpu)
631 {
632         struct svm_cpu_data *sd;
633         int r;
634
635         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
636         if (!sd)
637                 return -ENOMEM;
638         sd->cpu = cpu;
639         sd->save_area = alloc_page(GFP_KERNEL);
640         r = -ENOMEM;
641         if (!sd->save_area)
642                 goto err_1;
643
644         per_cpu(svm_data, cpu) = sd;
645
646         return 0;
647
648 err_1:
649         kfree(sd);
650         return r;
651
652 }
653
654 static bool valid_msr_intercept(u32 index)
655 {
656         int i;
657
658         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
659                 if (direct_access_msrs[i].index == index)
660                         return true;
661
662         return false;
663 }
664
665 static void set_msr_interception(u32 *msrpm, unsigned msr,
666                                  int read, int write)
667 {
668         u8 bit_read, bit_write;
669         unsigned long tmp;
670         u32 offset;
671
672         /*
673          * If this warning triggers extend the direct_access_msrs list at the
674          * beginning of the file
675          */
676         WARN_ON(!valid_msr_intercept(msr));
677
678         offset    = svm_msrpm_offset(msr);
679         bit_read  = 2 * (msr & 0x0f);
680         bit_write = 2 * (msr & 0x0f) + 1;
681         tmp       = msrpm[offset];
682
683         BUG_ON(offset == MSR_INVALID);
684
685         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
686         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
687
688         msrpm[offset] = tmp;
689 }
690
691 static void svm_vcpu_init_msrpm(u32 *msrpm)
692 {
693         int i;
694
695         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
696
697         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
698                 if (!direct_access_msrs[i].always)
699                         continue;
700
701                 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
702         }
703 }
704
705 static void add_msr_offset(u32 offset)
706 {
707         int i;
708
709         for (i = 0; i < MSRPM_OFFSETS; ++i) {
710
711                 /* Offset already in list? */
712                 if (msrpm_offsets[i] == offset)
713                         return;
714
715                 /* Slot used by another offset? */
716                 if (msrpm_offsets[i] != MSR_INVALID)
717                         continue;
718
719                 /* Add offset to list */
720                 msrpm_offsets[i] = offset;
721
722                 return;
723         }
724
725         /*
726          * If this BUG triggers the msrpm_offsets table has an overflow. Just
727          * increase MSRPM_OFFSETS in this case.
728          */
729         BUG();
730 }
731
732 static void init_msrpm_offsets(void)
733 {
734         int i;
735
736         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
737
738         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
739                 u32 offset;
740
741                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
742                 BUG_ON(offset == MSR_INVALID);
743
744                 add_msr_offset(offset);
745         }
746 }
747
748 static void svm_enable_lbrv(struct vcpu_svm *svm)
749 {
750         u32 *msrpm = svm->msrpm;
751
752         svm->vmcb->control.lbr_ctl = 1;
753         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
754         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
755         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
756         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
757 }
758
759 static void svm_disable_lbrv(struct vcpu_svm *svm)
760 {
761         u32 *msrpm = svm->msrpm;
762
763         svm->vmcb->control.lbr_ctl = 0;
764         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
765         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
766         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
767         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
768 }
769
770 static __init int svm_hardware_setup(void)
771 {
772         int cpu;
773         struct page *iopm_pages;
774         void *iopm_va;
775         int r;
776
777         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
778
779         if (!iopm_pages)
780                 return -ENOMEM;
781
782         iopm_va = page_address(iopm_pages);
783         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
784         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
785
786         init_msrpm_offsets();
787
788         if (boot_cpu_has(X86_FEATURE_NX))
789                 kvm_enable_efer_bits(EFER_NX);
790
791         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
792                 kvm_enable_efer_bits(EFER_FFXSR);
793
794         if (nested) {
795                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
796                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
797         }
798
799         for_each_possible_cpu(cpu) {
800                 r = svm_cpu_init(cpu);
801                 if (r)
802                         goto err;
803         }
804
805         svm_features = cpuid_edx(SVM_CPUID_FUNC);
806
807         if (!boot_cpu_has(X86_FEATURE_NPT))
808                 npt_enabled = false;
809
810         if (npt_enabled && !npt) {
811                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
812                 npt_enabled = false;
813         }
814
815         if (npt_enabled) {
816                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
817                 kvm_enable_tdp();
818         } else
819                 kvm_disable_tdp();
820
821         return 0;
822
823 err:
824         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
825         iopm_base = 0;
826         return r;
827 }
828
829 static __exit void svm_hardware_unsetup(void)
830 {
831         int cpu;
832
833         for_each_possible_cpu(cpu)
834                 svm_cpu_uninit(cpu);
835
836         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
837         iopm_base = 0;
838 }
839
840 static void init_seg(struct vmcb_seg *seg)
841 {
842         seg->selector = 0;
843         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
844                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
845         seg->limit = 0xffff;
846         seg->base = 0;
847 }
848
849 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
850 {
851         seg->selector = 0;
852         seg->attrib = SVM_SELECTOR_P_MASK | type;
853         seg->limit = 0xffff;
854         seg->base = 0;
855 }
856
857 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
858 {
859         struct vcpu_svm *svm = to_svm(vcpu);
860         u64 g_tsc_offset = 0;
861
862         if (is_guest_mode(vcpu)) {
863                 g_tsc_offset = svm->vmcb->control.tsc_offset -
864                                svm->nested.hsave->control.tsc_offset;
865                 svm->nested.hsave->control.tsc_offset = offset;
866         }
867
868         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
869
870         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
871 }
872
873 static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
874 {
875         struct vcpu_svm *svm = to_svm(vcpu);
876
877         svm->vmcb->control.tsc_offset += adjustment;
878         if (is_guest_mode(vcpu))
879                 svm->nested.hsave->control.tsc_offset += adjustment;
880         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
881 }
882
883 static void init_vmcb(struct vcpu_svm *svm)
884 {
885         struct vmcb_control_area *control = &svm->vmcb->control;
886         struct vmcb_save_area *save = &svm->vmcb->save;
887
888         svm->vcpu.fpu_active = 1;
889         svm->vcpu.arch.hflags = 0;
890
891         set_cr_intercept(svm, INTERCEPT_CR0_READ);
892         set_cr_intercept(svm, INTERCEPT_CR3_READ);
893         set_cr_intercept(svm, INTERCEPT_CR4_READ);
894         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
895         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
896         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
897         set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
898
899         set_dr_intercept(svm, INTERCEPT_DR0_READ);
900         set_dr_intercept(svm, INTERCEPT_DR1_READ);
901         set_dr_intercept(svm, INTERCEPT_DR2_READ);
902         set_dr_intercept(svm, INTERCEPT_DR3_READ);
903         set_dr_intercept(svm, INTERCEPT_DR4_READ);
904         set_dr_intercept(svm, INTERCEPT_DR5_READ);
905         set_dr_intercept(svm, INTERCEPT_DR6_READ);
906         set_dr_intercept(svm, INTERCEPT_DR7_READ);
907
908         set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
909         set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
910         set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
911         set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
912         set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
913         set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
914         set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
915         set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
916
917         set_exception_intercept(svm, PF_VECTOR);
918         set_exception_intercept(svm, UD_VECTOR);
919         set_exception_intercept(svm, MC_VECTOR);
920
921         set_intercept(svm, INTERCEPT_INTR);
922         set_intercept(svm, INTERCEPT_NMI);
923         set_intercept(svm, INTERCEPT_SMI);
924         set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
925         set_intercept(svm, INTERCEPT_CPUID);
926         set_intercept(svm, INTERCEPT_INVD);
927         set_intercept(svm, INTERCEPT_HLT);
928         set_intercept(svm, INTERCEPT_INVLPG);
929         set_intercept(svm, INTERCEPT_INVLPGA);
930         set_intercept(svm, INTERCEPT_IOIO_PROT);
931         set_intercept(svm, INTERCEPT_MSR_PROT);
932         set_intercept(svm, INTERCEPT_TASK_SWITCH);
933         set_intercept(svm, INTERCEPT_SHUTDOWN);
934         set_intercept(svm, INTERCEPT_VMRUN);
935         set_intercept(svm, INTERCEPT_VMMCALL);
936         set_intercept(svm, INTERCEPT_VMLOAD);
937         set_intercept(svm, INTERCEPT_VMSAVE);
938         set_intercept(svm, INTERCEPT_STGI);
939         set_intercept(svm, INTERCEPT_CLGI);
940         set_intercept(svm, INTERCEPT_SKINIT);
941         set_intercept(svm, INTERCEPT_WBINVD);
942         set_intercept(svm, INTERCEPT_MONITOR);
943         set_intercept(svm, INTERCEPT_MWAIT);
944         set_intercept(svm, INTERCEPT_XSETBV);
945
946         control->iopm_base_pa = iopm_base;
947         control->msrpm_base_pa = __pa(svm->msrpm);
948         control->int_ctl = V_INTR_MASKING_MASK;
949
950         init_seg(&save->es);
951         init_seg(&save->ss);
952         init_seg(&save->ds);
953         init_seg(&save->fs);
954         init_seg(&save->gs);
955
956         save->cs.selector = 0xf000;
957         /* Executable/Readable Code Segment */
958         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
959                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
960         save->cs.limit = 0xffff;
961         /*
962          * cs.base should really be 0xffff0000, but vmx can't handle that, so
963          * be consistent with it.
964          *
965          * Replace when we have real mode working for vmx.
966          */
967         save->cs.base = 0xf0000;
968
969         save->gdtr.limit = 0xffff;
970         save->idtr.limit = 0xffff;
971
972         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
973         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
974
975         svm_set_efer(&svm->vcpu, 0);
976         save->dr6 = 0xffff0ff0;
977         save->dr7 = 0x400;
978         save->rflags = 2;
979         save->rip = 0x0000fff0;
980         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
981
982         /*
983          * This is the guest-visible cr0 value.
984          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
985          */
986         svm->vcpu.arch.cr0 = 0;
987         (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
988
989         save->cr4 = X86_CR4_PAE;
990         /* rdx = ?? */
991
992         if (npt_enabled) {
993                 /* Setup VMCB for Nested Paging */
994                 control->nested_ctl = 1;
995                 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
996                 clr_intercept(svm, INTERCEPT_INVLPG);
997                 clr_exception_intercept(svm, PF_VECTOR);
998                 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
999                 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1000                 save->g_pat = 0x0007040600070406ULL;
1001                 save->cr3 = 0;
1002                 save->cr4 = 0;
1003         }
1004         svm->asid_generation = 0;
1005
1006         svm->nested.vmcb = 0;
1007         svm->vcpu.arch.hflags = 0;
1008
1009         if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1010                 control->pause_filter_count = 3000;
1011                 set_intercept(svm, INTERCEPT_PAUSE);
1012         }
1013
1014         mark_all_dirty(svm->vmcb);
1015
1016         enable_gif(svm);
1017 }
1018
1019 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1020 {
1021         struct vcpu_svm *svm = to_svm(vcpu);
1022
1023         init_vmcb(svm);
1024
1025         if (!kvm_vcpu_is_bsp(vcpu)) {
1026                 kvm_rip_write(vcpu, 0);
1027                 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1028                 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1029         }
1030         vcpu->arch.regs_avail = ~0;
1031         vcpu->arch.regs_dirty = ~0;
1032
1033         return 0;
1034 }
1035
1036 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1037 {
1038         struct vcpu_svm *svm;
1039         struct page *page;
1040         struct page *msrpm_pages;
1041         struct page *hsave_page;
1042         struct page *nested_msrpm_pages;
1043         int err;
1044
1045         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1046         if (!svm) {
1047                 err = -ENOMEM;
1048                 goto out;
1049         }
1050
1051         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1052         if (err)
1053                 goto free_svm;
1054
1055         err = -ENOMEM;
1056         page = alloc_page(GFP_KERNEL);
1057         if (!page)
1058                 goto uninit;
1059
1060         msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1061         if (!msrpm_pages)
1062                 goto free_page1;
1063
1064         nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1065         if (!nested_msrpm_pages)
1066                 goto free_page2;
1067
1068         hsave_page = alloc_page(GFP_KERNEL);
1069         if (!hsave_page)
1070                 goto free_page3;
1071
1072         svm->nested.hsave = page_address(hsave_page);
1073
1074         svm->msrpm = page_address(msrpm_pages);
1075         svm_vcpu_init_msrpm(svm->msrpm);
1076
1077         svm->nested.msrpm = page_address(nested_msrpm_pages);
1078         svm_vcpu_init_msrpm(svm->nested.msrpm);
1079
1080         svm->vmcb = page_address(page);
1081         clear_page(svm->vmcb);
1082         svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1083         svm->asid_generation = 0;
1084         init_vmcb(svm);
1085         kvm_write_tsc(&svm->vcpu, 0);
1086
1087         err = fx_init(&svm->vcpu);
1088         if (err)
1089                 goto free_page4;
1090
1091         svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1092         if (kvm_vcpu_is_bsp(&svm->vcpu))
1093                 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1094
1095         return &svm->vcpu;
1096
1097 free_page4:
1098         __free_page(hsave_page);
1099 free_page3:
1100         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1101 free_page2:
1102         __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1103 free_page1:
1104         __free_page(page);
1105 uninit:
1106         kvm_vcpu_uninit(&svm->vcpu);
1107 free_svm:
1108         kmem_cache_free(kvm_vcpu_cache, svm);
1109 out:
1110         return ERR_PTR(err);
1111 }
1112
1113 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1114 {
1115         struct vcpu_svm *svm = to_svm(vcpu);
1116
1117         __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1118         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1119         __free_page(virt_to_page(svm->nested.hsave));
1120         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1121         kvm_vcpu_uninit(vcpu);
1122         kmem_cache_free(kvm_vcpu_cache, svm);
1123 }
1124
1125 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1126 {
1127         struct vcpu_svm *svm = to_svm(vcpu);
1128         int i;
1129
1130         if (unlikely(cpu != vcpu->cpu)) {
1131                 svm->asid_generation = 0;
1132                 mark_all_dirty(svm->vmcb);
1133         }
1134
1135 #ifdef CONFIG_X86_64
1136         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1137 #endif
1138         savesegment(fs, svm->host.fs);
1139         savesegment(gs, svm->host.gs);
1140         svm->host.ldt = kvm_read_ldt();
1141
1142         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1143                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1144 }
1145
1146 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1147 {
1148         struct vcpu_svm *svm = to_svm(vcpu);
1149         int i;
1150
1151         ++vcpu->stat.host_state_reload;
1152         kvm_load_ldt(svm->host.ldt);
1153 #ifdef CONFIG_X86_64
1154         loadsegment(fs, svm->host.fs);
1155         wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1156         load_gs_index(svm->host.gs);
1157 #else
1158 #ifdef CONFIG_X86_32_LAZY_GS
1159         loadsegment(gs, svm->host.gs);
1160 #endif
1161 #endif
1162         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1163                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1164 }
1165
1166 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1167 {
1168         return to_svm(vcpu)->vmcb->save.rflags;
1169 }
1170
1171 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1172 {
1173         to_svm(vcpu)->vmcb->save.rflags = rflags;
1174 }
1175
1176 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1177 {
1178         switch (reg) {
1179         case VCPU_EXREG_PDPTR:
1180                 BUG_ON(!npt_enabled);
1181                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1182                 break;
1183         default:
1184                 BUG();
1185         }
1186 }
1187
1188 static void svm_set_vintr(struct vcpu_svm *svm)
1189 {
1190         set_intercept(svm, INTERCEPT_VINTR);
1191 }
1192
1193 static void svm_clear_vintr(struct vcpu_svm *svm)
1194 {
1195         clr_intercept(svm, INTERCEPT_VINTR);
1196 }
1197
1198 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1199 {
1200         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1201
1202         switch (seg) {
1203         case VCPU_SREG_CS: return &save->cs;
1204         case VCPU_SREG_DS: return &save->ds;
1205         case VCPU_SREG_ES: return &save->es;
1206         case VCPU_SREG_FS: return &save->fs;
1207         case VCPU_SREG_GS: return &save->gs;
1208         case VCPU_SREG_SS: return &save->ss;
1209         case VCPU_SREG_TR: return &save->tr;
1210         case VCPU_SREG_LDTR: return &save->ldtr;
1211         }
1212         BUG();
1213         return NULL;
1214 }
1215
1216 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1217 {
1218         struct vmcb_seg *s = svm_seg(vcpu, seg);
1219
1220         return s->base;
1221 }
1222
1223 static void svm_get_segment(struct kvm_vcpu *vcpu,
1224                             struct kvm_segment *var, int seg)
1225 {
1226         struct vmcb_seg *s = svm_seg(vcpu, seg);
1227
1228         var->base = s->base;
1229         var->limit = s->limit;
1230         var->selector = s->selector;
1231         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1232         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1233         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1234         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1235         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1236         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1237         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1238         var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
1239
1240         /*
1241          * AMD's VMCB does not have an explicit unusable field, so emulate it
1242          * for cross vendor migration purposes by "not present"
1243          */
1244         var->unusable = !var->present || (var->type == 0);
1245
1246         switch (seg) {
1247         case VCPU_SREG_CS:
1248                 /*
1249                  * SVM always stores 0 for the 'G' bit in the CS selector in
1250                  * the VMCB on a VMEXIT. This hurts cross-vendor migration:
1251                  * Intel's VMENTRY has a check on the 'G' bit.
1252                  */
1253                 var->g = s->limit > 0xfffff;
1254                 break;
1255         case VCPU_SREG_TR:
1256                 /*
1257                  * Work around a bug where the busy flag in the tr selector
1258                  * isn't exposed
1259                  */
1260                 var->type |= 0x2;
1261                 break;
1262         case VCPU_SREG_DS:
1263         case VCPU_SREG_ES:
1264         case VCPU_SREG_FS:
1265         case VCPU_SREG_GS:
1266                 /*
1267                  * The accessed bit must always be set in the segment
1268                  * descriptor cache, although it can be cleared in the
1269                  * descriptor, the cached bit always remains at 1. Since
1270                  * Intel has a check on this, set it here to support
1271                  * cross-vendor migration.
1272                  */
1273                 if (!var->unusable)
1274                         var->type |= 0x1;
1275                 break;
1276         case VCPU_SREG_SS:
1277                 /*
1278                  * On AMD CPUs sometimes the DB bit in the segment
1279                  * descriptor is left as 1, although the whole segment has
1280                  * been made unusable. Clear it here to pass an Intel VMX
1281                  * entry check when cross vendor migrating.
1282                  */
1283                 if (var->unusable)
1284                         var->db = 0;
1285                 break;
1286         }
1287 }
1288
1289 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1290 {
1291         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1292
1293         return save->cpl;
1294 }
1295
1296 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1297 {
1298         struct vcpu_svm *svm = to_svm(vcpu);
1299
1300         dt->size = svm->vmcb->save.idtr.limit;
1301         dt->address = svm->vmcb->save.idtr.base;
1302 }
1303
1304 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1305 {
1306         struct vcpu_svm *svm = to_svm(vcpu);
1307
1308         svm->vmcb->save.idtr.limit = dt->size;
1309         svm->vmcb->save.idtr.base = dt->address ;
1310         mark_dirty(svm->vmcb, VMCB_DT);
1311 }
1312
1313 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1314 {
1315         struct vcpu_svm *svm = to_svm(vcpu);
1316
1317         dt->size = svm->vmcb->save.gdtr.limit;
1318         dt->address = svm->vmcb->save.gdtr.base;
1319 }
1320
1321 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1322 {
1323         struct vcpu_svm *svm = to_svm(vcpu);
1324
1325         svm->vmcb->save.gdtr.limit = dt->size;
1326         svm->vmcb->save.gdtr.base = dt->address ;
1327         mark_dirty(svm->vmcb, VMCB_DT);
1328 }
1329
1330 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1331 {
1332 }
1333
1334 static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1335 {
1336 }
1337
1338 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1339 {
1340 }
1341
1342 static void update_cr0_intercept(struct vcpu_svm *svm)
1343 {
1344         ulong gcr0 = svm->vcpu.arch.cr0;
1345         u64 *hcr0 = &svm->vmcb->save.cr0;
1346
1347         if (!svm->vcpu.fpu_active)
1348                 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1349         else
1350                 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1351                         | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1352
1353         mark_dirty(svm->vmcb, VMCB_CR);
1354
1355         if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1356                 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1357                 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1358         } else {
1359                 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1360                 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1361         }
1362 }
1363
1364 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1365 {
1366         struct vcpu_svm *svm = to_svm(vcpu);
1367
1368         if (is_guest_mode(vcpu)) {
1369                 /*
1370                  * We are here because we run in nested mode, the host kvm
1371                  * intercepts cr0 writes but the l1 hypervisor does not.
1372                  * But the L1 hypervisor may intercept selective cr0 writes.
1373                  * This needs to be checked here.
1374                  */
1375                 unsigned long old, new;
1376
1377                 /* Remove bits that would trigger a real cr0 write intercept */
1378                 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1379                 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1380
1381                 if (old == new) {
1382                         /* cr0 write with ts and mp unchanged */
1383                         svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1384                         if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1385                                 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1386                                 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1387                                 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1388                                 return;
1389                         }
1390                 }
1391         }
1392
1393 #ifdef CONFIG_X86_64
1394         if (vcpu->arch.efer & EFER_LME) {
1395                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1396                         vcpu->arch.efer |= EFER_LMA;
1397                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1398                 }
1399
1400                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1401                         vcpu->arch.efer &= ~EFER_LMA;
1402                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1403                 }
1404         }
1405 #endif
1406         vcpu->arch.cr0 = cr0;
1407
1408         if (!npt_enabled)
1409                 cr0 |= X86_CR0_PG | X86_CR0_WP;
1410
1411         if (!vcpu->fpu_active)
1412                 cr0 |= X86_CR0_TS;
1413         /*
1414          * re-enable caching here because the QEMU bios
1415          * does not do it - this results in some delay at
1416          * reboot
1417          */
1418         cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1419         svm->vmcb->save.cr0 = cr0;
1420         mark_dirty(svm->vmcb, VMCB_CR);
1421         update_cr0_intercept(svm);
1422 }
1423
1424 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1425 {
1426         unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
1427         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1428
1429         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1430                 svm_flush_tlb(vcpu);
1431
1432         vcpu->arch.cr4 = cr4;
1433         if (!npt_enabled)
1434                 cr4 |= X86_CR4_PAE;
1435         cr4 |= host_cr4_mce;
1436         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1437         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1438 }
1439
1440 static void svm_set_segment(struct kvm_vcpu *vcpu,
1441                             struct kvm_segment *var, int seg)
1442 {
1443         struct vcpu_svm *svm = to_svm(vcpu);
1444         struct vmcb_seg *s = svm_seg(vcpu, seg);
1445
1446         s->base = var->base;
1447         s->limit = var->limit;
1448         s->selector = var->selector;
1449         if (var->unusable)
1450                 s->attrib = 0;
1451         else {
1452                 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1453                 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1454                 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1455                 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1456                 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1457                 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1458                 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1459                 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1460         }
1461         if (seg == VCPU_SREG_CS)
1462                 svm->vmcb->save.cpl
1463                         = (svm->vmcb->save.cs.attrib
1464                            >> SVM_SELECTOR_DPL_SHIFT) & 3;
1465
1466         mark_dirty(svm->vmcb, VMCB_SEG);
1467 }
1468
1469 static void update_db_intercept(struct kvm_vcpu *vcpu)
1470 {
1471         struct vcpu_svm *svm = to_svm(vcpu);
1472
1473         clr_exception_intercept(svm, DB_VECTOR);
1474         clr_exception_intercept(svm, BP_VECTOR);
1475
1476         if (svm->nmi_singlestep)
1477                 set_exception_intercept(svm, DB_VECTOR);
1478
1479         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1480                 if (vcpu->guest_debug &
1481                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1482                         set_exception_intercept(svm, DB_VECTOR);
1483                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1484                         set_exception_intercept(svm, BP_VECTOR);
1485         } else
1486                 vcpu->guest_debug = 0;
1487 }
1488
1489 static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1490 {
1491         struct vcpu_svm *svm = to_svm(vcpu);
1492
1493         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1494                 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1495         else
1496                 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1497
1498         mark_dirty(svm->vmcb, VMCB_DR);
1499
1500         update_db_intercept(vcpu);
1501 }
1502
1503 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1504 {
1505         if (sd->next_asid > sd->max_asid) {
1506                 ++sd->asid_generation;
1507                 sd->next_asid = 1;
1508                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1509         }
1510
1511         svm->asid_generation = sd->asid_generation;
1512         svm->vmcb->control.asid = sd->next_asid++;
1513
1514         mark_dirty(svm->vmcb, VMCB_ASID);
1515 }
1516
1517 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1518 {
1519         struct vcpu_svm *svm = to_svm(vcpu);
1520
1521         svm->vmcb->save.dr7 = value;
1522         mark_dirty(svm->vmcb, VMCB_DR);
1523 }
1524
1525 static int pf_interception(struct vcpu_svm *svm)
1526 {
1527         u64 fault_address = svm->vmcb->control.exit_info_2;
1528         u32 error_code;
1529         int r = 1;
1530
1531         switch (svm->apf_reason) {
1532         default:
1533                 error_code = svm->vmcb->control.exit_info_1;
1534
1535                 trace_kvm_page_fault(fault_address, error_code);
1536                 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1537                         kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1538                 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1539                         svm->vmcb->control.insn_bytes,
1540                         svm->vmcb->control.insn_len);
1541                 break;
1542         case KVM_PV_REASON_PAGE_NOT_PRESENT:
1543                 svm->apf_reason = 0;
1544                 local_irq_disable();
1545                 kvm_async_pf_task_wait(fault_address);
1546                 local_irq_enable();
1547                 break;
1548         case KVM_PV_REASON_PAGE_READY:
1549                 svm->apf_reason = 0;
1550                 local_irq_disable();
1551                 kvm_async_pf_task_wake(fault_address);
1552                 local_irq_enable();
1553                 break;
1554         }
1555         return r;
1556 }
1557
1558 static int db_interception(struct vcpu_svm *svm)
1559 {
1560         struct kvm_run *kvm_run = svm->vcpu.run;
1561
1562         if (!(svm->vcpu.guest_debug &
1563               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1564                 !svm->nmi_singlestep) {
1565                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1566                 return 1;
1567         }
1568
1569         if (svm->nmi_singlestep) {
1570                 svm->nmi_singlestep = false;
1571                 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1572                         svm->vmcb->save.rflags &=
1573                                 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1574                 update_db_intercept(&svm->vcpu);
1575         }
1576
1577         if (svm->vcpu.guest_debug &
1578             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1579                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1580                 kvm_run->debug.arch.pc =
1581                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1582                 kvm_run->debug.arch.exception = DB_VECTOR;
1583                 return 0;
1584         }
1585
1586         return 1;
1587 }
1588
1589 static int bp_interception(struct vcpu_svm *svm)
1590 {
1591         struct kvm_run *kvm_run = svm->vcpu.run;
1592
1593         kvm_run->exit_reason = KVM_EXIT_DEBUG;
1594         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1595         kvm_run->debug.arch.exception = BP_VECTOR;
1596         return 0;
1597 }
1598
1599 static int ud_interception(struct vcpu_svm *svm)
1600 {
1601         int er;
1602
1603         er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1604         if (er != EMULATE_DONE)
1605                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1606         return 1;
1607 }
1608
1609 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1610 {
1611         struct vcpu_svm *svm = to_svm(vcpu);
1612
1613         clr_exception_intercept(svm, NM_VECTOR);
1614
1615         svm->vcpu.fpu_active = 1;
1616         update_cr0_intercept(svm);
1617 }
1618
1619 static int nm_interception(struct vcpu_svm *svm)
1620 {
1621         svm_fpu_activate(&svm->vcpu);
1622         return 1;
1623 }
1624
1625 static bool is_erratum_383(void)
1626 {
1627         int err, i;
1628         u64 value;
1629
1630         if (!erratum_383_found)
1631                 return false;
1632
1633         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1634         if (err)
1635                 return false;
1636
1637         /* Bit 62 may or may not be set for this mce */
1638         value &= ~(1ULL << 62);
1639
1640         if (value != 0xb600000000010015ULL)
1641                 return false;
1642
1643         /* Clear MCi_STATUS registers */
1644         for (i = 0; i < 6; ++i)
1645                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1646
1647         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1648         if (!err) {
1649                 u32 low, high;
1650
1651                 value &= ~(1ULL << 2);
1652                 low    = lower_32_bits(value);
1653                 high   = upper_32_bits(value);
1654
1655                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1656         }
1657
1658         /* Flush tlb to evict multi-match entries */
1659         __flush_tlb_all();
1660
1661         return true;
1662 }
1663
1664 static void svm_handle_mce(struct vcpu_svm *svm)
1665 {
1666         if (is_erratum_383()) {
1667                 /*
1668                  * Erratum 383 triggered. Guest state is corrupt so kill the
1669                  * guest.
1670                  */
1671                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1672
1673                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1674
1675                 return;
1676         }
1677
1678         /*
1679          * On an #MC intercept the MCE handler is not called automatically in
1680          * the host. So do it by hand here.
1681          */
1682         asm volatile (
1683                 "int $0x12\n");
1684         /* not sure if we ever come back to this point */
1685
1686         return;
1687 }
1688
1689 static int mc_interception(struct vcpu_svm *svm)
1690 {
1691         return 1;
1692 }
1693
1694 static int shutdown_interception(struct vcpu_svm *svm)
1695 {
1696         struct kvm_run *kvm_run = svm->vcpu.run;
1697
1698         /*
1699          * VMCB is undefined after a SHUTDOWN intercept
1700          * so reinitialize it.
1701          */
1702         clear_page(svm->vmcb);
1703         init_vmcb(svm);
1704
1705         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1706         return 0;
1707 }
1708
1709 static int io_interception(struct vcpu_svm *svm)
1710 {
1711         struct kvm_vcpu *vcpu = &svm->vcpu;
1712         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1713         int size, in, string;
1714         unsigned port;
1715
1716         ++svm->vcpu.stat.io_exits;
1717         string = (io_info & SVM_IOIO_STR_MASK) != 0;
1718         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1719         if (string || in)
1720                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1721
1722         port = io_info >> 16;
1723         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1724         svm->next_rip = svm->vmcb->control.exit_info_2;
1725         skip_emulated_instruction(&svm->vcpu);
1726
1727         return kvm_fast_pio_out(vcpu, size, port);
1728 }
1729
1730 static int nmi_interception(struct vcpu_svm *svm)
1731 {
1732         return 1;
1733 }
1734
1735 static int intr_interception(struct vcpu_svm *svm)
1736 {
1737         ++svm->vcpu.stat.irq_exits;
1738         return 1;
1739 }
1740
1741 static int nop_on_interception(struct vcpu_svm *svm)
1742 {
1743         return 1;
1744 }
1745
1746 static int halt_interception(struct vcpu_svm *svm)
1747 {
1748         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1749         skip_emulated_instruction(&svm->vcpu);
1750         return kvm_emulate_halt(&svm->vcpu);
1751 }
1752
1753 static int vmmcall_interception(struct vcpu_svm *svm)
1754 {
1755         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1756         skip_emulated_instruction(&svm->vcpu);
1757         kvm_emulate_hypercall(&svm->vcpu);
1758         return 1;
1759 }
1760
1761 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1762 {
1763         struct vcpu_svm *svm = to_svm(vcpu);
1764
1765         return svm->nested.nested_cr3;
1766 }
1767
1768 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1769                                    unsigned long root)
1770 {
1771         struct vcpu_svm *svm = to_svm(vcpu);
1772
1773         svm->vmcb->control.nested_cr3 = root;
1774         mark_dirty(svm->vmcb, VMCB_NPT);
1775         svm_flush_tlb(vcpu);
1776 }
1777
1778 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1779                                        struct x86_exception *fault)
1780 {
1781         struct vcpu_svm *svm = to_svm(vcpu);
1782
1783         svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1784         svm->vmcb->control.exit_code_hi = 0;
1785         svm->vmcb->control.exit_info_1 = fault->error_code;
1786         svm->vmcb->control.exit_info_2 = fault->address;
1787
1788         nested_svm_vmexit(svm);
1789 }
1790
1791 static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1792 {
1793         int r;
1794
1795         r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1796
1797         vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
1798         vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
1799         vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1800         vcpu->arch.mmu.shadow_root_level = get_npt_level();
1801         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
1802
1803         return r;
1804 }
1805
1806 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1807 {
1808         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1809 }
1810
1811 static int nested_svm_check_permissions(struct vcpu_svm *svm)
1812 {
1813         if (!(svm->vcpu.arch.efer & EFER_SVME)
1814             || !is_paging(&svm->vcpu)) {
1815                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1816                 return 1;
1817         }
1818
1819         if (svm->vmcb->save.cpl) {
1820                 kvm_inject_gp(&svm->vcpu, 0);
1821                 return 1;
1822         }
1823
1824        return 0;
1825 }
1826
1827 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1828                                       bool has_error_code, u32 error_code)
1829 {
1830         int vmexit;
1831
1832         if (!is_guest_mode(&svm->vcpu))
1833                 return 0;
1834
1835         svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1836         svm->vmcb->control.exit_code_hi = 0;
1837         svm->vmcb->control.exit_info_1 = error_code;
1838         svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1839
1840         vmexit = nested_svm_intercept(svm);
1841         if (vmexit == NESTED_EXIT_DONE)
1842                 svm->nested.exit_required = true;
1843
1844         return vmexit;
1845 }
1846
1847 /* This function returns true if it is save to enable the irq window */
1848 static inline bool nested_svm_intr(struct vcpu_svm *svm)
1849 {
1850         if (!is_guest_mode(&svm->vcpu))
1851                 return true;
1852
1853         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1854                 return true;
1855
1856         if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1857                 return false;
1858
1859         /*
1860          * if vmexit was already requested (by intercepted exception
1861          * for instance) do not overwrite it with "external interrupt"
1862          * vmexit.
1863          */
1864         if (svm->nested.exit_required)
1865                 return false;
1866
1867         svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
1868         svm->vmcb->control.exit_info_1 = 0;
1869         svm->vmcb->control.exit_info_2 = 0;
1870
1871         if (svm->nested.intercept & 1ULL) {
1872                 /*
1873                  * The #vmexit can't be emulated here directly because this
1874                  * code path runs with irqs and preemtion disabled. A
1875                  * #vmexit emulation might sleep. Only signal request for
1876                  * the #vmexit here.
1877                  */
1878                 svm->nested.exit_required = true;
1879                 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1880                 return false;
1881         }
1882
1883         return true;
1884 }
1885
1886 /* This function returns true if it is save to enable the nmi window */
1887 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1888 {
1889         if (!is_guest_mode(&svm->vcpu))
1890                 return true;
1891
1892         if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1893                 return true;
1894
1895         svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1896         svm->nested.exit_required = true;
1897
1898         return false;
1899 }
1900
1901 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1902 {
1903         struct page *page;
1904
1905         might_sleep();
1906
1907         page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1908         if (is_error_page(page))
1909                 goto error;
1910
1911         *_page = page;
1912
1913         return kmap(page);
1914
1915 error:
1916         kvm_release_page_clean(page);
1917         kvm_inject_gp(&svm->vcpu, 0);
1918
1919         return NULL;
1920 }
1921
1922 static void nested_svm_unmap(struct page *page)
1923 {
1924         kunmap(page);
1925         kvm_release_page_dirty(page);
1926 }
1927
1928 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1929 {
1930         unsigned port;
1931         u8 val, bit;
1932         u64 gpa;
1933
1934         if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1935                 return NESTED_EXIT_HOST;
1936
1937         port = svm->vmcb->control.exit_info_1 >> 16;
1938         gpa  = svm->nested.vmcb_iopm + (port / 8);
1939         bit  = port % 8;
1940         val  = 0;
1941
1942         if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1943                 val &= (1 << bit);
1944
1945         return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1946 }
1947
1948 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1949 {
1950         u32 offset, msr, value;
1951         int write, mask;
1952
1953         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1954                 return NESTED_EXIT_HOST;
1955
1956         msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1957         offset = svm_msrpm_offset(msr);
1958         write  = svm->vmcb->control.exit_info_1 & 1;
1959         mask   = 1 << ((2 * (msr & 0xf)) + write);
1960
1961         if (offset == MSR_INVALID)
1962                 return NESTED_EXIT_DONE;
1963
1964         /* Offset is in 32 bit units but need in 8 bit units */
1965         offset *= 4;
1966
1967         if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1968                 return NESTED_EXIT_DONE;
1969
1970         return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1971 }
1972
1973 static int nested_svm_exit_special(struct vcpu_svm *svm)
1974 {
1975         u32 exit_code = svm->vmcb->control.exit_code;
1976
1977         switch (exit_code) {
1978         case SVM_EXIT_INTR:
1979         case SVM_EXIT_NMI:
1980         case SVM_EXIT_EXCP_BASE + MC_VECTOR:
1981                 return NESTED_EXIT_HOST;
1982         case SVM_EXIT_NPF:
1983                 /* For now we are always handling NPFs when using them */
1984                 if (npt_enabled)
1985                         return NESTED_EXIT_HOST;
1986                 break;
1987         case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1988                 /* When we're shadowing, trap PFs, but not async PF */
1989                 if (!npt_enabled && svm->apf_reason == 0)
1990                         return NESTED_EXIT_HOST;
1991                 break;
1992         case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1993                 nm_interception(svm);
1994                 break;
1995         default:
1996                 break;
1997         }
1998
1999         return NESTED_EXIT_CONTINUE;
2000 }
2001
2002 /*
2003  * If this function returns true, this #vmexit was already handled
2004  */
2005 static int nested_svm_intercept(struct vcpu_svm *svm)
2006 {
2007         u32 exit_code = svm->vmcb->control.exit_code;
2008         int vmexit = NESTED_EXIT_HOST;
2009
2010         switch (exit_code) {
2011         case SVM_EXIT_MSR:
2012                 vmexit = nested_svm_exit_handled_msr(svm);
2013                 break;
2014         case SVM_EXIT_IOIO:
2015                 vmexit = nested_svm_intercept_ioio(svm);
2016                 break;
2017         case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2018                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2019                 if (svm->nested.intercept_cr & bit)
2020                         vmexit = NESTED_EXIT_DONE;
2021                 break;
2022         }
2023         case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2024                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2025                 if (svm->nested.intercept_dr & bit)
2026                         vmexit = NESTED_EXIT_DONE;
2027                 break;
2028         }
2029         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2030                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2031                 if (svm->nested.intercept_exceptions & excp_bits)
2032                         vmexit = NESTED_EXIT_DONE;
2033                 /* async page fault always cause vmexit */
2034                 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2035                          svm->apf_reason != 0)
2036                         vmexit = NESTED_EXIT_DONE;
2037                 break;
2038         }
2039         case SVM_EXIT_ERR: {
2040                 vmexit = NESTED_EXIT_DONE;
2041                 break;
2042         }
2043         default: {
2044                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2045                 if (svm->nested.intercept & exit_bits)
2046                         vmexit = NESTED_EXIT_DONE;
2047         }
2048         }
2049
2050         return vmexit;
2051 }
2052
2053 static int nested_svm_exit_handled(struct vcpu_svm *svm)
2054 {
2055         int vmexit;
2056
2057         vmexit = nested_svm_intercept(svm);
2058
2059         if (vmexit == NESTED_EXIT_DONE)
2060                 nested_svm_vmexit(svm);
2061
2062         return vmexit;
2063 }
2064
2065 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2066 {
2067         struct vmcb_control_area *dst  = &dst_vmcb->control;
2068         struct vmcb_control_area *from = &from_vmcb->control;
2069
2070         dst->intercept_cr         = from->intercept_cr;
2071         dst->intercept_dr         = from->intercept_dr;
2072         dst->intercept_exceptions = from->intercept_exceptions;
2073         dst->intercept            = from->intercept;
2074         dst->iopm_base_pa         = from->iopm_base_pa;
2075         dst->msrpm_base_pa        = from->msrpm_base_pa;
2076         dst->tsc_offset           = from->tsc_offset;
2077         dst->asid                 = from->asid;
2078         dst->tlb_ctl              = from->tlb_ctl;
2079         dst->int_ctl              = from->int_ctl;
2080         dst->int_vector           = from->int_vector;
2081         dst->int_state            = from->int_state;
2082         dst->exit_code            = from->exit_code;
2083         dst->exit_code_hi         = from->exit_code_hi;
2084         dst->exit_info_1          = from->exit_info_1;
2085         dst->exit_info_2          = from->exit_info_2;
2086         dst->exit_int_info        = from->exit_int_info;
2087         dst->exit_int_info_err    = from->exit_int_info_err;
2088         dst->nested_ctl           = from->nested_ctl;
2089         dst->event_inj            = from->event_inj;
2090         dst->event_inj_err        = from->event_inj_err;
2091         dst->nested_cr3           = from->nested_cr3;
2092         dst->lbr_ctl              = from->lbr_ctl;
2093 }
2094
2095 static int nested_svm_vmexit(struct vcpu_svm *svm)
2096 {
2097         struct vmcb *nested_vmcb;
2098         struct vmcb *hsave = svm->nested.hsave;
2099         struct vmcb *vmcb = svm->vmcb;
2100         struct page *page;
2101
2102         trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2103                                        vmcb->control.exit_info_1,
2104                                        vmcb->control.exit_info_2,
2105                                        vmcb->control.exit_int_info,
2106                                        vmcb->control.exit_int_info_err);
2107
2108         nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2109         if (!nested_vmcb)
2110                 return 1;
2111
2112         /* Exit Guest-Mode */
2113         leave_guest_mode(&svm->vcpu);
2114         svm->nested.vmcb = 0;
2115
2116         /* Give the current vmcb to the guest */
2117         disable_gif(svm);
2118
2119         nested_vmcb->save.es     = vmcb->save.es;
2120         nested_vmcb->save.cs     = vmcb->save.cs;
2121         nested_vmcb->save.ss     = vmcb->save.ss;
2122         nested_vmcb->save.ds     = vmcb->save.ds;
2123         nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2124         nested_vmcb->save.idtr   = vmcb->save.idtr;
2125         nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2126         nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2127         nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2128         nested_vmcb->save.cr2    = vmcb->save.cr2;
2129         nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2130         nested_vmcb->save.rflags = vmcb->save.rflags;
2131         nested_vmcb->save.rip    = vmcb->save.rip;
2132         nested_vmcb->save.rsp    = vmcb->save.rsp;
2133         nested_vmcb->save.rax    = vmcb->save.rax;
2134         nested_vmcb->save.dr7    = vmcb->save.dr7;
2135         nested_vmcb->save.dr6    = vmcb->save.dr6;
2136         nested_vmcb->save.cpl    = vmcb->save.cpl;
2137
2138         nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2139         nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2140         nested_vmcb->control.int_state         = vmcb->control.int_state;
2141         nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2142         nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2143         nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2144         nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2145         nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2146         nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2147         nested_vmcb->control.next_rip          = vmcb->control.next_rip;
2148
2149         /*
2150          * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2151          * to make sure that we do not lose injected events. So check event_inj
2152          * here and copy it to exit_int_info if it is valid.
2153          * Exit_int_info and event_inj can't be both valid because the case
2154          * below only happens on a VMRUN instruction intercept which has
2155          * no valid exit_int_info set.
2156          */
2157         if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2158                 struct vmcb_control_area *nc = &nested_vmcb->control;
2159
2160                 nc->exit_int_info     = vmcb->control.event_inj;
2161                 nc->exit_int_info_err = vmcb->control.event_inj_err;
2162         }
2163
2164         nested_vmcb->control.tlb_ctl           = 0;
2165         nested_vmcb->control.event_inj         = 0;
2166         nested_vmcb->control.event_inj_err     = 0;
2167
2168         /* We always set V_INTR_MASKING and remember the old value in hflags */
2169         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2170                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2171
2172         /* Restore the original control entries */
2173         copy_vmcb_control_area(vmcb, hsave);
2174
2175         kvm_clear_exception_queue(&svm->vcpu);
2176         kvm_clear_interrupt_queue(&svm->vcpu);
2177
2178         svm->nested.nested_cr3 = 0;
2179
2180         /* Restore selected save entries */
2181         svm->vmcb->save.es = hsave->save.es;
2182         svm->vmcb->save.cs = hsave->save.cs;
2183         svm->vmcb->save.ss = hsave->save.ss;
2184         svm->vmcb->save.ds = hsave->save.ds;
2185         svm->vmcb->save.gdtr = hsave->save.gdtr;
2186         svm->vmcb->save.idtr = hsave->save.idtr;
2187         svm->vmcb->save.rflags = hsave->save.rflags;
2188         svm_set_efer(&svm->vcpu, hsave->save.efer);
2189         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2190         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2191         if (npt_enabled) {
2192                 svm->vmcb->save.cr3 = hsave->save.cr3;
2193                 svm->vcpu.arch.cr3 = hsave->save.cr3;
2194         } else {
2195                 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2196         }
2197         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2198         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2199         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2200         svm->vmcb->save.dr7 = 0;
2201         svm->vmcb->save.cpl = 0;
2202         svm->vmcb->control.exit_int_info = 0;
2203
2204         mark_all_dirty(svm->vmcb);
2205
2206         nested_svm_unmap(page);
2207
2208         nested_svm_uninit_mmu_context(&svm->vcpu);
2209         kvm_mmu_reset_context(&svm->vcpu);
2210         kvm_mmu_load(&svm->vcpu);
2211
2212         return 0;
2213 }
2214
2215 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2216 {
2217         /*
2218          * This function merges the msr permission bitmaps of kvm and the
2219          * nested vmcb. It is omptimized in that it only merges the parts where
2220          * the kvm msr permission bitmap may contain zero bits
2221          */
2222         int i;
2223
2224         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2225                 return true;
2226
2227         for (i = 0; i < MSRPM_OFFSETS; i++) {
2228                 u32 value, p;
2229                 u64 offset;
2230
2231                 if (msrpm_offsets[i] == 0xffffffff)
2232                         break;
2233
2234                 p      = msrpm_offsets[i];
2235                 offset = svm->nested.vmcb_msrpm + (p * 4);
2236
2237                 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
2238                         return false;
2239
2240                 svm->nested.msrpm[p] = svm->msrpm[p] | value;
2241         }
2242
2243         svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2244
2245         return true;
2246 }
2247
2248 static bool nested_vmcb_checks(struct vmcb *vmcb)
2249 {
2250         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2251                 return false;
2252
2253         if (vmcb->control.asid == 0)
2254                 return false;
2255
2256         if (vmcb->control.nested_ctl && !npt_enabled)
2257                 return false;
2258
2259         return true;
2260 }
2261
2262 static bool nested_svm_vmrun(struct vcpu_svm *svm)
2263 {
2264         struct vmcb *nested_vmcb;
2265         struct vmcb *hsave = svm->nested.hsave;
2266         struct vmcb *vmcb = svm->vmcb;
2267         struct page *page;
2268         u64 vmcb_gpa;
2269
2270         vmcb_gpa = svm->vmcb->save.rax;
2271
2272         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2273         if (!nested_vmcb)
2274                 return false;
2275
2276         if (!nested_vmcb_checks(nested_vmcb)) {
2277                 nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2278                 nested_vmcb->control.exit_code_hi = 0;
2279                 nested_vmcb->control.exit_info_1  = 0;
2280                 nested_vmcb->control.exit_info_2  = 0;
2281
2282                 nested_svm_unmap(page);
2283
2284                 return false;
2285         }
2286
2287         trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2288                                nested_vmcb->save.rip,
2289                                nested_vmcb->control.int_ctl,
2290                                nested_vmcb->control.event_inj,
2291                                nested_vmcb->control.nested_ctl);
2292
2293         trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2294                                     nested_vmcb->control.intercept_cr >> 16,
2295                                     nested_vmcb->control.intercept_exceptions,
2296                                     nested_vmcb->control.intercept);
2297
2298         /* Clear internal status */
2299         kvm_clear_exception_queue(&svm->vcpu);
2300         kvm_clear_interrupt_queue(&svm->vcpu);
2301
2302         /*
2303          * Save the old vmcb, so we don't need to pick what we save, but can
2304          * restore everything when a VMEXIT occurs
2305          */
2306         hsave->save.es     = vmcb->save.es;
2307         hsave->save.cs     = vmcb->save.cs;
2308         hsave->save.ss     = vmcb->save.ss;
2309         hsave->save.ds     = vmcb->save.ds;
2310         hsave->save.gdtr   = vmcb->save.gdtr;
2311         hsave->save.idtr   = vmcb->save.idtr;
2312         hsave->save.efer   = svm->vcpu.arch.efer;
2313         hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2314         hsave->save.cr4    = svm->vcpu.arch.cr4;
2315         hsave->save.rflags = vmcb->save.rflags;
2316         hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2317         hsave->save.rsp    = vmcb->save.rsp;
2318         hsave->save.rax    = vmcb->save.rax;
2319         if (npt_enabled)
2320                 hsave->save.cr3    = vmcb->save.cr3;
2321         else
2322                 hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2323
2324         copy_vmcb_control_area(hsave, vmcb);
2325
2326         if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
2327                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2328         else
2329                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2330
2331         if (nested_vmcb->control.nested_ctl) {
2332                 kvm_mmu_unload(&svm->vcpu);
2333                 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2334                 nested_svm_init_mmu_context(&svm->vcpu);
2335         }
2336
2337         /* Load the nested guest state */
2338         svm->vmcb->save.es = nested_vmcb->save.es;
2339         svm->vmcb->save.cs = nested_vmcb->save.cs;
2340         svm->vmcb->save.ss = nested_vmcb->save.ss;
2341         svm->vmcb->save.ds = nested_vmcb->save.ds;
2342         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2343         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2344         svm->vmcb->save.rflags = nested_vmcb->save.rflags;
2345         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2346         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2347         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2348         if (npt_enabled) {
2349                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2350                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2351         } else
2352                 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2353
2354         /* Guest paging mode is active - reset mmu */
2355         kvm_mmu_reset_context(&svm->vcpu);
2356
2357         svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2358         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2359         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2360         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2361
2362         /* In case we don't even reach vcpu_run, the fields are not updated */
2363         svm->vmcb->save.rax = nested_vmcb->save.rax;
2364         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2365         svm->vmcb->save.rip = nested_vmcb->save.rip;
2366         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2367         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2368         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2369
2370         svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2371         svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2372
2373         /* cache intercepts */
2374         svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2375         svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2376         svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2377         svm->nested.intercept            = nested_vmcb->control.intercept;
2378
2379         svm_flush_tlb(&svm->vcpu);
2380         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2381         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2382                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2383         else
2384                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2385
2386         if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2387                 /* We only want the cr8 intercept bits of the guest */
2388                 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2389                 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2390         }
2391
2392         /* We don't want to see VMMCALLs from a nested guest */
2393         clr_intercept(svm, INTERCEPT_VMMCALL);
2394
2395         svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2396         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2397         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2398         svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2399         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2400         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2401
2402         nested_svm_unmap(page);
2403
2404         /* Enter Guest-Mode */
2405         enter_guest_mode(&svm->vcpu);
2406
2407         /*
2408          * Merge guest and host intercepts - must be called  with vcpu in
2409          * guest-mode to take affect here
2410          */
2411         recalc_intercepts(svm);
2412
2413         svm->nested.vmcb = vmcb_gpa;
2414
2415         enable_gif(svm);
2416
2417         mark_all_dirty(svm->vmcb);
2418
2419         return true;
2420 }
2421
2422 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2423 {
2424         to_vmcb->save.fs = from_vmcb->save.fs;
2425         to_vmcb->save.gs = from_vmcb->save.gs;
2426         to_vmcb->save.tr = from_vmcb->save.tr;
2427         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2428         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2429         to_vmcb->save.star = from_vmcb->save.star;
2430         to_vmcb->save.lstar = from_vmcb->save.lstar;
2431         to_vmcb->save.cstar = from_vmcb->save.cstar;
2432         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2433         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2434         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2435         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2436 }
2437
2438 static int vmload_interception(struct vcpu_svm *svm)
2439 {
2440         struct vmcb *nested_vmcb;
2441         struct page *page;
2442
2443         if (nested_svm_check_permissions(svm))
2444                 return 1;
2445
2446         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2447         skip_emulated_instruction(&svm->vcpu);
2448
2449         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2450         if (!nested_vmcb)
2451                 return 1;
2452
2453         nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2454         nested_svm_unmap(page);
2455
2456         return 1;
2457 }
2458
2459 static int vmsave_interception(struct vcpu_svm *svm)
2460 {
2461         struct vmcb *nested_vmcb;
2462         struct page *page;
2463
2464         if (nested_svm_check_permissions(svm))
2465                 return 1;
2466
2467         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2468         skip_emulated_instruction(&svm->vcpu);
2469
2470         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2471         if (!nested_vmcb)
2472                 return 1;
2473
2474         nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2475         nested_svm_unmap(page);
2476
2477         return 1;
2478 }
2479
2480 static int vmrun_interception(struct vcpu_svm *svm)
2481 {
2482         if (nested_svm_check_permissions(svm))
2483                 return 1;
2484
2485         /* Save rip after vmrun instruction */
2486         kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2487
2488         if (!nested_svm_vmrun(svm))
2489                 return 1;
2490
2491         if (!nested_svm_vmrun_msrpm(svm))
2492                 goto failed;
2493
2494         return 1;
2495
2496 failed:
2497
2498         svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2499         svm->vmcb->control.exit_code_hi = 0;
2500         svm->vmcb->control.exit_info_1  = 0;
2501         svm->vmcb->control.exit_info_2  = 0;
2502
2503         nested_svm_vmexit(svm);
2504
2505         return 1;
2506 }
2507
2508 static int stgi_interception(struct vcpu_svm *svm)
2509 {
2510         if (nested_svm_check_permissions(svm))
2511                 return 1;
2512
2513         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2514         skip_emulated_instruction(&svm->vcpu);
2515         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2516
2517         enable_gif(svm);
2518
2519         return 1;
2520 }
2521
2522 static int clgi_interception(struct vcpu_svm *svm)
2523 {
2524         if (nested_svm_check_permissions(svm))
2525                 return 1;
2526
2527         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2528         skip_emulated_instruction(&svm->vcpu);
2529
2530         disable_gif(svm);
2531
2532         /* After a CLGI no interrupts should come */
2533         svm_clear_vintr(svm);
2534         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2535
2536         mark_dirty(svm->vmcb, VMCB_INTR);
2537
2538         return 1;
2539 }
2540
2541 static int invlpga_interception(struct vcpu_svm *svm)
2542 {
2543         struct kvm_vcpu *vcpu = &svm->vcpu;
2544
2545         trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
2546                           vcpu->arch.regs[VCPU_REGS_RAX]);
2547
2548         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2549         kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
2550
2551         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2552         skip_emulated_instruction(&svm->vcpu);
2553         return 1;
2554 }
2555
2556 static int skinit_interception(struct vcpu_svm *svm)
2557 {
2558         trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
2559
2560         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2561         return 1;
2562 }
2563
2564 static int xsetbv_interception(struct vcpu_svm *svm)
2565 {
2566         u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2567         u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2568
2569         if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2570                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2571                 skip_emulated_instruction(&svm->vcpu);
2572         }
2573
2574         return 1;
2575 }
2576
2577 static int invalid_op_interception(struct vcpu_svm *svm)
2578 {
2579         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2580         return 1;
2581 }
2582
2583 static int task_switch_interception(struct vcpu_svm *svm)
2584 {
2585         u16 tss_selector;
2586         int reason;
2587         int int_type = svm->vmcb->control.exit_int_info &
2588                 SVM_EXITINTINFO_TYPE_MASK;
2589         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2590         uint32_t type =
2591                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2592         uint32_t idt_v =
2593                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2594         bool has_error_code = false;
2595         u32 error_code = 0;
2596
2597         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2598
2599         if (svm->vmcb->control.exit_info_2 &
2600             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2601                 reason = TASK_SWITCH_IRET;
2602         else if (svm->vmcb->control.exit_info_2 &
2603                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2604                 reason = TASK_SWITCH_JMP;
2605         else if (idt_v)
2606                 reason = TASK_SWITCH_GATE;
2607         else
2608                 reason = TASK_SWITCH_CALL;
2609
2610         if (reason == TASK_SWITCH_GATE) {
2611                 switch (type) {
2612                 case SVM_EXITINTINFO_TYPE_NMI:
2613                         svm->vcpu.arch.nmi_injected = false;
2614                         break;
2615                 case SVM_EXITINTINFO_TYPE_EXEPT:
2616                         if (svm->vmcb->control.exit_info_2 &
2617                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2618                                 has_error_code = true;
2619                                 error_code =
2620                                         (u32)svm->vmcb->control.exit_info_2;
2621                         }
2622                         kvm_clear_exception_queue(&svm->vcpu);
2623                         break;
2624                 case SVM_EXITINTINFO_TYPE_INTR:
2625                         kvm_clear_interrupt_queue(&svm->vcpu);
2626                         break;
2627                 default:
2628                         break;
2629                 }
2630         }
2631
2632         if (reason != TASK_SWITCH_GATE ||
2633             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2634             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2635              (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2636                 skip_emulated_instruction(&svm->vcpu);
2637
2638         if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2639                                 has_error_code, error_code) == EMULATE_FAIL) {
2640                 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2641                 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2642                 svm->vcpu.run->internal.ndata = 0;
2643                 return 0;
2644         }
2645         return 1;
2646 }
2647
2648 static int cpuid_interception(struct vcpu_svm *svm)
2649 {
2650         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2651         kvm_emulate_cpuid(&svm->vcpu);
2652         return 1;
2653 }
2654
2655 static int iret_interception(struct vcpu_svm *svm)
2656 {
2657         ++svm->vcpu.stat.nmi_window_exits;
2658         clr_intercept(svm, INTERCEPT_IRET);
2659         svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2661         return 1;
2662 }
2663
2664 static int invlpg_interception(struct vcpu_svm *svm)
2665 {
2666         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2667                 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2668
2669         kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2670         skip_emulated_instruction(&svm->vcpu);
2671         return 1;
2672 }
2673
2674 static int emulate_on_interception(struct vcpu_svm *svm)
2675 {
2676         return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2677 }
2678
2679 #define CR_VALID (1ULL << 63)
2680
2681 static int cr_interception(struct vcpu_svm *svm)
2682 {
2683         int reg, cr;
2684         unsigned long val;
2685         int err;
2686
2687         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2688                 return emulate_on_interception(svm);
2689
2690         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2691                 return emulate_on_interception(svm);
2692
2693         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2694         cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2695
2696         err = 0;
2697         if (cr >= 16) { /* mov to cr */
2698                 cr -= 16;
2699                 val = kvm_register_read(&svm->vcpu, reg);
2700                 switch (cr) {
2701                 case 0:
2702                         err = kvm_set_cr0(&svm->vcpu, val);
2703                         break;
2704                 case 3:
2705                         err = kvm_set_cr3(&svm->vcpu, val);
2706                         break;
2707                 case 4:
2708                         err = kvm_set_cr4(&svm->vcpu, val);
2709                         break;
2710                 case 8:
2711                         err = kvm_set_cr8(&svm->vcpu, val);
2712                         break;
2713                 default:
2714                         WARN(1, "unhandled write to CR%d", cr);
2715                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2716                         return 1;
2717                 }
2718         } else { /* mov from cr */
2719                 switch (cr) {
2720                 case 0:
2721                         val = kvm_read_cr0(&svm->vcpu);
2722                         break;
2723                 case 2:
2724                         val = svm->vcpu.arch.cr2;
2725                         break;
2726                 case 3:
2727                         val = kvm_read_cr3(&svm->vcpu);
2728                         break;
2729                 case 4:
2730                         val = kvm_read_cr4(&svm->vcpu);
2731                         break;
2732                 case 8:
2733                         val = kvm_get_cr8(&svm->vcpu);
2734                         break;
2735                 default:
2736                         WARN(1, "unhandled read from CR%d", cr);
2737                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2738                         return 1;
2739                 }
2740                 kvm_register_write(&svm->vcpu, reg, val);
2741         }
2742         kvm_complete_insn_gp(&svm->vcpu, err);
2743
2744         return 1;
2745 }
2746
2747 static int cr0_write_interception(struct vcpu_svm *svm)
2748 {
2749         struct kvm_vcpu *vcpu = &svm->vcpu;
2750         int r;
2751
2752         r = cr_interception(svm);
2753
2754         if (svm->nested.vmexit_rip) {
2755                 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2756                 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2757                 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2758                 svm->nested.vmexit_rip = 0;
2759         }
2760
2761         return r;
2762 }
2763
2764 static int dr_interception(struct vcpu_svm *svm)
2765 {
2766         int reg, dr;
2767         unsigned long val;
2768         int err;
2769
2770         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2771                 return emulate_on_interception(svm);
2772
2773         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2774         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2775
2776         if (dr >= 16) { /* mov to DRn */
2777                 val = kvm_register_read(&svm->vcpu, reg);
2778                 kvm_set_dr(&svm->vcpu, dr - 16, val);
2779         } else {
2780                 err = kvm_get_dr(&svm->vcpu, dr, &val);
2781                 if (!err)
2782                         kvm_register_write(&svm->vcpu, reg, val);
2783         }
2784
2785         skip_emulated_instruction(&svm->vcpu);
2786
2787         return 1;
2788 }
2789
2790 static int cr8_write_interception(struct vcpu_svm *svm)
2791 {
2792         struct kvm_run *kvm_run = svm->vcpu.run;
2793         int r;
2794
2795         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2796         /* instruction emulation calls kvm_set_cr8() */
2797         r = cr_interception(svm);
2798         if (irqchip_in_kernel(svm->vcpu.kvm)) {
2799                 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2800                 return r;
2801         }
2802         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2803                 return r;
2804         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2805         return 0;
2806 }
2807
2808 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2809 {
2810         struct vcpu_svm *svm = to_svm(vcpu);
2811
2812         switch (ecx) {
2813         case MSR_IA32_TSC: {
2814                 struct vmcb *vmcb = get_host_vmcb(svm);
2815
2816                 *data = vmcb->control.tsc_offset + native_read_tsc();
2817                 break;
2818         }
2819         case MSR_STAR:
2820                 *data = svm->vmcb->save.star;
2821                 break;
2822 #ifdef CONFIG_X86_64
2823         case MSR_LSTAR:
2824                 *data = svm->vmcb->save.lstar;
2825                 break;
2826         case MSR_CSTAR:
2827                 *data = svm->vmcb->save.cstar;
2828                 break;
2829         case MSR_KERNEL_GS_BASE:
2830                 *data = svm->vmcb->save.kernel_gs_base;
2831                 break;
2832         case MSR_SYSCALL_MASK:
2833                 *data = svm->vmcb->save.sfmask;
2834                 break;
2835 #endif
2836         case MSR_IA32_SYSENTER_CS:
2837                 *data = svm->vmcb->save.sysenter_cs;
2838                 break;
2839         case MSR_IA32_SYSENTER_EIP:
2840                 *data = svm->sysenter_eip;
2841                 break;
2842         case MSR_IA32_SYSENTER_ESP:
2843                 *data = svm->sysenter_esp;
2844                 break;
2845         /*
2846          * Nobody will change the following 5 values in the VMCB so we can
2847          * safely return them on rdmsr. They will always be 0 until LBRV is
2848          * implemented.
2849          */
2850         case MSR_IA32_DEBUGCTLMSR:
2851                 *data = svm->vmcb->save.dbgctl;
2852                 break;
2853         case MSR_IA32_LASTBRANCHFROMIP:
2854                 *data = svm->vmcb->save.br_from;
2855                 break;
2856         case MSR_IA32_LASTBRANCHTOIP:
2857                 *data = svm->vmcb->save.br_to;
2858                 break;
2859         case MSR_IA32_LASTINTFROMIP:
2860                 *data = svm->vmcb->save.last_excp_from;
2861                 break;
2862         case MSR_IA32_LASTINTTOIP:
2863                 *data = svm->vmcb->save.last_excp_to;
2864                 break;
2865         case MSR_VM_HSAVE_PA:
2866                 *data = svm->nested.hsave_msr;
2867                 break;
2868         case MSR_VM_CR:
2869                 *data = svm->nested.vm_cr_msr;
2870                 break;
2871         case MSR_IA32_UCODE_REV:
2872                 *data = 0x01000065;
2873                 break;
2874         default:
2875                 return kvm_get_msr_common(vcpu, ecx, data);
2876         }
2877         return 0;
2878 }
2879
2880 static int rdmsr_interception(struct vcpu_svm *svm)
2881 {
2882         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2883         u64 data;
2884
2885         if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2886                 trace_kvm_msr_read_ex(ecx);
2887                 kvm_inject_gp(&svm->vcpu, 0);
2888         } else {
2889                 trace_kvm_msr_read(ecx, data);
2890
2891                 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
2892                 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
2893                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2894                 skip_emulated_instruction(&svm->vcpu);
2895         }
2896         return 1;
2897 }
2898
2899 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2900 {
2901         struct vcpu_svm *svm = to_svm(vcpu);
2902         int svm_dis, chg_mask;
2903
2904         if (data & ~SVM_VM_CR_VALID_MASK)
2905                 return 1;
2906
2907         chg_mask = SVM_VM_CR_VALID_MASK;
2908
2909         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2910                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2911
2912         svm->nested.vm_cr_msr &= ~chg_mask;
2913         svm->nested.vm_cr_msr |= (data & chg_mask);
2914
2915         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2916
2917         /* check for svm_disable while efer.svme is set */
2918         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2919                 return 1;
2920
2921         return 0;
2922 }
2923
2924 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2925 {
2926         struct vcpu_svm *svm = to_svm(vcpu);
2927
2928         switch (ecx) {
2929         case MSR_IA32_TSC:
2930                 kvm_write_tsc(vcpu, data);
2931                 break;
2932         case MSR_STAR:
2933                 svm->vmcb->save.star = data;
2934                 break;
2935 #ifdef CONFIG_X86_64
2936         case MSR_LSTAR:
2937                 svm->vmcb->save.lstar = data;
2938                 break;
2939         case MSR_CSTAR:
2940                 svm->vmcb->save.cstar = data;
2941                 break;
2942         case MSR_KERNEL_GS_BASE:
2943                 svm->vmcb->save.kernel_gs_base = data;
2944                 break;
2945         case MSR_SYSCALL_MASK:
2946                 svm->vmcb->save.sfmask = data;
2947                 break;
2948 #endif
2949         case MSR_IA32_SYSENTER_CS:
2950                 svm->vmcb->save.sysenter_cs = data;
2951                 break;
2952         case MSR_IA32_SYSENTER_EIP:
2953                 svm->sysenter_eip = data;
2954                 svm->vmcb->save.sysenter_eip = data;
2955                 break;
2956         case MSR_IA32_SYSENTER_ESP:
2957                 svm->sysenter_esp = data;
2958                 svm->vmcb->save.sysenter_esp = data;
2959                 break;
2960         case MSR_IA32_DEBUGCTLMSR:
2961                 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2962                         pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2963                                         __func__, data);
2964                         break;
2965                 }
2966                 if (data & DEBUGCTL_RESERVED_BITS)
2967                         return 1;
2968
2969                 svm->vmcb->save.dbgctl = data;
2970                 mark_dirty(svm->vmcb, VMCB_LBR);
2971                 if (data & (1ULL<<0))
2972                         svm_enable_lbrv(svm);
2973                 else
2974                         svm_disable_lbrv(svm);
2975                 break;
2976         case MSR_VM_HSAVE_PA:
2977                 svm->nested.hsave_msr = data;
2978                 break;
2979         case MSR_VM_CR:
2980                 return svm_set_vm_cr(vcpu, data);
2981         case MSR_VM_IGNNE:
2982                 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2983                 break;
2984         default:
2985                 return kvm_set_msr_common(vcpu, ecx, data);
2986         }
2987         return 0;
2988 }
2989
2990 static int wrmsr_interception(struct vcpu_svm *svm)
2991 {
2992         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2993         u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2994                 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2995
2996
2997         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2998         if (svm_set_msr(&svm->vcpu, ecx, data)) {
2999                 trace_kvm_msr_write_ex(ecx, data);
3000                 kvm_inject_gp(&svm->vcpu, 0);
3001         } else {
3002                 trace_kvm_msr_write(ecx, data);
3003                 skip_emulated_instruction(&svm->vcpu);
3004         }
3005         return 1;
3006 }
3007
3008 static int msr_interception(struct vcpu_svm *svm)
3009 {
3010         if (svm->vmcb->control.exit_info_1)
3011                 return wrmsr_interception(svm);
3012         else
3013                 return rdmsr_interception(svm);
3014 }
3015
3016 static int interrupt_window_interception(struct vcpu_svm *svm)
3017 {
3018         struct kvm_run *kvm_run = svm->vcpu.run;
3019
3020         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3021         svm_clear_vintr(svm);
3022         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3023         mark_dirty(svm->vmcb, VMCB_INTR);
3024         /*
3025          * If the user space waits to inject interrupts, exit as soon as
3026          * possible
3027          */
3028         if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3029             kvm_run->request_interrupt_window &&
3030             !kvm_cpu_has_interrupt(&svm->vcpu)) {
3031                 ++svm->vcpu.stat.irq_window_exits;
3032                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3033                 return 0;
3034         }
3035
3036         return 1;
3037 }
3038
3039 static int pause_interception(struct vcpu_svm *svm)
3040 {
3041         kvm_vcpu_on_spin(&(svm->vcpu));
3042         return 1;
3043 }
3044
3045 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3046         [SVM_EXIT_READ_CR0]                     = cr_interception,
3047         [SVM_EXIT_READ_CR3]                     = cr_interception,
3048         [SVM_EXIT_READ_CR4]                     = cr_interception,
3049         [SVM_EXIT_READ_CR8]                     = cr_interception,
3050         [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
3051         [SVM_EXIT_WRITE_CR0]                    = cr0_write_interception,
3052         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3053         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3054         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3055         [SVM_EXIT_READ_DR0]                     = dr_interception,
3056         [SVM_EXIT_READ_DR1]                     = dr_interception,
3057         [SVM_EXIT_READ_DR2]                     = dr_interception,
3058         [SVM_EXIT_READ_DR3]                     = dr_interception,
3059         [SVM_EXIT_READ_DR4]                     = dr_interception,
3060         [SVM_EXIT_READ_DR5]                     = dr_interception,
3061         [SVM_EXIT_READ_DR6]                     = dr_interception,
3062         [SVM_EXIT_READ_DR7]                     = dr_interception,
3063         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3064         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3065         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3066         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3067         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3068         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3069         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3070         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3071         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3072         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3073         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3074         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3075         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3076         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3077         [SVM_EXIT_INTR]                         = intr_interception,
3078         [SVM_EXIT_NMI]                          = nmi_interception,
3079         [SVM_EXIT_SMI]                          = nop_on_interception,
3080         [SVM_EXIT_INIT]                         = nop_on_interception,
3081         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3082         [SVM_EXIT_CPUID]                        = cpuid_interception,
3083         [SVM_EXIT_IRET]                         = iret_interception,
3084         [SVM_EXIT_INVD]                         = emulate_on_interception,
3085         [SVM_EXIT_PAUSE]                        = pause_interception,
3086         [SVM_EXIT_HLT]                          = halt_interception,
3087         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3088         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3089         [SVM_EXIT_IOIO]                         = io_interception,
3090         [SVM_EXIT_MSR]                          = msr_interception,
3091         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3092         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3093         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3094         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3095         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3096         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3097         [SVM_EXIT_STGI]                         = stgi_interception,
3098         [SVM_EXIT_CLGI]                         = clgi_interception,
3099         [SVM_EXIT_SKINIT]                       = skinit_interception,
3100         [SVM_EXIT_WBINVD]                       = emulate_on_interception,
3101         [SVM_EXIT_MONITOR]                      = invalid_op_interception,
3102         [SVM_EXIT_MWAIT]                        = invalid_op_interception,
3103         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3104         [SVM_EXIT_NPF]                          = pf_interception,
3105 };
3106
3107 void dump_vmcb(struct kvm_vcpu *vcpu)
3108 {
3109         struct vcpu_svm *svm = to_svm(vcpu);
3110         struct vmcb_control_area *control = &svm->vmcb->control;
3111         struct vmcb_save_area *save = &svm->vmcb->save;
3112
3113         pr_err("VMCB Control Area:\n");
3114         pr_err("cr_read:            %04x\n", control->intercept_cr & 0xffff);
3115         pr_err("cr_write:           %04x\n", control->intercept_cr >> 16);
3116         pr_err("dr_read:            %04x\n", control->intercept_dr & 0xffff);
3117         pr_err("dr_write:           %04x\n", control->intercept_dr >> 16);
3118         pr_err("exceptions:         %08x\n", control->intercept_exceptions);
3119         pr_err("intercepts:         %016llx\n", control->intercept);
3120         pr_err("pause filter count: %d\n", control->pause_filter_count);
3121         pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa);
3122         pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa);
3123         pr_err("tsc_offset:         %016llx\n", control->tsc_offset);
3124         pr_err("asid:               %d\n", control->asid);
3125         pr_err("tlb_ctl:            %d\n", control->tlb_ctl);
3126         pr_err("int_ctl:            %08x\n", control->int_ctl);
3127         pr_err("int_vector:         %08x\n", control->int_vector);
3128         pr_err("int_state:          %08x\n", control->int_state);
3129         pr_err("exit_code:          %08x\n", control->exit_code);
3130         pr_err("exit_info1:         %016llx\n", control->exit_info_1);
3131         pr_err("exit_info2:         %016llx\n", control->exit_info_2);
3132         pr_err("exit_int_info:      %08x\n", control->exit_int_info);
3133         pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err);
3134         pr_err("nested_ctl:         %lld\n", control->nested_ctl);
3135         pr_err("nested_cr3:         %016llx\n", control->nested_cr3);
3136         pr_err("event_inj:          %08x\n", control->event_inj);
3137         pr_err("event_inj_err:      %08x\n", control->event_inj_err);
3138         pr_err("lbr_ctl:            %lld\n", control->lbr_ctl);
3139         pr_err("next_rip:           %016llx\n", control->next_rip);
3140         pr_err("VMCB State Save Area:\n");
3141         pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n",
3142                 save->es.selector, save->es.attrib,
3143                 save->es.limit, save->es.base);
3144         pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n",
3145                 save->cs.selector, save->cs.attrib,
3146                 save->cs.limit, save->cs.base);
3147         pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n",
3148                 save->ss.selector, save->ss.attrib,
3149                 save->ss.limit, save->ss.base);
3150         pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n",
3151                 save->ds.selector, save->ds.attrib,
3152                 save->ds.limit, save->ds.base);
3153         pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n",
3154                 save->fs.selector, save->fs.attrib,
3155                 save->fs.limit, save->fs.base);
3156         pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n",
3157                 save->gs.selector, save->gs.attrib,
3158                 save->gs.limit, save->gs.base);
3159         pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
3160                 save->gdtr.selector, save->gdtr.attrib,
3161                 save->gdtr.limit, save->gdtr.base);
3162         pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
3163                 save->ldtr.selector, save->ldtr.attrib,
3164                 save->ldtr.limit, save->ldtr.base);
3165         pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
3166                 save->idtr.selector, save->idtr.attrib,
3167                 save->idtr.limit, save->idtr.base);
3168         pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n",
3169                 save->tr.selector, save->tr.attrib,
3170                 save->tr.limit, save->tr.base);
3171         pr_err("cpl:            %d                efer:         %016llx\n",
3172                 save->cpl, save->efer);
3173         pr_err("cr0:            %016llx cr2:          %016llx\n",
3174                 save->cr0, save->cr2);
3175         pr_err("cr3:            %016llx cr4:          %016llx\n",
3176                 save->cr3, save->cr4);
3177         pr_err("dr6:            %016llx dr7:          %016llx\n",
3178                 save->dr6, save->dr7);
3179         pr_err("rip:            %016llx rflags:       %016llx\n",
3180                 save->rip, save->rflags);
3181         pr_err("rsp:            %016llx rax:          %016llx\n",
3182                 save->rsp, save->rax);
3183         pr_err("star:           %016llx lstar:        %016llx\n",
3184                 save->star, save->lstar);
3185         pr_err("cstar:          %016llx sfmask:       %016llx\n",
3186                 save->cstar, save->sfmask);
3187         pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n",
3188                 save->kernel_gs_base, save->sysenter_cs);
3189         pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n",
3190                 save->sysenter_esp, save->sysenter_eip);
3191         pr_err("gpat:           %016llx dbgctl:       %016llx\n",
3192                 save->g_pat, save->dbgctl);
3193         pr_err("br_from:        %016llx br_to:        %016llx\n",
3194                 save->br_from, save->br_to);
3195         pr_err("excp_from:      %016llx excp_to:      %016llx\n",
3196                 save->last_excp_from, save->last_excp_to);
3197
3198 }
3199
3200 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3201 {
3202         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3203
3204         *info1 = control->exit_info_1;
3205         *info2 = control->exit_info_2;
3206 }
3207
3208 static int handle_exit(struct kvm_vcpu *vcpu)
3209 {
3210         struct vcpu_svm *svm = to_svm(vcpu);
3211         struct kvm_run *kvm_run = vcpu->run;
3212         u32 exit_code = svm->vmcb->control.exit_code;
3213
3214         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3215
3216         if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3217                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3218         if (npt_enabled)
3219                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3220
3221         if (unlikely(svm->nested.exit_required)) {
3222                 nested_svm_vmexit(svm);
3223                 svm->nested.exit_required = false;
3224
3225                 return 1;
3226         }
3227
3228         if (is_guest_mode(vcpu)) {
3229                 int vmexit;
3230
3231                 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3232                                         svm->vmcb->control.exit_info_1,
3233                                         svm->vmcb->control.exit_info_2,
3234                                         svm->vmcb->control.exit_int_info,
3235                                         svm->vmcb->control.exit_int_info_err);
3236
3237                 vmexit = nested_svm_exit_special(svm);
3238
3239                 if (vmexit == NESTED_EXIT_CONTINUE)
3240                         vmexit = nested_svm_exit_handled(svm);
3241
3242                 if (vmexit == NESTED_EXIT_DONE)
3243                         return 1;
3244         }
3245
3246         svm_complete_interrupts(svm);
3247
3248         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3249                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3250                 kvm_run->fail_entry.hardware_entry_failure_reason
3251                         = svm->vmcb->control.exit_code;
3252                 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
3253                 dump_vmcb(vcpu);
3254                 return 0;
3255         }
3256
3257         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3258             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3259             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3260             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3261                 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
3262                        "exit_code 0x%x\n",
3263                        __func__, svm->vmcb->control.exit_int_info,
3264                        exit_code);
3265
3266         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3267             || !svm_exit_handlers[exit_code]) {
3268                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3269                 kvm_run->hw.hardware_exit_reason = exit_code;
3270                 return 0;
3271         }
3272
3273         return svm_exit_handlers[exit_code](svm);
3274 }
3275
3276 static void reload_tss(struct kvm_vcpu *vcpu)
3277 {
3278         int cpu = raw_smp_processor_id();
3279
3280         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3281         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3282         load_TR_desc();
3283 }
3284
3285 static void pre_svm_run(struct vcpu_svm *svm)
3286 {
3287         int cpu = raw_smp_processor_id();
3288
3289         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3290
3291         /* FIXME: handle wraparound of asid_generation */
3292         if (svm->asid_generation != sd->asid_generation)
3293                 new_asid(svm, sd);
3294 }
3295
3296 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3297 {
3298         struct vcpu_svm *svm = to_svm(vcpu);
3299
3300         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3301         vcpu->arch.hflags |= HF_NMI_MASK;
3302         set_intercept(svm, INTERCEPT_IRET);
3303         ++vcpu->stat.nmi_injections;
3304 }
3305
3306 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3307 {
3308         struct vmcb_control_area *control;
3309
3310         control = &svm->vmcb->control;
3311         control->int_vector = irq;
3312         control->int_ctl &= ~V_INTR_PRIO_MASK;
3313         control->int_ctl |= V_IRQ_MASK |
3314                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3315         mark_dirty(svm->vmcb, VMCB_INTR);
3316 }
3317
3318 static void svm_set_irq(struct kvm_vcpu *vcpu)
3319 {
3320         struct vcpu_svm *svm = to_svm(vcpu);
3321
3322         BUG_ON(!(gif_set(svm)));
3323
3324         trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3325         ++vcpu->stat.irq_injections;
3326
3327         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3328                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3329 }
3330
3331 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3332 {
3333         struct vcpu_svm *svm = to_svm(vcpu);
3334
3335         if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3336                 return;
3337
3338         if (irr == -1)
3339                 return;
3340
3341         if (tpr >= irr)
3342                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3343 }
3344
3345 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3346 {
3347         struct vcpu_svm *svm = to_svm(vcpu);
3348         struct vmcb *vmcb = svm->vmcb;
3349         int ret;
3350         ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3351               !(svm->vcpu.arch.hflags & HF_NMI_MASK);
3352         ret = ret && gif_set(svm) && nested_svm_nmi(svm);
3353
3354         return ret;
3355 }
3356
3357 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3358 {
3359         struct vcpu_svm *svm = to_svm(vcpu);
3360
3361         return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3362 }
3363
3364 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3365 {
3366         struct vcpu_svm *svm = to_svm(vcpu);
3367
3368         if (masked) {
3369                 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3370                 set_intercept(svm, INTERCEPT_IRET);
3371         } else {
3372                 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3373                 clr_intercept(svm, INTERCEPT_IRET);
3374         }
3375 }
3376
3377 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3378 {
3379         struct vcpu_svm *svm = to_svm(vcpu);
3380         struct vmcb *vmcb = svm->vmcb;
3381         int ret;
3382
3383         if (!gif_set(svm) ||
3384              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3385                 return 0;
3386
3387         ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3388
3389         if (is_guest_mode(vcpu))
3390                 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3391
3392         return ret;
3393 }
3394
3395 static void enable_irq_window(struct kvm_vcpu *vcpu)
3396 {
3397         struct vcpu_svm *svm = to_svm(vcpu);
3398
3399         /*
3400          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3401          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3402          * get that intercept, this function will be called again though and
3403          * we'll get the vintr intercept.
3404          */
3405         if (gif_set(svm) && nested_svm_intr(svm)) {
3406                 svm_set_vintr(svm);
3407                 svm_inject_irq(svm, 0x0);
3408         }
3409 }
3410
3411 static void enable_nmi_window(struct kvm_vcpu *vcpu)
3412 {
3413         struct vcpu_svm *svm = to_svm(vcpu);
3414
3415         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3416             == HF_NMI_MASK)
3417                 return; /* IRET will cause a vm exit */
3418
3419         /*
3420          * Something prevents NMI from been injected. Single step over possible
3421          * problem (IRET or exception injection or interrupt shadow)
3422          */
3423         svm->nmi_singlestep = true;
3424         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3425         update_db_intercept(vcpu);
3426 }
3427
3428 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3429 {
3430         return 0;
3431 }
3432
3433 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3434 {
3435         struct vcpu_svm *svm = to_svm(vcpu);
3436
3437         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3438                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3439         else
3440                 svm->asid_generation--;
3441 }
3442
3443 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3444 {
3445 }
3446
3447 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3448 {
3449         struct vcpu_svm *svm = to_svm(vcpu);
3450
3451         if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3452                 return;
3453
3454         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3455                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3456                 kvm_set_cr8(vcpu, cr8);
3457         }
3458 }
3459
3460 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3461 {
3462         struct vcpu_svm *svm = to_svm(vcpu);
3463         u64 cr8;
3464
3465         if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3466                 return;
3467
3468         cr8 = kvm_get_cr8(vcpu);
3469         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3470         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3471 }
3472
3473 static void svm_complete_interrupts(struct vcpu_svm *svm)
3474 {
3475         u8 vector;
3476         int type;
3477         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3478         unsigned int3_injected = svm->int3_injected;
3479
3480         svm->int3_injected = 0;
3481
3482         /*
3483          * If we've made progress since setting HF_IRET_MASK, we've
3484          * executed an IRET and can allow NMI injection.
3485          */
3486         if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487             && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3488                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3489                 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3490         }
3491
3492         svm->vcpu.arch.nmi_injected = false;
3493         kvm_clear_exception_queue(&svm->vcpu);
3494         kvm_clear_interrupt_queue(&svm->vcpu);
3495
3496         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3497                 return;
3498
3499         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3500
3501         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3502         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3503
3504         switch (type) {
3505         case SVM_EXITINTINFO_TYPE_NMI:
3506                 svm->vcpu.arch.nmi_injected = true;
3507                 break;
3508         case SVM_EXITINTINFO_TYPE_EXEPT:
3509                 /*
3510                  * In case of software exceptions, do not reinject the vector,
3511                  * but re-execute the instruction instead. Rewind RIP first
3512                  * if we emulated INT3 before.
3513                  */
3514                 if (kvm_exception_is_soft(vector)) {
3515                         if (vector == BP_VECTOR && int3_injected &&
3516                             kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3517                                 kvm_rip_write(&svm->vcpu,
3518                                               kvm_rip_read(&svm->vcpu) -
3519                                               int3_injected);
3520                         break;
3521                 }
3522                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3523                         u32 err = svm->vmcb->control.exit_int_info_err;
3524                         kvm_requeue_exception_e(&svm->vcpu, vector, err);
3525
3526                 } else
3527                         kvm_requeue_exception(&svm->vcpu, vector);
3528                 break;
3529         case SVM_EXITINTINFO_TYPE_INTR:
3530                 kvm_queue_interrupt(&svm->vcpu, vector, false);
3531                 break;
3532         default:
3533                 break;
3534         }
3535 }
3536
3537 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3538 {
3539         struct vcpu_svm *svm = to_svm(vcpu);
3540         struct vmcb_control_area *control = &svm->vmcb->control;
3541
3542         control->exit_int_info = control->event_inj;
3543         control->exit_int_info_err = control->event_inj_err;
3544         control->event_inj = 0;
3545         svm_complete_interrupts(svm);
3546 }
3547
3548 #ifdef CONFIG_X86_64
3549 #define R "r"
3550 #else
3551 #define R "e"
3552 #endif
3553
3554 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3555 {
3556         struct vcpu_svm *svm = to_svm(vcpu);
3557
3558         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3559         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3560         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3561
3562         /*
3563          * A vmexit emulation is required before the vcpu can be executed
3564          * again.
3565          */
3566         if (unlikely(svm->nested.exit_required))
3567                 return;
3568
3569         pre_svm_run(svm);
3570
3571         sync_lapic_to_cr8(vcpu);
3572
3573         svm->vmcb->save.cr2 = vcpu->arch.cr2;
3574
3575         clgi();
3576
3577         local_irq_enable();
3578
3579         asm volatile (
3580                 "push %%"R"bp; \n\t"
3581                 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
3582                 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
3583                 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
3584                 "mov %c[rsi](%[svm]), %%"R"si \n\t"
3585                 "mov %c[rdi](%[svm]), %%"R"di \n\t"
3586                 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
3587 #ifdef CONFIG_X86_64
3588                 "mov %c[r8](%[svm]),  %%r8  \n\t"
3589                 "mov %c[r9](%[svm]),  %%r9  \n\t"
3590                 "mov %c[r10](%[svm]), %%r10 \n\t"
3591                 "mov %c[r11](%[svm]), %%r11 \n\t"
3592                 "mov %c[r12](%[svm]), %%r12 \n\t"
3593                 "mov %c[r13](%[svm]), %%r13 \n\t"
3594                 "mov %c[r14](%[svm]), %%r14 \n\t"
3595                 "mov %c[r15](%[svm]), %%r15 \n\t"
3596 #endif
3597
3598                 /* Enter guest mode */
3599                 "push %%"R"ax \n\t"
3600                 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
3601                 __ex(SVM_VMLOAD) "\n\t"
3602                 __ex(SVM_VMRUN) "\n\t"
3603                 __ex(SVM_VMSAVE) "\n\t"
3604                 "pop %%"R"ax \n\t"
3605
3606                 /* Save guest registers, load host registers */
3607                 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
3608                 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
3609                 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
3610                 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
3611                 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
3612                 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
3613 #ifdef CONFIG_X86_64
3614                 "mov %%r8,  %c[r8](%[svm]) \n\t"
3615                 "mov %%r9,  %c[r9](%[svm]) \n\t"
3616                 "mov %%r10, %c[r10](%[svm]) \n\t"
3617                 "mov %%r11, %c[r11](%[svm]) \n\t"
3618                 "mov %%r12, %c[r12](%[svm]) \n\t"
3619                 "mov %%r13, %c[r13](%[svm]) \n\t"
3620                 "mov %%r14, %c[r14](%[svm]) \n\t"
3621                 "mov %%r15, %c[r15](%[svm]) \n\t"
3622 #endif
3623                 "pop %%"R"bp"
3624                 :
3625                 : [svm]"a"(svm),
3626                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
3627                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
3628                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
3629                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
3630                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
3631                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
3632                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
3633 #ifdef CONFIG_X86_64
3634                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
3635                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
3636                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
3637                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
3638                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
3639                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
3640                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
3641                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3642 #endif
3643                 : "cc", "memory"
3644                 , R"bx", R"cx", R"dx", R"si", R"di"
3645 #ifdef CONFIG_X86_64
3646                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3647 #endif
3648                 );
3649
3650 #ifdef CONFIG_X86_64
3651         wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3652 #else
3653         loadsegment(fs, svm->host.fs);
3654 #ifndef CONFIG_X86_32_LAZY_GS
3655         loadsegment(gs, svm->host.gs);
3656 #endif
3657 #endif
3658
3659         reload_tss(vcpu);
3660
3661         local_irq_disable();
3662
3663         vcpu->arch.cr2 = svm->vmcb->save.cr2;
3664         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3665         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3666         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3667
3668         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669                 kvm_before_handle_nmi(&svm->vcpu);
3670
3671         stgi();
3672
3673         /* Any pending NMI will happen here */
3674
3675         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676                 kvm_after_handle_nmi(&svm->vcpu);
3677
3678         sync_cr8_to_lapic(vcpu);
3679
3680         svm->next_rip = 0;
3681
3682         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3683
3684         /* if exit due to PF check for async PF */
3685         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3686                 svm->apf_reason = kvm_read_and_reset_pf_reason();
3687
3688         if (npt_enabled) {
3689                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3690                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
3691         }
3692
3693         /*
3694          * We need to handle MC intercepts here before the vcpu has a chance to
3695          * change the physical cpu
3696          */
3697         if (unlikely(svm->vmcb->control.exit_code ==
3698                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
3699                 svm_handle_mce(svm);
3700
3701         mark_all_clean(svm->vmcb);
3702 }
3703
3704 #undef R
3705
3706 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3707 {
3708         struct vcpu_svm *svm = to_svm(vcpu);
3709
3710         svm->vmcb->save.cr3 = root;
3711         mark_dirty(svm->vmcb, VMCB_CR);
3712         svm_flush_tlb(vcpu);
3713 }
3714
3715 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3716 {
3717         struct vcpu_svm *svm = to_svm(vcpu);
3718
3719         svm->vmcb->control.nested_cr3 = root;
3720         mark_dirty(svm->vmcb, VMCB_NPT);
3721
3722         /* Also sync guest cr3 here in case we live migrate */
3723         svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3724         mark_dirty(svm->vmcb, VMCB_CR);
3725
3726         svm_flush_tlb(vcpu);
3727 }
3728
3729 static int is_disabled(void)
3730 {
3731         u64 vm_cr;
3732
3733         rdmsrl(MSR_VM_CR, vm_cr);
3734         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3735                 return 1;
3736
3737         return 0;
3738 }
3739
3740 static void
3741 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3742 {
3743         /*
3744          * Patch in the VMMCALL instruction:
3745          */
3746         hypercall[0] = 0x0f;
3747         hypercall[1] = 0x01;
3748         hypercall[2] = 0xd9;
3749 }
3750
3751 static void svm_check_processor_compat(void *rtn)
3752 {
3753         *(int *)rtn = 0;
3754 }
3755
3756 static bool svm_cpu_has_accelerated_tpr(void)
3757 {
3758         return false;
3759 }
3760
3761 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3762 {
3763         return 0;
3764 }
3765
3766 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3767 {
3768 }
3769
3770 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3771 {
3772         switch (func) {
3773         case 0x80000001:
3774                 if (nested)
3775                         entry->ecx |= (1 << 2); /* Set SVM bit */
3776                 break;
3777         case 0x8000000A:
3778                 entry->eax = 1; /* SVM revision 1 */
3779                 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3780                                    ASID emulation to nested SVM */
3781                 entry->ecx = 0; /* Reserved */
3782                 entry->edx = 0; /* Per default do not support any
3783                                    additional features */
3784
3785                 /* Support next_rip if host supports it */
3786                 if (boot_cpu_has(X86_FEATURE_NRIPS))
3787                         entry->edx |= SVM_FEATURE_NRIP;
3788
3789                 /* Support NPT for the guest if enabled */
3790                 if (npt_enabled)
3791                         entry->edx |= SVM_FEATURE_NPT;
3792
3793                 break;
3794         }
3795 }
3796
3797 static const struct trace_print_flags svm_exit_reasons_str[] = {
3798         { SVM_EXIT_READ_CR0,                    "read_cr0" },
3799         { SVM_EXIT_READ_CR3,                    "read_cr3" },
3800         { SVM_EXIT_READ_CR4,                    "read_cr4" },
3801         { SVM_EXIT_READ_CR8,                    "read_cr8" },
3802         { SVM_EXIT_WRITE_CR0,                   "write_cr0" },
3803         { SVM_EXIT_WRITE_CR3,                   "write_cr3" },
3804         { SVM_EXIT_WRITE_CR4,                   "write_cr4" },
3805         { SVM_EXIT_WRITE_CR8,                   "write_cr8" },
3806         { SVM_EXIT_READ_DR0,                    "read_dr0" },
3807         { SVM_EXIT_READ_DR1,                    "read_dr1" },
3808         { SVM_EXIT_READ_DR2,                    "read_dr2" },
3809         { SVM_EXIT_READ_DR3,                    "read_dr3" },
3810         { SVM_EXIT_WRITE_DR0,                   "write_dr0" },
3811         { SVM_EXIT_WRITE_DR1,                   "write_dr1" },
3812         { SVM_EXIT_WRITE_DR2,                   "write_dr2" },
3813         { SVM_EXIT_WRITE_DR3,                   "write_dr3" },
3814         { SVM_EXIT_WRITE_DR5,                   "write_dr5" },
3815         { SVM_EXIT_WRITE_DR7,                   "write_dr7" },
3816         { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" },
3817         { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" },
3818         { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" },
3819         { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" },
3820         { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" },
3821         { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" },
3822         { SVM_EXIT_INTR,                        "interrupt" },
3823         { SVM_EXIT_NMI,                         "nmi" },
3824         { SVM_EXIT_SMI,                         "smi" },
3825         { SVM_EXIT_INIT,                        "init" },
3826         { SVM_EXIT_VINTR,                       "vintr" },
3827         { SVM_EXIT_CPUID,                       "cpuid" },
3828         { SVM_EXIT_INVD,                        "invd" },
3829         { SVM_EXIT_HLT,                         "hlt" },
3830         { SVM_EXIT_INVLPG,                      "invlpg" },
3831         { SVM_EXIT_INVLPGA,                     "invlpga" },
3832         { SVM_EXIT_IOIO,                        "io" },
3833         { SVM_EXIT_MSR,                         "msr" },
3834         { SVM_EXIT_TASK_SWITCH,                 "task_switch" },
3835         { SVM_EXIT_SHUTDOWN,                    "shutdown" },
3836         { SVM_EXIT_VMRUN,                       "vmrun" },
3837         { SVM_EXIT_VMMCALL,                     "hypercall" },
3838         { SVM_EXIT_VMLOAD,                      "vmload" },
3839         { SVM_EXIT_VMSAVE,                      "vmsave" },
3840         { SVM_EXIT_STGI,                        "stgi" },
3841         { SVM_EXIT_CLGI,                        "clgi" },
3842         { SVM_EXIT_SKINIT,                      "skinit" },
3843         { SVM_EXIT_WBINVD,                      "wbinvd" },
3844         { SVM_EXIT_MONITOR,                     "monitor" },
3845         { SVM_EXIT_MWAIT,                       "mwait" },
3846         { SVM_EXIT_XSETBV,                      "xsetbv" },
3847         { SVM_EXIT_NPF,                         "npf" },
3848         { -1, NULL }
3849 };
3850
3851 static int svm_get_lpage_level(void)
3852 {
3853         return PT_PDPE_LEVEL;
3854 }
3855
3856 static bool svm_rdtscp_supported(void)
3857 {
3858         return false;
3859 }
3860
3861 static bool svm_has_wbinvd_exit(void)
3862 {
3863         return true;
3864 }
3865
3866 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3867 {
3868         struct vcpu_svm *svm = to_svm(vcpu);
3869
3870         set_exception_intercept(svm, NM_VECTOR);
3871         update_cr0_intercept(svm);
3872 }
3873
3874 static struct kvm_x86_ops svm_x86_ops = {
3875         .cpu_has_kvm_support = has_svm,
3876         .disabled_by_bios = is_disabled,
3877         .hardware_setup = svm_hardware_setup,
3878         .hardware_unsetup = svm_hardware_unsetup,
3879         .check_processor_compatibility = svm_check_processor_compat,
3880         .hardware_enable = svm_hardware_enable,
3881         .hardware_disable = svm_hardware_disable,
3882         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
3883
3884         .vcpu_create = svm_create_vcpu,
3885         .vcpu_free = svm_free_vcpu,
3886         .vcpu_reset = svm_vcpu_reset,
3887
3888         .prepare_guest_switch = svm_prepare_guest_switch,
3889         .vcpu_load = svm_vcpu_load,
3890         .vcpu_put = svm_vcpu_put,
3891
3892         .set_guest_debug = svm_guest_debug,
3893         .get_msr = svm_get_msr,
3894         .set_msr = svm_set_msr,
3895         .get_segment_base = svm_get_segment_base,
3896         .get_segment = svm_get_segment,
3897         .set_segment = svm_set_segment,
3898         .get_cpl = svm_get_cpl,
3899         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3900         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3901         .decache_cr3 = svm_decache_cr3,
3902         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3903         .set_cr0 = svm_set_cr0,
3904         .set_cr3 = svm_set_cr3,
3905         .set_cr4 = svm_set_cr4,
3906         .set_efer = svm_set_efer,
3907         .get_idt = svm_get_idt,
3908         .set_idt = svm_set_idt,
3909         .get_gdt = svm_get_gdt,
3910         .set_gdt = svm_set_gdt,
3911         .set_dr7 = svm_set_dr7,
3912         .cache_reg = svm_cache_reg,
3913         .get_rflags = svm_get_rflags,
3914         .set_rflags = svm_set_rflags,
3915         .fpu_activate = svm_fpu_activate,
3916         .fpu_deactivate = svm_fpu_deactivate,
3917
3918         .tlb_flush = svm_flush_tlb,
3919
3920         .run = svm_vcpu_run,
3921         .handle_exit = handle_exit,
3922         .skip_emulated_instruction = skip_emulated_instruction,
3923         .set_interrupt_shadow = svm_set_interrupt_shadow,
3924         .get_interrupt_shadow = svm_get_interrupt_shadow,
3925         .patch_hypercall = svm_patch_hypercall,
3926         .set_irq = svm_set_irq,
3927         .set_nmi = svm_inject_nmi,
3928         .queue_exception = svm_queue_exception,
3929         .cancel_injection = svm_cancel_injection,
3930         .interrupt_allowed = svm_interrupt_allowed,
3931         .nmi_allowed = svm_nmi_allowed,
3932         .get_nmi_mask = svm_get_nmi_mask,
3933         .set_nmi_mask = svm_set_nmi_mask,
3934         .enable_nmi_window = enable_nmi_window,
3935         .enable_irq_window = enable_irq_window,
3936         .update_cr8_intercept = update_cr8_intercept,
3937
3938         .set_tss_addr = svm_set_tss_addr,
3939         .get_tdp_level = get_npt_level,
3940         .get_mt_mask = svm_get_mt_mask,
3941
3942         .get_exit_info = svm_get_exit_info,
3943         .exit_reasons_str = svm_exit_reasons_str,
3944
3945         .get_lpage_level = svm_get_lpage_level,
3946
3947         .cpuid_update = svm_cpuid_update,
3948
3949         .rdtscp_supported = svm_rdtscp_supported,
3950
3951         .set_supported_cpuid = svm_set_supported_cpuid,
3952
3953         .has_wbinvd_exit = svm_has_wbinvd_exit,
3954
3955         .write_tsc_offset = svm_write_tsc_offset,
3956         .adjust_tsc_offset = svm_adjust_tsc_offset,
3957
3958         .set_tdp_cr3 = set_tdp_cr3,
3959 };
3960
3961 static int __init svm_init(void)
3962 {
3963         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
3964                         __alignof__(struct vcpu_svm), THIS_MODULE);
3965 }
3966
3967 static void __exit svm_exit(void)
3968 {
3969         kvm_exit();
3970 }
3971
3972 module_init(svm_init)
3973 module_exit(svm_exit)